/* make_random_seq
Program:   make_random_seq
Version:   1.0.1
Date:      03-AUG-2008
Author:    David Mathog, Biology Division, Caltech
email:     mathog@caltech.edu
Copyright: 2008 David Mathog and California Institute of Technology (Caltech)

Description:

    This is an C implementation of the make_random_dna program that was mentioned
    in:
    
    Sequence analysis of the cis-regulatory regions of the bithorax complex of Drosophila.
    E B Lewis, J D Knafels, D R Mathog, and S E Celniker
    Proc Natl Acad Sci U S A. 1995 August 29; 92(18)
	
    The original was written in Fortran for use with the GCG package and so could not be
    distributed.  The new version also handles the protein, ASCII, and custom alphabets
    (subject to run size limitations).
    
    This program is a Markov chain generator for character sequences.  
    It first obtains a transition table for the selected order.
    This is either read directly from a data table, or derived on the fly from an
    input sequence.  It then picks a starting sequence, and using a random number generator
    emits jumps around in the transition table, emitting as many sequences/lengths as 
    desired.  Input and output sequence format is only fasta.  The format for the
    transition tables is described in the emit_hformat() function.

    Compiles cleanly with:
    
    gcc -Wall -std=c99 -lm -o make_random_seq make_random_seq.c
    
Changes:

  1.0.1 03-AUG-2008, fixed typo in -h output
  1.0.0 05-JUN-2008, first release
  
License:  GNU General Public License 2
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

*/

#include <stdlib.h>
#include <stdio.h>
#include <limits.h>           /* for INT_MAX            */
#include <ctype.h>            /* for toupper()          */
#include <string.h>           /* for strlen() and such  */
#include <math.h>             /* for pow()              */
#include <time.h>             /* for time()             */
#include <sys/time.h>         /* for gettimeofday()     */
#include <unistd.h>

/* definitions */
#define EXVERSTRING "1.0.1 03-AUG-2008"
#define COPYSTRING  "2008 David Mathog and California Institute of Technology"
#define BUGSTRING   "mathog@caltech.edu"
#define LICSTRING   "GNU General Public License 2"


#define INTNOTSET        -1

#define TYPE_NONE         0
#define TYPE_NUCLEIC      1
#define TYPE_PROTEIN      2
#define TYPE_ASCII        3
#define TYPE_CUSTOM       4

#define PROC_FASTA_FIRST  0
#define PROC_FASTA_ALL    1
#define PROC_ALL          2
#define PROC_MIN          PROC_FASTA_FIRST
#define PROC_MAX          PROC_ALL
    
#define IN_SEQ            1
#define IN_DATA           2

#define NOT_IN_ALPHABET 255  /* Reserved lookup value, indicates matching letter is not defined */
#define MAX_ALPHABET    255  /* Theoretically could use every single ascii letter except one, reserved for "not valid" */
#define ALPHABET_SPACE  256
#define MAX_HEADERLINE 1024  /* this is the longest comment or data header line */
#define MAX_ORDER        16  /* the largest order, allowed */
#define DEF_PROTEIN_ORDER     1
#define DEF_NUCLEIC_ORDER     3
#define DEF_ASCII_ORDER       2
#define PROTEIN_ALPHABET     "ACDEFGHIKLMNPQRSTVWY"
#define PROTEIN_ALPHABET_LEN  20
#define NUCLEIC_ALPHABET     "ACGT"
#define NUCLEIC_ALPHABET_LEN  4
#define ASCII_ALPHABET_LEN    NOT_IN_ALPHABET-1

typedef struct is_transition TRANSITION_NEXUS;
typedef struct is_table      TABLE_NEXUS;

struct is_transition {
  unsigned int *counts;          /* There will be C+1 of these, where C is the number of letters in the alphabet */
};

struct is_table {
  unsigned char      *alphabet;       /* alphabet used to construct the table. This is NOT a null terminated string! */
  unsigned char       lookup[ALPHABET_SPACE];    /* provide a fast way to convert from a letter to its alphabet position value  */
                                      /* value 255 means "not set"                                                   */
  unsigned int        anum;           /* characters in alphabet                                                      */
  unsigned int        order;          /* markov order                                                                */
  unsigned int        most_likely;    /* largest value (not position of value) in transitions->counts, used for emit */
  unsigned long long  entries;        /* total number of lines in transitions and totals == pow(anum,order)          */
  unsigned long long  sum_total;      /* sum of all totals                                                           */
  TRANSITION_NEXUS   *transitions;    /* there will be anum^order of these                                           */
  unsigned int       *totals;         /* totals for each set of transitions                                          */
};

/* function prototypes */
        int  compare_uchar(const void *s1, const void *s2);
TABLE_NEXUS *create_transition_table(void);
       void  emit_help(void);
       void  emit_hexamples(void);
       void  emit_hformat(void);
       void  emit_hinput(void);
       void  emit_random(TABLE_NEXUS *table);
       void  emit_transition_table(TABLE_NEXUS *table);
       void  insane(char *string);
        int  lcl_strcasecmp(char *s1, char *s2);
TABLE_NEXUS *load_transition_table(void);
       void  process_command_line_args(int argc,char **argv);
       void  read_data_header_line(void);
TABLE_NEXUS *read_into_transition_table(void);
       void  read_table_lines(TABLE_NEXUS *table);
       void  setintrange(int *val,int *numarg,int imin, int imax, int argc,char **argv,char * label);
       void  setstring(char **val,int *numarg,int argc,char **argv,char * label);
       void  sort_alphabet(unsigned char *string, int slen);
       void  transition_table_totals(TABLE_NEXUS *table);

/* global variables */
int   gbl_anum     = INTNOTSET;        /* length of custom alphabet                                     */
char *gbl_custom   = NULL;             /* Custom alphabet (only Null terminated from command line       */
FILE *gbl_in       = NULL;             /* set by process_command_line                                   */
char *gbl_indata   = NULL;
char *gbl_infile   = NULL;
int   gbl_inmode   = INTNOTSET;        /* set by process_command_line                                   */
int   gbl_len      = INTNOTSET;        /* take length from input                                        */
int   gbl_order    = INTNOTSET;        /* default is 3 for DNA, 1 for protein                           */
FILE *gbl_out      = NULL;             /* MAY be set by process_command_line                            */
FILE *gbl_outdata  = NULL;             /* MAY be set by process_command_line                            */
char *gbl_outdfile = NULL;
char *gbl_outsfile = NULL;
int   gbl_proc     = PROC_FASTA_FIRST; /* process only first entry in a fasta file                      */
int   gbl_rep      = 1;                /* default is one sequence in output                             */
int   gbl_seed      =INTNOTSET;        /* initial by time if not set                                    */
int   gbl_type     = TYPE_NONE;        /* default varies with other inputs                              */
int   gbl_wrap     = 50;               /* line wrap                                                     */
int   gbl_ucase    = 1;                /* Default, uppercase all letters                                */

/* functions */

void  insane(char *string){
 (void) fprintf(stderr,"%s\n",string);
 exit(EXIT_FAILURE);
}

int  lcl_strcasecmp(char *s1, char *s2){
int c1;
int c2;
  for(; ;s1++,s2++){
    c1=toupper(*s1);
    c2=toupper(*s2);
    if(c1 < c2)return -1;
    if(c1 > c2)return  1;
    if(c1 == 0)return  0;  /*c2 also is 0 in this case */
  }
}

/*Return an integer within the specified min/max range (inclusive) */
void  setintrange(int *val,int *numarg,int imin, int imax, int argc,char **argv,char * label){
      (*numarg)++;
      if( ( *numarg >= argc ) || (argv[*numarg] == NULL)){
        (void) fprintf( stderr, "%s: missing argument\n",label);
        exit(EXIT_FAILURE);
      }
      if(sscanf(argv[*numarg],"%d",val) != 1){
        (void) fprintf(stderr,"Bad integer argument/parameter [%s %s] \n",label,argv[*numarg]);
        exit(EXIT_FAILURE);
      }
      if(*val < imin || *val > imax){
        (void) fprintf(stderr,"Illegal integer argument/parameter. out of range [%s %s] \n",label,argv[*numarg]);
        exit(EXIT_FAILURE);
      }
}

void  setstring(char **val,int *numarg,int argc,char **argv,char * label){
      (*numarg)++;
      if( ( *numarg >= argc ) || (argv[*numarg] == NULL)){
        (void) fprintf( stderr, "%s: missing argument\n",label);
        exit(EXIT_FAILURE);
      }
      *val = argv[*numarg];
}

/* used in qsort to sort custom alphabet */
int  compare_uchar(const void *s1, const void *s2){
unsigned const char *p1 = s1, *p2 = s2;
  if( *p1 < *p2)return -1;
  if( *p1 > *p2)return  1;
  return 0;
}


/* custom alphabet put letters into order.  If any duplications are found it is a fatal error. */
void  sort_alphabet(unsigned char *string, int slen){
unsigned char c;
  qsort((void *)string,slen,sizeof(unsigned char),compare_uchar);
  for(c = *string, string++, slen--; slen ; string++, slen-- ){
    if(c==*string)insane("make_random_seq: fatal error: custom alphabet contained duplicate letters");
    c=*string;
  }
}

void  process_command_line_args(int argc,char **argv){
int numarg;


  for(numarg=1;numarg<argc;numarg++){
    if(0==lcl_strcasecmp(argv[numarg], "-a")){
      if(gbl_type != TYPE_NONE)insane("make_random_seq: fatal error: alphabet type specified twice");
      gbl_type = TYPE_ASCII;
    }
    else if(0==lcl_strcasecmp(argv[numarg], "-c")){
      if(gbl_type != TYPE_NONE)insane("make_random_seq: fatal error: alphabet type specified twice");
      gbl_type = TYPE_CUSTOM;
      numarg++;
      if(strlen(gbl_custom) > MAX_ALPHABET)insane("make_random_seq: fatal error: custom alphabet is too long");
      gbl_custom=argv[numarg];
    }
    else if(0==lcl_strcasecmp(argv[numarg], "-case")){
      gbl_ucase=0;
    }
    else if( (lcl_strcasecmp(argv[numarg], "-h")==0)     ||
             (lcl_strcasecmp(argv[numarg], "--h")==0)    ||
             (lcl_strcasecmp(argv[numarg], "-?")==0)     ||
             (lcl_strcasecmp(argv[numarg], "--?")==0)    ||
             (lcl_strcasecmp(argv[numarg], "-help")==0)  ||
             (lcl_strcasecmp(argv[numarg], "--help")==0) ){
       emit_help();
       exit(EXIT_SUCCESS);
    }
    else if( lcl_strcasecmp(argv[numarg], "-hexamples")==0){
      emit_hexamples();
      exit(EXIT_FAILURE);
    }
    else if( lcl_strcasecmp(argv[numarg], "-hformat")==0){
      emit_hformat();
      exit(EXIT_FAILURE);
    }
    else if( lcl_strcasecmp(argv[numarg], "-hinput")==0){
      emit_hinput();
      exit(EXIT_FAILURE);
    }
    else if(0==lcl_strcasecmp(argv[numarg], "-i")){
      (void)fprintf(stderr,"Version:   %s\n",EXVERSTRING);
      (void)fprintf(stderr,"bugs to:   %s\n",BUGSTRING);
      (void)fprintf(stderr,"Copyright: %s\n",COPYSTRING);
      (void)fprintf(stderr,"License:   %s\n",LICSTRING);
      exit(EXIT_SUCCESS);
    }
    else if( 0==lcl_strcasecmp(argv[numarg], "-in")){
      if(gbl_infile)insane("make_random_seq: fatal error: -in specified twice");
      setstring(&gbl_infile,&numarg,argc,argv,"-in");
    }
    else if( 0==lcl_strcasecmp(argv[numarg], "-indata")){
      if(gbl_indata)insane("make_random_seq: fatal error: -indata specified twice");
      setstring(&gbl_indata,&numarg,argc,argv,"-indata");
    }
    else if(0==lcl_strcasecmp(argv[numarg], "-inproc")){
      setintrange(&gbl_proc,&numarg,PROC_MIN,PROC_MAX,argc,argv,"-inproc");
    }
    else if(0==lcl_strcasecmp(argv[numarg], "-len")){
      setintrange(&gbl_len,&numarg,0,INT_MAX,argc,argv,"-len");
    }
    else if(0==lcl_strcasecmp(argv[numarg], "-n")){
      if(gbl_type != TYPE_NONE)insane("make_random_seq: fatal error: alphabet type specified twice");
      gbl_type = TYPE_NUCLEIC;
    }
    else if(0==lcl_strcasecmp(argv[numarg], "-order")){
      setintrange(&gbl_order,&numarg,0,INT_MAX,argc,argv,"-order");
    }
    else if( 0==lcl_strcasecmp(argv[numarg], "-out")){
      if(gbl_outsfile)insane("make_random_seq: fatal error: -out specified twice");
      setstring(&gbl_outsfile,&numarg,argc,argv,"-out");
    }
    else if( 0==lcl_strcasecmp(argv[numarg], "-outdata")){
      if(gbl_outdfile)insane("make_random_seq: fatal error: -outdata specified twice");
      setstring(&gbl_outdfile,&numarg,argc,argv,"-outdata");
    }
    else if(0==lcl_strcasecmp(argv[numarg], "-p")){
      if(gbl_type != TYPE_NONE)insane("make_random_seq: fatal error: alphabet type specified twice");
      gbl_type = TYPE_PROTEIN;
    }
    else if(0==lcl_strcasecmp(argv[numarg], "-rep")){
      setintrange(&gbl_rep,&numarg,0,INT_MAX,argc,argv,"-rep");
    }
    else if(0==lcl_strcasecmp(argv[numarg], "-seed")){
      setintrange(&gbl_seed,&numarg,INT_MIN,INT_MAX,argc,argv,"-seed");
    }
    else if(0==lcl_strcasecmp(argv[numarg], "-wrap")){
      setintrange(&gbl_wrap,&numarg,1,INT_MAX,argc,argv,"-wrap");
    }
    else{
      (void) fprintf(stderr,"make_random_seq: fatal error:no such parameter [%s]\n",argv[numarg]);
      emit_help();
      exit(EXIT_FAILURE);
    }
  }
  
  /* sanity checking */
  
  /* what is the input?  If nothing was specified then we are reading sequence from stdin */
  if(gbl_infile){
    if(gbl_indata){ insane("make_random_seq: fatal error: both -in and -indata were specified"); }
    else {          gbl_inmode = IN_SEQ;  }
  }
  else {
    if(gbl_indata){ gbl_inmode = IN_DATA; gbl_infile=gbl_indata; }
    else {          gbl_inmode = IN_SEQ;  }
  }
  
  if(!gbl_infile || (0==lcl_strcasecmp(gbl_infile,"-"))){ gbl_in = stdin; }
  else {
    gbl_in = fopen(gbl_infile,"r");
    if(!gbl_in)insane("make_random_seq: fatal error: could not open the specified input file"); 
  }

  /* what is the output?  It can be sequence, data, or both */
  if(gbl_rep){
    if(gbl_outsfile){
      if(0==lcl_strcasecmp(gbl_outsfile,"-")){ gbl_out = stdout; }
      else {
        gbl_out = fopen(gbl_outsfile,"w");
        if(!gbl_out)insane("make_random_seq: fatal error: could not open the specified output file"); 
      }
    }
    else {
      gbl_out = stdout;
    }
  }
  
  if(gbl_outdfile){
    if(0==lcl_strcasecmp(gbl_outdfile,"-")){ gbl_outdata = stdout; }
    else {
      gbl_outdata = fopen(gbl_outdfile,"w");
      if(!gbl_outdata)insane("make_random_seq: fatal error: could not open the specified output file"); 
    }
  }

  if(gbl_outdata== stdout && gbl_out == stdout){
    (void) fprintf(stderr,"fata_make_random: warning: both generated sequence and data are directed to stdout\n");
  }
  
  if(gbl_seed == INTNOTSET){
	struct timeval thetv;
        struct timeval *tp=&thetv;
        if (gettimeofday(tp,(struct timezone *)NULL) != 0)insane("make_random_seq: fatal error: could not get time to set random seed"); 
        gbl_seed = time(NULL)*1000000 + tp->tv_usec;
  }
  srand((unsigned int) gbl_seed);
  
  if(!gbl_ucase && gbl_type!=TYPE_CUSTOM)insane("make_random_seq: fatal error: -case may only be used with -c"); 
  
  /* if a table file is specified, the order comes from there.  If not, set the order based on -p, -n, or
     check that it was specified with -c */
  if(gbl_indata){
    if( gbl_order!=INTNOTSET)insane("make_random_seq: fatal error: -indata defines order, cannot also use -order"); 
    gbl_type=TYPE_CUSTOM;  /* read from table file */
  }
  else {
    if(gbl_order == INTNOTSET){
      switch(gbl_type){
        case TYPE_PROTEIN:
          gbl_order = DEF_PROTEIN_ORDER;
          break;
        case TYPE_NUCLEIC:
          gbl_order = DEF_NUCLEIC_ORDER;
          break;
        case TYPE_ASCII:
          gbl_order = DEF_ASCII_ORDER;
          break;
        case TYPE_CUSTOM:
          insane("make_random_seq: fatal error: -order must be specified with -c");    
          break;
        default:
          insane("make_random_seq: fatal programming error: invalid alphabet type");   
      }
    }
  }
  
  /* command line specified alphabet will be a null terminated string */
  if(gbl_custom)gbl_anum=strlen(gbl_custom);
  
  
}

void  emit_help(void){
  (void) fprintf(stderr,"make_random_seq command summary:\n\n");
  (void) fprintf(stderr,"   -in      file   read sequence from file, default or \"-\" from stdin,  Not with -indata.\n");
  (void) fprintf(stderr,"   -indata  file   read transition table, \"-\" from stdin.  Not with -in.\n");
  (void) fprintf(stderr,"   -out     file   write fasta sequence to file, default or \"-\" to stdout.\n");
  (void) fprintf(stderr,"   -outdata file   write transition table, \"-\" to stdout, default is not emitted.\n");
  (void) fprintf(stderr,"   -order N        Markov chain order, default is %d for Nucleic, %d for Protein.\n" ,DEF_NUCLEIC_ORDER,DEF_PROTEIN_ORDER);
  (void) fprintf(stderr,"                     N must be zero (composition analysis) or higher.\n");
  (void) fprintf(stderr,"   -a              Ascii alphabet (254 characters, hex 0->FE).\n");
  (void) fprintf(stderr,"   -p              Protein alphabet (20 characters: %s ).\n",PROTEIN_ALPHABET);
  (void) fprintf(stderr,"   -n              Nucleic acid alphabet (4 characters: %s, U is translated to T).\n", NUCLEIC_ALPHABET);
  (void) fprintf(stderr,"   -c LETTERS      Custom alphabet, specify up to 254 unique characters.\n");
  (void) fprintf(stderr,"   -case           Retain case for custom alphabet, default is force to upper.\n");
  (void) fprintf(stderr,"   -len N          Number of characters in each output sequence, must be at least order + 1\n");
  (void) fprintf(stderr,"                     Defaults to the length of the input file\n");
  (void) fprintf(stderr,"   -rep N          Number of sequences in sequence output, defaults to 1. Zero suppresses sequence output.\n");
  (void) fprintf(stderr,"   -inproc         Processing of sequence input, collect statistics from: \n");
  (void) fprintf(stderr,"                   0 first entry in a Fasta file (default)\n");
  (void) fprintf(stderr,"                   1 all entries in a Fasta file\n");
  (void) fprintf(stderr,"                   2 all of input file (raw sequence)\n");
  (void) fprintf(stderr,"   -wrap N         Wrap output sequence lines every Nth character.  Default is 50.\n");
  (void) fprintf(stderr,"   -seed N         Random number generator seed.  Default uses timeofday in microseconds.\n");
  (void) fprintf(stderr,"   -h              print this help message (also -help --h --help -? --?)\n");
  (void) fprintf(stderr,"   -hinput         print a description of the input sequenceformat.\n");
  (void) fprintf(stderr,"   -hexamples      print examples.\n");
  (void) fprintf(stderr,"   -hformat        print a description of the data file format.\n");
  (void) fprintf(stderr,"   -i              emit version, copyright, license and contact information.\n");
  exit(EXIT_FAILURE);
}

void  emit_hexamples(void){
  (void) fprintf(stderr,"%% make_random_seq -in dna.nfa -n -outdata dna.table                         \n");
  (void) fprintf(stderr,"    Read a Fasta formatted file containing DNA sequence, using a 3rd        \n");
  (void) fprintf(stderr,"    order Markov chain generator, create an equal sized random output       \n");
  (void) fprintf(stderr,"    sequence which is sent to stdout.  Additionally, save the transition    \n");
  (void) fprintf(stderr,"    data derived from the input in the file dna.table.                      \n");
  (void) fprintf(stderr,"                                                                            \n");
  (void) fprintf(stderr,"%% make_random_seq -indata dna.table -len 10000 -rep 5 -out dna_5_long.nfa   \n");
  (void) fprintf(stderr,"    Using the transition table from the previous command, generate a Fasta  \n");
  (void) fprintf(stderr,"    formatted file containing 5 random sequences of length  10000.          \n");
  (void) fprintf(stderr,"                                                                            \n");
  (void) fprintf(stderr,"%% make_random_seq -in special.txt -c ABCDEFGH -order 2                      \n");
  (void) fprintf(stderr,"    Generate a random sequence based on the custom alphabet shown           \n");
  (void) fprintf(stderr,"    using a 2nd order Markov chain generator with transitions determined    \n");
  (void) fprintf(stderr,"    from the input file.                                                    \n");
  (void) fprintf(stderr,"                                                                            \n");
  (void) fprintf(stderr,"%% make_random_seq -in macbeth.txt -a -inproc 2 -out gibberish_macbeth.txt   \n");
  (void) fprintf(stderr,"    Make a randomized version of Macbeth, using a 2nd order Markov chain    \n");
  (void) fprintf(stderr,"    generator.  The input is a text file consisting of ASCII characters.    \n");
  exit(EXIT_SUCCESS);
}

void  emit_hformat(void){
  (void) fprintf(stderr,"A transition table is formmated like this:                      \n");
  (void) fprintf(stderr,"                                                                \n");
  (void) fprintf(stderr,"!comments, if any, which are ignored                            \n");
  (void) fprintf(stderr,"3,4,ACGT                                                        \n");
  (void) fprintf(stderr,"AAA   4  24 4  23                                               \n");
  (void) fprintf(stderr,"AAC   0   4 21 13                                               \n");
  (void) fprintf(stderr,"...                                                             \n");
  (void) fprintf(stderr,"GGG   5   5 11 21                                               \n");
  (void) fprintf(stderr,"                                                                \n");
  (void) fprintf(stderr,"The definition line contains exactly these elements:            \n");
  (void) fprintf(stderr,"  R, the Markov order, here 3.                                  \n");
  (void) fprintf(stderr,"  a comma separator          .                                  \n");
  (void) fprintf(stderr,"  C, the number of allowed characters, here 4.                  \n");
  (void) fprintf(stderr,"  a comma separator          .                                  \n");
  (void) fprintf(stderr,"  The character set, here {A,C,G,T}.  Maximum length: %d        \n",MAX_ALPHABET);
  (void) fprintf(stderr,"                                                                \n");
  (void) fprintf(stderr,"C^R data lines, each containing:                                \n");
  (void) fprintf(stderr,"   If R is greater than zero, a sequence prefix (for instance,  \n");
  (void) fprintf(stderr,"     AAA)   The order must be as indicated.                     \n");
  (void) fprintf(stderr,"   The number of observed transitions from that prefix          \n");
  (void) fprintf(stderr,"     sequence to each of the subsequent letters.  These values  \n");
  (void) fprintf(stderr,"     are entered in the same order as they were listed on the   \n");
  (void) fprintf(stderr,"     definition line.  Here, A,C,G,T.                           \n");
  (void) fprintf(stderr,"                                                                \n");
  (void) fprintf(stderr,"All C^R data lines must be present, even if some of the sequence\n");
  (void) fprintf(stderr,"prefixes were never observed, in which case numeric values on   \n");
  (void) fprintf(stderr,"that line are all zero.                                         \n");
  (void) fprintf(stderr,"                                                                \n");
  (void) fprintf(stderr,"Definition and comment lines must be less than %d characters.   \n",MAX_HEADERLINE);
  exit(EXIT_SUCCESS);
}

void  emit_hinput(void){
  (void) fprintf(stderr,"Input is normally in Fasta format (-inproc 0 or default).  That is, it is     \n");
  (void) fprintf(stderr,"comprised of one or more sequence entries of:                                 \n");
  (void) fprintf(stderr,"  A header line which begins with \">\" and is followed by text               \n");
  (void) fprintf(stderr,"  One or more data lines containing letters from the specified alphabet.      \n");
  (void) fprintf(stderr,"                                                                              \n");
  (void) fprintf(stderr,"If the fasta file contains more than one sequence entry only the first        \n");
  (void) fprintf(stderr,"will be processed, unless \"-inproc 1\" is specified, in which case all       \n");
  (void) fprintf(stderr,"entries will be processed.  Nucleic acid and protein Sequence letters are     \n");
  (void) fprintf(stderr,"converted to uppercase, ASCII and custom alphabets retain their case.  For    \n");
  (void) fprintf(stderr,"the nucleic acid alphabet U is converted to T.  Letters which are not         \n");
  (void) fprintf(stderr,"part of the alphabet are ignored.  For instance, if the input sequence is     \n");
  (void) fprintf(stderr,"\"actgACGZACGgcgat\", the Markov order is three, and the alphabet is Nucleic, \n");
  (void) fprintf(stderr,"the uppercased region in the input would be ignored because of the Z.         \n");
  (void) fprintf(stderr,"                                                                              \n");
  (void) fprintf(stderr,"When \"-inproc 2\" is specified all input characters are part of the sequence.\n");
  exit(EXIT_SUCCESS);
}

/* create an empty transition table */
TABLE_NEXUS *create_transition_table(void){
TABLE_NEXUS *table;
TRANSITION_NEXUS *ptrans;
unsigned long long entries;
double dentries;
unsigned int *counts;
int i;
unsigned char alphabet[MAX_ALPHABET];

  table = malloc(sizeof(TABLE_NEXUS));
  if(!table)insane("make_random_seq: fatal error: could not create a transition table");
  table->order       = gbl_order;
  table->most_likely = 0;
  memset(table->lookup,NOT_IN_ALPHABET,ALPHABET_SPACE); 
  table->alphabet=malloc(sizeof(char)*MAX_ALPHABET);
  if(!table->alphabet)insane("make_random_seq: fatal error: could not create an alphabet entry in the transition table");

  switch(gbl_type){
    case TYPE_PROTEIN:
      table->anum=PROTEIN_ALPHABET_LEN;
      memcpy(table->alphabet,PROTEIN_ALPHABET,table->anum);
      gbl_ucase=1;
      break;
    case TYPE_NUCLEIC:
      table->anum=NUCLEIC_ALPHABET_LEN;
      memcpy(table->alphabet,NUCLEIC_ALPHABET,table->anum);
      table->lookup['U'] = 3;  /* Map U to T. */
      table->lookup['u'] = 3;  /* Map u to T. */
      gbl_ucase=1;
      break;
    case TYPE_ASCII:
      table->anum=ASCII_ALPHABET_LEN;
      for(i=0;i<table->anum;i++){ table->alphabet[i] = i; }
      gbl_ucase=0;  /* For ascii case matters*/
      break;
    case TYPE_CUSTOM:
      table->anum= gbl_anum;
      memcpy(table->alphabet,gbl_custom,table->anum);
      break;
    default:
      insane("make_random_seq: fatal programming error: invalid alphabet type, use -h for help");    
  }


  if(gbl_ucase){
    for(i=0;i<table->anum;i++){
      table->alphabet[i]=toupper(table->alphabet[i]);;
    }
  }

  /* this could result in the alphabet now having duplications, as in sS -> SS */

  memcpy(alphabet,table->alphabet,table->anum);
  sort_alphabet(alphabet,table->anum);   /* Force an error if duplicate characters are present, harmless otherwise */

  for(i=0;i<table->anum;i++){  /* Fill in the lookup table */
    table->lookup[table->alphabet[i]]=i;
    if(gbl_ucase){
      table->lookup[tolower(table->alphabet[i])]=i;
    }
  } /* Fill in the lookup table */
  
  
  dentries = pow((double) table->anum, (double) table->order);
  entries = dentries;
  if((double) entries != dentries)insane("make_random_seq: fatal error: accuracy problem calculating number of entries"); 
  table->transitions=calloc(entries,sizeof(TRANSITION_NEXUS));
  if(!table->transitions)insane("make_random_seq: fatal error: could not allocate space for tansition pointers"); 
  table->totals=calloc(entries,sizeof(unsigned int));
  if(!table->totals)insane("make_random_seq: fatal error: could not allocate space for tansition totals"); 
  table->entries = entries;
  /* allocate one huge block of unsigned int to hold counts */
  counts=calloc(entries * table->anum, sizeof(unsigned int));
  if(!counts)insane("make_random_seq: fatal error: could not allocate space for tansition counts"); 
  /* then counts pointer in each transitions block to the right place in the big block */
  ptrans = table->transitions;
  for( ;entries;entries--){
    ptrans->counts = counts;
    ptrans++;
    counts += table->anum;
  }
  
  return(table);
} 

/* The data header line is a bit tricky.  The first two fields are numeric, and the remainder of that
line may contain any character, including EOL.  Consequently it has to be processed a character at a time
since fgets() or something like that could truncate the alphabet.*/
void  read_data_header_line(void){
int count;
char *alphabet;
char *buffer, *pbuf;
char c;
int i;
  /* read lines until one is found which is not a comment */
  buffer   = malloc(sizeof(char) * MAX_HEADERLINE);
  if(!buffer)insane("make_random_seq: fatal error: allocation error for buffer");
  alphabet = malloc(sizeof(char) * (MAX_ALPHABET + 1));
  if(!alphabet)insane("make_random_seq: fatal error: allocation error for alphabet");
  while(1){
    c=fgetc(gbl_in);
    if(c==EOF)insane("make_random_seq: fatal error: data file is invalid");
    if(c != '!')break;
    while(1){  /* Eat the remainder of the line */
      c=fgetc(gbl_in);
      if(c=='\n')break;
      if(c==EOF)insane("make_random_seq: fatal error: data file is invalid");
    }
  }
  
  /* pull out the order value */
  pbuf = buffer;
  while(1){
    if(!isdigit((int) c))insane("make_random_seq: fatal error: order field contains non digit characters");
    *pbuf++ = c;
    c=fgetc(gbl_in);
    if(c==EOF)insane("make_random_seq: fatal error: data file is invalid");
    if(c == ',')break;
  }
  *pbuf='\0';
  count = sscanf(buffer,"%d",&gbl_order);
  if(count != 1    )insane("make_random_seq: fatal error: invalid order field");
  
  /* pull out the gbl_anum value */
  pbuf = buffer;
  while(1){
    c=fgetc(gbl_in);
    if(c==EOF)insane("make_random_seq: fatal error: data file is invalid");
    if(c == ',')break;
    if(!isdigit((int) c))insane("make_random_seq: fatal error: order field contains non digit characters");
    *pbuf++ = c;
  }
  *pbuf='\0';
  count = sscanf(buffer,"%d",&gbl_anum);
  if(count != 1     )insane("make_random_seq: fatal error: invalid alphabet size field");
  if(gbl_anum == 0  )insane("make_random_seq: fatal error: invalid alphabet size field");
  
  /* load the alphabet */
  for(i=0;i<gbl_anum;i++){
    c = fgetc(gbl_in);
    if(c==EOF)insane("make_random_seq: fatal error: truncated alphabet field");
    alphabet[i]=c;
  }
  gbl_custom=malloc(sizeof(char)*gbl_anum);  /* This is NOT a null terminated string */
  if(!gbl_custom             )insane("make_random_seq: fatal error: allocation error for gbl_custom");
  memcpy(gbl_custom,alphabet,gbl_anum);
  free(alphabet);
  free(buffer);
  
  /* Eat the EOL character before leaving */
  c = fgetc(gbl_in);
  if(c!='\n')insane("make_random_seq: fatal error: count mismatch on definition line, final character not EOL");
}

/* reads all of the transitions lines from the data file.  Verifies that the prefix string is that expected, stores
the transitions values, calculates totals */
void  read_table_lines(TABLE_NEXUS *table){
char prefix[MAX_ORDER]; /* should be large enough for any reasonable operation */
TRANSITION_NEXUS *ptrans;
unsigned int *pcounts;
unsigned long long which;
unsigned long long nextwhich;
unsigned long long lastwhich;
int count,i;
unsigned int divisor;

  for(which=0; which < table->entries; which++){
  
    if(table->order > 0){
      count = fscanf(gbl_in,"%s",&prefix[0]);
      if(count!=1){
        exit(EXIT_FAILURE);
      }
      /* verify that the prefix is correct, essentially we are just counting by base table->anum, using
         the characters in table->alphabet as the digits. So repeatedly dividing the number by the base
         spits out the digits one by one.  */
      divisor = table->anum;
      for(lastwhich=which, i=table->order-1; i>=0; i--){
        nextwhich = lastwhich/divisor;
        if(prefix[i] != table->alphabet[lastwhich - divisor*nextwhich]){
          (void) fprintf(stderr,"make_random_seq: fatal error: Invalid prefix string %s at line %lld\n",prefix,which);
          exit(EXIT_FAILURE);
        }
        lastwhich=nextwhich;
      }
    }
    
    ptrans =  &(table->transitions[which]);
    pcounts = ptrans->counts;
    for(i=0; i<table->anum; i++,pcounts++){
      count = fscanf(gbl_in,"%ud",pcounts);
      if(count!=1){
        (void) fprintf(stderr,"make_random_seq: fatal error: Invalid transition value at line %lld value %d\n",which,i);
        exit(EXIT_FAILURE);
      }
    }
  }
}

/* This is pretty simple.  It reads in the first line*/
TABLE_NEXUS *load_transition_table(void){
TABLE_NEXUS *table;
  read_data_header_line();  /* this sets gbl_custom and gbl_order */ 
  gbl_type = TYPE_CUSTOM;
  table = create_transition_table();  /* this creates the empty table */
  read_table_lines(table);            /* this fills it, reading data from the data file*/
  return table;
}

TABLE_NEXUS *read_into_transition_table(void){
TABLE_NEXUS *table=NULL;
unsigned char iprefix[MAX_ORDER];
unsigned long long which;
int  c,ok;
int  good;
int  order1,order,anum;
unsigned char value;
int j,k;
unsigned int rlen=0;

 
  
  table = create_transition_table();  /* this creates the empty table */
  order  = table->order;
  order1 = table->order + 1;
  anum   = table-> anum;
  
 
  /*read the first character */
  if(gbl_proc != PROC_ALL){
    c=fgetc(gbl_in);
    if(c==EOF || c!='>')insane("make_random_seq: fatal error: fasta input file is invalid");
  }

  /* enter the main loop */
  ok=1;
  while(ok){
    /*eat the remainder of the FASTA header */
    if(gbl_proc != PROC_ALL){
      while(ok){
        c=fgetc(gbl_in);
        if(c==EOF){
          if(gbl_proc == PROC_FASTA_FIRST)insane("make_random_seq: fatal error: fasta input file is invalid");
          ok=0;
        }
        if(c=='\n')break;
      }
    }

    /* read in characters and add into the transition table if iprefix is full.  If we hit
    an unknown character flush iprefix */
    good=0;
    while(ok){
      c=fgetc(gbl_in);
      if(c==EOF ){
        ok = 0;
        break;
      }
      if(gbl_proc != PROC_ALL && c=='>')break;
      if(gbl_ucase)c = toupper(c);  /* uppercase the character */
      value=table->lookup[c];
      if(value==NOT_IN_ALPHABET){
        if(c=='\n')continue;  /* EOL is ignored if it isn't in the alphabet */
        good=0;   /* not in alphabet, and not an EOL, so refill iprefix from next good one */
      }
      else {
        iprefix[good++] = value;
        rlen++;
        if(good==order1){
          for(which=j=0; j<order; j++){  which = anum * which + iprefix[j]; }
          good--;
          table->transitions[which].counts[iprefix[order]]++;
          
          /* shift the prefix letters one left */
          for(j=0,k=1; j<order;){  iprefix[j++] = iprefix[k++]; }
        }
      }
    }
    if(gbl_proc != PROC_FASTA_ALL)ok=0;  /* this line must remain just before the end of the outer while loop */
  }
  
  if(gbl_len == INTNOTSET)gbl_len = rlen;   /* If not specified, use value from file  */

  return table;
}

void  transition_table_totals(TABLE_NEXUS *table){
unsigned long long  which;
unsigned long long  sum_total;  
TRANSITION_NEXUS   *ptrans=NULL;
unsigned int       *pcounts=NULL;
unsigned int       *totals=NULL;
unsigned int        most_likely=0;  /* the number of transitions in the transition which is observed most */   
int i;
  /* Fill in the totals */
  
  sum_total=0;
  totals = table->totals;  
  for(which=0; which<table->entries; which++,ptrans++,totals++){
    ptrans =  &(table->transitions[which]);
    pcounts = ptrans->counts;
    for(i=0; i<table->anum; i++,pcounts++){
        *totals += *pcounts;
        if(*pcounts > most_likely)most_likely=*pcounts;
    };
    sum_total += *totals;
  }
  table->sum_total   = sum_total;
  table->most_likely = most_likely;  /* This is used to minimize padding in the table */
}

void  emit_transition_table(TABLE_NEXUS *table){
int i;
unsigned long long which;
char prefix[MAX_ORDER]; /* should be large enough for any reasonable operation */
unsigned long long nextwhich;
unsigned long long lastwhich;
unsigned int divisor;
TRANSITION_NEXUS *ptrans;
unsigned int *pcounts;
unsigned int most_likely;
int fw;
char ffield[16];

  divisor = table->anum;

  /* figure out what field width to use for numeric values.  It must be wide enough to hold most_likely */
  for(fw=0,most_likely=table->most_likely; most_likely; fw++){
    most_likely /= 10;
  }
  (void) sprintf(ffield," %c%dd",'%',fw);

  /* table->alphabet is not a null terminated string, so emit it one character at a time */

  (void) fprintf(gbl_outdata,"%d,%d,",table->order,table->anum);
  for(i=0;i<table->anum;i++){ (void) fputc(table->alphabet[i],gbl_outdata); }
  (void) fputc('\n',gbl_outdata);

  for(which=0; which<table->entries;which++){
    if(table->order > 0){
       /* construct the prefix from the position number. Eessentially we are just counting by base table->anum, using
          the characters in table->alphabet as the digits. So repeatedly dividing the number by the base
          spits out the digits one by one.  */
      prefix[table->order]='\0';  /* terminate the prefix string, then work left */
      for(lastwhich=which, i=table->order-1; i>=0; i--){
        nextwhich = lastwhich/divisor;
        prefix[i] = table->alphabet[lastwhich - divisor*nextwhich];
        lastwhich=nextwhich;
      }
      (void) fprintf(gbl_outdata,"%s",prefix);
    }
    

    /* emit the counts, try to keep it neat.  If they are wider than 6 digits the columns will
       be messy.
    */
    ptrans =  &(table->transitions[which]);
    pcounts = ptrans->counts;
    for(i=0; i<divisor; i++,pcounts++){
      (void) fprintf(gbl_outdata,ffield,*pcounts);
    }
    (void) fprintf(gbl_outdata,"\n");
  }
}

void  emit_random(TABLE_NEXUS *table){
float  frand;
double drand;
unsigned long long which;
unsigned long long sum_total;
unsigned int *totals;
int i,j,k;
int anum;
int order;
int count,lcount;
unsigned int ptotal;
int iprefix[MAX_ORDER]; /* should be big enough, here use integer index values rather than character values */
TRANSITION_NEXUS *ptrans=NULL;
unsigned int     *pcounts=NULL;

unsigned long long lastwhich;
unsigned long long nextwhich;
unsigned int divisor;
double hold_frac;

  order   = table->order;
  anum    = table->anum;
  divisor = anum;
  for(i=0; i<gbl_rep; i++){
    (void) fprintf(gbl_out,">random_sequence_%d\n",i);

    /* pick a starting value.  This is done by the none too efficent method of picking
    a random number between 0 and sum_totals, then scanning up the list adding totals()
    until this target is reached. This does have the advantage though that it weights each starting
    tuple according to its original fraction in the training set.  Note also that only a bin
    which has something in it can match. */

    drand = (double) rand() * (double) table->sum_total / (double) RAND_MAX;
    sum_total = 0;
    totals = table->totals;
    for(which=0; which < table->entries; which++,totals++){
      if( (drand >= (double) sum_total) && (drand <= (double)(sum_total  + *totals)))break;
      sum_total += *totals;
    }
    
    /* (pre)load the iprefix array, based on which */

    for(lastwhich=which, j=table->order-1; j>=0; j--){
      nextwhich = lastwhich/divisor;
      iprefix[j] = lastwhich - divisor*nextwhich;
      lastwhich=nextwhich;
    }

    for(lcount=count=1;count<=gbl_len;lcount++,count++){

      /* pick the next letter and append it to prefix, also emit it.  The method is similar to how we
      found which, except the total for these transitions is used and we march along the transition
      counts.  Again, only a transition with something in it can match. */

      ptrans =  &(table->transitions[which]);
      pcounts = ptrans->counts;

      hold_frac = (((float) rand())/ ((float) RAND_MAX));
      frand = ((float) table->totals[which]) *  hold_frac;
      for(ptotal=j=0; j < anum; j++, pcounts++ ){
        if((frand >= (float) ptotal) && (frand <= (float) (ptotal + *pcounts)))break;
        ptotal += *pcounts;
      }
            
      (void) fprintf(gbl_out,"%c",table->alphabet[j]);
      if(lcount==gbl_wrap){
        (void) fprintf(gbl_out,"\n");
        lcount=0;
      }
      
      /* store the letter one slot above the top in iprefix, then shift the iprefix array */
      iprefix[order]=j;
      for(j=0,k=1;j<order;){ iprefix[j++] = iprefix[k++]; }
      
      /* generate a new "which" from the iprefix */
      for(which=j=0; j<order; j++){  which = anum * which + iprefix[j]; }
      
    }
    if(lcount>1)(void) fprintf(gbl_out,"\n");
  }
}

int  main(int argc, char *argv[]){
TABLE_NEXUS *table=NULL;


  process_command_line_args(argc,argv);
  
  /* load or create the transition table.  Side effect of both, will set gbl_alphabet */
  if(gbl_inmode == IN_SEQ){ 
          table = read_into_transition_table();   /* will set gbl_len if it was not provided on command line */
  }
  else {  table=load_transition_table();   }
  transition_table_totals(table);

  if(gbl_out)emit_random(table);
  if(gbl_outdata)emit_transition_table(table);
  
  exit(EXIT_SUCCESS);
}
