/*******************************************************************/
/*      File: bitpar.C                                             */
/*    Author: Helmut Schmid                                        */
/*   Purpose:                                                      */
/*   Created: Tue Oct 29 10:59:29 2002                             */
/*  Modified: Thu Mar 15 09:58:38 2007 (schmid)                    */
/* Copyright: Institut fuer maschinelle Sprachverarbeitung         */
/*            Universitaet Stuttgart                               */
/*******************************************************************/

#include <time.h>

#include <iostream>
using std::cerr;

#include "parser.h"
#include "vitparser.h"
#include "retagging.h"

static bool Viterbi=false;
static bool PrintOpt=false;
static bool PrintYAP=false;
static bool ViterbiProbs=false;
static bool InsideProbs=false;
static bool EstimatedFreqs=false;
static bool PrintTraces=false;
static bool Tagging=false;
static bool Training=false;
static bool PrintRuleNumbers=false;
static bool PrintLexScores=false;
static bool MaxFScore=false;
static bool MikeOption=false;
static double PruningThreshold=0.0;
static int  NBest=0;

static char *StartSymbol=NULL;
static char *Tfileprefix=NULL;
static char *OCfilename=NULL;
static char *WCfilename=NULL;
static char *Scorefilename=NULL;
static char TraceStartSymbol='*';
static char TraceEndSymbol='*';

extern FILE  *yyin;
extern int yyparse (void);

/*FA****************************************************************/
/*                                                                 */
/*  usage                                                          */
/*                                                                 */
/*FE****************************************************************/

void usage()

{
  cerr << "\nUsage:  bitpar grammar-file lexicon-file [infile [outfile]]\n";
  cerr << "OPTIONS\n";
  cerr << "-o   print parse forest\n";
  cerr << "-s s Use s as start symbol rather than the first symbol in the grammar file  \n";
  cerr << "-p   grammar is a PCFG\n";
  cerr << "-v   print Viterbi parse\n";
  cerr << "-u   file containing possible tags of unknown words\n";
  cerr << "-w   file containing an automaton for word classification\n";
  cerr << "-S w set wordclass smoothing weight to w (default is 1)\n";
  cerr << "-tg  grammar contains trace symbols of the form *XY*\n";
  cerr << "-ts xy change the trace start symbol to x and the trace end symbol to y; traces have the form x...y\n";
  cerr << "-H   heads of grammar rules are marked with a preceding ^\n";
  cerr << "-l   lexicon entries with lemmata\n";
  cerr << "-a f read head-governor association scores from file f\n";
  cerr << "-b n print the n best parse trees\n";
  cerr << "-vp  print parse forests with Viterbi probabilities\n";
  cerr << "-ip  print parse forests with Inside probabilities\n";
  cerr << "-f   print parse forests with estimated frequencies\n";
  cerr << "-em f EM training. f ist the prefix of the files, where the output is stored.\n";
  cerr << "-prune t pruning of the parse forest with threshold t\n";
  cerr << "-t   print trace probabilities\n";
  cerr << "-r   retag parse trees\n";
  cerr << "-rn  print parse forests with rule numbers\n";
  cerr << "-mf  print the maximum estimated f-score parse\n";
  cerr << "-as  print the lexical association scores used for each sentence\n";
  cerr << "-y   print parse forest in YAP format\n";
  cerr << "-q   suppress status messages\n";
  cerr << "-i   verbose mode\n";
  cerr << "-h   this message\n";
  cerr << "\nOptions -v -vp -ip -f -prune and -t imply option -p.\n";
  cerr << "Option -t also implies option -tg.\n";
  exit(1);
}


/*FA****************************************************************/
/*                                                                 */
/*  get_flags                                                      */
/*                                                                 */
/*FE****************************************************************/

static void get_flags( int &argc, char **argv )

{
  for( int i=1; i<argc; i++ ) {
    if (strcmp(argv[i],"-q") == 0) {
      Quiet = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-p") == 0) {
      WithProbs = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-vp") == 0) {
      WithProbs = true;
      ViterbiProbs = true;
      PrintOpt = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-ip") == 0) {
      WithProbs = true;
      InsideProbs = true;
      PrintOpt = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-f") == 0) {
      WithProbs = true;
      EstimatedFreqs = true;
      PrintOpt = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-tg") == 0) {
      WithTraces = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-r") == 0) {
      WithProbs = true;
      Tagging = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-rn") == 0) {
      PrintRuleNumbers = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-mf") == 0) {
      MaxFScore = true;
      Viterbi = true;
      WithProbs = true;
      PrintOpt = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-as") == 0) {
      PrintLexScores = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-t") == 0) {
      WithProbs = true;
      PrintTraces = true;
      WithTraces = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-v") == 0) {
      Viterbi = true;
      WithProbs = true;
      PrintOpt = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-o") == 0) {
      PrintOpt = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-y") == 0) {
      PrintYAP = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-H") == 0) {
      WithHeads = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-l") == 0) {
      WithLemmas = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-h") == 0) {
      usage();
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-i") == 0) {
      Verbose = true;
      argv[i] = NULL;
    }
    else if (strcmp(argv[i],"-mike") == 0) {
      MikeOption = true;
      Viterbi = true;
      ViterbiProbs = true;
      WithProbs = true;
      PrintOpt = true;
      argv[i] = NULL;
    }
    else if (i < argc-1) {
      if (strcmp(argv[i],"-s") == 0) {
	StartSymbol = argv[i+1];
	argv[i] = NULL;
	argv[++i] = NULL;
      }
      else if (strcmp(argv[i],"-a") == 0) {
	Scorefilename= argv[i+1];
	WithHeads = true;
	argv[i] = NULL;
	argv[++i] = NULL;
      }
      else if (strcmp(argv[i],"-u") == 0) {
	OCfilename = argv[i+1];
	argv[i] = NULL;
	argv[++i] = NULL;
      }
      else if (strcmp(argv[i],"-w") == 0) {
	WCfilename = argv[i+1];
	argv[i] = NULL;
	argv[++i] = NULL;
      }
      else if (strcmp(argv[i],"-S") == 0) {
	SmoothingWeight = atof(argv[i+1]);
	argv[i] = NULL;
	argv[++i] = NULL;
      }
      else if (strcmp(argv[i],"-ts") == 0) {
	if (strlen(argv[i+1]) != 2) {
	  fprintf(stderr,"Error: option -ts requires a 2-letter argument!\n");
	  exit(1);
	}
	TraceStartSymbol = argv[i+1][0];
	TraceEndSymbol = argv[i+1][1];
	WithTraces = true;
	argv[i] = NULL;
	argv[++i] = NULL;
      }
      else if (strcmp(argv[i],"-b") == 0) {
	NBest = atoi(argv[i+1]);
	if (NBest < 1 || NBest > 1000) {
	  fprintf(stderr,"Error: argument of option -b is out of range!\n");
	  exit(1);
	}
	WithProbs = true;
	PrintOpt = true;
	argv[i] = NULL;
	argv[++i] = NULL;
      }
      else if (strcmp(argv[i],"-em") == 0) {
	WithProbs = true;
	Training = true;
	Tfileprefix = argv[i+1];
	argv[i] = NULL;
	argv[++i] = NULL;
      }
      else if (strcmp(argv[i],"-prune") == 0) {
	WithProbs = true;
	PruningThreshold = atof(argv[i+1]);
	if (PruningThreshold <= 0.0 || PruningThreshold > 1) {
	  fprintf(stderr, "Error: invalid pruning threshold \"%s\"!\n", 
		  argv[i+1]);
	  exit(1);
	}
	argv[i] = NULL;
	argv[++i] = NULL;
      }
    }
  }
  // remove flags from the argument list
  int k;
  for( int i=k=1; i<argc; i++)
    if (argv[i] != NULL)
      argv[k++] = argv[i];
  argc = k;
  if (Scorefilename && PruningThreshold == 0.0)
    PruningThreshold = 0.00000001;
}



/*******************************************************************/
/*                                                                 */
/*  main                                                           */
/*                                                                 */
/*******************************************************************/

int main( int argc, char *argv[] )

{
  get_flags( argc, argv );

  if (argc < 3)
    usage();
  
  FILE *gfile;
  if ((gfile = fopen(argv[1], "rt")) == NULL) {
    fprintf(stderr, "\nError: unable to open file \"%s\"\n\n", argv[1]);
    exit(1);
  }

  FILE *lfile;
  if ((lfile = fopen(argv[2], "rt")) == NULL) {
    fprintf(stderr, "\nError: unable to open file \"%s\"\n\n", argv[2]);
    exit(1);
  }
    
  FILE *infile;
  if (argc < 4)
    infile = stdin;
  else if ((infile = fopen(argv[3], "rt")) == NULL) {
    fprintf(stderr, "\nError: unable to open file \"%s\"\n\n", argv[3]);
    exit(1);
  } 
  
  FILE *outfile;
  if (argc < 5)
    outfile = stdout;
  else if ((outfile = fopen(argv[4], "wt")) == NULL) {
    fprintf(stderr, "\nError: unable to open file \"%s\"\n\n", argv[4]);
    exit(1);
  }
    
  try {
    clock_t start;
    if (Tagging) {
      Parser parser(gfile, lfile, StartSymbol, OCfilename,
		    WCfilename, TraceStartSymbol, TraceEndSymbol);
      fclose(gfile);
      fclose(lfile);
      yyin = infile;
      init_tagging(parser.grammar, parser.lexicon);
      start = clock();
      yyparse();
    }
    else if (Viterbi && !Scorefilename && !NBest) {
      VitParser parser(gfile, lfile, StartSymbol, OCfilename,
		       WCfilename, TraceStartSymbol, TraceEndSymbol);
      parser.PrintProbs = ViterbiProbs;
      parser.MaxFScore = MaxFScore;
      fclose(gfile);
      fclose(lfile);
      if (MikeOption) {
	fputs("Startup finished. Ready for processing.\n", outfile);
	fflush(outfile);
      }
      parser.verbose = Verbose;
      start = clock();
      for(;;) {
	if (MikeOption)
	  start = clock();
	parser.next_parse(infile);
	if (parser.finished)
	  break;
	parser.print_parse(outfile);
	if (MikeOption) {
	  fprintf( outfile, "cpu-time=%.3fs\n", 
		   (double) (clock() - start) / CLOCKS_PER_SEC);
	  fflush(outfile);
	}
      }
    }
    else {
      Parser parser(gfile, lfile, StartSymbol, OCfilename,
		    WCfilename, TraceStartSymbol, TraceEndSymbol);
      parser.verbose = Verbose;
      parser.Viterbi = Viterbi;
      parser.ViterbiProbs = ViterbiProbs;
      parser.PruningThreshold = (Prob)PruningThreshold;
      parser.InsideProbs = InsideProbs;
      parser.NBest = NBest;
      parser.EstimatedFreqs = EstimatedFreqs;
      parser.Training = Training;
      parser.PrintRuleNumbers = PrintRuleNumbers;
      fclose(gfile);
      fclose(lfile);
      if (Scorefilename) {
	parser.Lexicalized = true;
	parser.lexicon.read_scores(Scorefilename);
	parser.PrintLexScores = PrintLexScores;
      }
    
      start = clock();
      for(;;) {
	parser.next_parse(infile);
	if (parser.finished)
	  break;
	if (PrintTraces)
	  parser.print_trace_probs(outfile);
	if (PrintOpt) {
	  if (NBest)
	    parser.print_nbest_parses( outfile );
	  else if (Viterbi)
	    parser.print_best_parse(outfile);
	  else
	    parser.print_parse(outfile);
	}
	if (PrintYAP)
	  parser.print_YAP_parse(outfile);
      }

      if (Training) {
	FILE *file;
	char buffer[1000];
	sprintf(buffer, "%s.gram", Tfileprefix);
	if ((file = fopen(buffer,"wt")) == NULL)
	  fprintf(stderr, "Error: unable to open file \"%s\"!\n", buffer);
	parser.grammar.store(file);
	fclose(file);
    
	sprintf(buffer, "%s.lex", Tfileprefix);
	if ((file = fopen(buffer,"wt")) == NULL)
	  fprintf(stderr, "Error: unable to open file \"%s\"!\n", buffer);
	parser.lexicon.store(file);
	fclose(file);
      }
    }
	
    if (!Quiet)
      fprintf( stderr, "\nraw cpu time %.3f\n", 
	       (double) (clock() - start) / CLOCKS_PER_SEC);
  }
  catch(const char* p) {
    cerr << "\nError: " << p << "\n\n";
  }
  
  if (!Quiet)
    cerr << "finished\n";
}
