/*******************************************************************/
/*      File: lexicon.h                                            */
/*    Author: Helmut Schmid                                        */
/*   Purpose:                                                      */
/*   Created: Tue Nov  5 12:02:15 2002                             */
/*  Modified: Thu Jan 15 17:31:26 2009 (schmid)                    */
/* Copyright: Institut fuer maschinelle Sprachverarbeitung         */
/*            Universitaet Stuttgart                               */
/*******************************************************************/

#ifdef SGIext

#include <ext/hash_map>
using __gnu_cxx::hash_map;
using __gnu_cxx::hash;

#else

#include <hash_map>
using std::hash_map;
using std::hash;

#endif

#include "string-set.h"
#include "grammar.h"
#include "wordclass.h"

static StringSet RefString;
extern const char *DefaultHead;


class Lexicon;
extern bool WithLemmas;


/*****************  class Tag  *************************************/

class Tag {

public:
  SymNum symbol;
  float  prob;
  double freq;
  const char *lemma;

  Tag() { lemma = NULL; };
};


/*****************  class Tags  ************************************/

class Tags {

 private:
  unsigned int l;
  Tag *tag;
  float &getprob( size_t i ) { return tag[i].prob; };
  double &getfreq( size_t i ) { return tag[i].freq; };

 public:
  Tags() { l=0; tag = NULL; };
  ~Tags() { delete[] tag; };
  void init( vector<SymNum> &symbols, vector<float> &probs, vector<const char*> &lem );
  void init( vector<SymNum> &symbols, vector<float> &probs );
  void init( vector<SymNum> &symbols );

  size_t size() { return (size_t)l; };
  SymNum operator[]( size_t i ) const { return tag[i].symbol; };
  float prob( size_t i ) const { return tag[i].prob; };
  double freq( size_t i ) const { return tag[i].freq; };
  const char *lemma( size_t i ) const { return tag[i].lemma; };

  void incr_freq( size_t tn, double f ) { tag[tn].freq += f; };

  friend class Lexicon;
};


/*****************  class ScoreTable  ******************************/

class ScoreTable {

  // The dafult scores are also stored in the table
  // with a negative rule number
  // rule number = -(symbolID + 1)

 public:
  class Tuple {
  public:
    const char *word;
    RuleNumber source_rule;

    Tuple( const char *w, RuleNumber sr ) 
      : word(w), source_rule(sr) {};
  };

  struct hashf {
    size_t operator()(const Tuple &p) const { 
      return (size_t)p.word ^ p.source_rule;
    }
  };
  
  struct eqf {
    bool operator()(const Tuple &t1, const Tuple &t2) const {
      return (t1.word == t2.word && t1.source_rule == t2.source_rule);
    }
  };

  typedef hash_map<Tuple, float, hashf, eqf> ST;
  ST Score;

 public:
  typedef ST::iterator iterator;

  iterator begin() { return Score.begin(); }
  iterator end()   { return Score.end(); }
  size_t   size()  { return Score.size(); }

  void add( char *w, RuleNumber sr, float score ) {
    Score[Tuple(RefString(w), sr)] = score;
  }

  float operator()( const char *h, RuleNumber rn ) {
    if ((h = RefString.lookup(h)) == NULL)
      return 1.0;
    iterator it = Score.find(Tuple(h, rn));
    if (it != Score.end())
      return it->second;
    else
      return -1.0;
  }
};


/*****************  class Lexicon  *********************************/

class Lexicon {

 private:

  struct eqstr {
      bool operator()(const char* s1, const char* s2) const {
          return strcmp(s1, s2) == 0;
      }
  };

  typedef hash_map<const char*, Tags, hash<const char*>, eqstr> Lex;

  typedef Lex::iterator iterator;

  Lex lex;
  Grammar &grammar;
  Tags OCTags;
  Automaton *WCAutomaton;
  Tags *WCTags;
  ScoreTable Score;

  void add_entry( char *word, vector<SymNum> &symbols, vector<float> &prob,
		  vector<const char*> &lemmas);
  void read_oc( char *ocf, Tags &tags );
  void read_wc( char *ocf );
  void compute_priors( vector<double> &prior_prob );
  void smooth_with_wordclass( vector<double> &prior_prob );
  void smooth_lexical_frequencies( vector<double> &prior_prob );
  void estimate_probs();
  int number_of_wordclasses()
    { return (WCAutomaton == NULL)? 0: WCAutomaton->number_of_classes; };
  int wordclass( const char *s ) {
    if (WCAutomaton == NULL) 
      return -1; 
    return WCAutomaton->wordclass(s);
  };

 public:
  Lexicon( Grammar&, FILE*, char*, char* );
  iterator begin() { return lex.begin(); };
  iterator end()   { return lex.end();   };
  size_t   size()  { return lex.size();  };

  void print_entry( iterator it, FILE *file=stdout ) {
    fprintf(file, "%s\t", it->first);
    Tags &tags=it->second;
    for( size_t i=0; i<tags.size(); i++ ) {
      fprintf(file, " %s", grammar.symbol_name(tags[(int)i]));
      if (WithProbs)
	fprintf(file, " (%g)", tags.prob((int)i));
    }
    fputc('\n', file);
  };

  void print( FILE *file=stdout ) {
    for( iterator it=lex.begin(); it!=lex.end(); it++ )
      print_entry( it );
  };

  Tags *lookup( const char *word, bool sstart );

  void store( FILE *file );
  void store_oc( FILE *file );

  float score( const char *w, RuleNumber rn ) {
    // lookup up the association score
    float result = Score( w, rn );
    if (result >= 0.0)
      return result;

    // lookup up the default score
    rn = -(grammar.rules[rn].symbol(0) + 1);
    result = Score( w, rn );
    if (result >= 0.0)
      return result;

    return 1.0;
  };

  void read_scores( char *filename );
};
