package szte.csd;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.DefaultLemmatizer;
import edu.northwestern.at.utils.corpuslinguistics.sentencesplitter.BreakIteratorSentenceSplitter;
import edu.northwestern.at.utils.corpuslinguistics.sentencesplitter.SentenceSplitter;
import edu.northwestern.at.utils.corpuslinguistics.tokenizer.DefaultWordTokenizer;
import edu.northwestern.at.utils.corpuslinguistics.tokenizer.WordTokenizer;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;

/**
 * 
 * Tokenizer is an utility class for task-dependent tokenization and lemmatization.
 *
 */
public class Tokenizer {
  protected WordTokenizer tokenizer = new DefaultWordTokenizer();
  protected SentenceSplitter splitter = new BreakIteratorSentenceSplitter();
  protected DefaultLemmatizer lemmatizer = null;
  protected DocumentPreprocessor docPreprocessor = null;
  
  public Tokenizer(){
      docPreprocessor = new DocumentPreprocessor(new PennTreebankLanguagePack().getTokenizerFactory());
      lemmatizer = null;
  }

  public Tokenizer(DocumentPreprocessor preproc){
    docPreprocessor = preproc;
    lemmatizer = null;
  }
  
  public void lemmatiseOn(){
    try {
      lemmatizer = new DefaultLemmatizer();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
  
  protected List<Sentence> tokeniseObes(String text){
    String lines[] = text.split("\n");
    String header = null;
    List<Sentence> res = new LinkedList<Sentence>();
    for(int i=0; i<lines.length;++i){
      String line = lines[i];
      if(line.endsWith(":") && !line.startsWith(" "))
      {
        header = line.toLowerCase().trim();
        continue;
      }
      if(line.trim().length() < 2) 
        continue;
      List<String> sentence = new LinkedList<String>();
      if(header!=null)
        sentence.add(header);
      String[] tokens = line.split(" ");
      for(String token : tokens)
      {
        token = token.trim().toLowerCase().replaceAll("[0-9]+", "NUM");
        if(token.length()>0)
          sentence.add(token);
      }
      res.add(new Sentence(sentence, line));
    }
    return res;
  }

  public List<Sentence> tokenise(String text){
    if(docPreprocessor == null)
      return tokeniseObes(text);
    return tokeniseStanford(text);
  }

  protected List<Sentence> tokeniseStanford(String text){
    List<List<String>> sentences = null;
    List<Sentence> res = new LinkedList<Sentence>();
    String token = null;
    try {
      sentences = splitter.extractSentences(text, tokenizer);
      int[] offsets = splitter.findSentenceOffsets(text, sentences);
      for(int k=0; k<sentences.size();++k)
      {
        String sentence = text.substring(offsets[k], offsets[k+1]).trim();
        List<Word> words = docPreprocessor.getWordsFromString(sentence);
        List<String> sent = new ArrayList<String>(words.size());
        for(int i=0; i<words.size(); ++i)
        {
          token = words.get(i).word().toLowerCase().trim();
          if(lemmatizer != null)
          {
            token = token.replaceAll("[0-9]+", "NUM");
            if(token.endsWith("red")) {
              token = token.substring(0,token.length()-2);
            }if(token.endsWith("ing")) {
              token = token.substring(0,token.length()-3);
            }
            else 
              token = lemmatizer.lemmatize(token);
          }

          sent.add(i, token);
        }
        res.add(new Sentence(sent, sentence));
      }
    } catch (Exception e) {
      System.err.println("LEMMATIZER ERROR: "+token);
    }
    return res;
  }

  public void serialize(){
  }
}
