package szte.csd;

import java.io.*;
import java.util.HashMap;
import java.util.zip.*;

import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.trees.*;

/**
 * 
 * SyntaxParser is an utility class for parsing sentences, it wraps the Stanford parser (http://nlp.stanford.edu/software/lex-parser.shtml).
 * If uses a cache for parsed sentences which is serializable, thus each dataset can be parsed only once.
 *
 */
public class SyntaxParser {
  protected LexicalizedParser lexicalizedParser;
  protected GrammaticalStructureFactory gsf;
  protected String prev_sentence;
  protected HashMap<String,GrammaticalStructure> parsedSentences;
  protected String PARSEDSENTENCES_FILE = "parsedSentences.ser.gz";
  
  public SyntaxParser(String task){
    lexicalizedParser = new LexicalizedParser("englishPCFG.ser.gz");
    lexicalizedParser.setOptionFlags(new String[]{"-outputFormat", "penn,typedDependenciesCollapsed", "-retainTmpSubcategories"});
    gsf = new PennTreebankLanguagePack().grammaticalStructureFactory();
    prev_sentence = "";
    parsedSentences = new HashMap<String,GrammaticalStructure>();
    PARSEDSENTENCES_FILE = task+"_parsedSentences.ser.gz";
    try {
      if(new File(PARSEDSENTENCES_FILE).exists())
      {
        FileInputStream fis = new FileInputStream(PARSEDSENTENCES_FILE);
        GZIPInputStream gs = new GZIPInputStream(fis);
        ObjectInputStream ois = new ObjectInputStream(gs);
        parsedSentences = (HashMap<String,GrammaticalStructure>)ois.readObject();
        System.out.println("Read "+parsedSentences.size()+" processed sentences.");
        ois.close();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
  
  public DocumentPreprocessor getDocProcessor(){
    return new DocumentPreprocessor(lexicalizedParser.getOp().tlpParams.treebankLanguagePack().getTokenizerFactory());
  }
  
  public GrammaticalStructure parseSentence(String sent){
    try{
      if(parsedSentences.containsKey(sent))
        return parsedSentences.get(sent);
      else{
        lexicalizedParser.parse(sent);
        GrammaticalStructure gstruct = gsf.newGrammaticalStructure(lexicalizedParser.getBestParse());
        parsedSentences.put(sent, gstruct);
        if(parsedSentences.size()%1000==0)
          System.out.println(parsedSentences.size()+" sentences parsed so far!");
        return gstruct;
      }
    }catch(Exception e){
      e.printStackTrace();
      System.err.println(sent);
      return null;
    }      
  }
  
  public void serailizeParsedSentences() {
    try {
      FileOutputStream fos = new FileOutputStream(PARSEDSENTENCES_FILE);
      GZIPOutputStream gz = new GZIPOutputStream(fos);
      ObjectOutputStream oos = new ObjectOutputStream(gz);

      oos.writeObject(parsedSentences);
      oos.flush();
      oos.close();
      fos.close();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

  public static void main(String[] a){
    System.out.println(new SyntaxParser("q").parseSentence("Born in Bo'ness, McDonald started his career with Townside before joining Woolwich Arsenal in April 1909, but had to wait until October 30 1909 to make his debut, against Manchester United."));
  }
}
