package szte.io;

import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import edu.northwestern.at.utils.corpuslinguistics.sentencesplitter.BreakIteratorSentenceSplitter;
import edu.northwestern.at.utils.corpuslinguistics.sentencesplitter.SentenceSplitter;
import edu.northwestern.at.utils.corpuslinguistics.tokenizer.DefaultWordTokenizer;
import edu.northwestern.at.utils.corpuslinguistics.tokenizer.WordTokenizer;

/**
 * 
 * CorpusStat counts basic statistics (size, #labels, #avg. labels/doc etc) about the multi-labeling corpora (using the DocumentSet interface).
 *
 */
public class CorpusStat {
  static public int getTotalTokenNum(DocumentSet docset){
    WordTokenizer tokenizer = new DefaultWordTokenizer();
    SentenceSplitter splitter = new BreakIteratorSentenceSplitter();
    Iterator<Document> itr = docset.iterator();
    int n=0;
    while(itr.hasNext())
    {
      List<List<String>> sentences = splitter.extractSentences(itr.next().getText(), tokenizer);
      for(List<String> sent : sentences)
        n += sent.size();
    }
    return n;
  }

  static public int getTotalAssignmentNum(DocumentSet docset){
    Iterator<Document> itr = docset.iterator();
    int n=0;
    while(itr.hasNext())
    {
      n += itr.next().getLabels().size();
    }
    return n;
  }

  static public Set<String> getLabelSet(DocumentSet docset){
    Iterator<Document> itr = docset.iterator();
    Set<String> labels = new HashSet<String>();
    while(itr.hasNext())
    {
      labels.addAll(itr.next().getLabels());
    }
    System.out.println(labels);
    return labels;
  }

  static public void printStatistics(DocumentSet docset){
    System.out.println("Size: "+docset.size());
    System.out.println("#words/doc: "+(double)getTotalTokenNum(docset)/docset.size());
    System.out.println("#labels: "+getLabelSet(docset).size());
    System.out.println("#labels/doc: "+(double)getTotalAssignmentNum(docset)/docset.size());  
  }

  static public void printStatistics(DocumentSet docseta, DocumentSet docsetb){
    System.out.println("SizeA: "+docseta.size());
    System.out.println("SizeB: "+docsetb.size());
    System.out.println("#words/doc: "+(double)(getTotalTokenNum(docseta)+getTotalTokenNum(docsetb))/(docseta.size()+docsetb.size()));
    Set<String> labels = getLabelSet(docseta);
    labels.addAll(getLabelSet(docsetb));
    System.out.println("#labels: "+labels.size());
    System.out.println("#labels/doc: "+(double)(getTotalAssignmentNum(docseta)+getTotalAssignmentNum(docsetb))/(docseta.size()+docsetb.size()));  
  }
  
  public static void main(String args[]){
    DocumentSet train = null, eval = null;
    train = new CMCDataHolder();
    train.readDocumentSet("corpus/2007ChallengeTrainData.xml");
    eval = new CMCDataHolder();
    eval.readDocumentSet("corpus/2007ChallengeTestDataCodes.xml");
    System.out.println("--------  CMC:");
    printStatistics(train, eval);
    
    train = new ObesDocumentSet();
    train.readDocumentSet("corpus/obes/|corpus/obesity_standoff_annotations_training_all.xml");
    eval = new ObesDocumentSet();
    eval.readDocumentSet("corpus/obes_test/|corpus/obesity_standoff_annotations_test.xml");
    System.out.println("\n--------  OBES:");
    printStatistics(train, eval);

    train = new WikiDocSet();
    train.readDocumentSet("corpus/wiki.txt|corpus/id_cat_train_filter");
    eval = new WikiDocSet();
    eval.readDocumentSet("corpus/wiki.txt|corpus/id_cat_eval_filter");
    System.out.println("\n--------  WIKI:");
    printStatistics(train, eval);

    train = new ReutersDocSet();
    train.readDocumentSet("corpus/reuters.xml|TRAIN");
    eval = new ReutersDocSet();
    eval.readDocumentSet("corpus/reuters.xml|TEST");
    System.out.println("\n--------  REUTERS:");
    printStatistics(train, eval);

  }
}
