package szte.io;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

/**
 * 
 * The reader and container class for the Reuters corpus.
 * We use the 10 most frequnt labels.
 *
 */
public class ReutersDocSet extends HashSet<Document> implements DocumentSet {

  /**
   * 
   * @param file the path for the reuters.xml and TRAIN or TEST for indicating the Lewis split
   */
  public void readDocumentSet(String file) {
    try {
      String selectedLabels=" Acq Corn Crude Earn Grain Interest Money Ship Trade Wheat ";
      selectedLabels=selectedLabels.toLowerCase();
      
      String line, id = "";
      String filename = file.split("\\|")[0];
      String split = file.split("\\|")[1];
      BufferedReader br = new BufferedReader(new FileReader(filename));
      Set<String> labels = new HashSet<String>();
      while ((line = br.readLine()) != null) {
        if ( line.startsWith("<REUTERS") ) {
          id = line.split("\"")[9];
          if(!line.split("\"")[3].equals(split))
            id = "";
        } else if ( line.contains("<TOPICS>") ) {
          line = line.replaceAll("</?TOPICS>", "");
          line = line.replaceAll("</D>", "");
          labels = new HashSet<String>();
          for(int i=1; i<line.split("<D>").length;++i)
          {
            String label = line.split("<D>")[i].toLowerCase();
            label = label.replaceAll("-.*", "");
            if(selectedLabels.contains(" "+label+" "))
              labels.add(label);
          }
        } else if(line.contains("<TEXT"))
        {
          StringBuffer sb = new StringBuffer();
          while ((line = br.readLine()) != null){
            if(line.contains("</TEXT>"))
              break;
            sb.append(line+"\n");
          }
          String text = sb.toString();
          if(text.contains("<TITLE>"))
          {
            int b = text.indexOf("<TITLE>");
            int e = text.indexOf("</TITLE>");
            text = text.substring(0,b) + " " + text.charAt(b+7) + text.substring(b+8,e).toLowerCase() + ". " + text.substring(e+8);
          }
          text = text.replaceAll("<DATELINE>.*</DATELINE>", "");
          String t = text;
          text = text.replaceAll("&lt;", "<");
          text = text.replaceAll("<[^>]+>", "");
          for(int i=0;i<text.length();++i)
            if(((int)text.charAt(i)>=160 && (int)text.charAt(i)<192) 
                || (int)text.charAt(i)>65532)
              text = text.substring(0,i) + " " + text.substring(i+1,text.length());
          
          text = text.trim();
          if(!id.equals("") && !labels.isEmpty())
            super.add(new SimpleDocument(id,text,labels));
        }
      }
      System.out.println(size()+" docs read.");
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
  
  public static void main(String args[]){
    ReutersDocSet train = new ReutersDocSet();
    train.readDocumentSet("corpus/reuters.xml|TRAIN");
    for(Document d : train){
      for(String l : d.getLabels())
      {
        if(!(new File(l).exists()))
          new File(l).mkdir();
        PrintWriter log;
        try {
          log = new PrintWriter(l+"/"+d.getDocID());
          log.println(d.getText());
          log.flush();
          log.close();
        } catch (FileNotFoundException e) {
          e.printStackTrace();
        }
      }
    }
  }
}
