package szte.io;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.Vector;

/**
 *  
 *   The reader and container class for the Obesity 2008 challenge corpus.
 *   To obtain it please visit: https://www.i2b2.org/NLP/Obesity/
 *   
 */
public class ObesDocumentSet extends HashSet<Document> implements DocumentSet {

  protected Map<String, Set<String>> doclabels;
  
  protected void readLabels(String file) throws IOException
  {
      String line, disease="";
      doclabels = new HashMap<String, Set<String>>();
      BufferedReader inp = new BufferedReader(new FileReader(file));
      while ((line = inp.readLine())!=null && !line.equals("<diseases source=\"textual\">")){}
      while ((line = inp.readLine())!=null && !line.startsWith("</diseases"))
      {
        if(line.startsWith("<disease name"))
        {
          disease = line.substring(15,line.lastIndexOf('\"'));
          disease = disease.toLowerCase().replaceAll(" ", "_");
        }
        else if(line.startsWith("<doc id="))
        {
          String docId=line.substring(9, line.indexOf("\"",9));
          if(!doclabels.containsKey(docId))
            doclabels.put(docId, new HashSet<String>());
          if(line.substring(line.lastIndexOf("\"")-1,line.lastIndexOf("\"")).equals("Y"))
          {
            doclabels.get(docId).add(disease);
          }
        }
      }
  }

  /**
   *  The documents are in a directory (txt files) and the gold-standard labels are in an XML file.
   *  @param file consist of the path to the document (*.txt files will be read from this directory) and the label XML file path separated by a |
   */
  public void readDocumentSet(String file) {
    if(file.split("\\|").length != 2)
    {
      System.err.println("The documentset acces string must contain the corpus directory and the label xml separated by |");
      return;
    }
    try{
      readLabels(file.split("\\|")[1]);
      readCorpus(file.split("\\|")[0]);
    }catch(IOException e){
      e.printStackTrace();
    }
  }
  
  protected void readCorpus(String dir) throws IOException {
    System.out.println("Reading corpus...");
    String fileNames[] = new File(dir).list();
    super.clear();
    for(int i=0; i<fileNames.length; i++)
    {
      if(!fileNames[i].endsWith(".txt")) continue;
      BufferedReader inp = new BufferedReader(new FileReader(dir + "/" + fileNames[i]));   
      String line;
      String docId = fileNames[i].substring(0,fileNames[i].indexOf('.'));
      if(!doclabels.containsKey(docId))
        continue;
        
      StringBuffer sbuff = new StringBuffer();
      while ((line = inp.readLine())!=null)
      {
        line = line.toLowerCase();
        sbuff.append(line + "\n");
      }
      super.add(new SimpleDocument(docId,sbuff.toString(), doclabels.get(docId)));
    }
  }
}
