package szte.io;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

/**
 * 
 * The reader and container class for the Wikipedia soocer corpus.
 *
 */
public class WikiDocSet extends HashSet<Document> implements DocumentSet {
  /**
   *  The documents are listed in one file separated by a -DOCSTART- line (extracted from the Wikipedia dump) and the gold-standard labels are in an txt file (in a "documentid TAB label" format).
   *  @param file consist of the path to the document (*.txt files will be read from this directory) and the label XML file path separated by a |
   */
  public void readDocumentSet(String file) {
    try {
    String line, id="";
    Map<String, Set<String>> labels = new HashMap<String, Set<String>>(); 
    BufferedReader br;
      br = new BufferedReader(new FileReader(file.split("\\|")[1]));
    while ((line = br.readLine()) != null) {
      id = line.split("\t")[0];
      if(!labels.containsKey(id))
        labels.put(id, new HashSet<String>());
      labels.get(id).add(line.split("\t")[1]);
    }
    
    br = new BufferedReader(new InputStreamReader(new FileInputStream(file.split("\\|")[0]), "UTF-8"));
    StringBuffer sb = new StringBuffer(); 
    while ((line = br.readLine()) != null) {
      if(line.startsWith("-DOCSTART-"))
      {
        if(labels.containsKey(id))
        {
          String text = new String(sb.toString().getBytes("ISO8859_1"),"ISO8859_1");
          for(int i=0;i<text.length();++i)
            if((int)text.charAt(i)>=160 && (int)text.charAt(i)<192)
              text = text.substring(0,i) + " " + text.substring(i+1,text.length());
          add(new SimpleDocument(id,text,labels.get(id)));
        }
        sb = new StringBuffer();
        id = line.split("\t")[1];
        continue;
      }
      sb.append(line+"\n"); 
    }
    } catch (IOException e) {
      e.printStackTrace();
    }
    System.out.println(size()+" docs");
  }
}
