package szte.nlputils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;

public class WikiText {

  static String prevline;
  static StringBuffer sb = new StringBuffer();
  
  protected static boolean skipheader(String line){
    String titleToLower = line.toLowerCase();
    return (titleToLower.contains("talk:") || titleToLower.contains("wikipedia:")
      || titleToLower.contains("template:") || titleToLower.contains("portal:")
      || titleToLower.contains("user:mathbot") || titleToLower.contains("user talk:")
      || titleToLower.contains("user:") || titleToLower.contains("image:")
      || titleToLower.contains("#redirect") || titleToLower.contains("redirects here")
      || titleToLower.contains("this article is being considered"));
  }

  
  protected static void processLine(String line, OutputStreamWriter out) throws IOException{
    if(line.contains("<id>") && prevline.contains("<title>"))
    {
      String id = line.substring(line.indexOf("<id>")+4,line.length()-5);
      if(sb.toString().split("\n").length>1)
        out.write(sb.toString());
      sb = new StringBuffer();
      sb.append("-DOCSTART-\t"+id+"\n");
      return;
    }
    prevline = line;
    if(skipheader(line))
      return;
    line = replaceEntities(line);
    line = replaceEntities(line);
    line = clearLine(line);
    if (line.length() < 40 
     || line.startsWith("!") || line.startsWith("|") || line.startsWith("*") || line.startsWith("?")  || line.startsWith(";")
     || line.startsWith("[[Category:") || line.matches("\\[\\[..:.*")
     || (line.contains("=") && line.contains("|"))
    ) {
      return;
    }
    if (getNumOfShitChars(line) / line.length() > 0.12) {
      return;
    }
    line = removeLinkTag(line);
    sb.append(line+"\n");
  }
  
  protected static String removeLinkTag(String line){
    int p=line.indexOf("[[");
    while(p>=0)
    {
      int e = line.indexOf("]]",p);
      if(e<0)
      {
        System.err.println(line);
        return line;
      }
      String text = line.substring(p+2,e);
      int b = line.indexOf("|",p);
      if(b>0 && b<e)
        text = line.substring(b+1,e);
      line = line.substring(0,p) + text + line.substring(e+2);
      p=line.indexOf("[[",p);
    }
    return line;
  }
  
  protected static int getNumOfShitChars(String s) {
    int notChar = 0;
    for (int i = 0; i < s.length(); i++) {
      if (!((s.charAt(i) >= 'a' && s.charAt(i) <= 'z') || s.charAt(i) == ' ')) {
        notChar++;
      }
    }
    return notChar;
  }

  protected static int getInstancesOf(String subject, String needle) {
    int i = 0;
    int pos = subject.indexOf(needle);
    while (pos != -1) {
      i++;
      pos = subject.indexOf(needle, pos + needle.length());
    }
    return i;
  }

  protected static String removeFromTo(String s, String from, String to) {
    return removeFromTo(s, from, to, from, to);
  }
  
  protected static String removeFromTo(String s, String from, String to, String checkFrom, String checkTo) {
    String lower = s.toLowerCase();
    int openTagPos, closeTagPos;
    openTagPos = lower.indexOf(from);
    while (openTagPos != -1) {
      closeTagPos = s.indexOf(to, openTagPos + from.length());
      if (closeTagPos != -1) {
        String t = s.substring(openTagPos + from.length(), closeTagPos);
        while (getInstancesOf(t, checkFrom) != getInstancesOf(t, checkTo)) {
          closeTagPos = s.indexOf(to, closeTagPos + to.length());
          if (closeTagPos != -1) {
            t = s.substring(openTagPos + from.length(), closeTagPos);
          } else {
            break;
          }
        }

      }
      if (closeTagPos != -1) {
        s = s.substring(0, openTagPos) + s.substring(closeTagPos + to.length());
      } else {
        s = s.substring(0, openTagPos);
      }
      lower = s.toLowerCase();
      openTagPos = lower.indexOf(from);
    }
    return s;
  }

  protected static String removeXML(String content) {
    content = content.trim();
    if(content.matches("<[^>]+>"))
      return "";
    int p = content.indexOf('<');
    while(p>=0)
    {
      int e = content.indexOf('>',p);
      int s = content.indexOf(' ',p);
      if(s>0 && s<e) e=s;
      if(e<0) return content;
      String tag = content.substring(p+1,e+1).toLowerCase();
      if(tag.startsWith("</"))
      {
        System.err.println(content);
        System.err.println("Closing tag without open");
        System.exit(1);
      }
      if(content.contains("</"+tag))
        content = removeFromTo(content,"<"+tag, "</"+tag);
      else
        content = content.substring(0,p);
      p = content.indexOf('<');
    }
    return content;
  }
  
  protected static String clearLine(String content) {
    content = removeXML(content);
    //content = cutFirstPartIfSuspicious(content);
    content = removeFromTo(content, "{{", "}}");
    content = removeFromTo(content, "[[image:", "]]", "[[", "]]");
    content = removeFromTo(content, "style=\"", "\"");
    content = removeFromTo(content, "colspan=", " ");
    content = removeLink(content, "http://");
    content = removeLink(content, "ftp://");
    content = removeLink(content, "https://");
    content = removeLink(content, "www.");
    content = removeUnclosedWikiTags(content, "{{", "}}");
    content = removeCssStyles(content);
    content = handleImageTags(content);
    content = content.replaceAll("\\(\\)", "");
    content = content.replaceAll("(\\s){2,}", " ");
    content = content.trim();
    return content;
  }
  
  protected static String removeUnclosedWikiTags(String s, String open, String close) {
    int strt = s.lastIndexOf(open);
    if (strt != -1) {
      int end = s.indexOf(close, strt);
      if (end == -1) {
        s = s.substring(0, strt);
      }
    }
    return s;
  }

  protected static String removeCssStyles(String s) {
    int strt, end, style;
    strt = s.indexOf("{");
    while (strt != -1) {
      end = s.indexOf("}", strt);
      style = s.substring(strt + 1).indexOf("style=\"");
      if (style != -1) {
        if (end != -1) {
          if (style < end) {
            s = s.substring(0, strt) + s.substring(end + 1);
          } else {
          }
          strt = s.indexOf("{", strt + 1);
        } else {
          s = s.substring(0, strt);
          break;
        }
      } else {
        break;
      }
    }

    return s;
  }

  protected static String handleImageTags(String c) {
    String copy = c.toLowerCase();
    int s = copy.indexOf("image:");
    while (s != -1) {
      int end = copy.lastIndexOf("|");
      if (end != -1 && end > s) {
        int maxIter = 2000;
        while (end != -1 && end - s > 100) {
          end = copy.lastIndexOf("|", end - 1);
          if (maxIter-- == 0) {
            break;
          }
        }
        if (end == -1) {
          break;
        }
        if (end - s <= 100 && end - s >= 0) {
          c = c.substring(0, s) + c.substring(end + 1);
          copy = copy.substring(0, s) + copy.substring(end + 1);
          s = copy.indexOf("image:");
        } else {
          break;
        }
      } else {
        break;
      }
    }
    return c;
  }

  protected static String cutFirstPartIfSuspicious(String s) {
    String original = s;
    String[] ends = {"|}}}}", "|}}}", "|}}", "|}", "}}"};
    int actEndPos = 0;
    int bestEndPos = -1;
    int bestEndLength = 0;
    for (int i = 0; i < ends.length; i++) {
      actEndPos = s.length();
      do {
        actEndPos = s.lastIndexOf(ends[i], actEndPos - 1);
      } while (actEndPos > 0 && s.charAt(actEndPos - 1) == 'w');
      if (actEndPos > bestEndPos) {
        bestEndPos = actEndPos;
        bestEndLength = ends[i].length();
      }
    }
    if (bestEndPos != -1) {
//      System.out.println("ez az eredeti: " + s);
      s = s.substring(0, bestEndPos + bestEndLength);
//      System.out.println("ezt vagom ki : " + s);
//      System.out.println();
      return original.substring(bestEndPos + 2);
    } else {
//      System.out.println("doubleClose = -1");
    }
    int l = s.length();
    int susp = 0;
    for (int i = 0; i < l; i++) {
      if (s.charAt(i) == '=') {
        susp += 2;
      }
      if (s.charAt(i) == '|') {
        susp++;
      }
    }
    double rate = susp * 1.0 / l;
//System.err.println("rate: "+rate);
    if (rate > 0.03) {
      return "";
    } else {
      return original;
    }
  }

  protected static String removeLink(String s, String url) {
    int l = s.indexOf(url);
    while (l != -1) {

      int openBracket = -1;
      if (l > 0 && s.charAt(l - 1) == '[') {
        openBracket = l - 1;
      }
      int endOfLink = s.indexOf(" ", l);
      if (s.indexOf("]", l) < endOfLink) {
        endOfLink = s.indexOf("]", l);
      }
      if (s.indexOf("}", l) < endOfLink) {
        endOfLink = s.indexOf("}", l);
      }
      // nem vikis link, csak a linket toroljuk az elso szokozig es kesz
      if (openBracket == -1) {
//        int endOfLink = s.indexOf(" ", l);
        if (endOfLink != -1) {
          s = s.substring(0, l) + s.substring(endOfLink + 1);
        } else {
          s = s.substring(0, l);
        }
      } else {
        int closeBracket = s.indexOf("]", l);
        // itt a link egy [ ]-n belul van feltehetoleg
        if (closeBracket != -1) {
//          int endOfLink = s.indexOf(" ", l);
          if (endOfLink != -1 && endOfLink < closeBracket) {
            s = s.substring(0, openBracket) + s.substring(endOfLink + 1, closeBracket) + s.substring(closeBracket + 1);
          } else {
            s = s.substring(0, openBracket) + s.substring(closeBracket + 1);
          }
          // van nyito [ de nincs zaro ]
        } else {
//          int endOfLink = s.indexOf(" ", l);
          if (endOfLink != -1) {
            s = s.substring(0, l - 1) + s.substring(endOfLink + 1);
          } else {
            s = s.substring(0, l - 1);
          }
        }
      }
      l = s.indexOf(url);
//      System.out.println("\nmost epp: "+s+"\n");
    }
//    System.out.println("VEGEN");
//    System.out.println(s);
//    System.out.println("");
    return s;
  }

  
  protected static String replaceEntities(String s) {
    s = s.replaceAll("&amp;", "&");
    s = s.replaceAll("#123;", "{");
    s = s.replaceAll("#124;", "|");
    s = s.replaceAll("#125;", "}");
    s = s.replaceAll("#126;", "~");
    s = s.replaceAll("&#91;", "[");
    s = s.replaceAll("&#93;", "]");

    s = s.replaceAll("&eacute;", "e");
    s = s.replaceAll("&lt;", "<");
    s = s.replaceAll("&gt;", ">");
    s = s.replaceAll("&ndash;", "-");
    s = s.replaceAll("&mdash;", "-");
    s = s.replaceAll("&minus;", "-");
    s = s.replaceAll("&nbsp;", " ");
    s = s.replaceAll("&quot;", "\"");
    s = s.replaceAll("&or;", "OR");
    s = s.replaceAll("&bull;", "*");
    s = s.replaceAll("&beta;", "B");
    s = s.replaceAll("&times;", "*");
    s = s.replaceAll("&sup2;", "^");


    s = s.replaceAll("&#8212;", "-");
    s = s.replaceAll("&#8216;", "'");
    s = s.replaceAll("&#8217;", "'");
    s = s.replaceAll("&#8220;", "\"");
    s = s.replaceAll("&#8221;", "\"");
    s = s.replaceAll("&#7717;", "h");
    s = s.replaceAll("&#596;", "c");
    s = s.replaceAll("&#803;", "");
    s = s.replaceAll("&#9786;", "");
    s = s.replaceAll("&#x200b;", "");
    s = s.replaceAll("&#X200b;", "");

    s = s.replaceAll("#363;", "u");

    return s;
  }

  public static void process(String infile, String outfile) throws IOException{
    if(!infile.endsWith(".xml"))
    {
      System.err.println("Input should be an XML");
      System.exit(1);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(infile), "UTF-8"));
    OutputStreamWriter out = new OutputStreamWriter(new FileOutputStream(new File(outfile)), "UTF-8");
    String line;
    while ((line = br.readLine()) != null) {
      processLine(line, out);
    }
    out.close();
  }
  
  public static void main(String[] args) throws Exception {
    process("w:/project/LocalTextClassifier/Wikipedia-20100119222814.xml","w:/project/LocalTextClassifier/wiki.txt");
  }
}
