/*
 * Decompiled with CFR 0.152.
 */
package tsg.corpora;

import java.io.File;
import java.io.PrintWriter;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Scanner;
import java.util.TreeSet;
import settings.Parameters;
import tesniere.Box;
import tesniere.Word;
import tsg.Label;
import tsg.TSNode;
import tsg.TSNodeLabel;
import tsg.TSNodeLabelStructure;
import tsg.corpora.Auxify;
import tsg.corpora.ConstCorpus;
import util.FileUtil;
import util.Utility;

public abstract class Wsj
extends ConstCorpus {
    private static final long serialVersionUID = 0L;
    public static String testSet;
    public static boolean skip120TrainingSentences;
    public static boolean transformNPbasal;
    public static boolean transformSG;
    public static int initialHeads;
    public static String WsjBase;
    public static String WsjConstBase;
    public static String WsjOriginal;
    public static String WsjOriginalReadable;
    public static String WsjOriginalReadableAuxify;
    public static String WsjOriginalCleaned;
    public static String WsjOriginalCleanedTop;
    public static String WsjOriginalCleanedSemTagsOff;
    public static String WsjOriginalCleanedTopSemTagsOff;
    public static String WsjOriginalCleanedTopSemTagsOffCharBased;
    public static String WsjOriginalCleanedSemTagsOffAuxify;
    public static String WsjOriginalCleanedCollins97;
    public static String WsjOriginalCleanedCollins99;
    public static String WsjOriginalCleanedMagerman;
    public static String WsjOriginalCleanedLeft;
    public static String WsjOriginalCleanedRight;
    public static String WsjOriginalCleanedRandom;
    public static String WsjOriginalCleanedYM;
    public static String WsjOriginalCollins97;
    public static String WsjOriginalCollins99;
    public static String WsjOriginalMagerman;
    public static String WsjOriginalNoTraces;
    public static String WsjOriginalNPBraketing;
    public static String WsjOriginalNPBraketingCleaned;
    public static String WsjOriginalNPBraketingCleanedFixCC;
    public static String WsjOriginalNPBraketingCleanedConll07;
    public static String Wsj10;
    public static String WsjFlatBase;
    public static String WsjFlatTraces;
    public static String WsjFlatNoTraces;
    public static String WsjFlatNoTracesCharniak;
    public static String WsjOriginalBikelPreprocessed;
    public static final String[] initialHeadLabels;
    public static String[] possibleInitialHeadPath;
    public static String WsjBinary;
    public static String WsjTrainingSecFile;
    public static String traceTag;
    public static String NPbasalTag;
    public static String SubjectLessTag;
    public static String[] nonCountCatInLength;
    public static final String[] nonNecessaryLabels;
    public static final String[] semTagNonComplement;
    public static final String[] complementParentCat;
    public static final String[][] complementChildCat;
    public static final int[] collinsSkipArr;
    public static BigInteger multiplyFactor;

    static {
        WsjBase = "/scratch/fsangati/CORPUS/WSJ/";
        WsjConstBase = String.valueOf(WsjBase) + "CONSTITUENCY/";
        WsjOriginal = String.valueOf(WsjConstBase) + "ORIGINAL/";
        WsjOriginalReadable = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE/";
        WsjOriginalReadableAuxify = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_AUXIFY/";
        WsjOriginalCleaned = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_CLEANED/";
        WsjOriginalCleanedTop = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_CLEANED_TOP/";
        WsjOriginalCleanedSemTagsOff = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_CLEANED_SEMTAGSOFF/";
        WsjOriginalCleanedTopSemTagsOff = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_CLEANED_TOP_SEMTAGSOFF/";
        WsjOriginalCleanedTopSemTagsOffCharBased = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_CLEANED_TOP_SEMTAGSOFF_CHARBASED/";
        WsjOriginalCleanedSemTagsOffAuxify = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_CLEANED_SEMTAGSOFF_AUXIFY/";
        WsjOriginalCleanedCollins97 = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_CLEANED_COLLINS_97/";
        WsjOriginalCleanedCollins99 = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_CLEANED_COLLINS_99/";
        WsjOriginalCleanedMagerman = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_CLEANED_MAGERMAN/";
        WsjOriginalCleanedLeft = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_CLEANED_LEFT/";
        WsjOriginalCleanedRight = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_CLEANED_RIGHT/";
        WsjOriginalCleanedRandom = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_CLEANED_RANDOM/";
        WsjOriginalCleanedYM = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_CLEANED_YM/";
        WsjOriginalCollins97 = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_COLLINS_97/";
        WsjOriginalCollins99 = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_COLLINS_99/";
        WsjOriginalMagerman = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_MAGERMAN/";
        WsjOriginalNoTraces = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_NOTRACES/";
        WsjOriginalNPBraketing = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_NP_BRACKETING/";
        WsjOriginalNPBraketingCleaned = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_NP_BRACKETING_CLEANED/";
        WsjOriginalNPBraketingCleanedFixCC = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_NP_BRACKETING_CLEANED_FIXCC/";
        WsjOriginalNPBraketingCleanedConll07 = String.valueOf(WsjConstBase) + "ORIGINAL_READABLE_NP_BRACKETING_CLEANED_CONLL07/";
        Wsj10 = String.valueOf(WsjConstBase) + "WSJ10/";
        WsjFlatBase = String.valueOf(WsjBase) + "FLAT/";
        WsjFlatTraces = String.valueOf(WsjFlatBase) + "FLAT_TRACES/";
        WsjFlatNoTraces = String.valueOf(WsjFlatBase) + "FLAT_NOTRACES/";
        WsjFlatNoTracesCharniak = String.valueOf(WsjFlatBase) + "FLAT_NOTRACES_Charniak/";
        WsjOriginalBikelPreprocessed = String.valueOf(WsjConstBase) + "BikelPreProcessed/";
        initialHeadLabels = new String[]{"No Heads", "Magerman", "Collins 97", "Collins 99", "YM", "FirstLeft", "FirstRight", "Random"};
        possibleInitialHeadPath = new String[]{WsjOriginalCleaned, WsjOriginalCleanedMagerman, WsjOriginalCleanedCollins97, WsjOriginalCleanedCollins99, WsjOriginalCleanedYM, WsjOriginalCleanedLeft, WsjOriginalCleanedRight, WsjOriginalCleanedRandom};
        WsjBinary = String.valueOf(WsjConstBase) + "BINARY/";
        WsjTrainingSecFile = "wsj-02-21";
        traceTag = "-NONE-";
        NPbasalTag = "NPB";
        SubjectLessTag = "SG";
        nonCountCatInLength = new String[]{"#", "$", "''", ",", "-LCB-", "-LRB-", "-NONE-", "-RCB-", "-RRB-", ".", ":", "``"};
        nonNecessaryLabels = new String[]{".", "''", "``"};
        semTagNonComplement = new String[]{"ADV", "VOC", "BNF", "DIR", "EXT", "LOC", "MNR", "TMP", "CLR", "PRP"};
        complementParentCat = new String[]{"S", "VP", "SBAR"};
        complementChildCat = new String[][]{{"NP", "SBAR", "S"}, {"NP", "SBAR", "S", "VP"}, {"S"}};
        collinsSkipArr = new int[]{167, 557, 581, 687, 698, 863, 914, 1358, 1406, 1869, 1873, 1884, 1887, 2617, 2700, 2957, 3241, 3939, 3946, 3959, 4613, 4645, 4669, 5021, 5312, 5401, 6151, 6161, 6165, 6173, 6340, 6342, 6347, 6432, 6704, 6850, 7162, 7381, 7778, 7941, 8053, 8076, 8229, 10110, 10525, 10676, 11361, 11593, 11716, 11727, 11737, 12286, 12871, 12902, 13182, 13409, 13426, 13868, 13909, 13918, 14252, 14255, 16488, 16489, 16822, 17112, 17566, 17644, 18414, 19663, 20105, 20213, 20308, 20653, 22565, 23053, 23226, 23483, 24856, 24928, 24930, 25179, 25193, 25200, 26821, 26967, 27051, 27862, 28081, 28680, 28827, 29254, 29261, 29348, 30110, 30142, 31287, 31739, 31940, 32001, 32010, 32015, 32378, 34173, 34544, 34545, 34573, 35105, 35247, 35390, 35865, 35868, 36281, 37653, 38403, 38545, 39182, 39197, 39538, 39695};
        multiplyFactor = BigInteger.valueOf(10L);
    }

    public static void makeReadable(File startDir, File outputDir) {
        outputDir.mkdirs();
        Object[] fileList = startDir.listFiles();
        Arrays.sort(fileList);
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).isDirectory()) {
                File newFile = new File(outputDir + "/" + ((File)inputFile).getName());
                Wsj.makeReadableFile((File)inputFile, newFile);
            }
            ++n2;
        }
    }

    public static void addTop(File startDir, File outputDir) {
        outputDir.mkdirs();
        Object[] fileList = startDir.listFiles();
        Arrays.sort(fileList);
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).isDirectory()) {
                File newFile = new File(outputDir + "/" + ((File)inputFile).getName());
                ConstCorpus.addTop((File)inputFile, newFile);
            }
            ++n2;
        }
    }

    public static void makeReadableFile(File inputFile, File newFile) {
        Scanner reader = FileUtil.getScanner(inputFile);
        PrintWriter writer = FileUtil.getPrintWriter(newFile);
        int parenthesis = 0;
        String sentence = "";
        int lineNumber = 0;
        while (reader.hasNextLine()) {
            ++lineNumber;
            String line = reader.nextLine();
            if (line.length() == 0) continue;
            sentence = String.valueOf(sentence) + line;
            if ((parenthesis += Utility.countParenthesis(line)) != 0 || line.length() == 0) continue;
            sentence = sentence.trim();
            sentence = sentence.replaceAll("\n", "");
            sentence = sentence.replaceAll("\\s+", " ");
            writer.println(ConstCorpus.adjustParenthesisation(sentence));
            sentence = "";
        }
        reader.close();
        writer.close();
    }

    public static ArrayList<String> makeReadableFileToArray(File inputFile) {
        ArrayList<String> result = new ArrayList<String>();
        Scanner reader = FileUtil.getScanner(inputFile);
        int parenthesis = 0;
        String sentence = "";
        int lineNumber = 0;
        while (reader.hasNextLine()) {
            ++lineNumber;
            String line = reader.nextLine();
            if (line.length() == 0 || line.indexOf(40) == -1 && line.indexOf(41) == -1) continue;
            int parethesisInLine = Utility.countParenthesis(line);
            sentence = String.valueOf(sentence) + line;
            if ((parenthesis += parethesisInLine) != 0 || line.length() == 0) continue;
            sentence = sentence.trim();
            sentence = sentence.replaceAll("\\\\", "");
            sentence = sentence.replaceAll("\n", "");
            sentence = sentence.replaceAll("\\s+", " ");
            sentence = ConstCorpus.adjustParenthesisation(sentence);
            result.add(sentence);
            sentence = "";
        }
        reader.close();
        return result;
    }

    private static void makeOriginalNoTraces() {
        File startDir = new File(WsjOriginalReadable);
        File outputDir = new File(WsjOriginalNoTraces);
        outputDir.mkdirs();
        Object[] fileList = startDir.listFiles();
        Arrays.sort(fileList);
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).isDirectory()) {
                File newFile = new File(outputDir + "/" + ((File)inputFile).getName());
                Scanner reader = FileUtil.getScanner((File)inputFile);
                PrintWriter writer = FileUtil.getPrintWriter(newFile);
                while (reader.hasNextLine()) {
                    String line = reader.nextLine();
                    if (line.length() == 0) continue;
                    TSNode treeLine = new TSNode(line);
                    treeLine.pruneSubTrees(traceTag);
                    treeLine.removeNumberInLabels();
                    writer.println(treeLine);
                }
                reader.close();
                writer.close();
            }
            ++n2;
        }
    }

    private static void finalCleaning(ConstCorpus corpus, Hashtable<String, Integer> lexFreq, boolean training, boolean traces) {
        for (TSNode treeLine : corpus.treeBank) {
            if (Parameters.removeNonNecessaryLables) {
                treeLine.pruneSubTrees(nonNecessaryLabels);
            }
            if (!traces) {
                treeLine.pruneSubTrees(traceTag);
                treeLine.removeNumberInLabels();
            }
            if (Parameters.removePunctStartEnd) {
                treeLine.prunePunctuationBeginning();
                treeLine.prunePunctuationEnd();
            }
            if (Parameters.raisePunctuation) {
                treeLine.raisePunctuation();
            }
            if (!Parameters.semanticTags) {
                treeLine.removeSemanticTags();
            }
            if (training && transformNPbasal) {
                treeLine.transformNodebasal("NP", "NPB");
            }
            if (training && transformSG) {
                treeLine.transformSubjectlessSentences(SubjectLessTag);
            }
            if (Parameters.removeRedundantRules) {
                treeLine.removeRedundantRules();
            }
            if (Parameters.replaceNumbers) {
                treeLine.replaceNumbers(numberTag);
            }
            if (Parameters.ukLimit <= 0) continue;
            treeLine.updateUnknown(Parameters.ukLimit, lexFreq, unknownTag, numberTag);
        }
    }

    private static void makeFlats() {
        File startDir = new File(WsjOriginalReadable);
        File outputDirTraces = new File(WsjFlatTraces);
        File outputDirNoTraces = new File(WsjFlatNoTraces);
        outputDirTraces.mkdirs();
        outputDirNoTraces.mkdirs();
        Object[] fileList = startDir.listFiles();
        Arrays.sort(fileList);
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).isDirectory()) {
                File newFileTraces = new File(outputDirTraces + "/" + ((File)inputFile).getName());
                File newFileNoTraces = new File(outputDirNoTraces + "/" + ((File)inputFile).getName());
                Scanner reader = FileUtil.getScanner((File)inputFile);
                PrintWriter writerTraces = FileUtil.getPrintWriter(newFileTraces);
                PrintWriter writerNoTraces = FileUtil.getPrintWriter(newFileNoTraces);
                while (reader.hasNextLine()) {
                    String line = reader.nextLine();
                    if (line.length() == 0) continue;
                    TSNode treeLine = new TSNode(line);
                    writerTraces.println(treeLine.toFlat());
                    treeLine.pruneSubTrees(traceTag);
                    writerNoTraces.println(treeLine.toFlat());
                }
                reader.close();
                writerTraces.close();
                writerNoTraces.close();
            }
            ++n2;
        }
    }

    private static void makeFlatFile(File inputFile, File outputFile) throws Exception {
        ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(inputFile);
        PrintWriter pw = FileUtil.getPrintWriter(outputFile);
        for (TSNodeLabel t : treebank) {
            String flat = t.toFlatSentence();
            pw.println(flat);
        }
        pw.close();
    }

    private static void makeSingleLabelX(File inputFile, File outputFile) throws Exception {
        ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(inputFile);
        PrintWriter pw = FileUtil.getPrintWriter(outputFile);
        Label labelX = Label.getLabel("X");
        for (TSNodeLabel t : treebank) {
            t.renameAllConstituentLabels(labelX);
            pw.println(t.toString());
        }
        pw.close();
    }

    private static void countCC() throws Exception {
        File startDir = new File(WsjOriginalNPBraketingCleaned);
        Object[] fileList = startDir.listFiles();
        Arrays.sort(fileList);
        int sentenceCounter = 0;
        int coordSentenceCouter = 0;
        int coordCounter = 0;
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).isDirectory() && ((File)inputFile).getName().startsWith("wsj")) {
                Scanner reader = FileUtil.getScanner((File)inputFile);
                while (reader.hasNextLine()) {
                    String line = reader.nextLine();
                    if (line.length() == 0) continue;
                    TSNodeLabel treeLine = new TSNodeLabel(line);
                    int CCcount = treeLine.countDaughtersWithLabel(Word.conjunctionsSorted);
                    ++sentenceCounter;
                    if (CCcount <= 0) continue;
                    coordCounter += CCcount;
                    ++coordSentenceCouter;
                }
                reader.close();
            }
            ++n2;
        }
        System.out.println("Total sentences: " + sentenceCounter);
        System.out.println("Total sentences with coordinations: " + coordSentenceCouter);
        System.out.println("Total coordinations: " + coordCounter);
    }

    public static void retriveTrainingAndTestCorpus() {
        Arrays.sort(nonNecessaryLabels);
        String dirParam = String.valueOf(WsjBinary) + "LLtr" + Parameters.lengthLimitTraining + "_LLts" + Parameters.lengthLimitTest + "_TS" + testSet + "_UK" + Parameters.ukLimit + "_H" + initialHeads + "_TR" + Utility.booleanToOnOff(Parameters.traces) + "_ST" + Utility.booleanToOnOff(Parameters.semanticTags) + "_RN" + Utility.booleanToOnOff(Parameters.replaceNumbers) + "_SK" + Utility.booleanToOnOff(skip120TrainingSentences) + "_RR" + Utility.booleanToOnOff(Parameters.removeRedundantRules) + "_NPB" + Utility.booleanToOnOff(transformNPbasal) + "_SG" + Utility.booleanToOnOff(transformSG) + "/";
        String WsjTestSecFile = "wsj-" + testSet;
        File trainingBinaryCorpus = new File(String.valueOf(dirParam) + WsjTrainingSecFile + ".binary");
        File testBinaryCorpus = new File(String.valueOf(dirParam) + WsjTestSecFile + ".binary");
        String trainingStatPath = String.valueOf(dirParam) + "TrainingStat/";
        File trainingStatDir = new File(trainingStatPath);
        File internalLabels = new File(String.valueOf(trainingStatPath) + "internalLabels");
        File posTagLabelsPath = new File(String.valueOf(trainingStatPath) + "posTagLabels");
        File lexiconTablePath = new File(String.valueOf(trainingStatPath) + "lexiconTable");
        File lexiconTable = new File(String.valueOf(trainingStatPath) + "labelReports.txt");
        File labelReport = new File(String.valueOf(trainingStatPath) + "labelReports.txt");
        File internalLablesTable = new File(String.valueOf(trainingStatPath) + "internalLabelsTable.txt");
        File posTagTable = new File(String.valueOf(trainingStatPath) + "posTagTable.txt");
        if (trainingBinaryCorpus.exists() && testBinaryCorpus.exists()) {
            FileUtil.appendReturn("Trainig and test corpora read from binary", Parameters.logFile);
            Parameters.trainingCorpus = ConstCorpus.fromBinaryFile(trainingBinaryCorpus);
            Parameters.testCorpus = ConstCorpus.fromBinaryFile(testBinaryCorpus);
            Parameters.internalLabels = (String[])FileUtil.fromBinaryFile(internalLabels);
            Parameters.posTagLabels = (String[])FileUtil.fromBinaryFile(posTagLabelsPath);
            Parameters.lexiconTable = (Hashtable)FileUtil.fromBinaryFile(lexiconTablePath);
        } else {
            String initialDir = possibleInitialHeadPath[initialHeads];
            File traininigFile = new File(String.valueOf(initialDir) + WsjTrainingSecFile + ".mrg");
            File testFile = new File(String.valueOf(initialDir) + WsjTestSecFile + ".mrg");
            File trainingCompleteCorpus = new File(String.valueOf(dirParam) + WsjTrainingSecFile + ".mrg");
            File testCompleteCorpus = new File(String.valueOf(dirParam) + WsjTestSecFile + ".mrg");
            Parameters.trainingCorpus = new ConstCorpus(traininigFile, WsjTrainingSecFile);
            Parameters.testCorpus = new ConstCorpus(testFile, WsjTestSecFile);
            if (skip120TrainingSentences) {
                Utility.removeOneToIntArray(collinsSkipArr);
                Parameters.trainingCorpus.removeIndexes(collinsSkipArr);
            }
            Parameters.trainingCorpus.removeTreesLongerThan(Parameters.lengthLimitTraining, nonCountCatInLength);
            Parameters.testCorpus.removeTreesLongerThan(Parameters.lengthLimitTraining, nonCountCatInLength);
            Parameters.lexiconTable = Parameters.trainingCorpus.buildLexFreq();
            Wsj.finalCleaning(Parameters.trainingCorpus, Parameters.lexiconTable, true, Parameters.traces);
            Wsj.finalCleaning(Parameters.testCorpus, Parameters.lexiconTable, false, Parameters.traces);
            trainingStatDir.mkdirs();
            Utility.hashtableOrderedToFile(Parameters.lexiconTable, lexiconTable);
            Parameters.trainingCorpus.buildLabelsStatistics(labelReport, internalLablesTable, posTagTable);
            FileUtil.toBinaryFile(internalLabels, Parameters.internalLabels);
            FileUtil.toBinaryFile(posTagLabelsPath, Parameters.posTagLabels);
            FileUtil.toBinaryFile(lexiconTablePath, Parameters.lexiconTable);
            Parameters.trainingCorpus.toBinaryFile(trainingBinaryCorpus);
            Parameters.testCorpus.toBinaryFile(testBinaryCorpus);
            Parameters.trainingCorpus.toFile_Complete(trainingCompleteCorpus, true);
            Parameters.testCorpus.toFile_Complete(testCompleteCorpus, true);
            FileUtil.appendReturn("Built binary training and test corpus", Parameters.logFile);
        }
        FileUtil.appendReturn("The training corpus has # sentences: " + Parameters.trainingCorpus.size(), Parameters.logFile);
        FileUtil.appendReturn("The test corpus has # sentences: " + Parameters.testCorpus.size(), Parameters.logFile);
    }

    public static void removeQuotations(ConstCorpus corpus) {
        for (TSNode tree : corpus.treeBank) {
            tree.pruneSubTrees("''");
            tree.pruneSubTrees("``");
        }
    }

    public static String posProcessTestLine(String line) {
        return line;
    }

    public static void reportCollinsStatistics() {
        File binaryFie = new File("/home/fsangati/CORPUS/WSJ/BINARY/22_LLtr40_LLts40_UK0_STon_RNoff_SKoff_KTFoff.binary");
        File statisticFile = new File("/home/fsangati/CORPUS/WSJ/BINARY/22_LLtr40_LLts40_UK0_STon_RNoff_SKoff_KTFoff.Statistic");
        ConstCorpus collinsCorpus = ConstCorpus.fromBinaryFile(binaryFie);
        collinsCorpus.checkHeadAnnotationStatistics(statisticFile);
    }

    public static String writeParam() {
        String param = "\n\nCorpus: Wsj";
        param = String.valueOf(param) + "\nTest Corpus Section: " + testSet;
        param = String.valueOf(param) + "\nInitial Heads: " + initialHeadLabels[initialHeads];
        param = String.valueOf(param) + "\nSkip 120 Collins sentences: " + skip120TrainingSentences;
        param = String.valueOf(param) + "\nTransform NP basal (NP --> NPB): " + transformNPbasal;
        param = String.valueOf(param) + "\nTransform subjectless sentence (S --> SG): " + transformSG;
        return param;
    }

    public static void makeClean(File startDir, File outputDirClean) {
        outputDirClean.mkdirs();
        Object[] fileList = startDir.listFiles();
        Arrays.sort(fileList);
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).isDirectory()) {
                File outputCleanedFile = new File(outputDirClean + "/" + ((File)inputFile).getName());
                Wsj.makeCleanFile((File)inputFile, outputCleanedFile);
            }
            ++n2;
        }
    }

    public static void makeCleanFile(File startFile, File outputFile) {
        ConstCorpus corpus = new ConstCorpus(startFile, FileUtil.defaultEncoding);
        corpus.removeTraces(traceTag);
        corpus.removeNumbersInLables();
        corpus.removeRedundantRules();
        corpus.toFile_Complete(outputFile, false, false);
    }

    public static void removeSemTagDir(File startDir, File outputDirClean) throws Exception {
        outputDirClean.mkdirs();
        Object[] fileList = startDir.listFiles();
        Arrays.sort(fileList);
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).getName().startsWith(".") && !((File)inputFile).isDirectory()) {
                File outputCleanedFile = new File(outputDirClean + "/" + ((File)inputFile).getName());
                Wsj.removeSemTagFile((File)inputFile, outputCleanedFile);
            }
            ++n2;
        }
    }

    public static File[][] collectFiles(File startDir, File outputDir) {
        outputDir.mkdirs();
        Object[] fileList = startDir.listFiles();
        Arrays.sort(fileList);
        ArrayList<Object> arrayListResultInput = new ArrayList<Object>();
        ArrayList<File> arrayListResultOutput = new ArrayList<File>();
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).getName().startsWith(".") && !((File)inputFile).isDirectory()) {
                arrayListResultInput.add(inputFile);
                File outputFile = new File(outputDir + "/" + ((File)inputFile).getName());
                arrayListResultOutput.add(outputFile);
            }
            ++n2;
        }
        int size = arrayListResultInput.size();
        File[][] result = new File[2][size];
        arrayListResultInput.toArray(result[0]);
        arrayListResultInput.toArray(result[1]);
        return result;
    }

    public static void removeSemTagFile(File inputFile, File ouputFile) throws Exception {
        PrintWriter pw = FileUtil.getPrintWriter(ouputFile);
        ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(inputFile);
        for (TSNodeLabel t : treebank) {
            t.removeSemanticTags();
            t.removeRedundantRules();
            pw.println(t.toString());
        }
        pw.close();
    }

    public static void makeNaive() {
        File startDir = new File(WsjOriginalCleaned);
        File outputDirNaive = new File(WsjOriginalCleanedRandom);
        outputDirNaive.mkdirs();
        Object[] fileList = startDir.listFiles();
        Arrays.sort(fileList);
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).isDirectory()) {
                ConstCorpus corpus = new ConstCorpus((File)inputFile, FileUtil.defaultEncoding, false);
                File outputCleanedFile = new File(outputDirNaive + "/" + ((File)inputFile).getName());
                corpus.assignRandomHeads();
                corpus.toFile_Complete(outputCleanedFile, true, false);
            }
            ++n2;
        }
    }

    public static void checkHeads(File startDir) {
        Object[] fileList = startDir.listFiles();
        Arrays.sort(fileList);
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).isDirectory()) {
                ConstCorpus corpus = new ConstCorpus((File)inputFile, "noname");
                corpus.removeTop();
                corpus.checkHeadAnnotationStatistics(new File(startDir + "/" + ((File)inputFile).getName() + ".headCorrection"));
            }
            ++n2;
        }
    }

    public static void removeArgumentInHeadsInCollins99() {
        Object[] fileList = new File(WsjOriginalCleanedCollins99).listFiles();
        Arrays.sort(fileList);
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).isDirectory()) {
                File outputFile = new File(inputFile + "_fixed");
                ConstCorpus corpus = new ConstCorpus((File)inputFile, "noname");
                corpus.removeTop();
                corpus.removeArgumentInHeads();
                corpus.toFile_Complete(outputFile, true);
            }
            ++n2;
        }
    }

    public static void removeHeads() {
        File inFile = new File("/scratch/fsangati/RESULTS/TSG/LTSG/Collins/NoHeads/TrainingCorpus.processed");
        ConstCorpus corpus = new ConstCorpus(inFile, "");
        corpus.toFile_Complete(inFile, false, false);
    }

    public static void countAllFragments() throws Exception {
        File trainingCorpus = new File(String.valueOf(WsjOriginalCleaned) + "wsj-02-21.mrg");
        ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(trainingCorpus);
        BigInteger result = BigInteger.ZERO;
        for (TSNodeLabel t : treebank) {
            result = result.add(t.countTotalFragments()[1]);
        }
        System.out.println("Total fragments: " + result);
    }

    public static void countAllFragmentsDepths() throws Exception {
        File trainingCorpus = new File(String.valueOf(WsjOriginalCleaned) + "wsj-02-21.mrg");
        ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(trainingCorpus);
        int maxDepthTreebank = TSNodeLabel.maxDepthTreebank(treebank);
        Object[] result = new BigInteger[maxDepthTreebank];
        Arrays.fill(result, BigInteger.ZERO);
        for (TSNodeLabel t : treebank) {
            BigInteger[] resultTree = t.countTotalFragmentsDepth()[0];
            int i = 0;
            while (i < resultTree.length) {
                result[i] = ((BigInteger)result[i]).add(resultTree[i]);
                ++i;
            }
        }
        BigInteger totalSum = BigInteger.ZERO;
        int i = 0;
        while (i < maxDepthTreebank) {
            Object depthTotalFragments = result[i];
            System.out.println("Depth " + (i + 1) + ": " + depthTotalFragments);
            totalSum = totalSum.add((BigInteger)depthTotalFragments);
            ++i;
        }
        System.out.println("Total fragments: " + totalSum);
    }

    public static void maxDepthMaxBranchingStatistics() throws Exception {
        File trainingCorpus = new File(String.valueOf(WsjOriginalCleaned) + "wsj-02-21.mrg");
        ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(trainingCorpus);
        int maxDepthTreebank = TSNodeLabel.maxDepthTreebank(treebank);
        int maxBranchingTreebank = TSNodeLabel.maxBranchingTreebank(treebank);
        System.out.println("Max depth: " + maxDepthTreebank);
        System.out.println("Max branching: " + maxBranchingTreebank);
        int[][] countDB = new int[maxDepthTreebank][maxBranchingTreebank];
        for (TSNodeLabel t : treebank) {
            int d = t.maxDepth();
            int b = t.maxBranching();
            if (b == 51) {
                System.out.println(t);
            }
            int[] nArray = countDB[d - 1];
            int n = b - 1;
            nArray[n] = nArray[n] + 1;
        }
        int d = 0;
        while (d < maxDepthTreebank) {
            int b = 0;
            while (b < maxBranchingTreebank) {
                System.out.print(String.valueOf(countDB[d][b]) + "\t");
                ++b;
            }
            System.out.println();
            ++d;
        }
    }

    public static void countAllFragmentsDepthsMaxBranching() throws Exception {
        int b;
        File trainingCorpus = new File(String.valueOf(WsjOriginalCleaned) + "wsj-02-21.mrg");
        ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(trainingCorpus);
        int maxAllowedDepth = 25;
        int maxAllowedBranching = 7;
        System.out.println("Max depth: " + maxAllowedDepth);
        System.out.println("Max branching: " + maxAllowedBranching);
        BigInteger[][] result = new BigInteger[++maxAllowedDepth][++maxAllowedBranching];
        int d = 0;
        while (d < maxAllowedDepth) {
            Arrays.fill(result[d], BigInteger.ZERO);
            ++d;
        }
        for (TSNodeLabel t : treebank) {
            if (t.maxBranching() > maxAllowedBranching - 1 || t.maxDepth() > maxAllowedDepth - 1) continue;
            b = 0;
            while (b < maxAllowedBranching) {
                BigInteger[] resultTree = t.countTotalFragmentsDepthMaxBranching(b)[0];
                int d2 = 0;
                while (d2 < resultTree.length) {
                    result[d2 + 1][b] = result[d2 + 1][b].add(resultTree[d2]);
                    ++d2;
                }
                ++b;
            }
        }
        boolean[] allEqualToPreviousBranching = new boolean[maxAllowedBranching];
        Arrays.fill(allEqualToPreviousBranching, true);
        allEqualToPreviousBranching[0] = false;
        int d3 = 0;
        while (d3 < maxAllowedDepth) {
            b = 1;
            while (b < maxAllowedBranching) {
                if (allEqualToPreviousBranching[b] && result[d3][b].compareTo(result[d3][b - 1].multiply(multiplyFactor)) > 0) {
                    allEqualToPreviousBranching[b] = false;
                }
                ++b;
            }
            ++d3;
        }
        int b2 = 1;
        while (b2 < maxAllowedBranching) {
            if (!allEqualToPreviousBranching[b2]) {
                System.out.print("\t" + b2);
            }
            ++b2;
        }
        System.out.println();
        d3 = 1;
        while (d3 < maxAllowedDepth) {
            System.out.print(d3);
            b = 1;
            while (b < maxAllowedBranching) {
                if (!allEqualToPreviousBranching[b]) {
                    BigInteger depthBranchTotalFragments = result[d3][b];
                    System.out.print("\t" + depthBranchTotalFragments);
                }
                ++b;
            }
            System.out.println();
            ++d3;
        }
    }

    public static void corpusStatistics() throws Exception {
        TreeSet<String> nodeLabels = new TreeSet<String>();
        TreeSet<String> posLabels = new TreeSet<String>();
        File baseDir = new File(WsjOriginalCleanedSemTagsOff);
        Object[] fileList = baseDir.listFiles();
        Arrays.sort(fileList);
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).getName().startsWith(".") && !((File)inputFile).isDirectory()) {
                ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank((File)inputFile);
                for (TSNodeLabel t : treebank) {
                    t.collectNodesPos(nodeLabels, posLabels);
                }
            }
            ++n2;
        }
        File nodesOutputFile = new File(String.valueOf(WsjBase) + "statistics/wsj-00-24_originalReadableCleaned_noSemTags_nodesStat.txt");
        PrintWriter pw = FileUtil.getPrintWriter(nodesOutputFile);
        for (String s : nodeLabels) {
            pw.println(s);
        }
        pw.close();
        File posOutputFile = new File(String.valueOf(WsjBase) + "statistics/wsj-00-24_originalReadableCleaned_noSemTags_posStat.txt");
        pw = FileUtil.getPrintWriter(posOutputFile);
        for (String s : posLabels) {
            pw.println(s);
        }
        pw.close();
    }

    public static void markCircumstantialDir(File startDir, File outputDir) throws Exception {
        outputDir.mkdirs();
        Object[] fileList = startDir.listFiles();
        Arrays.sort(fileList);
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).getName().startsWith(".") && !((File)inputFile).isDirectory()) {
                File newFile = new File(outputDir + "/" + ((File)inputFile).getName());
                Wsj.markCircumstantial((File)inputFile, newFile);
            }
            ++n2;
        }
    }

    public static void markCircumstantial(File inputFile, File outputFile) throws Exception {
        ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(inputFile);
        PrintWriter pw = FileUtil.getPrintWriter(outputFile);
        for (TSNodeLabel t : treebank) {
            t.markCircumstantial(Box.advSemTagSorted, "_C");
            pw.println(t.toStringExtraParenthesis());
        }
        pw.close();
    }

    public static void makeCleanWSJ(File inputFile, File outputFile) throws Exception {
        PrintWriter pw = FileUtil.getPrintWriter(outputFile);
        ArrayList<String> treebankString = Wsj.makeReadableFileToArray(inputFile);
        for (String treeString : treebankString) {
            TSNodeLabel t = new TSNodeLabel(treeString);
            t.makeCleanWsj();
            pw.println(t.toString());
        }
        pw.close();
    }

    public static void makeCharBased(File inputFile, File outputFile) throws Exception {
        if (inputFile.isDirectory()) {
            File[][] srcDstFiles = Wsj.getFilePairs(inputFile, outputFile);
            int size = srcDstFiles[0].length;
            int i = 0;
            while (i < size) {
                Wsj.makeCharBased(srcDstFiles[0][i], srcDstFiles[1][i]);
                ++i;
            }
            return;
        }
        PrintWriter pw = FileUtil.getPrintWriter(outputFile);
        ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(inputFile);
        for (TSNodeLabel t : treebank) {
            t.makeTreeCharBased();
            pw.println(t.toString());
        }
        pw.close();
    }

    private static void convertNMLBracketingToNormalDir(File startDir, File outputDir) throws Exception {
        outputDir.mkdirs();
        Object[] fileList = startDir.listFiles();
        Arrays.sort(fileList);
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).getName().startsWith(".") && !((File)inputFile).isDirectory()) {
                File newFile = new File(outputDir + "/" + ((File)inputFile).getName());
                Wsj.convertNMLBracketingToNormal((File)inputFile, newFile);
            }
            ++n2;
        }
    }

    private static void convertNMLBracketingToNormal(File inputFile, File outputFile) throws Exception {
        ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(inputFile);
        PrintWriter pw = FileUtil.getPrintWriter(outputFile);
        for (TSNodeLabel t : treebank) {
            t.replaceAllNonTerminalLabels("NML", "NP");
            t.replaceAllNonTerminalLabels("JJP", "ADJP");
            pw.println(t.toStringExtraParenthesis());
        }
        pw.close();
    }

    public static ArrayList<TSNodeLabel> getTreebankReadableAndClean(File inputFile) throws Exception {
        ArrayList<TSNodeLabel> result = new ArrayList<TSNodeLabel>();
        ArrayList<String> treebankString = Wsj.makeReadableFileToArray(inputFile);
        for (String treeString : treebankString) {
            TSNodeLabel t = new TSNodeLabel(treeString);
            t.makeCleanWsj();
            result.add(t);
        }
        return result;
    }

    public static ArrayList<TSNodeLabel> getTreebank(File inputFile) throws Exception {
        ArrayList<TSNodeLabel> result = new ArrayList<TSNodeLabel>();
        ArrayList<String> treebankString = Wsj.makeReadableFileToArray(inputFile);
        int i = 1;
        for (String treeString : treebankString) {
            TSNodeLabel t = new TSNodeLabel(treeString);
            result.add(t);
            ++i;
        }
        return result;
    }

    public static ArrayList<TSNodeLabelStructure> getTreebankStructure(File inputFile) throws Exception {
        ArrayList<TSNodeLabelStructure> result = new ArrayList<TSNodeLabelStructure>();
        ArrayList<String> treebankString = Wsj.makeReadableFileToArray(inputFile);
        for (String treeString : treebankString) {
            TSNodeLabelStructure t = new TSNodeLabelStructure(treeString);
            result.add(t);
        }
        return result;
    }

    public static ArrayList<File> getFileInDir(File startDir) throws Exception {
        ArrayList<File> result = new ArrayList<File>();
        Object[] fileList = startDir.listFiles();
        Arrays.sort(fileList);
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).getName().startsWith(".") && !((File)inputFile).isDirectory()) {
                result.add((File)inputFile);
            }
            ++n2;
        }
        return result;
    }

    public static void auxifyWsj(File startDir, File outputDir) throws Exception {
        Auxify.orderArrays();
        ArrayList<File> fileList = Wsj.getFileInDir(startDir);
        for (File inputFile : fileList) {
            ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(inputFile);
            File outputFile = new File(outputDir + "/" + inputFile.getName());
            for (TSNodeLabel t : treebank) {
                Auxify.auxify(t);
            }
            TSNodeLabel.printTreebankToFile(outputFile, treebank, false, false);
        }
    }

    public static void filterSentenceUpToLength(int length, File startDir, File outputDir) throws Exception {
        ArrayList<File> fileList = Wsj.getFileInDir(startDir);
        for (File inputFile : fileList) {
            ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(inputFile);
            File outputFile = new File(outputDir + "/" + inputFile.getName());
            PrintWriter pw = new PrintWriter(outputFile);
            for (TSNodeLabel t : treebank) {
                int sentenceLength = t.countLexicalNodesExcludingCatLabels(nonCountCatInLength);
                if (sentenceLength > length) continue;
                pw.println(t.toString());
            }
            pw.close();
        }
    }

    private static void checkDiff() throws Exception {
        File file1 = new File(String.valueOf(WsjOriginalCleanedSemTagsOff) + "wsj-02-21.mrg");
        File file2 = new File(String.valueOf(WsjOriginalCleanedTop) + "wsj-02-21.mrg");
        ArrayList<TSNodeLabel> corpus1 = TSNodeLabel.getTreebank(file1);
        ArrayList<TSNodeLabel> corpus2 = TSNodeLabel.getTreebank(file2);
        Iterator<TSNodeLabel> iter1 = corpus1.iterator();
        Iterator<TSNodeLabel> iter2 = corpus2.iterator();
        int index = 0;
        while (iter1.hasNext()) {
            ++index;
            TSNodeLabel tree1 = iter1.next().addTop();
            TSNodeLabel tree2 = iter2.next();
            tree2.removeSemanticTags();
            if (tree1.equals(tree2)) continue;
            System.out.println(String.valueOf(index) + ":");
            System.out.println(tree1.toString());
            System.out.println(tree2.toString());
            System.out.println();
        }
    }

    private static void makeTreesFlatForCharniack() {
        File inputFile = new File(String.valueOf(WsjFlatNoTraces) + "wsj-24.mrg");
        File outputFile = new File(String.valueOf(WsjFlatNoTracesCharniak) + "wsj-24.mrg");
        Scanner scan = FileUtil.getScanner(inputFile);
        PrintWriter pw = FileUtil.getPrintWriter(outputFile);
        while (scan.hasNextLine()) {
            String line = scan.nextLine();
            line = "<s> " + line + " </s>";
            pw.println(line);
        }
        pw.close();
        scan.close();
    }

    private static void cleanCharniackOutput() {
        String workingDir = "/Users/fedja/Work/SOFTWARE/CharniakParser/reranking-parser/first-stage/Results/EN_noAux/";
        File inputFile = new File(String.valueOf(workingDir) + "wsj-22_1000best_noAux.mrg");
        File outputFile = new File(String.valueOf(workingDir) + "wsj-22_1000best_noAux_cleaned.mrg");
        Scanner scan = FileUtil.getScanner(inputFile);
        PrintWriter pw = FileUtil.getPrintWriter(outputFile);
        while (scan.hasNextLine()) {
            String line = scan.nextLine();
            if (line.startsWith("(")) {
                line = line.replaceAll("S1", "TOP");
                pw.println(line);
                continue;
            }
            if (!line.equals("")) continue;
            pw.println();
        }
        pw.close();
        scan.close();
    }

    private static void customTransform(File startDir, File outputDir) throws Exception {
        outputDir.mkdirs();
        Object[] fileList = startDir.listFiles();
        Arrays.sort(fileList);
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).getName().startsWith(".") && !((File)inputFile).isDirectory()) {
                File newFile = new File(outputDir + "/" + ((File)inputFile).getName());
                Wsj.makeCharBased((File)inputFile, newFile);
            }
            ++n2;
        }
    }

    private static void checkNodesOverlap() throws Exception {
        File corpusFile = new File(String.valueOf(WsjOriginalCleanedTopSemTagsOff) + "wsj-02-21.mrg");
        ArrayList<TSNodeLabel> treebank = Wsj.getTreebank(corpusFile);
        HashSet<Label> internalNodesSet = new HashSet<Label>();
        HashSet<Label> lexicalNodesSet = new HashSet<Label>();
        for (TSNodeLabel t : treebank) {
            ArrayList<TSNodeLabel> allNodes = t.collectAllNodes();
            for (TSNodeLabel n : allNodes) {
                if (n.isLexical) {
                    lexicalNodesSet.add(n.label);
                    continue;
                }
                internalNodesSet.add(n.label);
            }
        }
        System.out.println("Overlapping between lexical and non-lexical nodes:");
        internalNodesSet.retainAll(lexicalNodesSet);
        System.out.println(internalNodesSet.toString());
    }

    public static File[][] getFilePairs(File srcDir, File dstDir) {
        dstDir.mkdir();
        Object[] fileList = srcDir.listFiles();
        Arrays.sort(fileList);
        ArrayList<Object> srcFiles = new ArrayList<Object>();
        ArrayList<File> dstFiles = new ArrayList<File>();
        int size = 0;
        Object[] objectArray = fileList;
        int n = fileList.length;
        int n2 = 0;
        while (n2 < n) {
            Object inputFile = objectArray[n2];
            if (!((File)inputFile).getName().startsWith(".") && !((File)inputFile).isDirectory()) {
                srcFiles.add(inputFile);
                File newFile = new File(dstDir + "/" + ((File)inputFile).getName());
                dstFiles.add(newFile);
                ++size;
            }
            ++n2;
        }
        File[][] result = new File[2][size];
        result[0] = srcFiles.toArray(result[0]);
        result[1] = dstFiles.toArray(result[1]);
        return result;
    }

    public static void main(String[] args) throws Exception {
        Wsj.removeSemTagDir(new File(WsjOriginalCleaned), new File(WsjOriginalCleanedSemTagsOff));
    }
}

