/*
 * Decompiled with CFR 0.152.
 */
package main.lexinduct;

import babel.content.corpora.accessors.CorpusAccessor;
import babel.content.corpora.accessors.CrawlCorpusAccessor;
import babel.content.corpora.accessors.EuroParlCorpusAccessor;
import babel.content.corpora.accessors.LexCorpusAccessor;
import babel.content.eqclasses.EquivalenceClass;
import babel.content.eqclasses.SimpleEquivalenceClass;
import babel.content.eqclasses.collectors.EquivalenceClassCollector;
import babel.content.eqclasses.collectors.SimpleEquivalenceClassCollector;
import babel.content.eqclasses.comparators.NumberComparator;
import babel.content.eqclasses.filters.DictionaryFilter;
import babel.content.eqclasses.filters.EquivalenceClassFilter;
import babel.content.eqclasses.filters.GarbageFilter;
import babel.content.eqclasses.filters.LengthFilter;
import babel.content.eqclasses.filters.NumOccurencesFilter;
import babel.content.eqclasses.filters.RomanizationFilter;
import babel.content.eqclasses.filters.StopWordsFilter;
import babel.content.eqclasses.properties.context.PhraseContextCollector;
import babel.content.eqclasses.properties.lshcontext.LSHContextCollector;
import babel.content.eqclasses.properties.lshtime.LSHTimeDistributionCollector;
import babel.content.eqclasses.properties.number.Number;
import babel.content.eqclasses.properties.number.NumberCollector;
import babel.content.eqclasses.properties.time.PhraseTimeDistributionCollector;
import babel.content.eqclasses.properties.time.TimeDistribution;
import babel.content.eqclasses.properties.type.Type;
import babel.ranking.scorers.Scorer;
import babel.util.config.Configurator;
import babel.util.dict.Dictionary;
import babel.util.dict.SimpleDictionary;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class FreqBinInductPreparer {
    protected static final Log LOG = LogFactory.getLog(FreqBinInductPreparer.class);
    protected static final String DEFAULT_CHARSET = "UTF-8";
    protected static final int SRC_LOW_THRESH = 10;
    protected static final int SRC_HI_THRESH = 5000;
    protected static final int TRG_LOW_THRESH = 10;
    protected static final int TRG_HI_THRESH = Integer.MAX_VALUE;
    protected Dictionary m_seedDict = null;
    protected SimpleDictionary m_translitDict = null;
    protected List<Set<EquivalenceClass>> m_binnedSrcEqs = null;
    protected Set<EquivalenceClass> m_srcEqs = null;
    protected Set<EquivalenceClass> m_trgEqs = null;
    protected Set<EquivalenceClass> m_contextSrcEqs = null;
    protected Set<EquivalenceClass> m_contextTrgEqs = null;
    protected long m_maxTokCountInSrc = 0L;
    protected long m_maxTokCountInTrg = 0L;

    public void prepare() throws Exception {
        this.collectContextEqs();
        this.prepareSeedDictionary(this.m_contextSrcEqs, this.m_contextTrgEqs);
        this.prepareTranslitDictionary(this.m_contextSrcEqs);
        this.selectSrcCandidatesByNum();
        this.selectTrgCandidates();
        this.filterContextEqs();
    }

    public Set<EquivalenceClass> getSrcEqsToInduct() {
        return this.m_srcEqs;
    }

    public List<Set<EquivalenceClass>> getBinnedSrcEqs() {
        return this.m_binnedSrcEqs;
    }

    public Set<EquivalenceClass> getTrgEqs() {
        return this.m_trgEqs;
    }

    public Dictionary getSeedDict() {
        return this.m_seedDict;
    }

    public SimpleDictionary getTranslitDict() {
        return this.m_translitDict;
    }

    public long getMaxSrcTokCount() {
        return this.m_maxTokCountInSrc;
    }

    public long getMaxTrgTokCount() {
        return this.m_maxTokCountInTrg;
    }

    public void collectContextAndTimeProps(Set<? extends EquivalenceClass> srcPhrases, Set<? extends EquivalenceClass> trgPhrases) throws Exception {
        LOG.info((Object)(" - Collecting context and time phrase properties for " + srcPhrases.size() + " source and " + trgPhrases.size() + " target phrases " + " ..."));
        int maxPhraseLength = Configurator.CONFIG.getInt("preprocessing.phrases.MaxPhraseLength");
        boolean caseSensitive = Configurator.CONFIG.getBoolean("preprocessing.phrases.CaseSensitive");
        int contextWindowSize = Configurator.CONFIG.getInt("preprocessing.context.Window");
        boolean alignDistros = Configurator.CONFIG.getBoolean("preprocessing.time.Align");
        Set<Integer> srcBins = this.collectContextAndTimeProps(true, srcPhrases, maxPhraseLength, this.m_contextSrcEqs, contextWindowSize, caseSensitive);
        Set<Integer> trgBins = this.collectContextAndTimeProps(false, trgPhrases, maxPhraseLength, this.m_contextTrgEqs, contextWindowSize, caseSensitive);
        if (alignDistros) {
            LOG.info((Object)" - Aligning temporal distributions...");
            this.alignDistributions(srcBins, trgBins, srcPhrases, trgPhrases);
        }
    }

    protected void alignDistributions(Set<Integer> srcBins, Set<Integer> trgBins, Set<? extends EquivalenceClass> srcEqs, Set<? extends EquivalenceClass> trgEqs) {
        TimeDistribution timeProp;
        HashSet<Integer> toRemove = new HashSet<Integer>(srcBins);
        toRemove.removeAll(trgBins);
        for (EquivalenceClass equivalenceClass : srcEqs) {
            timeProp = (TimeDistribution)equivalenceClass.getProperty(TimeDistribution.class.getName());
            if (timeProp == null) continue;
            timeProp.removeBins(toRemove);
        }
        toRemove.clear();
        toRemove.addAll(trgBins);
        toRemove.removeAll(srcBins);
        for (EquivalenceClass equivalenceClass : trgEqs) {
            timeProp = (TimeDistribution)equivalenceClass.getProperty(TimeDistribution.class.getName());
            if (timeProp == null) continue;
            timeProp.removeBins(toRemove);
        }
        toRemove.clear();
        toRemove.addAll(srcBins);
        toRemove.retainAll(trgBins);
        LOG.info((Object)("There are " + srcBins.size() + " days in src distributions."));
        LOG.info((Object)("There are " + trgBins.size() + " days in trg distributions."));
        LOG.info((Object)("There are " + toRemove.size() + " common days between src and trg distributions."));
    }

    public void prepareContextAndTimeProps(boolean src, Set<? extends EquivalenceClass> eqs, Scorer contextScorer, Scorer timeScorer, boolean mapToLSH) throws Exception {
        LOG.info((Object)(" - " + (src ? "Projecting and scoring source" : "Scoring target") + " contextual items with " + contextScorer.toString() + " and time distributions with " + timeScorer.toString() + "..."));
        for (EquivalenceClass equivalenceClass : eqs) {
            contextScorer.prepare(equivalenceClass);
            timeScorer.prepare(equivalenceClass);
        }
        if (mapToLSH) {
            LOG.info((Object)(" - Mapping " + (src ? "source" : "target") + " context into LSH space..."));
            new LSHContextCollector(true).collectProperty(eqs);
            LOG.info((Object)(" - Mapping " + (src ? "source" : "target") + " temporal into LSH space..."));
            new LSHTimeDistributionCollector(true).collectProperty(eqs);
        }
    }

    protected Set<Integer> collectContextAndTimeProps(boolean src, Set<? extends EquivalenceClass> phrases, int maxPhraseLength, Set<EquivalenceClass> contextEqs, int contextWindowSize, boolean caseSensitive) throws Exception {
        CorpusAccessor accessor = this.getAccessor(Configurator.CONFIG.getString("preprocessing.input.Context"), src);
        new PhraseContextCollector(maxPhraseLength, caseSensitive, contextWindowSize, contextWindowSize, contextEqs).collectProperty(accessor, phrases);
        accessor = this.getAccessor(Configurator.CONFIG.getString("preprocessing.input.Time"), src);
        PhraseTimeDistributionCollector distCollector = new PhraseTimeDistributionCollector(maxPhraseLength, caseSensitive);
        distCollector.collectProperty(accessor, phrases);
        return distCollector.binsCollected();
    }

    protected void selectSrcCandidatesByNum() throws Exception {
        LinkedList<EquivalenceClass> filteredSrcEqs = new LinkedList<EquivalenceClass>(this.createAndFilterSrcEqs(this.m_contextSrcEqs, 10, 5000));
        Collections.sort(filteredSrcEqs, new NumberComparator(false));
        int numSource = Configurator.CONFIG.getInt("experiments.NumSource");
        int numBins = Configurator.CONFIG.getInt("experiments.NumSourceBins");
        int numInBin = filteredSrcEqs.size() / numBins;
        int numToSampleFromBin = Math.min(numSource, filteredSrcEqs.size()) / numBins;
        this.m_srcEqs = new HashSet<EquivalenceClass>();
        this.m_binnedSrcEqs = new LinkedList<Set<EquivalenceClass>>();
        Random rand = new Random();
        LOG.info((Object)(" - Selecting " + numSource + " source candidates from " + numBins + " bins ..."));
        int binNum = 0;
        while (binNum < numBins) {
            LinkedList bin = new LinkedList(filteredSrcEqs.subList(0, numInBin));
            filteredSrcEqs.removeAll(bin);
            HashSet<EquivalenceClass> binSample = new HashSet<EquivalenceClass>();
            this.m_binnedSrcEqs.add(binSample);
            double averageCount = 0.0;
            double minCount = Double.MAX_VALUE;
            double maxCount = Double.MIN_VALUE;
            int i = 0;
            while (i < numToSampleFromBin) {
                EquivalenceClass eq = (EquivalenceClass)bin.remove(rand.nextInt(bin.size()));
                binSample.add(eq);
                this.m_srcEqs.add(eq);
                double count = ((Number)eq.getProperty(Number.class.getName())).getNumber();
                if (count < minCount) {
                    minCount = count;
                }
                if (count > maxCount) {
                    maxCount = count;
                }
                averageCount += count;
                ++i;
            }
            LOG.info((Object)(" - Bin " + binNum + ": counts between " + minCount + " and " + maxCount + ", average = " + averageCount / (double)binSample.size() + " and " + binSample.size() + " source candidates ..."));
            ++binNum;
        }
        LOG.info((Object)(" - Selected " + this.m_srcEqs.size() + " source candidates ..."));
    }

    protected void selectSrcCandidatesByFreq() throws Exception {
        int numSource = Configurator.CONFIG.getInt("experiments.NumSource");
        int numBins = Configurator.CONFIG.getInt("experiments.NumSourceBins");
        int numInBin = numSource / numBins;
        double binSize = (double)(Math.min(5000L, this.m_maxTokCountInSrc) - 10L) / (double)numBins;
        assert (binSize > 0.0);
        LOG.info((Object)(" - Selecting " + numSource + " source candidates from " + numBins + " frequency bins (size " + (int)binSize + ") ..."));
        this.m_srcEqs = new HashSet<EquivalenceClass>();
        this.m_binnedSrcEqs = new LinkedList<Set<EquivalenceClass>>();
        LinkedList<EquivalenceClass> filteredSrcEqs = new LinkedList<EquivalenceClass>(this.createAndFilterSrcEqs(this.m_contextSrcEqs, 10, 5000));
        Collections.sort(filteredSrcEqs, new NumberComparator(false));
        LinkedList<EquivalenceClass> band = new LinkedList<EquivalenceClass>();
        double to = Math.min(5000L, this.m_maxTokCountInSrc);
        int j = 0;
        int numCollected = 0;
        Random rand = new Random();
        while (to > 10.0) {
            EquivalenceClass eq;
            double from = to;
            to -= binSize;
            to = Math.max(to, 10.0);
            band.clear();
            while (j < filteredSrcEqs.size()) {
                eq = filteredSrcEqs.get(j);
                long eqNum = ((Number)filteredSrcEqs.get(j).getProperty(Number.class.getName())).getNumber();
                if ((double)eqNum <= from && (double)eqNum > to) {
                    band.add(eq);
                } else if ((double)eqNum <= to) break;
                ++j;
            }
            numCollected = 0;
            HashSet<EquivalenceClass> bandSet = new HashSet<EquivalenceClass>();
            this.m_binnedSrcEqs.add(bandSet);
            while (band.size() > 0 && numCollected < numInBin) {
                eq = (EquivalenceClass)band.remove(rand.nextInt(band.size()));
                this.m_srcEqs.add(eq);
                bandSet.add(eq);
                ++numCollected;
            }
            LOG.info((Object)(" - Bin " + (this.m_binnedSrcEqs.size() - 1) + ": [" + to + "," + from + "] has " + numCollected + " source candidates ..."));
        }
        LOG.info((Object)(" - Selected " + this.m_srcEqs.size() + " source candidates ..."));
    }

    public void writeSelectedCandidates(String fileName) throws Exception {
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter((OutputStream)new FileOutputStream(fileName), DEFAULT_CHARSET));
        int num = 0;
        for (Set<EquivalenceClass> bin : this.m_binnedSrcEqs) {
            writer.write("-------------- Bin " + num++ + " --------------\n");
            for (EquivalenceClass eq : bin) {
                writer.write(String.valueOf(((Number)eq.getProperty(Number.class.getName())).getNumber()) + "\t" + eq.getStem() + "\n");
            }
        }
        writer.close();
    }

    protected void selectTrgCandidates() throws Exception {
        LOG.info((Object)" - Selecting target candidates ...");
        this.m_trgEqs = this.createAndFilterTrgEqs(this.m_contextTrgEqs, 10, Integer.MAX_VALUE);
        LOG.info((Object)(" - Selected " + this.m_trgEqs.size() + " target candidates ..."));
    }

    protected void filterContextEqs() throws Exception {
        int pruneContEqIfOccursFewerThan = Configurator.CONFIG.getInt("preprocessing.context.PruneEqIfOccursFewerThan");
        int pruneContEqIfOccursMoreThan = Configurator.CONFIG.getInt("preprocessing.context.PruneEqIfOccursMoreThan");
        this.m_contextSrcEqs = this.filterContextEqs(true, this.m_contextSrcEqs, pruneContEqIfOccursFewerThan, pruneContEqIfOccursMoreThan);
        this.m_contextTrgEqs = this.filterContextEqs(false, this.m_contextTrgEqs, pruneContEqIfOccursFewerThan, pruneContEqIfOccursMoreThan);
    }

    protected Set<EquivalenceClass> filterContextEqs(boolean src, Set<EquivalenceClass> eqs, int pruneContEqIfOccursFewerThan, int pruneContEqIfOccursMoreThan) throws Exception {
        LOG.info((Object)(" - Filtering " + (src ? "source" : "target") + " contextual words: keeping those in dict [" + this.m_seedDict.toString() + "] and occuring (" + pruneContEqIfOccursFewerThan + "," + pruneContEqIfOccursMoreThan + ") times..."));
        LinkedList<EquivalenceClassFilter> filters = new LinkedList<EquivalenceClassFilter>();
        filters.add(new DictionaryFilter(this.m_seedDict, true, src));
        filters.add(new NumOccurencesFilter(pruneContEqIfOccursFewerThan, true));
        filters.add(new NumOccurencesFilter(pruneContEqIfOccursMoreThan, false));
        Set<EquivalenceClass> filtContextEqs = EquivalenceClassCollector.filter(eqs, filters);
        LOG.info((Object)(" - Filtered context " + (src ? "source" : "target") + " classes: " + filtContextEqs.size()));
        return filtContextEqs;
    }

    protected Set<EquivalenceClass> createAndFilterSrcEqs(Set<EquivalenceClass> contSrcEqs, int pruneContEqIfOccursFewerThan, int pruneContEqIfOccursMoreThan) throws Exception {
        String stopWordsDir = Configurator.CONFIG.getString("resources.stopwords.Path");
        String srcStopFileName = Configurator.CONFIG.containsKey("resources.stopwords.SrcStopWords") ? Configurator.CONFIG.getString("resources.stopwords.SrcStopWords") : null;
        LOG.info((Object)(" - Filtering source words: keeping those in dict [" + this.m_seedDict.toString() + "] and occuring (" + pruneContEqIfOccursFewerThan + "," + pruneContEqIfOccursMoreThan + ") times" + (srcStopFileName == null ? " ..." : " and not in the stop word list ...")));
        LinkedList<EquivalenceClassFilter> filters = new LinkedList<EquivalenceClassFilter>();
        filters.add(new DictionaryFilter(this.m_seedDict, true, true));
        filters.add(new NumOccurencesFilter(pruneContEqIfOccursFewerThan, true));
        filters.add(new NumOccurencesFilter(pruneContEqIfOccursMoreThan, false));
        if (srcStopFileName != null) {
            SimpleEquivalenceClassCollector collector = new SimpleEquivalenceClassCollector(filters, false);
            HashSet stopEqs = new File(String.valueOf(stopWordsDir) + srcStopFileName).exists() ? collector.collect(new LexCorpusAccessor(srcStopFileName, stopWordsDir, true).getCorpusReader(), -1) : new HashSet();
            filters.add(new StopWordsFilter(stopEqs));
        }
        Set<EquivalenceClass> filtSrcEqs = EquivalenceClassCollector.filter(contSrcEqs, filters);
        LOG.info((Object)(" - Filtered source classes: " + filtSrcEqs.size()));
        return filtSrcEqs;
    }

    protected Set<EquivalenceClass> createAndFilterTrgEqs(Set<EquivalenceClass> contTrgEqs, int pruneContEqIfOccursFewerThan, int pruneContEqIfOccursMoreThan) throws Exception {
        String stopWordsDir = Configurator.CONFIG.getString("resources.stopwords.Path");
        String trgStopFileName = Configurator.CONFIG.containsKey("resources.stopwords.TrgStopWords") ? Configurator.CONFIG.getString("resources.stopwords.TrgStopWords") : null;
        LOG.info((Object)(" - Filtering target words: keeping those occuring (" + pruneContEqIfOccursFewerThan + "," + pruneContEqIfOccursMoreThan + ") times" + (trgStopFileName == null ? " ..." : " and not in the stop word list ...")));
        LinkedList<EquivalenceClassFilter> filters = new LinkedList<EquivalenceClassFilter>();
        filters.add(new NumOccurencesFilter(pruneContEqIfOccursFewerThan, true));
        filters.add(new NumOccurencesFilter(pruneContEqIfOccursMoreThan, false));
        if (trgStopFileName != null) {
            SimpleEquivalenceClassCollector collector = new SimpleEquivalenceClassCollector(filters, false);
            HashSet stopEqs = new File(String.valueOf(stopWordsDir) + trgStopFileName).exists() ? collector.collect(new LexCorpusAccessor(trgStopFileName, stopWordsDir, true).getCorpusReader(), -1) : new HashSet();
            filters.add(new StopWordsFilter(stopEqs));
        }
        Set<EquivalenceClass> filtTrgEqs = EquivalenceClassCollector.filter(contTrgEqs, filters);
        LOG.info((Object)(" - Filtered target classes: " + filtTrgEqs.size()));
        return filtTrgEqs;
    }

    protected void prepareSeedDictionary(Set<EquivalenceClass> srcContEqs, Set<EquivalenceClass> trgContEqs) throws Exception {
        SimpleDictionary simpSeedDict;
        String dictDir = Configurator.CONFIG.getString("resources.dictionary.Path");
        int ridDictNumTrans = Configurator.CONFIG.containsKey("experiments.DictionaryPruneNumTranslations") ? Configurator.CONFIG.getInt("experiments.DictionaryPruneNumTranslations") : -1;
        LOG.info((Object)" - Reading/preparing seed dictionary ...");
        if (Configurator.CONFIG.containsKey("resources.dictionary.Dictionary")) {
            String dictFileName = Configurator.CONFIG.getString("resources.dictionary.Dictionary");
            simpSeedDict = new SimpleDictionary(String.valueOf(dictDir) + dictFileName, "SeedDictionary");
        } else {
            String srcDictFileName = Configurator.CONFIG.getString("resources.dictionary.SrcName");
            String trgDictFileName = Configurator.CONFIG.getString("resources.dictionary.TrgName");
            simpSeedDict = new SimpleDictionary(new SimpleDictionary.DictHalves(String.valueOf(dictDir) + srcDictFileName, String.valueOf(dictDir) + trgDictFileName), "SeedDictionary");
        }
        simpSeedDict.pruneCounts(ridDictNumTrans);
        this.m_seedDict = new Dictionary(srcContEqs, trgContEqs, simpSeedDict, "SeedDictionary");
        LOG.info((Object)(" - Seed dictionary: " + this.m_seedDict.toString()));
    }

    protected void prepareTranslitDictionary(Set<EquivalenceClass> srcContEqs) throws Exception {
        String dictDir;
        LOG.info((Object)" - Reading/preparing transliteration dictionary ...");
        String string = dictDir = Configurator.CONFIG.containsKey("resources.translit.Path") ? Configurator.CONFIG.getString("resources.translit.Path") : null;
        if (dictDir == null || dictDir.trim().length() == 0) {
            LOG.info((Object)" - No transliteration dictionary specified");
        } else {
            if (Configurator.CONFIG.containsKey("resources.translit.Dictionary")) {
                String dictFileName = Configurator.CONFIG.getString("resources.translit.Dictionary");
                this.m_translitDict = new SimpleDictionary(String.valueOf(dictDir) + dictFileName, "Translit");
            } else {
                String srcDictFileName = Configurator.CONFIG.getString("resources.translit.SrcName");
                String trgDictFileName = Configurator.CONFIG.getString("resources.translit.TrgName");
                this.m_translitDict = new SimpleDictionary(new SimpleDictionary.DictHalves(String.valueOf(dictDir) + srcDictFileName, String.valueOf(dictDir) + trgDictFileName), "TranslitDictionary");
            }
            LOG.info((Object)(" - Transliteration dictionary: " + this.m_translitDict.toString()));
        }
    }

    protected void collectContextEqs() throws Exception {
        LOG.info((Object)" - Constructing contextual equivalence classes...");
        boolean filterRomanSrc = Configurator.CONFIG.containsKey("preprocessing.FilterRomanSrc") && Configurator.CONFIG.getBoolean("preprocessing.FilterRomanSrc");
        boolean filterRomanTrg = Configurator.CONFIG.containsKey("preprocessing.FilterRomanTrg") && Configurator.CONFIG.getBoolean("preprocessing.FilterRomanTrg");
        Class<EquivalenceClass> srcContClassClass = Class.forName(Configurator.CONFIG.getString("preprocessing.context.SrcEqClass"));
        Class<EquivalenceClass> trgContClassClass = Class.forName(Configurator.CONFIG.getString("preprocessing.context.TrgEqClass"));
        this.m_contextSrcEqs = this.collectContextEqs(true, true, filterRomanSrc, srcContClassClass);
        this.m_contextTrgEqs = this.collectContextEqs(false, true, filterRomanTrg, trgContClassClass);
        this.m_maxTokCountInSrc = this.collectMaxOccurrenceCount(this.m_contextSrcEqs);
        this.m_maxTokCountInTrg = this.collectMaxOccurrenceCount(this.m_contextTrgEqs);
        LOG.info((Object)(" - Source context classes = " + this.m_contextSrcEqs.size() + ", max occurrences = " + this.m_maxTokCountInSrc));
        LOG.info((Object)(" - Target context classes = " + this.m_contextTrgEqs.size() + ", max occurrences = " + this.m_maxTokCountInTrg));
    }

    protected Set<EquivalenceClass> collectContextEqs(boolean src, boolean caseSensitive, boolean filterRoman, Class<EquivalenceClass> contextClassClass) throws Exception {
        ArrayList<EquivalenceClassFilter> filters = new ArrayList<EquivalenceClassFilter>(3);
        filters.add(new GarbageFilter());
        filters.add(new LengthFilter(2));
        if (filterRoman) {
            filters.add(new RomanizationFilter());
        }
        CorpusAccessor accessor = this.getAccessor(Configurator.CONFIG.getString("preprocessing.input.Context"), src);
        SimpleEquivalenceClassCollector collector = new SimpleEquivalenceClassCollector(filters, caseSensitive);
        Set<EquivalenceClass> eqs = collector.collect(accessor.getCorpusReader(), -1);
        new NumberCollector(caseSensitive).collectProperty(accessor, eqs);
        eqs = this.constructEqClasses(src, eqs, contextClassClass);
        this.assignTypeProp(eqs, src ? Type.EqType.SOURCE : Type.EqType.TARGET);
        return eqs;
    }

    protected Set<EquivalenceClass> constructEqClasses(boolean src, Set<EquivalenceClass> allEqs, Class<? extends EquivalenceClass> eqClassClass) throws Exception {
        HashMap<String, EquivalenceClass> eqsMap = new HashMap<String, EquivalenceClass>();
        for (EquivalenceClass eq : allEqs) {
            String newWord = ((SimpleEquivalenceClass)eq).getWord();
            long newCount = ((Number)eq.getProperty(Number.class.getName())).getNumber();
            EquivalenceClass newEq = eqClassClass.newInstance();
            newEq.init(newWord, true);
            EquivalenceClass foundEq = (EquivalenceClass)eqsMap.get(newEq.getStem());
            if (foundEq == null) {
                newEq.assignId();
                newEq.setProperty(new Number(newCount));
                newEq.setProperty(new Type(src ? Type.EqType.SOURCE : Type.EqType.TARGET));
                eqsMap.put(newEq.getStem(), newEq);
                continue;
            }
            foundEq.merge(newEq);
            ((Number)foundEq.getProperty(Number.class.getName())).increment(newCount);
        }
        return new HashSet<EquivalenceClass>(eqsMap.values());
    }

    protected long collectMaxOccurrenceCount(Set<? extends EquivalenceClass> eqs) {
        long maxOccurCount = 0L;
        for (EquivalenceClass equivalenceClass : eqs) {
            long count;
            Number num = (Number)equivalenceClass.getProperty(Number.class.getName());
            if (num == null || (count = num.getNumber()) <= maxOccurCount) continue;
            maxOccurCount = count;
        }
        return maxOccurCount;
    }

    protected void assignTypeProp(Set<? extends EquivalenceClass> eqClasses, Type.EqType type) {
        Type commonType = new Type(type);
        for (EquivalenceClass equivalenceClass : eqClasses) {
            equivalenceClass.setProperty(commonType);
        }
    }

    protected CorpusAccessor getAccessor(String kind, boolean src) throws Exception {
        CorpusAccessor accessor = null;
        if ("europarl".equals(kind)) {
            accessor = this.getEuroParlAccessor(src);
        } else if ("wiki".equals(kind)) {
            accessor = this.getWikiAccessor(src);
        } else if ("crawls".equals(kind)) {
            accessor = this.getCrawlsAccessor(src);
        } else if ("dev".equals(kind)) {
            accessor = this.getDevAccessor(src);
        } else if ("test".equals(kind)) {
            accessor = this.getTestAccessor(src);
        } else {
            LOG.error((Object)("Could not find corpus accessor for " + kind));
        }
        return accessor;
    }

    protected LexCorpusAccessor getDevAccessor(boolean src) throws Exception {
        String path = Configurator.CONFIG.getString("corpora.dev.Path");
        boolean oneSentPerLine = Configurator.CONFIG.getBoolean("corpora.dev.OneSentPerLine");
        String name = src ? Configurator.CONFIG.getString("corpora.dev.SrcName") : Configurator.CONFIG.getString("corpora.dev.TrgName");
        return new LexCorpusAccessor(name, this.appendSep(path), oneSentPerLine);
    }

    protected LexCorpusAccessor getTestAccessor(boolean src) throws Exception {
        String path = Configurator.CONFIG.getString("corpora.test.Path");
        boolean oneSentPerLine = Configurator.CONFIG.getBoolean("corpora.test.OneSentPerLine");
        String name = src ? Configurator.CONFIG.getString("corpora.test.SrcName") : Configurator.CONFIG.getString("corpora.test.TrgName");
        return new LexCorpusAccessor(name, this.appendSep(path), oneSentPerLine);
    }

    protected EuroParlCorpusAccessor getEuroParlAccessor(boolean src) throws Exception {
        String path = Configurator.CONFIG.getString("corpora.europarl.Path");
        boolean oneSentPerLine = Configurator.CONFIG.getBoolean("corpora.europarl.OneSentPerLine");
        String subDir = src ? Configurator.CONFIG.getString("corpora.europarl.SrcSubDir") : Configurator.CONFIG.getString("corpora.europarl.TrgSubDir");
        SimpleDateFormat sdf = new SimpleDateFormat("yy-MM-dd");
        Date fromDate = sdf.parse(Configurator.CONFIG.getString("corpora.europarl.DateFrom"));
        Date toDate = sdf.parse(Configurator.CONFIG.getString("corpora.europarl.DateTo"));
        return new EuroParlCorpusAccessor(String.valueOf(this.appendSep(path)) + subDir, fromDate, toDate, oneSentPerLine);
    }

    protected CrawlCorpusAccessor getCrawlsAccessor(boolean src) throws Exception {
        String path = Configurator.CONFIG.getString("corpora.crawls.Path");
        boolean oneSentPerLine = Configurator.CONFIG.getBoolean("corpora.crawls.OneSentPerLine");
        String subDir = src ? Configurator.CONFIG.getString("corpora.crawls.SrcSubDir") : Configurator.CONFIG.getString("corpora.crawls.TrgSubDir");
        SimpleDateFormat sdf = new SimpleDateFormat("yy-MM-dd");
        Date fromDate = sdf.parse(Configurator.CONFIG.getString("corpora.crawls.DateFrom"));
        Date toDate = sdf.parse(Configurator.CONFIG.getString("corpora.crawls.DateTo"));
        return new CrawlCorpusAccessor(String.valueOf(this.appendSep(path)) + subDir, fromDate, toDate, oneSentPerLine);
    }

    protected LexCorpusAccessor getWikiAccessor(boolean src) {
        String path = Configurator.CONFIG.getString("corpora.wiki.Path");
        boolean oneSentPerLine = Configurator.CONFIG.getBoolean("corpora.wiki.OneSentPerLine");
        String fileRegExp = src ? Configurator.CONFIG.getString("corpora.wiki.SrcRegExp") : Configurator.CONFIG.getString("corpora.wiki.TrgRegExp");
        return new LexCorpusAccessor(fileRegExp, this.appendSep(path), oneSentPerLine);
    }

    protected String appendSep(String str) {
        String ret;
        String string = ret = str == null ? null : str.trim();
        if (ret != null && ret.length() > 0 && !ret.endsWith(File.separator)) {
            ret = String.valueOf(ret) + File.separator;
        }
        return ret;
    }
}

