/*
 * Decompiled with CFR 0.152.
 */
package main.lexinduct;

import babel.content.corpora.accessors.CorpusAccessor;
import babel.content.corpora.accessors.CrawlCorpusAccessor;
import babel.content.corpora.accessors.EuroParlCorpusAccessor;
import babel.content.corpora.accessors.LexCorpusAccessor;
import babel.content.eqclasses.EquivalenceClass;
import babel.content.eqclasses.SimpleEquivalenceClass;
import babel.content.eqclasses.collectors.EquivalenceClassCollector;
import babel.content.eqclasses.collectors.SimpleEquivalenceClassCollector;
import babel.content.eqclasses.filters.DictionaryFilter;
import babel.content.eqclasses.filters.EquivalenceClassFilter;
import babel.content.eqclasses.filters.GarbageFilter;
import babel.content.eqclasses.filters.LengthFilter;
import babel.content.eqclasses.filters.NoContextFilter;
import babel.content.eqclasses.filters.NoTimeDistributionFilter;
import babel.content.eqclasses.filters.NumOccurencesFilter;
import babel.content.eqclasses.filters.RomanizationFilter;
import babel.content.eqclasses.properties.context.ContextCollector;
import babel.content.eqclasses.properties.number.Number;
import babel.content.eqclasses.properties.number.NumberCollector;
import babel.content.eqclasses.properties.time.TimeDistribution;
import babel.content.eqclasses.properties.time.TimeDistributionCollector;
import babel.content.eqclasses.properties.type.Type;
import babel.ranking.scorers.Scorer;
import babel.util.config.Configurator;
import babel.util.dict.Dictionary;
import babel.util.dict.SimpleDictionary;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class InductPreparer {
    protected static final Log LOG = LogFactory.getLog(InductPreparer.class);
    protected static final String DEFAULT_ENCODING = "UTF-8";
    protected Dictionary m_seedDict = null;
    protected Set<EquivalenceClass> m_contextSrcEqs = null;
    protected Set<EquivalenceClass> m_contextTrgEqs = null;
    protected Set<EquivalenceClass> m_srcEqs = null;
    protected Set<EquivalenceClass> m_trgEqs = null;
    protected double m_numToksInSrc = 0.0;
    protected double m_numToksInTrg = 0.0;
    protected double m_maxTokCountInSrc = 0.0;
    protected double m_maxTokCountInTrg = 0.0;

    public Dictionary getSeedDict() {
        return this.m_seedDict;
    }

    public double getNumSrcToks() {
        return this.m_numToksInSrc;
    }

    public double getNumTrgToks() {
        return this.m_numToksInTrg;
    }

    public double getMaxSrcTokCount() {
        return this.m_maxTokCountInSrc;
    }

    public double getMaxTrgTokCount() {
        return this.m_maxTokCountInTrg;
    }

    public Set<EquivalenceClass> getSrcEqsToInduct() {
        return this.m_srcEqs;
    }

    public Set<EquivalenceClass> getTrgEqs() {
        return this.m_trgEqs;
    }

    public void prepare() throws Exception {
        boolean filterRomanSrc = Configurator.CONFIG.containsKey("preprocessing.FilterRomanSrc") && Configurator.CONFIG.getBoolean("preprocessing.FilterRomanSrc");
        boolean filterRomanTrg = Configurator.CONFIG.containsKey("preprocessing.FilterRomanTrg") && Configurator.CONFIG.getBoolean("preprocessing.FilterRomanTrg");
        String srcContEqClassName = Configurator.CONFIG.getString("preprocessing.context.SrcEqClass");
        String trgContEqClassName = Configurator.CONFIG.getString("preprocessing.context.TrgEqClass");
        boolean alignDistros = Configurator.CONFIG.getBoolean("preprocessing.time.Align");
        Class<?> srcContClassClass = Class.forName(srcContEqClassName);
        Class<?> trgContClassClass = Class.forName(trgContEqClassName);
        LOG.info((Object)" - Collecting from scratch ...");
        Set<EquivalenceClass> allSrcEqs = this.collectInitEqClasses(true, filterRomanSrc);
        Set<EquivalenceClass> allTrgEqs = this.collectInitEqClasses(false, filterRomanTrg);
        LOG.info((Object)(" - All source types: " + allSrcEqs.size() + (filterRomanSrc ? " (without romanization) " : "")));
        LOG.info((Object)(" - All target types: " + allTrgEqs.size() + (filterRomanTrg ? " (without romanization) " : "")));
        LOG.info((Object)" - Constructing context classes...");
        this.m_contextSrcEqs = this.constructEqClasses(true, allSrcEqs, srcContClassClass);
        this.m_contextTrgEqs = this.constructEqClasses(false, allTrgEqs, trgContClassClass);
        LOG.info((Object)(" - Context source classes: " + this.m_contextSrcEqs.size()));
        LOG.info((Object)(" - Context target classes: " + this.m_contextTrgEqs.size()));
        LOG.info((Object)" - Reading source and target candidates...");
        this.m_srcEqs = this.prepareCandidates(true, this.m_contextSrcEqs);
        this.m_trgEqs = this.prepareCandidates(false, this.m_contextTrgEqs);
        LOG.info((Object)(" - Source candidates: " + this.m_srcEqs.size()));
        LOG.info((Object)(" - Target candidates: " + this.m_trgEqs.size()));
        this.prepareSeedDictionary(this.m_contextSrcEqs, this.m_contextTrgEqs);
        LOG.info((Object)" - Collecting properties...");
        Set<Integer> srcBins = this.collectProps(true, this.m_srcEqs, this.m_contextSrcEqs, this.m_seedDict);
        Set<Integer> trgBins = this.collectProps(false, this.m_trgEqs, this.m_contextTrgEqs, this.m_seedDict);
        if (alignDistros) {
            LOG.info((Object)" - Aligning temporal distributions...");
            this.alignDistributions(srcBins, trgBins, this.m_srcEqs, this.m_trgEqs);
        }
        LOG.info((Object)" - Cleaning up candidate classes...");
        this.m_srcEqs = this.cleanUpEqClasses(this.m_srcEqs, true);
        this.m_trgEqs = this.cleanUpEqClasses(this.m_trgEqs, false);
        LOG.info((Object)(" - Source candidates: " + this.m_srcEqs.size()));
        LOG.info((Object)(" - Target candidates: " + this.m_trgEqs.size()));
        this.collectTokenCounts(this.m_contextSrcEqs, this.m_contextTrgEqs);
    }

    protected HashSet<EquivalenceClass> prepareCandidates(boolean src, Set<EquivalenceClass> contEqs) throws Exception {
        String line;
        String dictDir = Configurator.CONFIG.getString("resources.inductvocab.Path");
        String fileName = src ? Configurator.CONFIG.getString("resources.inductvocab.SrcName") : Configurator.CONFIG.getString("resources.inductvocab.TrgName");
        BufferedReader reader = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(String.valueOf(dictDir) + fileName), DEFAULT_ENCODING));
        HashSet<EquivalenceClass> eqSet = new HashSet<EquivalenceClass>();
        HashMap<String, EquivalenceClass> eqMap = new HashMap<String, EquivalenceClass>();
        LOG.info((Object)("Reading " + (src ? "source" : "target") + " candidates ..."));
        for (EquivalenceClass eq : contEqs) {
            for (String sWord : eq.getAllWords()) {
                assert (!eqMap.containsKey(sWord));
                eqMap.put(sWord, eq);
            }
        }
        while ((line = reader.readLine()) != null) {
            EquivalenceClass foundEq = (EquivalenceClass)eqMap.get(line.split("\\s")[0]);
            if (foundEq == null) continue;
            eqSet.add(foundEq);
        }
        reader.close();
        return eqSet;
    }

    protected Set<EquivalenceClass> cleanUpEqClasses(Set<EquivalenceClass> eqClasses, boolean src) throws Exception {
        LOG.info((Object)("Throwing out " + (src ? "source" : "target") + " candidate classes without context or time properties..."));
        LinkedList<EquivalenceClassFilter> filters = new LinkedList<EquivalenceClassFilter>();
        filters.add(new NoContextFilter());
        filters.add(new NoTimeDistributionFilter());
        return EquivalenceClassCollector.filter(eqClasses, filters);
    }

    protected void prepareSeedDictionary(Set<EquivalenceClass> srcContEqs, Set<EquivalenceClass> trgContEqs) throws Exception {
        SimpleDictionary simpSeedDict;
        String dictDir = Configurator.CONFIG.getString("resources.dictionary.Path");
        int ridDictNumTrans = Configurator.CONFIG.containsKey("experiments.DictionaryPruneNumTranslations") ? Configurator.CONFIG.getInt("experiments.DictionaryPruneNumTranslations") : -1;
        LOG.info((Object)"Reading/preparing seed dictionary ...");
        if (Configurator.CONFIG.containsKey("resources.dictionary.Dictionary")) {
            String dictFileName = Configurator.CONFIG.getString("resources.dictionary.Dictionary");
            simpSeedDict = new SimpleDictionary(String.valueOf(dictDir) + dictFileName, "SeedDictionary");
        } else {
            String srcDictFileName = Configurator.CONFIG.getString("resources.dictionary.SrcName");
            String trgDictFileName = Configurator.CONFIG.getString("resources.dictionary.TrgName");
            simpSeedDict = new SimpleDictionary(new SimpleDictionary.DictHalves(String.valueOf(dictDir) + srcDictFileName, String.valueOf(dictDir) + trgDictFileName), "SeedDictionary");
        }
        simpSeedDict.pruneCounts(ridDictNumTrans);
        this.m_seedDict = new Dictionary(srcContEqs, trgContEqs, simpSeedDict, "SeedDictionary");
        LOG.info((Object)("Seed dictionary: " + this.m_seedDict.toString()));
    }

    protected void alignDistributions(Set<Integer> srcBins, Set<Integer> trgBins, Set<EquivalenceClass> srcEqs, Set<EquivalenceClass> trgEqs) {
        TimeDistribution timeProp;
        HashSet<Integer> toRemove = new HashSet<Integer>(srcBins);
        toRemove.removeAll(trgBins);
        for (EquivalenceClass eq : srcEqs) {
            timeProp = (TimeDistribution)eq.getProperty(TimeDistribution.class.getName());
            if (timeProp == null) continue;
            timeProp.removeBins(toRemove);
        }
        toRemove.clear();
        toRemove.addAll(trgBins);
        toRemove.removeAll(srcBins);
        for (EquivalenceClass eq : trgEqs) {
            timeProp = (TimeDistribution)eq.getProperty(TimeDistribution.class.getName());
            if (timeProp == null) continue;
            timeProp.removeBins(toRemove);
        }
        toRemove.clear();
        toRemove.addAll(srcBins);
        toRemove.retainAll(trgBins);
        LOG.info((Object)("There are " + srcBins.size() + " days in src distributions."));
        LOG.info((Object)("There are " + trgBins.size() + " days in trg distributions."));
        LOG.info((Object)("There are " + toRemove.size() + " common days between src and trg distributions."));
    }

    protected Set<EquivalenceClass> collectInitEqClasses(boolean src, boolean filterRoman) throws Exception {
        ArrayList<EquivalenceClassFilter> filters = new ArrayList<EquivalenceClassFilter>(3);
        filters.add(new GarbageFilter());
        filters.add(new LengthFilter(2));
        if (filterRoman) {
            filters.add(new RomanizationFilter());
        }
        CorpusAccessor accessor = this.getAccessor(Configurator.CONFIG.getString("preprocessing.input.Context"), src);
        SimpleEquivalenceClassCollector collector = new SimpleEquivalenceClassCollector(filters, true);
        Set<EquivalenceClass> eqClasses = collector.collect(accessor.getCorpusReader(), -1);
        new NumberCollector(true).collectProperty(accessor, eqClasses);
        this.assignTypeProp(eqClasses, src ? Type.EqType.SOURCE : Type.EqType.TARGET);
        return eqClasses;
    }

    protected Set<EquivalenceClass> constructEqClasses(boolean src, Set<EquivalenceClass> allEqs, Class<? extends EquivalenceClass> eqClassClass) throws Exception {
        HashMap<String, EquivalenceClass> eqsMap = new HashMap<String, EquivalenceClass>();
        for (EquivalenceClass eq : allEqs) {
            String newWord = ((SimpleEquivalenceClass)eq).getWord();
            long newCount = ((Number)eq.getProperty(Number.class.getName())).getNumber();
            EquivalenceClass newEq = eqClassClass.newInstance();
            newEq.init(newWord, false);
            EquivalenceClass foundEq = (EquivalenceClass)eqsMap.get(newEq.getStem());
            if (foundEq == null) {
                newEq.assignId();
                newEq.setProperty(new Number(newCount));
                newEq.setProperty(new Type(src ? Type.EqType.SOURCE : Type.EqType.TARGET));
                eqsMap.put(newEq.getStem(), newEq);
                continue;
            }
            foundEq.merge(newEq);
            ((Number)foundEq.getProperty(Number.class.getName())).increment(newCount);
        }
        return new HashSet<EquivalenceClass>(eqsMap.values());
    }

    protected Set<Integer> collectProps(boolean src, Set<EquivalenceClass> eqClasses, Set<EquivalenceClass> contextEqs, Dictionary contextDict) throws Exception {
        int pruneContEqIfOccursFewerThan = Configurator.CONFIG.getInt("preprocessing.context.PruneEqIfOccursFewerThan");
        int pruneContEqIfOccursMoreThan = Configurator.CONFIG.getInt("preprocessing.context.PruneEqIfOccursMoreThan");
        int contextWindowSize = Configurator.CONFIG.getInt("preprocessing.context.Window");
        Set<EquivalenceClass> filtContextEqs = new HashSet<EquivalenceClass>(contextEqs);
        LOG.info((Object)("Preparing contextual words for " + (src ? "source" : "target") + ": keeping those in dict [" + contextDict.toString() + "] and occuring (" + pruneContEqIfOccursFewerThan + "," + pruneContEqIfOccursMoreThan + ") times..."));
        LinkedList<EquivalenceClassFilter> filters = new LinkedList<EquivalenceClassFilter>();
        filters.add(new DictionaryFilter(contextDict, true, src));
        filters.add(new NumOccurencesFilter(pruneContEqIfOccursFewerThan, true));
        filters.add(new NumOccurencesFilter(pruneContEqIfOccursMoreThan, false));
        filtContextEqs = EquivalenceClassCollector.filter(filtContextEqs, filters);
        LOG.info((Object)("Context " + (src ? "source" : "target") + " classes: " + filtContextEqs.size()));
        CorpusAccessor accessor = this.getAccessor(Configurator.CONFIG.getString("preprocessing.input.Context"), src);
        new ContextCollector(true, contextWindowSize, contextWindowSize, contextEqs).collectProperty(accessor, eqClasses);
        accessor = this.getAccessor(Configurator.CONFIG.getString("preprocessing.input.Time"), src);
        TimeDistributionCollector distCollector = new TimeDistributionCollector(true);
        distCollector.collectProperty(accessor, eqClasses);
        this.assignTypeProp(eqClasses, src ? Type.EqType.SOURCE : Type.EqType.TARGET);
        return distCollector.binsCollected();
    }

    protected void collectTokenCounts(Set<? extends EquivalenceClass> srcEqs, Set<? extends EquivalenceClass> trgEqs) {
        Number tmpNum;
        this.m_maxTokCountInSrc = 0.0;
        this.m_maxTokCountInTrg = 0.0;
        this.m_numToksInSrc = 0.0;
        this.m_numToksInTrg = 0.0;
        for (EquivalenceClass equivalenceClass : srcEqs) {
            tmpNum = (Number)equivalenceClass.getProperty(Number.class.getName());
            if (tmpNum == null) continue;
            if ((double)tmpNum.getNumber() > this.m_maxTokCountInSrc) {
                this.m_maxTokCountInSrc = tmpNum.getNumber();
            }
            this.m_numToksInSrc += (double)tmpNum.getNumber();
        }
        for (EquivalenceClass equivalenceClass : trgEqs) {
            tmpNum = (Number)equivalenceClass.getProperty(Number.class.getName());
            if (tmpNum == null) continue;
            if ((double)tmpNum.getNumber() > this.m_maxTokCountInTrg) {
                this.m_maxTokCountInTrg = tmpNum.getNumber();
            }
            this.m_numToksInTrg += (double)tmpNum.getNumber();
        }
        LOG.info((Object)("Maximum occurrences: src = " + this.m_maxTokCountInSrc + ", trg = " + this.m_maxTokCountInTrg + "."));
        LOG.info((Object)("Total Counts: src = " + this.m_numToksInSrc + ", trg = " + this.m_numToksInTrg + "."));
    }

    protected void assignTypeProp(Set<? extends EquivalenceClass> eqClasses, Type.EqType type) {
        Type commonType = new Type(type);
        for (EquivalenceClass equivalenceClass : eqClasses) {
            equivalenceClass.setProperty(commonType);
        }
    }

    public void prepareProperties(boolean src, Set<? extends EquivalenceClass> eqs, Scorer contextScorer, Scorer timeScorer) {
        LOG.info((Object)("Projecting and scoring " + (src ? "source" : "target") + " contextual items with " + contextScorer.toString() + " and time distributions with " + timeScorer.toString() + "..."));
        for (EquivalenceClass equivalenceClass : eqs) {
            contextScorer.prepare(equivalenceClass);
            timeScorer.prepare(equivalenceClass);
        }
    }

    protected CorpusAccessor getAccessor(String kind, boolean src) throws Exception {
        CorpusAccessor accessor = null;
        if ("europarl".equals(kind)) {
            accessor = this.getEuroParlAccessor(src);
        } else if ("wiki".equals(kind)) {
            accessor = this.getWikiAccessor(src);
        } else if ("crawls".equals(kind)) {
            accessor = this.getCrawlsAccessor(src);
        } else if ("dev".equals(kind)) {
            accessor = this.getDevAccessor(src);
        } else if ("test".equals(kind)) {
            accessor = this.getTestAccessor(src);
        } else {
            LOG.error((Object)("Could not find corpus accessor for " + kind));
        }
        return accessor;
    }

    protected LexCorpusAccessor getDevAccessor(boolean src) throws Exception {
        String path = Configurator.CONFIG.getString("corpora.dev.Path");
        boolean oneSentPerLine = Configurator.CONFIG.getBoolean("corpora.dev.OneSentPerLine");
        String name = src ? Configurator.CONFIG.getString("corpora.dev.SrcName") : Configurator.CONFIG.getString("corpora.dev.TrgName");
        return new LexCorpusAccessor(name, this.appendSep(path), oneSentPerLine);
    }

    protected LexCorpusAccessor getTestAccessor(boolean src) throws Exception {
        String path = Configurator.CONFIG.getString("corpora.test.Path");
        boolean oneSentPerLine = Configurator.CONFIG.getBoolean("corpora.test.OneSentPerLine");
        String name = src ? Configurator.CONFIG.getString("corpora.test.SrcName") : Configurator.CONFIG.getString("corpora.test.TrgName");
        return new LexCorpusAccessor(name, this.appendSep(path), oneSentPerLine);
    }

    protected EuroParlCorpusAccessor getEuroParlAccessor(boolean src) throws Exception {
        String path = Configurator.CONFIG.getString("corpora.europarl.Path");
        boolean oneSentPerLine = Configurator.CONFIG.getBoolean("corpora.europarl.OneSentPerLine");
        String subDir = src ? Configurator.CONFIG.getString("corpora.europarl.SrcSubDir") : Configurator.CONFIG.getString("corpora.europarl.TrgSubDir");
        SimpleDateFormat sdf = new SimpleDateFormat("yy-MM-dd");
        Date fromDate = sdf.parse(Configurator.CONFIG.getString("corpora.europarl.DateFrom"));
        Date toDate = sdf.parse(Configurator.CONFIG.getString("corpora.europarl.DateTo"));
        return new EuroParlCorpusAccessor(String.valueOf(this.appendSep(path)) + subDir, fromDate, toDate, oneSentPerLine);
    }

    protected CrawlCorpusAccessor getCrawlsAccessor(boolean src) throws Exception {
        String path = Configurator.CONFIG.getString("corpora.crawls.Path");
        boolean oneSentPerLine = Configurator.CONFIG.getBoolean("corpora.crawls.OneSentPerLine");
        String subDir = src ? Configurator.CONFIG.getString("corpora.crawls.SrcSubDir") : Configurator.CONFIG.getString("corpora.crawls.TrgSubDir");
        SimpleDateFormat sdf = new SimpleDateFormat("yy-MM-dd");
        Date fromDate = sdf.parse(Configurator.CONFIG.getString("corpora.crawls.DateFrom"));
        Date toDate = sdf.parse(Configurator.CONFIG.getString("corpora.crawls.DateTo"));
        return new CrawlCorpusAccessor(String.valueOf(this.appendSep(path)) + subDir, fromDate, toDate, oneSentPerLine);
    }

    protected LexCorpusAccessor getWikiAccessor(boolean src) {
        String path = Configurator.CONFIG.getString("corpora.wiki.Path");
        boolean oneSentPerLine = Configurator.CONFIG.getBoolean("corpora.wiki.OneSentPerLine");
        String fileRegExp = src ? Configurator.CONFIG.getString("corpora.wiki.SrcRegExp") : Configurator.CONFIG.getString("corpora.wiki.TrgRegExp");
        return new LexCorpusAccessor(fileRegExp, this.appendSep(path), oneSentPerLine);
    }

    protected String appendSep(String str) {
        String ret;
        String string = ret = str == null ? null : str.trim();
        if (ret != null && ret.length() > 0 && !ret.endsWith(File.separator)) {
            ret = String.valueOf(ret) + File.separator;
        }
        return ret;
    }
}

