/*
 * Decompiled with CFR 0.152.
 */
package main.phrases;

import babel.content.corpora.accessors.CorpusAccessor;
import babel.content.corpora.accessors.CrawlCorpusAccessor;
import babel.content.corpora.accessors.EuroParlCorpusAccessor;
import babel.content.corpora.accessors.LexCorpusAccessor;
import babel.content.eqclasses.EquivalenceClass;
import babel.content.eqclasses.SimpleEquivalenceClass;
import babel.content.eqclasses.collectors.EquivalenceClassCollector;
import babel.content.eqclasses.collectors.SimpleEquivalenceClassCollector;
import babel.content.eqclasses.comparators.LexComparator;
import babel.content.eqclasses.filters.DictionaryFilter;
import babel.content.eqclasses.filters.EquivalenceClassFilter;
import babel.content.eqclasses.filters.GarbageFilter;
import babel.content.eqclasses.filters.LengthFilter;
import babel.content.eqclasses.filters.NumOccurencesFilter;
import babel.content.eqclasses.filters.RomanizationFilter;
import babel.content.eqclasses.phrases.Phrase;
import babel.content.eqclasses.properties.context.PhraseContextCollector;
import babel.content.eqclasses.properties.lshcontext.LSHContext;
import babel.content.eqclasses.properties.lshcontext.LSHContextCollector;
import babel.content.eqclasses.properties.lshtime.LSHTimeDistribution;
import babel.content.eqclasses.properties.lshtime.LSHTimeDistributionCollector;
import babel.content.eqclasses.properties.number.Number;
import babel.content.eqclasses.properties.number.NumberCollector;
import babel.content.eqclasses.properties.number.PhraseNumberCollector;
import babel.content.eqclasses.properties.time.PhraseTimeDistributionCollector;
import babel.content.eqclasses.properties.type.Type;
import babel.ranking.scorers.Scorer;
import babel.util.config.Configurator;
import babel.util.dict.Dictionary;
import babel.util.dict.SimpleDictionary;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class BenPhrasePreparer {
    protected static final Log LOG = LogFactory.getLog(BenPhrasePreparer.class);
    protected static final String DEFAULT_CHARSET = "UTF-8";
    protected static final String FIELD_DELIM = " ||| ";
    protected String m_phraseFileName = "";
    protected BufferedReader m_phraseFileReader;
    protected long m_maxPhrCount = 0L;
    protected Dictionary m_seedDict = null;
    protected Set<EquivalenceClass> m_contextSrcEqs = null;
    protected Set<EquivalenceClass> m_contextTrgEqs = null;
    protected long m_maxTokCountInSrc = 0L;
    protected long m_maxTokCountInTrg = 0L;
    protected BufferedWriter m_phraseStrWriter;
    protected BufferedOutputStream m_binContextStream;
    protected BufferedOutputStream m_binTimeStream;

    public Dictionary getSeedDict() {
        return this.m_seedDict;
    }

    public long getMaxSrcTokCount() {
        return this.m_maxTokCountInSrc;
    }

    public long getMaxTrgTokCount() {
        return this.m_maxTokCountInTrg;
    }

    public long getMaxPhrCount() {
        return this.m_maxPhrCount;
    }

    public void openFiles(String phrasesFile, String contextSigFile, String timeSigFile) throws Exception {
        this.m_phraseStrWriter = new BufferedWriter(new OutputStreamWriter((OutputStream)new FileOutputStream(phrasesFile, false), DEFAULT_CHARSET));
        this.m_binContextStream = new BufferedOutputStream(new FileOutputStream(contextSigFile, false));
        this.m_binTimeStream = new BufferedOutputStream(new FileOutputStream(timeSigFile, false));
    }

    public void closeFiles() throws Exception {
        this.m_phraseStrWriter.close();
        this.m_binContextStream.close();
        this.m_binTimeStream.close();
    }

    public void saveChunk(Set<Phrase> phrases) throws Exception {
        ArrayList<Phrase> phraseList = new ArrayList<Phrase>(phrases);
        Collections.sort(phraseList, new LexComparator(true));
        for (Phrase phrase : phraseList) {
            LSHContext context = (LSHContext)phrase.getProperty(LSHContext.class.getName());
            LSHTimeDistribution time = (LSHTimeDistribution)phrase.getProperty(LSHTimeDistribution.class.getName());
            this.m_phraseStrWriter.write(String.valueOf(phrase.getStem()) + "\n");
            this.m_binContextStream.write(context.getSignature());
            this.m_binTimeStream.write(time.getSignature());
            this.m_phraseStrWriter.flush();
            this.m_binContextStream.flush();
            this.m_binTimeStream.flush();
        }
    }

    public void collectContextAndTimeProps(boolean src, Set<Phrase> chunk) throws Exception {
        LOG.info((Object)(" - Collecting context and time phrase properties for " + chunk.size() + (src ? " source " : " target ") + "phrases ..."));
        int maxPhraseLength = Configurator.CONFIG.getInt("preprocessing.phrases.MaxPhraseLength");
        int contextWindowSize = Configurator.CONFIG.getInt("preprocessing.context.Window");
        this.collectContextAndTimeProps(src, chunk, maxPhraseLength, src ? this.m_contextSrcEqs : this.m_contextTrgEqs, contextWindowSize, true);
    }

    protected void collectContextAndTimeProps(boolean src, Set<Phrase> phrases, int maxPhraseLength, Set<EquivalenceClass> contextEqs, int contextWindowSize, boolean caseSensitive) throws Exception {
        CorpusAccessor accessor = this.getAccessor(Configurator.CONFIG.getString("preprocessing.input.Context"), src);
        new PhraseContextCollector(maxPhraseLength, caseSensitive, contextWindowSize, contextWindowSize, contextEqs).collectProperty(accessor, phrases);
        accessor = this.getAccessor(Configurator.CONFIG.getString("preprocessing.input.Time"), src);
        PhraseTimeDistributionCollector distCollector = new PhraseTimeDistributionCollector(maxPhraseLength, caseSensitive);
        distCollector.collectProperty(accessor, phrases);
    }

    public Set<Phrase> prepareNextChunk(boolean src, String fileName, int chunkSize) throws Exception {
        boolean caseSensitive = Configurator.CONFIG.getBoolean("preprocessing.phrases.CaseSensitive");
        int maxPhraseLength = Configurator.CONFIG.getInt("preprocessing.phrases.MaxPhraseLength");
        Set<Phrase> chunk = this.readPhraseFileChunk(src, fileName, chunkSize, caseSensitive);
        this.collectNumberProps(src, chunk, maxPhraseLength, caseSensitive);
        this.assignTypeProp(chunk, src ? Type.EqType.SOURCE : Type.EqType.TARGET);
        return chunk;
    }

    public void prepareContextAndTimeProps(boolean src, Set<? extends EquivalenceClass> eqs, Scorer contextScorer, Scorer timeScorer) throws Exception {
        LOG.info((Object)(" - " + (src ? "Preparing source" : "Projecting and preparing target") + " contextual items with " + contextScorer.toString() + " and time distributions with " + timeScorer.toString() + "..."));
        for (EquivalenceClass equivalenceClass : eqs) {
            contextScorer.prepare(equivalenceClass);
            timeScorer.prepare(equivalenceClass);
        }
        LOG.info((Object)(" - Mapping " + (src ? "source" : "target") + " context into LSH space..."));
        new LSHContextCollector(true).collectProperty(eqs);
        LOG.info((Object)(" - Mapping " + (src ? "source" : "target") + " temporal into LSH space..."));
        new LSHTimeDistributionCollector(true).collectProperty(eqs);
    }

    public void prepareForChunkCollection(boolean src, String fileName, int numLines) throws Exception {
        int maxPhraseLength = Configurator.CONFIG.getInt("preprocessing.phrases.MaxPhraseLength");
        boolean caseSensitive = Configurator.CONFIG.getBoolean("preprocessing.phrases.CaseSensitive");
        this.m_maxPhrCount = this.findMaxPhraseCount(src, fileName, numLines, maxPhraseLength, caseSensitive);
    }

    public void prepareContextForChunkCollection() throws Exception {
        this.collectContextEqs();
        this.prepareSeedDictionary(this.m_contextSrcEqs, this.m_contextTrgEqs);
        this.filterContextEqs();
    }

    protected void filterContextEqs() throws Exception {
        int pruneContEqIfOccursFewerThan = Configurator.CONFIG.getInt("preprocessing.context.PruneEqIfOccursFewerThan");
        int pruneContEqIfOccursMoreThan = Configurator.CONFIG.getInt("preprocessing.context.PruneEqIfOccursMoreThan");
        this.m_contextSrcEqs = this.filterContextEqs(true, this.m_contextSrcEqs, pruneContEqIfOccursFewerThan, pruneContEqIfOccursMoreThan);
        this.m_contextTrgEqs = this.filterContextEqs(false, this.m_contextTrgEqs, pruneContEqIfOccursFewerThan, pruneContEqIfOccursMoreThan);
    }

    protected Set<EquivalenceClass> filterContextEqs(boolean src, Set<EquivalenceClass> eqs, int pruneContEqIfOccursFewerThan, int pruneContEqIfOccursMoreThan) throws Exception {
        LOG.info((Object)(" - Filtering " + (src ? "source" : "target") + " contextual words: keeping those in dict [" + this.m_seedDict.toString() + "] and occuring (" + pruneContEqIfOccursFewerThan + "," + pruneContEqIfOccursMoreThan + ") times..."));
        LinkedList<EquivalenceClassFilter> filters = new LinkedList<EquivalenceClassFilter>();
        filters.add(new DictionaryFilter(this.m_seedDict, true, src));
        filters.add(new NumOccurencesFilter(pruneContEqIfOccursFewerThan, true));
        filters.add(new NumOccurencesFilter(pruneContEqIfOccursMoreThan, false));
        Set<EquivalenceClass> filtContextEqs = EquivalenceClassCollector.filter(eqs, filters);
        LOG.info((Object)(" - Filtered context " + (src ? "source" : "target") + " classes: " + filtContextEqs.size()));
        return filtContextEqs;
    }

    protected void prepareSeedDictionary(Set<EquivalenceClass> srcContEqs, Set<EquivalenceClass> trgContEqs) throws Exception {
        SimpleDictionary simpSeedDict;
        String dictDir = Configurator.CONFIG.getString("resources.dictionary.Path");
        int ridDictNumTrans = Configurator.CONFIG.containsKey("experiments.DictionaryPruneNumTranslations") ? Configurator.CONFIG.getInt("experiments.DictionaryPruneNumTranslations") : -1;
        LOG.info((Object)" - Reading/preparing seed dictionary ...");
        if (Configurator.CONFIG.containsKey("resources.dictionary.Dictionary")) {
            String dictFileName = Configurator.CONFIG.getString("resources.dictionary.Dictionary");
            simpSeedDict = new SimpleDictionary(String.valueOf(dictDir) + dictFileName, "SeedDictionary");
        } else {
            String srcDictFileName = Configurator.CONFIG.getString("resources.dictionary.SrcName");
            String trgDictFileName = Configurator.CONFIG.getString("resources.dictionary.TrgName");
            simpSeedDict = new SimpleDictionary(new SimpleDictionary.DictHalves(String.valueOf(dictDir) + srcDictFileName, String.valueOf(dictDir) + trgDictFileName), "SeedDictionary");
        }
        simpSeedDict.pruneCounts(ridDictNumTrans);
        this.m_seedDict = new Dictionary(srcContEqs, trgContEqs, simpSeedDict, "SeedDictionary");
        LOG.info((Object)(" - Seed dictionary: " + this.m_seedDict.toString()));
    }

    protected void collectContextEqs() throws Exception {
        LOG.info((Object)" - Constructing contextual equivalence classes...");
        boolean filterRomanSrc = Configurator.CONFIG.containsKey("preprocessing.FilterRomanSrc") && Configurator.CONFIG.getBoolean("preprocessing.FilterRomanSrc");
        boolean filterRomanTrg = Configurator.CONFIG.containsKey("preprocessing.FilterRomanTrg") && Configurator.CONFIG.getBoolean("preprocessing.FilterRomanTrg");
        Class<EquivalenceClass> srcContClassClass = Class.forName(Configurator.CONFIG.getString("preprocessing.context.SrcEqClass"));
        Class<EquivalenceClass> trgContClassClass = Class.forName(Configurator.CONFIG.getString("preprocessing.context.TrgEqClass"));
        this.m_contextSrcEqs = this.collectContextEqs(true, true, filterRomanSrc, srcContClassClass);
        this.m_contextTrgEqs = this.collectContextEqs(false, true, filterRomanTrg, trgContClassClass);
        this.m_maxTokCountInSrc = this.findMaxCount(this.m_contextSrcEqs);
        this.m_maxTokCountInTrg = this.findMaxCount(this.m_contextTrgEqs);
        LOG.info((Object)(" - Source context classes = " + this.m_contextSrcEqs.size() + ", max occurrences = " + this.m_maxTokCountInSrc));
        LOG.info((Object)(" - Target context classes = " + this.m_contextTrgEqs.size() + ", max occurrences = " + this.m_maxTokCountInTrg));
    }

    protected Set<EquivalenceClass> collectContextEqs(boolean src, boolean caseSensitive, boolean filterRoman, Class<EquivalenceClass> contextClassClass) throws Exception {
        ArrayList<EquivalenceClassFilter> filters = new ArrayList<EquivalenceClassFilter>(3);
        filters.add(new GarbageFilter());
        filters.add(new LengthFilter(2));
        if (filterRoman) {
            filters.add(new RomanizationFilter());
        }
        CorpusAccessor accessor = this.getAccessor(Configurator.CONFIG.getString("preprocessing.input.Context"), src);
        SimpleEquivalenceClassCollector collector = new SimpleEquivalenceClassCollector(filters, caseSensitive);
        Set<EquivalenceClass> eqs = collector.collect(accessor.getCorpusReader(), -1);
        new NumberCollector(caseSensitive).collectProperty(accessor, eqs);
        eqs = this.constructEqClasses(src, eqs, contextClassClass);
        this.assignTypeProp(eqs, src ? Type.EqType.SOURCE : Type.EqType.TARGET);
        return eqs;
    }

    protected void assignTypeProp(Set<? extends EquivalenceClass> eqClasses, Type.EqType type) {
        Type commonType = new Type(type);
        for (EquivalenceClass equivalenceClass : eqClasses) {
            equivalenceClass.setProperty(commonType);
        }
    }

    protected Set<EquivalenceClass> constructEqClasses(boolean src, Set<EquivalenceClass> allEqs, Class<? extends EquivalenceClass> eqClassClass) throws Exception {
        HashMap<String, EquivalenceClass> eqsMap = new HashMap<String, EquivalenceClass>();
        for (EquivalenceClass eq : allEqs) {
            String newWord = ((SimpleEquivalenceClass)eq).getWord();
            long newCount = ((Number)eq.getProperty(Number.class.getName())).getNumber();
            EquivalenceClass newEq = eqClassClass.newInstance();
            newEq.init(newWord, true);
            EquivalenceClass foundEq = (EquivalenceClass)eqsMap.get(newEq.getStem());
            if (foundEq == null) {
                newEq.assignId();
                newEq.setProperty(new Number(newCount));
                newEq.setProperty(new Type(src ? Type.EqType.SOURCE : Type.EqType.TARGET));
                eqsMap.put(newEq.getStem(), newEq);
                continue;
            }
            foundEq.merge(newEq);
            ((Number)foundEq.getProperty(Number.class.getName())).increment(newCount);
        }
        return new HashSet<EquivalenceClass>(eqsMap.values());
    }

    protected Set<Phrase> readPhraseFileChunk(boolean src, String fileName, int numLines, boolean caseSensitive) throws IOException {
        String line = null;
        int numLinesRead = 0;
        HashSet<Phrase> phraseChunk = new HashSet<Phrase>();
        if (!fileName.equals(this.m_phraseFileName)) {
            this.m_phraseFileName = fileName;
            if (this.m_phraseFileReader != null) {
                this.m_phraseFileReader.close();
            }
            InputStream is = new FileInputStream(fileName);
            if (fileName.toLowerCase().endsWith("gz")) {
                is = new GZIPInputStream(is);
            }
            this.m_phraseFileReader = new BufferedReader(new InputStreamReader(is, DEFAULT_CHARSET));
        }
        while ((numLines < 0 || numLinesRead < numLines) && (line = this.m_phraseFileReader.readLine()) != null) {
            ++numLinesRead;
            Phrase phrase = new Phrase();
            phrase.init(line, caseSensitive);
            phrase.assignId();
            phraseChunk.add(phrase);
        }
        if (line == null && phraseChunk.size() == 0) {
            this.m_phraseFileName = "";
            this.m_phraseFileReader = null;
        }
        return phraseChunk;
    }

    protected long findMaxPhraseCount(boolean src, String fileName, int chunkSize, int maxPhraseLength, boolean caseSensitive) throws Exception {
        Set<Phrase> chunk;
        LOG.info((Object)(" - Collecting max count for " + (src ? "source" : "target") + " phrases"));
        long maxCount = 0L;
        while ((chunk = this.readPhraseFileChunk(src, fileName, chunkSize, caseSensitive)).size() > 0) {
            this.collectNumberProps(src, chunk, maxPhraseLength, caseSensitive);
            maxCount = Math.max(maxCount, this.findMaxCount(chunk));
        }
        LOG.info((Object)(" - " + (src ? "Source" : "Target") + " phrases max count = " + maxCount));
        return maxCount;
    }

    protected long findMaxCount(Set<? extends EquivalenceClass> eqs) {
        long maxOccurCount = 0L;
        for (EquivalenceClass equivalenceClass : eqs) {
            long count;
            Number num = (Number)equivalenceClass.getProperty(Number.class.getName());
            if (num == null || (count = num.getNumber()) <= maxOccurCount) continue;
            maxOccurCount = count;
        }
        return maxOccurCount;
    }

    protected void collectNumberProps(boolean src, Set<Phrase> phrases, int maxPhraseLength, boolean caseSensitive) throws Exception {
        CorpusAccessor accessor = this.getAccessor(Configurator.CONFIG.getString("preprocessing.input.Context"), src);
        new PhraseNumberCollector(maxPhraseLength, caseSensitive).collectProperty(accessor, phrases);
    }

    protected CorpusAccessor getAccessor(String kind, boolean src) throws Exception {
        CorpusAccessor accessor = null;
        if ("europarl".equals(kind)) {
            accessor = this.getEuroParlAccessor(src);
        } else if ("wiki".equals(kind)) {
            accessor = this.getWikiAccessor(src);
        } else if ("crawls".equals(kind)) {
            accessor = this.getCrawlsAccessor(src);
        } else if ("dev".equals(kind)) {
            accessor = this.getDevAccessor(src);
        } else if ("test".equals(kind)) {
            accessor = this.getTestAccessor(src);
        } else {
            LOG.error((Object)("Could not find corpus accessor for " + kind));
        }
        return accessor;
    }

    protected LexCorpusAccessor getDevAccessor(boolean src) throws Exception {
        String path = Configurator.CONFIG.getString("corpora.dev.Path");
        boolean oneSentPerLine = Configurator.CONFIG.getBoolean("corpora.dev.OneSentPerLine");
        String name = src ? Configurator.CONFIG.getString("corpora.dev.SrcName") : Configurator.CONFIG.getString("corpora.dev.TrgName");
        return new LexCorpusAccessor(name, this.appendSep(path), oneSentPerLine);
    }

    protected LexCorpusAccessor getTestAccessor(boolean src) throws Exception {
        String path = Configurator.CONFIG.getString("corpora.test.Path");
        boolean oneSentPerLine = Configurator.CONFIG.getBoolean("corpora.test.OneSentPerLine");
        String name = src ? Configurator.CONFIG.getString("corpora.test.SrcName") : Configurator.CONFIG.getString("corpora.test.TrgName");
        return new LexCorpusAccessor(name, this.appendSep(path), oneSentPerLine);
    }

    protected EuroParlCorpusAccessor getEuroParlAccessor(boolean src) throws Exception {
        String path = Configurator.CONFIG.getString("corpora.europarl.Path");
        boolean oneSentPerLine = Configurator.CONFIG.getBoolean("corpora.europarl.OneSentPerLine");
        String subDir = src ? Configurator.CONFIG.getString("corpora.europarl.SrcSubDir") : Configurator.CONFIG.getString("corpora.europarl.TrgSubDir");
        SimpleDateFormat sdf = new SimpleDateFormat("yy-MM-dd");
        Date fromDate = sdf.parse(Configurator.CONFIG.getString("corpora.europarl.DateFrom"));
        Date toDate = sdf.parse(Configurator.CONFIG.getString("corpora.europarl.DateTo"));
        return new EuroParlCorpusAccessor(String.valueOf(this.appendSep(path)) + subDir, fromDate, toDate, oneSentPerLine);
    }

    protected CrawlCorpusAccessor getCrawlsAccessor(boolean src) throws Exception {
        String path = Configurator.CONFIG.getString("corpora.crawls.Path");
        boolean oneSentPerLine = Configurator.CONFIG.getBoolean("corpora.crawls.OneSentPerLine");
        String subDir = src ? Configurator.CONFIG.getString("corpora.crawls.SrcSubDir") : Configurator.CONFIG.getString("corpora.crawls.TrgSubDir");
        SimpleDateFormat sdf = new SimpleDateFormat("yy-MM-dd");
        Date fromDate = sdf.parse(Configurator.CONFIG.getString("corpora.crawls.DateFrom"));
        Date toDate = sdf.parse(Configurator.CONFIG.getString("corpora.crawls.DateTo"));
        return new CrawlCorpusAccessor(String.valueOf(this.appendSep(path)) + subDir, fromDate, toDate, oneSentPerLine);
    }

    protected LexCorpusAccessor getWikiAccessor(boolean src) {
        String path = Configurator.CONFIG.getString("corpora.wiki.Path");
        boolean oneSentPerLine = Configurator.CONFIG.getBoolean("corpora.wiki.OneSentPerLine");
        String fileRegExp = src ? Configurator.CONFIG.getString("corpora.wiki.SrcRegExp") : Configurator.CONFIG.getString("corpora.wiki.TrgRegExp");
        return new LexCorpusAccessor(fileRegExp, this.appendSep(path), oneSentPerLine);
    }

    protected String appendSep(String str) {
        String ret;
        String string = ret = str == null ? null : str.trim();
        if (ret != null && ret.length() > 0 && !ret.endsWith(File.separator)) {
            ret = String.valueOf(ret) + File.separator;
        }
        return ret;
    }
}

