/*
 * Decompiled with CFR 0.152.
 */
package babel.prep.datedcorpus;

import babel.content.pages.Page;
import babel.content.pages.PageVersion;
import babel.prep.datedcorpus.DatedCorpusGenerator;
import babel.util.language.Language;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;

public class DatedCorpusGenMapper
extends MapReduceBase
implements Mapper<Text, Page, Text, PageVersion> {
    public static final String DATE_LANG_SEP = "-";

    public void map(Text url, Page page, OutputCollector<Text, PageVersion> output, Reporter reporter) throws IOException {
        Language lang = page.getLanguage();
        if (lang != null) {
            for (PageVersion ver : page.pageVersions()) {
                Long modTime = ver.getModificationTime();
                String content = ver.getContent();
                if (modTime == null || modTime == 0L || content == null || content.length() <= 0) continue;
                output.collect((Object)new Text(new String(String.valueOf(lang.toString()) + DATE_LANG_SEP + modTime.toString())), (Object)ver);
                content = ver.getContent();
                DatedCorpusGenerator.Stats.incLangPageVerCount(lang.toString());
                DatedCorpusGenerator.Stats.incLangWordCount(lang.toString(), ver.getContent().split("\\s").length);
            }
        }
    }

    protected boolean isBBCEnglish(Page page) {
        String url = this.removeProtocolAndPrefix(page.pageURL());
        return url.matches("^bbc.co.uk/(hi/|low/)?english/.*") || url.matches("^bbc.co.uk/local/.*") || url.matches("^bbc.co.uk/[12]/.*");
    }

    protected String removeProtocolAndPrefix(String url) {
        return url.substring(url.indexOf(".") + 1).toLowerCase();
    }
}

