/*
 * Decompiled with CFR 0.152.
 */
package babel.prep.extract;

import babel.content.pages.Page;
import babel.prep.PrepStep;
import babel.prep.extract.NutchChunk;
import babel.prep.extract.PageExtMapper;
import babel.prep.extract.PageExtReducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;

public class NutchPageExtractor
extends PrepStep {
    protected static final Log LOG = LogFactory.getLog(NutchPageExtractor.class);
    static final String JOB_PROP_SEGMENTS_DIR = "pageextractor.segments.dir";
    protected static final String JOB_PROP_JOB_TIMESTAMP = "pageextractor.timestamp";
    protected static final String PAGES_SUBDIR = "pages";
    protected static final String SEGMENTS_SUBDIR = "segments";
    protected boolean m_ge = true;
    protected boolean m_fe = true;
    protected boolean m_pa = true;
    protected boolean m_co = true;
    protected boolean m_pd = true;
    protected boolean m_pt = true;

    public NutchPageExtractor() throws Exception {
        super(NutchConfiguration.create());
    }

    @Override
    public void configure(JobConf job) throws Exception {
        super.configure(job);
        this.m_co = this.getConf().getBoolean("segment.reader.co", true);
        this.m_fe = this.getConf().getBoolean("segment.reader.fe", true);
        this.m_ge = this.getConf().getBoolean("segment.reader.ge", true);
        this.m_pa = this.getConf().getBoolean("segment.reader.pa", true);
        this.m_pd = this.getConf().getBoolean("segment.reader.pd", true);
        this.m_pt = this.getConf().getBoolean("segment.reader.pt", true);
    }

    public static void main(String[] args) throws Exception {
        if (args.length != 1) {
            NutchPageExtractor.usage();
            return;
        }
        NutchPageExtractor extractor = new NutchPageExtractor();
        JobConf job = extractor.createJobConf(args[0]);
        if (LOG.isInfoEnabled()) {
            LOG.info((Object)("NutchPageExtractor: " + job.getJobName()));
        }
        extractor.runPrepStep(job);
        if (LOG.isInfoEnabled()) {
            LOG.info((Object)(String.valueOf(Stats.dumpStats()) + "\n"));
            LOG.info((Object)("Output: " + FileOutputFormat.getOutputPath((JobConf)job)));
            LOG.info((Object)"NutchPageExtractor: done");
        }
    }

    protected JobConf createJobConf(String crawlDir) throws IOException {
        Path segmentsPath = new Path(crawlDir, SEGMENTS_SUBDIR);
        List<Path> segPaths = this.allSegmentDirs(segmentsPath);
        StringBuilder allSegNames = new StringBuilder();
        int i = 0;
        while (i < segPaths.size()) {
            allSegNames.append(" " + segPaths.get(i).getName());
            ++i;
        }
        String timeStamp = this.getCurTimeStamp();
        NutchJob job = new NutchJob(this.getConf());
        job.setJobName("read segments" + allSegNames.toString());
        job.setBoolean("segment.reader.co", this.m_co);
        job.setBoolean("segment.reader.fe", this.m_fe);
        job.setBoolean("segment.reader.ge", this.m_ge);
        job.setBoolean("segment.reader.pa", this.m_pa);
        job.setBoolean("segment.reader.pd", this.m_pd);
        job.setBoolean("segment.reader.pt", this.m_pt);
        int i2 = 0;
        while (i2 < segPaths.size()) {
            if (this.m_ge) {
                FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segPaths.get(i2), "crawl_generate"));
            }
            if (this.m_fe) {
                FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segPaths.get(i2), "crawl_fetch"));
            }
            if (this.m_pa) {
                FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segPaths.get(i2), "crawl_parse"));
            }
            if (this.m_co) {
                FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segPaths.get(i2), "content"));
            }
            if (this.m_pd) {
                FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segPaths.get(i2), "parse_data"));
            }
            if (this.m_pt) {
                FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segPaths.get(i2), "parse_text"));
            }
            ++i2;
        }
        job.set(JOB_PROP_SEGMENTS_DIR, segmentsPath.getName());
        job.set(JOB_PROP_JOB_TIMESTAMP, timeStamp);
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(PageExtMapper.class);
        job.setReducerClass(PageExtReducer.class);
        job.setMapOutputValueClass(NutchChunk.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Page.class);
        Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.extract." + timeStamp);
        this.m_fs.delete(outDir, true);
        FileOutputFormat.setOutputPath((JobConf)job, (Path)outDir);
        this.setUniqueTempDir((JobConf)job);
        return job;
    }

    protected List<Path> allSegmentDirs(Path segmentsDir) throws IOException {
        ArrayList<Path> dirs = new ArrayList<Path>();
        FileStatus[] fstats = this.m_fs.listStatus(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter((FileSystem)this.m_fs));
        Path[] files = HadoopFSUtil.getPaths((FileStatus[])fstats);
        if (files != null && files.length > 0) {
            dirs.addAll((Collection<Path>)Arrays.asList(files));
        }
        return dirs;
    }

    protected static void usage() {
        System.err.println("Usage: NutchPageExtractor crawl_dir\n");
    }

    static class Stats {
        private static int numPages = 0;
        private static int numIgnoredPages;
        private static int numVersions;

        static {
            numVersions = 0;
            numIgnoredPages = 0;
        }

        Stats() {
        }

        public static synchronized void incPages() {
            ++numPages;
        }

        public static synchronized void incIgnoredPages() {
            ++numIgnoredPages;
        }

        public static synchronized void incVersions() {
            ++numVersions;
        }

        public static synchronized void incVersions(int inc) {
            numVersions += inc;
        }

        public static String dumpStats() {
            StringBuilder strBld = new StringBuilder();
            strBld.append("Extracted pages = " + numPages + " (" + numVersions + " versions)\n");
            strBld.append("Ignored pages = " + numIgnoredPages);
            return strBld.toString();
        }
    }
}

