/*
 * Decompiled with CFR 0.152.
 */
package babel.content.pages;

import babel.content.pages.MetaData;
import babel.content.pages.Page;
import babel.prep.extract.NutchChunk;
import babel.util.language.Language;
import babel.util.persistence.XMLPersistable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.XMLStreamWriter;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;

public class PageVersion
implements XMLPersistable,
Writable {
    public static final Log LOG = LogFactory.getLog(PageVersion.class);
    private static final String DEFAULT_CHARSET = "utf-8";
    private static final String XML_TAG_PAGEVERSION = "PageVersion";
    private static final String XML_TAG_OUT_LINKS = "OutgoingLinks";
    private static final String XML_TAG_LINK = "Link";
    private static final String XML_ATTRIB_ANCHOR = "Anchor";
    private static final String XML_TAG_CONTENT = "ParsedContent";
    private static final String PROP_TITLE = "Title";
    private static final String PROP_FETCH_TIME = "Fetched";
    private static final String PROP_MODIFIED_TIME = "Modified";
    private static final String PROP_SEGMENT_ID = "NutchSegment";
    protected MetaData m_verProps = new MetaData("VersionProperties");
    protected MetaData m_contentMeta = new MetaData("ContentMetadata");
    protected MetaData m_parseMeta = new MetaData("ParseMetadata");
    protected Outlink[] m_outLinks = null;
    protected String m_content = new String();

    public PageVersion() {
    }

    public PageVersion(String segmentId, List<NutchChunk> chunks, Page page) {
        this();
        this.m_verProps.set(PROP_SEGMENT_ID, segmentId);
        for (NutchChunk chunk : chunks) {
            Writable curVal = chunk.get();
            if (curVal instanceof CrawlDatum) {
                CrawlDatum curCD = (CrawlDatum)curVal;
                if (curCD.getStatus() != 33) continue;
                this.m_verProps.set(PROP_FETCH_TIME, Long.toString(curCD.getFetchTime()));
                this.m_verProps.set(PROP_MODIFIED_TIME, Long.toString(curCD.getModifiedTime()));
                continue;
            }
            if (curVal instanceof Content) {
                Content curCT = (Content)curVal;
                try {
                    String str = new String(curCT.getContent(), DEFAULT_CHARSET);
                    Matcher m = Pattern.compile("<html[^>]*>", 2).matcher(str);
                    if (!m.find()) continue;
                    str = str.substring(m.start(), m.end());
                    m = Pattern.compile("\\slang\\s*=\\s*\"*([^\\s=\"]+)\"*", 2).matcher(str);
                    if (!m.find() || (str = str.substring(m.start(1), m.end(1)).trim().toLowerCase()).length() <= 0) continue;
                    page.setLanguage(Language.fromString(str));
                }
                catch (Exception exception) {}
                continue;
            }
            if (curVal instanceof ParseData) {
                ParseData curPD = (ParseData)curVal;
                if (!curPD.getStatus().isSuccess()) continue;
                this.m_verProps.set(PROP_TITLE, curPD.getTitle());
                this.m_parseMeta.setAll(curPD.getParseMeta());
                this.m_contentMeta.setAll(curPD.getContentMeta());
                this.m_outLinks = curPD.getOutlinks();
                continue;
            }
            if (curVal instanceof ParseText) {
                ParseText curPT = (ParseText)curVal;
                this.m_content = this.setStr(curPT.getText());
                continue;
            }
            if (!LOG.isWarnEnabled()) continue;
            LOG.warn((Object)("Unrecognized type: " + curVal.getClass()));
        }
    }

    public boolean isNutchComplete() {
        return this.m_content.length() > 0 && this.m_verProps.hasKey(PROP_SEGMENT_ID) && this.m_verProps.hasKey(PROP_FETCH_TIME);
    }

    public String getContent() {
        return this.m_content;
    }

    public Long getFetchTime() {
        String prop = this.m_verProps.getFirst(PROP_FETCH_TIME);
        return prop != null ? Long.valueOf(Long.parseLong(prop)) : null;
    }

    public Long getModificationTime() {
        String prop = this.m_verProps.getFirst(PROP_MODIFIED_TIME);
        return prop != null ? Long.valueOf(Long.parseLong(prop)) : null;
    }

    public void setModificationTime(Long time) {
        this.m_verProps.set(PROP_MODIFIED_TIME, Long.toString(time));
    }

    public String toString() {
        StringBuilder strBld = new StringBuilder();
        SimpleDateFormat dft = new SimpleDateFormat();
        String prop = this.m_verProps.getFirst(PROP_SEGMENT_ID);
        strBld.append("PageVersion from Segment: " + (prop != null ? prop : "-") + "\n");
        prop = this.m_verProps.getFirst(PROP_TITLE);
        strBld.append("  Title: " + (prop != null ? prop : "-") + "\n");
        prop = this.m_verProps.getFirst(PROP_FETCH_TIME);
        strBld.append("  Fetched: " + (prop != null ? dft.format(new Date(Long.parseLong(prop))) : "-") + "\n");
        strBld.append("  Content Metadata: " + (this.m_contentMeta.numKeys() > 0 ? "present" : "absent") + "\n");
        strBld.append("  Parse Metadata: " + (this.m_parseMeta.numKeys() > 0 ? "present" : "absent") + "\n");
        strBld.append("  Outgoing Links: " + (this.m_outLinks == null ? "0" : Integer.valueOf(this.m_outLinks.length)) + "\n");
        strBld.append("  Parsed Content: " + (this.m_content.length() > 0 ? "present" : "absent") + "\n");
        return strBld.toString();
    }

    public boolean equals(Object obj) {
        return obj instanceof PageVersion && this.m_content.equals(((PageVersion)obj).m_content);
    }

    @Override
    public void persist(XMLStreamWriter writer) throws XMLStreamException {
        writer.writeStartElement(XML_TAG_PAGEVERSION);
        if (this.m_verProps.numKeys() > 0) {
            this.m_verProps.persist(writer);
        }
        if (this.m_parseMeta.numKeys() > 0) {
            this.m_parseMeta.persist(writer);
        }
        if (this.m_contentMeta.numKeys() > 0) {
            this.m_contentMeta.persist(writer);
        }
        if (this.m_outLinks != null) {
            writer.writeStartElement(XML_TAG_OUT_LINKS);
            Outlink[] outlinkArray = this.m_outLinks;
            int n = this.m_outLinks.length;
            int n2 = 0;
            while (n2 < n) {
                Outlink outlink = outlinkArray[n2];
                writer.writeStartElement(XML_TAG_LINK);
                String anchor = outlink.getAnchor();
                if (anchor != null && anchor.length() != 0) {
                    writer.writeAttribute(XML_ATTRIB_ANCHOR, anchor);
                }
                writer.writeCharacters(outlink.getToUrl());
                writer.writeEndElement();
                ++n2;
            }
            writer.writeEndElement();
        }
        if (this.m_content.length() > 0) {
            writer.writeStartElement(XML_TAG_CONTENT);
            writer.writeCharacters(new String(Base64.encodeBase64((byte[])this.m_content.getBytes())));
            writer.writeEndElement();
        }
        writer.writeEndElement();
    }

    public void readFields(DataInput in) throws IOException {
        this.m_verProps.readFields(in);
        this.m_contentMeta.readFields(in);
        this.m_parseMeta.readFields(in);
        int numLinks = WritableUtils.readVInt((DataInput)in);
        this.m_outLinks = numLinks == 0 ? null : new Outlink[numLinks];
        int i = 0;
        while (i < numLinks) {
            this.m_outLinks[i] = new Outlink();
            this.m_outLinks[i].readFields(in);
            ++i;
        }
        this.m_content = Text.readString((DataInput)in);
    }

    public void write(DataOutput out) throws IOException {
        this.m_verProps.write(out);
        this.m_contentMeta.write(out);
        this.m_parseMeta.write(out);
        int numLinks = this.m_outLinks == null ? 0 : this.m_outLinks.length;
        WritableUtils.writeVInt((DataOutput)out, (int)numLinks);
        int i = 0;
        while (i < numLinks) {
            this.m_outLinks[i].write(out);
            ++i;
        }
        Text.writeString((DataOutput)out, (String)this.m_content);
    }

    @Override
    public void unpersist(XMLStreamReader reader) throws XMLStreamException {
        int event;
        this.m_verProps.clear();
        this.m_contentMeta.clear();
        this.m_parseMeta.clear();
        this.m_outLinks = null;
        this.m_content = new String();
        while ((event = reader.next()) != 2 || !XML_TAG_PAGEVERSION.equals(reader.getName().toString())) {
            if (event != 1) continue;
            String elemTag = reader.getName().toString();
            String elemAttrib = reader.getAttributeValue(0);
            if ("MetaData".equals(elemTag)) {
                if ("VersionProperties".equals(elemAttrib)) {
                    this.m_verProps.unpersist(reader);
                    continue;
                }
                if ("ContentMetadata".equals(elemAttrib)) {
                    this.m_contentMeta.unpersist(reader);
                    continue;
                }
                if (!"ParseMetadata".equals(elemAttrib)) continue;
                this.m_parseMeta.unpersist(reader);
                continue;
            }
            if (!XML_TAG_CONTENT.equals(elemTag)) continue;
            this.m_content = reader.getElementText();
        }
    }

    protected String setStr(String val) {
        return val == null ? new String() : val;
    }
}

