/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.ie.machinereading.domains.ace.reader;

import edu.stanford.nlp.ie.machinereading.common.DomReader;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceToken;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.RobustTokenizer;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.util.Generics;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import javax.xml.parsers.ParserConfigurationException;
import org.xml.sax.SAXException;

public class AceSentenceSegmenter
extends DomReader {
    private static final String[] sentenceFinalPunc = new String[]{".", "!", "?"};
    private static Set<String> sentenceFinalPuncSet = Generics.newHashSet();

    public static List<List<AceToken>> tokenizeAndSegmentSentences(String filenamePrefix) throws IOException, SAXException, ParserConfigurationException {
        ArrayList<List<AceToken>> sentences = new ArrayList<List<AceToken>>();
        File inputFile = new File(filenamePrefix + ".sgm");
        String input = IOUtils.slurpFile(inputFile);
        RobustTokenizer tokenizer = new RobustTokenizer(input);
        List<RobustTokenizer.WordToken> tokenList = tokenizer.tokenizeToWordTokens();
        ArrayList<AceToken> currentSentence = new ArrayList<AceToken>();
        int quoteCount = 0;
        for (int i = 0; i < tokenList.size(); ++i) {
            RobustTokenizer.WordToken token = tokenList.get(i);
            String tokenText = token.getWord();
            AceToken convertedToken = AceSentenceSegmenter.wordTokenToAceToken(token, sentences.size());
            if (AceToken.isSgml(tokenText)) {
                if (currentSentence.size() > 0) {
                    sentences.add(currentSentence);
                }
                currentSentence = new ArrayList();
                quoteCount = 0;
            }
            currentSentence.add(convertedToken);
            if (tokenText.equals("\"")) {
                ++quoteCount;
            }
            if (sentenceFinalPuncSet.contains(tokenText)) {
                if (i < tokenList.size() - 1 && quoteCount % 2 == 1 && tokenList.get(i + 1).getWord().equals("\"")) {
                    AceToken quoteToken = AceSentenceSegmenter.wordTokenToAceToken(tokenList.get(i + 1), sentences.size());
                    currentSentence.add(quoteToken);
                    ++quoteCount;
                    ++i;
                }
                if (currentSentence.size() > 0) {
                    sentences.add(currentSentence);
                }
                currentSentence = new ArrayList();
                quoteCount = 0;
                continue;
            }
            if (!AceToken.isSgml(tokenText)) continue;
            if (currentSentence.size() > 0) {
                sentences.add(currentSentence);
            }
            currentSentence = new ArrayList();
            quoteCount = 0;
        }
        return sentences;
    }

    public static AceToken wordTokenToAceToken(RobustTokenizer.WordToken wordToken, int sentence) {
        return new AceToken(wordToken.getWord(), "", "", "", "", Integer.toString(wordToken.getStart()), Integer.toString(wordToken.getEnd()), sentence);
    }

    public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException {
        String testFilename = "/home/mcclosky/data/ACE2005/English/wl/timex2norm/AGGRESSIVEVOICEDAILY_20041101.1144";
        testFilename = "/home/mcclosky/data/ACE2005/English/nw/timex2norm/AFP_ENG_20030502.0614";
        List<List<AceToken>> sentences = AceSentenceSegmenter.tokenizeAndSegmentSentences(testFilename);
        for (List<AceToken> sentence : sentences) {
            System.out.println("s: [" + sentence + "]");
        }
    }

    static {
        for (int i = 0; i < sentenceFinalPunc.length; ++i) {
            sentenceFinalPuncSet.add(sentenceFinalPunc[i]);
        }
    }
}

