# import nltk
# nltk.download()

# def tag_phrase(phrase):
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk import RegexpParser
from nltk.stem.porter import *

# 假设的短语
phrases = ["out of print", "out of print material"]

# 定义一个名词短语的语法规则
# 这里使用了一个简单的规则，实际应用中可能需要更复杂的规则
grammar = "NP: {<DT>?<JJ.*>*<NN.*>+}"

# 创建一个正则表达式分析器
cp = RegexpParser(grammar)

# for phrase in phrases:
#     # 词性标注
#     tokens = word_tokenize(phrase)
#     tagged = pos_tag(tokens)

#     # 应用句法分析
#     result = cp.parse(tagged)
#     print(result)

#     # 检查是否是名词短语
#     is_np = any(isinstance(subtree, nltk.Tree) and subtree.label() == 'NP' for subtree in result)
    
#     print(f"Phrase: '{phrase}' - Is noun phrase: {is_np}")


PHRASE_GRAMMAR = """
    PHRASE: {<IN|CD|DT|FW|GW|AFX|POS|HYPH|LS|ADD|:|NN.*|VB.*|JJ.*|RB.*>+<CC|RP|IN|CD|DT|FW|GW|AFX|POS|HYPH|LS|ADD|:|NN.*|VB.*|JJ.*|RB.*>*}
"""
# CD: cardinal digit, FW: foreign word, GW: additional word, NN: noun, VB: verb, JJ: adj, RB: adv, ADD: email (for <digit>)
indep_pos_set = {"CD", "FW", "GW", "NN", "VB", "JJ", "RB", "AD"}
# DT: determiner, AF(AFX): affix, LS: list item marker
end_dep_pos_set = {"DT", "AF", "LS"}           # can start but not end with these
# CC: coordinating conjunction, PO(POS): possessive, HY(HYPH): hyphen, IN: subordinating conjunction or preposition
dep_pos_set = {"CC", "PO", "HY", ":", "IN"}    # cannot start or end with these
# RP: particle adverb (e.g., put it "back")
start_dep_pos_set = {"RP"}                     # cannot start but end with these

phrase_parser = nltk.RegexpParser(PHRASE_GRAMMAR)
stemmer = PorterStemmer()
for phrase in phrases:
    result = phrase_parser.parse(pos_tag(word_tokenize(phrase)))
    for k, (w1, t1) in enumerate(result.leaves()):
        if t1[:2] not in indep_pos_set and t1[:2] not in end_dep_pos_set:
            continue

        word = w1.lower().replace(u'\xa0', u' ')
        span = stemmer.stem(word)
        orig_span = word
        
        # backtrack pointer j in case we point to wrong idx
        # this happens b/c phrase parser parses more words than word tokenizer does
        while j+k >= len(src_spans):
            j -= 1
        start, end = src_spans[j+k]
        curr_span = tokenizer.convert_tokens_to_string(src_tokens[start:end]).strip().lower()
        st_curr_span = stemmer.stem(curr_span)
        while (span not in st_curr_span and st_curr_span not in span and
            word not in curr_span and curr_span not in word):
            j -= 1
            if j+k < 0:
                not_aligned = True
                break
            start, end = src_spans[j+k]
            curr_span = tokenizer.convert_tokens_to_string(src_tokens[start:end]).strip().lower()
            st_curr_span = stemmer.stem(curr_span)

        if not_aligned:
            break

        # count the number of hyphens to correct ngram length for hyphenated phrases
        num_hyphens = 0

        # independent unigrams can be candidate kps (except subordinating conjunction)
        if t1[:2] in indep_pos_set:
            num_hyphens += word.count("-")
            _word = word.replace("-", " ")
            span = " ".join([stemmer.stem(w) for w in _word.strip().split()])
            orig_span = word
            if t1[:2] != "IN":  # sub conj itself cannot be a candidate
                candidate_kp_spans[span].append((start, end))