import jieba

jieba.enable_paddle()
events = []
titles = []
contents = []


def get_spans(text, language='english'):
    if language == 'chinese':
        words = list(jieba.cut(text))
    else:
        words = text.split()
    return words


def data_preprocess(file_name):
    with open(file_name, 'r', encoding='utf-8') as f:
        data = f.read()
    psgs = data.split('-' * 54)
    insts = []
    for psg in psgs:
        psg = psg.strip().split('\n')
        event = psg[0].split('\t')[0]
        lines = psg[2:]
        opinion = []
        for line_idx, line in enumerate(lines):
            if len(line.strip().split('\t')) == 3:
                sent, label, aspect = [ele.strip() for ele in line.strip().split('\t') if len(ele)]
            else:
                continue
            if (label == 'B' and not opinion) or label == 'I':
                opinion.append(sent)
            elif label == 'B' and opinion:
                old_aspect = [ele for ele in lines[line_idx - 1].strip().split('\t') if len(ele)][2]
                insts.append([opinion, event, old_aspect])
                opinion = [sent]
            elif label == 'O' and opinion:
                old_aspect = [ele for ele in lines[line_idx-1].strip().split('\t') if len(ele)][2]
                insts.append([opinion, event, old_aspect])
                opinion = []
        if len(opinion):
            insts.append([opinion, event, aspect])
    print(file_name, 'Num of opinions:', len(insts), sep='\t')
    return insts


if __name__ == '__main__':
    """
    Data Statistics.
    """
    file_name = '../data/ECOB-ZH.txt'
    lang = 'english'

    # get docs: events, titles, contents
    with open(file_name, 'r', encoding='utf-8') as f:
        data = f.read()
        documents = data.split('-' * 54)
        documents = [doc for doc in documents if len(doc.strip())]
        print('document num: ', len(documents))
        for document in documents:
            document = document.strip()
            if document:
                lines = document.split('\n')
                events.append(lines[0].strip().split('\t')[0])
                titles.append(lines[1].strip())
                contents.append([line.strip().split('\t') for line in lines[2:]])
    # get opinions: opinion_text, event, aspect
    opinions = data_preprocess(file_name)

    # basic data
    print('Num of events: ', len(list(set(events))))
    event_words = [get_spans(event, language=lang) for event in events]
    event_lens = [len(event_word) for event_word in event_words]
    print('Len of events: ', sum(event_lens) / len(events))

    # doc data
    doc_lens = [len(content) for content in contents]
    print('doc avg len: ', sum(doc_lens) / len(events))
    print('doc avg opinion: ', len(opinions) / len(doc_lens))

    # opinion data
    opinion_lens = [len(opinion[0]) for opinion in opinions]
    print('opinion avg len: ', sum(opinion_lens) / len(opinion_lens), sum(opinion_lens))
    print('opinion ratio: ', sum(opinion_lens) / sum(doc_lens))