import sys
from statistics import mean, median
from collections import Counter, defaultdict

new_path = 'data/ldc2016e114/data/eng'
old_path = 'data/ldc2016e27v2/data/eng'

from best.data_iterators import iter_best_files, iter_best_old_files
from best.test_split import test_ids, valid_ids

docs = [f for f in iter_best_files(new_path)
        if (f.doc_id in test_ids or f.doc_id in valid_ids)
        ]
docs = []
docs.extend(iter_best_old_files(old_path))

total = 0
explicit_author = 0
self_author = 0
#  prev_author = 0


for doc in docs:
    if doc.data_type != 'forum':
        continue

    for opin in doc.evaluator_best.beliefs:
        if opin.source is None:
            continue

        if hasattr(opin, 'belief_type'):
            if opin.belief_type == 'na':
                continue
        else:
            if opin.polarity == 'none':
                continue

        src_offset = opin.source.offset
        author = doc.source[src_offset - 9:src_offset - 1] == ' author='
        # quoted = doc.source[src_offset - 13:src_offset - 1] == 'orig_author='

        total += 1
        if author:
            explicit_author += 1
            src_id = doc.get_post_id(src_offset)
            assert src_id.startswith('p')

            if hasattr(opin.target, 'offset'):
                trg_offset = opin.target.offset
            elif getattr(opin.target, 'trigger', None) is not None:
                trg_offset = opin.target.trigger.offset
            elif getattr(opin.target, 'rel_arg1', None) is not None:
                trg_offset = opin.target.rel_arg1.entity_mention.offset
            else:
                trg_offset = None

            trg_id = doc.get_post_id(trg_offset)

            assert trg_id.startswith('p')

            if src_id == trg_id:
                self_author += 1
            else:
                print(doc.doc_id, src_id, trg_id)
                print("source author ", doc.get_author(src_offset))
                print("target author ", doc.get_author(trg_offset))
                print()

print("total", total)
print("explicit author", explicit_author, explicit_author * 100 / total)
print("self author", self_author, self_author * 100 / explicit_author)
