#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import os
import shutil
import copy
import time
import codecs
import datetime
import logging
import numpy as np
import argparse

def checkEntity(doc, summ, language):
    if language == 'zh':
        patterns = zhEntityPattern()
    else:
        patterns = enEntityPattern()
    parts = re.findall(patterns, summ)
    # print(parts)
    missing = []
    for p in parts:
        if p not in doc:
            missing.append(p)
    if len(missing) > 0:
        print("[Sample]\t" + doc)
        print("[Reference]\t" + refer)
        print("[Missing]\t" + " ".join(missing))
        print("\n")
        return int(len(parts) > 0), 1
    else:
        return int(len(parts) > 0), 0
    

################# tools #################

def confir(s):
    for i in range(0,32):
        s = s.replace(chr(i),'')
    return  s

def zhEntityPattern():
    date = "一二三四五六日"
    times_pattern = r"[0-9]+年|[0-9]+月|[0-9]+日"
    week_pattern = r"星期[%s]|周[%s]" % (date, date)
    book_pattern = r"《\S+》"
    return times_pattern + "|" + week_pattern + "|" + book_pattern

def enEntityPattern():
    times_pattern = r"[1-2][0-9]{3}"
    num_pattern = r"[0-9][0-9\.,%]*[0-9%]"
    return times_pattern + "|" + num_pattern

################# main #################

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', type=str, default="candidate.txt", help='candidate file')
    parser.add_argument('-l', type=str, default="en", help='language')
    args = parser.parse_args()
    print(args)

    context = open(args.i + ".doc", encoding="utf-8").read().replace(u'\u2028', '').replace('\r', '').strip()
    samples = [line.strip() for line in context.split("\n")]

    references = codecs.open(args.i + ".sum", encoding="utf-8")
    references = [line.strip() for line in references]

    assert len(samples) == len(references), "%d:%d" % (len(samples),len(references))

    missing = 0
    found = 0
    for sample, refer in zip(samples, references):
        f, m = checkEntity(sample, refer, args.l)
        found += f
        missing += m
    
    print("\n\nTotal Missing %d Examples, %d has key patterns" % (missing, found))



