import myutils
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

def read_conll(path):
    data = [[]]
    for line in open(path):
        if len(line) < 2:
            data.append([])
        elif line[0] != '#':
            data[-1].append(line.strip().split('\t')[1])
    return data

for dataset in ['UD_Chinese-GSD' , 'UD_Chinese-GSDSimp', 'UD_Classical_Chinese-Kyoto', 'UD_Estonian-EWT', 'UD_Japanese-GSD', 'UD_Japanese-GSDLUW', 'UD_Old_East_Slavic-Birchbark', 'UD_Old_East_Slavic-TOROT', 'UD_Romanian-Nonstandard', 'UD_Russian-Taiga', 'UD_Swedish_Sign_Language-SSLC', 'UD_Turkish-Penn', 'UD_Uyghur-UDT', 'UD_Vietnamese-VTB']:
    outFile = open('analysis/' + dataset + '.out', 'w')
    gold_dir = 'data/ud-treebanks-v2.10.singleToken/' + dataset  + '/'
    _, gold_path, _ = myutils.getTrainDevTest(gold_dir)
    pred_model_name = 'tok.bert-base-multilingual-cased.' + dataset + '.False.False.2.10'
    model_path = myutils.getModel(pred_model_name)
    pred_path = model_path.replace('model.pt', dataset + '.out')
    
    gold_data = read_conll(gold_path)
    pred_data = read_conll(pred_path)

    for gold_sent, pred_sent in zip(gold_data, pred_data):
        if gold_sent != pred_sent:
            encoded = ' '.join(tokenizer.tokenize(' '.join(gold_sent))).replace(' ##','') 
            if 'UNK' in encoded:
                outFile.write(encoded + '\n')
            outFile.write(' '.join(gold_sent) + '\n')
            outFile.write(' '.join(pred_sent) + '\n')
            outFile.write('\n')
    outFile.close()

