import myutils
import os
v='2.10'
scriptFinder = myutils.ScriptFinder()

scripts = {}
langs = {}
udPath = 'data/ud-treebanks-v' + v + '.singleToken/'
for UDdir in sorted(os.listdir(udPath)):
    if not UDdir.startswith("UD") or not os.path.isdir(udPath + UDdir):
        continue
    train, dev, test = myutils.getTrainDevTest(udPath + UDdir)
    if train == '':
        continue

    train = train.replace('conllu', 'txt')
    script = scriptFinder.guess_script('\n'.join(open(train).readlines()[:100]))
    if script not in scripts:
        scripts[script] = 1
    else:
        scripts[script] += 1

    lang = UDdir.split('-')[0].replace('UD_', '')
    if lang not in langs:
        langs[lang] = 1
    else:
        langs[lang] += 1

results = {}
tokScores = {'all': [], 'in-lang': [], 'in-script': [], 'new-script': []}
lasScores = {'all': [], 'in-lang': [], 'in-script': [], 'new-script': []}
for tokLine, lasLine in zip(open('results/cross-tok.multi-ling-' + v + '.csv'), open('results/cross-las.multi-ling-' + v + '.csv')):
    tok = tokLine.strip().split('\t')
    treebank = tok[0]
    tokScore = float(tok[1])
    lasScore = float(lasLine.strip().split('\t')[1])
    results[treebank] = float(tokScore)
    train, dev, test = myutils.getTrainDevTest('data/ud-treebanks-v' + v + '.singleToken/' + treebank)
    test = test.replace('conllu', 'txt')
    script = scriptFinder.guess_script('\n'.join(open(test).readlines()[:100]))
    lang = treebank.split('-')[0].replace('UD_', '')

    tokScores['all'].append(tokScore)
    if lang in langs:
        tokScores['in-lang'].append(tokScore)
    if script in scripts:
        tokScores['in-script'].append(tokScore)
    if lang not in langs and script not in scripts:
        tokScores['new-script'].append(tokScore)

    lasScores['all'].append(lasScore)
    if lang in langs:
        lasScores['in-lang'].append(lasScore)
    if script in scripts:
        lasScores['in-script'].append(lasScore)
    if lang not in langs and script not in scripts:
        lasScores['new-script'].append(lasScore)


for score1, score2 in zip(tokScores, lasScores):
    avg1 = sum(tokScores[score1])/len(tokScores[score1])
    avg2 = sum(lasScores[score2])/len(lasScores[score2])
    print(score1 + ' & ' + '{:.2f}'.format(avg1) + ' & ' + '{:.2f}'.format(avg2) + ' & ' + str(len(tokScores[score1])) + ' \\\\')


#print(len(results))
#print(total/len(results))
for item in sorted(results.items(), key=lambda x: x[1]):
    treebank = item[0]
    score = item[1]
    print(score, treebank, treebank in langs, treebank in scripts)


