import os
import myutils

def getText(html):
    return html.split('<p>')[1].split('</p>')[0]
tokFile = open('results/tok-sota-2.5.csv', 'w')
lasFile = open('results/las-sota-2.5.csv', 'w')
# https://trankit.readthedocs.io/en/latest/performance.html
data = open('scripts/performance.html').readlines()[570:4900]
for lineIdx, line in enumerate(data):
    if line.startswith('<tr class="row-odd"><td><p>') or line.startswith('<tr class="row-even"><td><p>'):
        treebank = 'UD_' + getText(line)
        tok = getText(data[lineIdx+2])
        las = getText(data[lineIdx+11])
        tokFile.write(treebank + '\t' + tok + '\n')
        lasFile.write(treebank + '\t' + las + '\n')
tokFile.close()
lasFile.close()

conv = {}
rootDir = 'data/ud-treebanks-v2.2.singleToken/'
for udDir in os.listdir(rootDir):
    train, dev, test = myutils.getTrainDevTest(rootDir + udDir)
    if test != '':
        shortName = test.split('/')[-1].split('-')[0]
        conv[shortName] = udDir

def readConllResults(inPath, outPath):
    outFile = open(outPath, 'w')
    treebank = ''
    for line in open(inPath):
        if line.startswith('<h3 id="'):
            treebank = line.split('"')[1]
        elif line.startswith('  1') and treebank != '':
            best_score = line.strip().split('\t')[-1]
            outFile.write(conv[treebank] + '\t' + best_score + '\n')
    outFile.close()
#https://universaldependencies.org/conll18/results-tokens.html
readConllResults('scripts/results-tokens.html', 'results/tok-sota-2.2.csv')
#https://universaldependencies.org/conll18/results-las.html
readConllResults('scripts/results-las.html', 'results/las-sota-2.2.csv')
