import myutils
import os
from transformers import tokenization_utils

scriptFinder = myutils.ScriptFinder()

def getScript(path):
    train, dev, test = myutils.getTrainDevTest(path)
    tgt = train
    if tgt == '':
        return
    tgt = tgt.replace('conllu', 'txt')
    return scriptFinder.guess_script('\n'.join(open(tgt).readlines()[:100]))
    

def getWhitespaceRatio(path):
    train, dev, test = myutils.getTrainDevTest(path)
    tgt = test
    if tgt == '':
        return
    tgt = tgt.replace('conllu', 'txt')
    text = '\n'.join(open(tgt).readlines()[:100])
    white = 0
    total = 0
    for char in text:
        if tokenization_utils._is_whitespace(char) and char != '\n':
            white+=1
        total+=1
    return white/total

for udVersion in myutils.udVersions[-1:]:
    udPath = 'data/ud-treebanks-v' + udVersion + '.singleToken/'
    for UDdir in sorted(os.listdir(udPath)):
        if not UDdir.startswith("UD") or not os.path.isdir(udPath + UDdir):
            continue
        script = getScript(udPath + UDdir)
        whitespace_ratio = getWhitespaceRatio(udPath + UDdir)
        print(UDdir, script, '{:.2f}'.format(whitespace_ratio*100))

