import argparse
import conllu
import json
from glob import glob
from numpy.lib import index_tricks
from tqdm import tqdm

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--input-template', '-i', type=str, required=True, help='regular expression for matching all files in the treebank')
    parser.add_argument('--output-prefix', '-o', type=str, required=True, help='output prefix for metadata json file')
    args = parser.parse_args()

    label_set = set()
    for fname in tqdm(glob(args.input_template)): 
        for sent in conllu.parse_incr(open(fname)):
            for w in sent:
                if w['deprel'] != '_':
                    label_set.add(w['deprel'])
    
    label_dict_full = {item: i for i, item in enumerate(sorted(label_set))}
    with open(f'{args.output_prefix}.full.json', 'w') as fout:
        json.dump(label_dict_full, fout, indent=4)
        print(file=fout)
        fout.close()
    
    label_set_simp = set([item.split(':')[0] for item in label_set])
    label_dict_simp = {item: i for i, item in enumerate(sorted(label_set_simp))}
    with open(f'{args.output_prefix}.simp.json', 'w') as fout:
        json.dump(label_dict_simp, fout, indent=4)
        print(file=fout)
        fout.close()
    