"""Parse a sample using stanford stanza tool."""

import stanza
nlp = stanza.Pipeline('en')

def parse_str_item(s, vocab_counter=None):
    doc = nlp(s.strip())
    doc_words = [ w.text for sent in doc.sentences
        for w in sent.words]
    doc_words = [dw.strip().lower() for dw in doc_words]
    doc_words = [dw for dw in doc_words if dw!='']
    if vocab_counter is not None:
        vocab_counter.update(doc_words)
    return doc_words

def parse_str_list(string_list, vocab_counter=None):
    parsed_string_list = []
    for string in string_list:
        doc_words = parse_str_item(string, vocab_counter)
        parsed_string_list.append(doc_words)
    return parsed_string_list

def parse_fielded_list(fielded_list, vocab_counter=None):
    parsed_fielded_list = []
    for attr, value in fielded_list:
        value_words = parse_str_item(value, vocab_counter)
        parsed_fielded_list.append( (attr, value_words) )
    return parsed_fielded_list

def parse_sample_dict(sample, vocab_counter=None):
    """Parse the source, target, table_parent fields into tokens.
    args:
        sample: {'table_id', 'sub_sent_id', 'source', 'target', 'table_parent', 'operations'}
    """
    parsed_sample = {
        'table_id': sample['table_id'], 
        'sub_sent_id': sample['sub_sent_id'], 
        'operations': sample['operations'],         
    }
    parsed_sample['source'] = parse_str_list(
        sample['source'], vocab_counter)
    parsed_sample['target']= parse_str_item(
        sample['target'], vocab_counter)
    parsed_sample['table_parent'] = parse_fielded_list(
        sample['table_parent'], vocab_counter) 
    return parsed_sample


import json

def parse_datafile(infile: str, outfile: str, vocab_counter):
    """Parse the in-file dataset, write into the out-file, update the vocab-counter."""
    
    output_instances = []
    with open(infile, 'r', encoding='utf-8') as fr:
        for line in fr:
            inins = json.loads(line.strip())
            outins = parse_sample_dict(inins, vocab_counter)
            output_instances.append(outins)
    
    with open(outfile, 'w', encoding='utf-8') as fw:
        for outins in output_instances:
            outline = json.dumps(outins)
            fw.write(outline + '\n')

    print(f'from [{infile}] to [{outfile}]')
