import argparse
import os
import ipdb
from tqdm import tqdm
import stanza
import time

if __name__ == '__main__':
    parser = argparse.ArgumentParser('clean input file')
    parser.add_argument('--fp', type=str, required=True, help='input file')
    parser.add_argument('--lang', type=str, required=True, help='input file')
    parser.add_argument('--out', type=str, required=True, help='output file')
    args = parser.parse_args()
    nlp = stanza.Pipeline(lang = args.lang,tokenize_no_ssplit=True, tokenize_batch_size=2048) 
    with open(args.fp, 'r') as f1,\
        open(args.out, 'w') as f2:
        sentences = []
        data = f1.readlines()
        for ind in tqdm(range(len(data))):
            sent = data[ind].strip()
            sent = sent.replace("``", '"')
            sent = sent.replace("''", '"')
            sent = sent.replace('“', '"')
            sent = sent.replace('”', '"')
            sent = sent.replace('‘‘', '"')
            sent = sent.replace('’’', '"')
            sent = sent.replace('`', "'")
            sent = sent.replace('‘', "'")
            sent = sent.replace('’', "'")
            sent = sent.replace("\n", "")
            sent = sent.replace("&#39;&#39;", '"')
            sent = sent.replace("&#39; &#39;", '"')
            sent = sent.replace("&#39;", "'")
            sent = sent.replace("&quot;", "'")
            #sentences.append(sent.split())
            sentences.append(sent)

        all_sentences = "\n\n".join(sentences)
        count = 0
        start_time = time.time()
        doc = nlp(all_sentences)
        for sentence in doc.sentences:
            count+=1
            src = []
            for word in sentence.tokens:
                src.append(word.text.strip())

            src = " ".join(src)
            f2.write(src +  '\n')
        end_time = time.time()
        print('time taken in sec ', end_time-start_time)
        assert len(sentences) == count
