import csv
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

data = []
with open("data-augmentation-for-trec/data/labeled.tsv") as file:
    tsv_file = csv.reader(file, delimiter="\t")
    for line in tsv_file:
        data.append(line[0])
data = data[1:]

split_data = []
for i in range(len(data)):
  split_data.append(data[i].split(" ", 1))

print("Example of data:", split_data[0])

tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")


# Single sentence aug
syn_data = []

# 9 (low-resource) or 2 (half)
num = 1

for i in range(len(split_data)):
  txt = split_data[i][1]

  text =  "paraphrase: " + txt + " </s>"

  encoding = tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
  input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]


  outputs = model.generate(
      input_ids=input_ids, attention_mask=attention_masks,
      max_length=256,
      do_sample=True,
      top_k=120,
      top_p=0.95,
      early_stopping=True,
      num_return_sequences=num
  )

  for output in outputs:
      new_txt = tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)      
      syn_data.append(split_data[i][0]+' '+ new_txt)     


syn_data = syn_data + data

print("size of train data:", len(split_data))
print("size of augmented data:", len(syn_data))

with open("data-augmentation-for-trec/aug_data/labeled.tsv", "w") as file:
  writer = csv.writer(file)
  writer.writerow(["fine_label utterance"])


with open("data-augmentation-for-trec/aug_data/labeled.tsv", "a") as file:
  writer = csv.writer(file, delimiter='\n', quotechar='', quoting=csv.QUOTE_NONE)
  writer.writerow(syn_data)

