import csv
import re
import nlpaug.augmenter.word as naw


data = []
with open("data-augmentation-for-trec/data/labeled.tsv") as file:
    tsv_file = csv.reader(file, delimiter="\t")
    for line in tsv_file:
        data.append(line[0])
data = data[1:]

split_data = []
for i in range(len(data)):
  split_data.append(data[i].split(" ", 1))

print("Example of data:", split_data[0])

lang = [['Helsinki-NLP/opus-mt-en-es', 'Helsinki-NLP/opus-mt-es-en'],
        ['Helsinki-NLP/opus-mt-en-fr', 'Helsinki-NLP/opus-mt-fr-en'],
        ['Helsinki-NLP/opus-mt-en-de', 'Helsinki-NLP/opus-mt-de-en'],
        ['Helsinki-NLP/opus-mt-en-zh', 'Helsinki-NLP/opus-mt-zh-en'],
        ['Helsinki-NLP/opus-mt-en-it', 'Helsinki-NLP/opus-mt-it-en'],
        ['Helsinki-NLP/opus-mt-en-ru', 'Helsinki-NLP/opus-mt-ru-en']]



# Single sentence aug
syn_data = []

# 9 (low-resource) or 2 (half)
num = 1

for i in range(len(split_data)):
  txt = split_data[i][1]

  if num < 6:
    for j in range(num):
      aug = naw.BackTranslationAug(from_model_name=lang[j][0], to_model_name=lang[j][1]) # cpu
      syn_data.append(split_data[i][0]+' '+aug.augment(txt)[0])
  else: 
    for j in range(num):
      k = j % 6
      aug = naw.BackTranslationAug(from_model_name=lang[k][0], to_model_name=lang[k][1])
      syn_data.append(split_data[i][0]+' '+aug.augment(txt)[0])
      

syn_data = syn_data + data

print("size of train data:", len(split_data))
print("size of augmented data:", len(syn_data))

with open("data-augmentation-for-trec/aug_data/labeled.tsv", "w") as file:
  writer = csv.writer(file)
  writer.writerow(["fine_label utterance"])


with open("data-augmentation-for-trec/aug_data/labeled.tsv", "a") as file:
  writer = csv.writer(file, delimiter='\n', quotechar='', quoting=csv.QUOTE_NONE)
  writer.writerow(syn_data)

