import numpy as np
from transformers import pipeline, set_seed
import nlpaug.augmenter.word as naw

train_data = np.load('data-augmentation-for-sts/data/low_data.npz', allow_pickle=True)['store'][()]['train']
dev_data = np.load('data-augmentation-for-sts/data/low_data.npz', allow_pickle=True)['store'][()]['dev']
test_data = np.load('data-augmentation-for-sts/data/low_data.npz', allow_pickle=True)['store'][()]['test']
print("Example of data:", train_data[0])


lang = [['Helsinki-NLP/opus-mt-en-es', 'Helsinki-NLP/opus-mt-es-en'],
        ['Helsinki-NLP/opus-mt-en-fr', 'Helsinki-NLP/opus-mt-fr-en'],
        ['Helsinki-NLP/opus-mt-en-de', 'Helsinki-NLP/opus-mt-de-en'],
        ['Helsinki-NLP/opus-mt-en-zh', 'Helsinki-NLP/opus-mt-zh-en'],
        ['Helsinki-NLP/opus-mt-en-it', 'Helsinki-NLP/opus-mt-it-en'],
        ['Helsinki-NLP/opus-mt-en-ru', 'Helsinki-NLP/opus-mt-ru-en']]
# Single sentence aug - back-translation
syn_data = []

num = 1

for i in range(len(train_data)): 
  txt = train_data[i][0]

  if num < 6:
    for j in range(num):
      aug = naw.BackTranslationAug(from_model_name=lang[j][0], to_model_name=lang[j][1]) # cpu
      syn_data.append([aug.augment(txt)[0], train_data[i][1], train_data[i][2]])
          
  else: 
    for j in range(num):
      k = j % 6
      aug = naw.BackTranslationAug(from_model_name=lang[k][0], to_model_name=lang[k][1])
      syn_data.append([aug.augment(txt)[0], train_data[i][1], train_data[i][2]])



syn_data = syn_data + train_data
print("size of train data:", len(train_data))
print("size of augmented data:", len(syn_data))

store = {}
store['train'] = syn_data
store['dev'] = dev_data
store['test'] = test_data

np.savez('data-augmentation-for-sts/low_data_back.npz', store=store)

