import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

train_data = np.load('data-augmentation-for-sts/data/low_data.npz', allow_pickle=True)['store'][()]['train']
dev_data = np.load('data-augmentation-for-sts/data/low_data.npz', allow_pickle=True)['store'][()]['dev']
test_data = np.load('data-augmentation-for-sts/data/low_data.npz', allow_pickle=True)['store'][()]['test']
print("Example of data:", train_data[0])

tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")


# Single sentence aug
syn_data = []

# Single sentence aug - paraphrase
syn_data = []

num = 1

for i in range(len(train_data)): 
  txt = train_data[i][0]

  text =  "paraphrase: " + txt + " </s>"

  encoding = tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
  input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]

  outputs = model.generate(
      input_ids=input_ids, attention_mask=attention_masks,
      max_length=256,
      do_sample=True,
      top_k=120,
      top_p=0.95,
      early_stopping=True,
      num_return_sequences=num
  )

  for output in outputs:
      new_txt = tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)      
      syn_data.append([new_txt, train_data[i][1], train_data[i][2]])


syn_data = syn_data + train_data
print("size of train data:", len(train_data))
print("size of augmented data:", len(syn_data))

store = {}
store['train'] = syn_data
store['dev'] = dev_data
store['test'] = test_data

np.savez('data-augmentation-for-sts/low_data_para.npz', store=store)