import pandas as pd

df = pd.read_csv('paraphrase.csv')

df['combined'] = '<s>' + df.sentence1.str.strip('\n') + '</s>' + '>>>>' + '<p>' + df.sentence2.str.strip('\n') + '</p>'
df['combined'] = df.combined.to_csv('combined.txt', sep='\n', index = False)

from transformers import (
AutoModelWithLMHead,
AutoConfig,
Trainer,
AutoTokenizer,
TextDataset,
DataCollatorForLanguageModeling,
TrainingArguments)

import os

model="gpt2"
batch_size=8 
cache_dir = "cache"
text_path = 'combined.txt'
epochs = 5

model = AutoModelWithLMHead.from_pretrained(model)
tokenizer = AutoTokenizer.from_pretrained('gpt2')
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_dataset = TextDataset(
  tokenizer=tokenizer,
  file_path=text_path,
  block_size=256
)

training_args =TrainingArguments(
output_dir="gpt2 fine tune/{}".format(os.path.basename(text_path)),
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
warmup_steps=500,
save_steps=2000,
logging_steps=10,
prediction_loss_only=True
)

trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
)

trainer.train()
trainer.save_model()
