import json
import random

spt_positive_train_dataset_path = '/home/hadoop-aipnlp/nazarite/reverse_curse/reversal_curse-main/data/final_data/author_work_final/standard_positive_train_dataset.json'
spt_negative_train_dataset_path = '/home/hadoop-aipnlp/nazarite/reverse_curse/reversal_curse-main/data/final_data/author_work_final/standard_negative_train_dataset.json'

spt_positive_train_dataset = json.load(open(spt_positive_train_dataset_path, 'r'))
spt_negative_train_dataset = json.load(open(spt_negative_train_dataset_path, 'r'))

spt_positive_train_dataset = [sample_data['prompt'] for sample_data in spt_positive_train_dataset]
spt_negative_train_dataset = [sample_data['prompt'] for sample_data in spt_negative_train_dataset]

rsp_positive_train_dataset = []
rsp_negative_train_dataset = []

def random_split_sentence(origin_str_sentence):
    split_result = []
    # 首先切分成列表
    origin_sentence = origin_str_sentence.split()
    # 然后从1～3中进行随机抽样，这里按照不同的长度可以进行不同的划分
    # 这里的2,3,5就确定了permutation的长度
    while len(origin_sentence) >= 5:
        split_len = random.randint(1, 5)
        # 加上对应的一段
        split_result.append('[REV] ' + ' '.join(origin_sentence[:split_len]))
        # 删除已经挖掉的一段
        origin_sentence = origin_sentence[split_len:]
    if len(origin_sentence) == 0:
        return split_result
    else:
        split_result.append('[REV] ' + ' '.join(origin_sentence))
        return split_result

for sample_data in spt_positive_train_dataset:
    rsp_positive_train_dataset.append({'prompt': random_split_sentence(sample_data), 'completion': ''})

for sample_data in spt_negative_train_dataset:
    rsp_negative_train_dataset.append({'prompt': random_split_sentence(sample_data), 'completion': ''})

with open('/home/hadoop-aipnlp/nazarite/reverse_curse/reversal_curse-main/data/final_data/author_work_final/rsp_5_positve_train_dataset.json', 'w') as file:
    json.dump(rsp_positive_train_dataset, file)

with open('/home/hadoop-aipnlp/nazarite/reverse_curse/reversal_curse-main/data/final_data/author_work_final/rsp_5_negative_train_dataset.json', 'w') as file1:
    json.dump(rsp_negative_train_dataset, file1)

