#!/bin/sh
set -ex
# Data from Million Reddit User Dataset (MUD) https://arxiv.org/abs/2105.07263
# Access form: https://docs.google.com/forms/d/e/1FAIpQLSesc-0HI2DRYjFqlpPo2hTh9OJ53jtWjYQiIfAtmzSVUCxiLA/viewform

# Once the data is downloaded and extracted, rename it to data.jsonl and store it in the data folder
DATAFOLDER='/mnt/reddit_mud/raw_all/'
if [ -z "$DATAFOLDER" ]
then
      echo "Please specify a location where the downloaded enron data is located."
	  exit 1
fi
PATH_TO_MUD_JSONL=$DATAFOLDER/'data.jsonl'
# export CUDA_VISIBLE_DEVICES='4'  #'4,3,6'


# python split_authors.py --path $DATAFOLDER/'data.jsonl' --out_dir $DATAFOLDER/'emnlp' --protected_authors_path 'styll_data/protected_authors.txt'
# python subsample_data.py --path  $DATAFOLDER/'data.jsonl' --author_splits_path $DATAFOLDER/'emnlp/author_splits.json'  --per_author 10
# python split_files.py --dir $DATAFOLDER/'emnlp' --max_per_file 50000

# WORKERS=7

# seahorse
# CUDA_VISIBLE_DEVICES=4 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/split_files' --out_dir $DATAFOLDER/'emnlp/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 0 --num_workers $WORKERS --batch_size 32 --max_input_length 60 --max_output_length 60 &
# CUDA_VISIBLE_DEVICES=3 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/split_files' --out_dir $DATAFOLDER/'emnlp/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 1 --num_workers $WORKERS --batch_size 32 --max_input_length 60 --max_output_length 60 &
# CUDA_VISIBLE_DEVICES=6 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/split_files' --out_dir $DATAFOLDER/'emnlp/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 2 --num_workers $WORKERS --batch_size 32 --max_input_length 60 --max_output_length 60 &
# CUDA_VISIBLE_DEVICES=0 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/split_files' --out_dir $DATAFOLDER/'emnlp/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 6 --num_workers $WORKERS --batch_size 32 --max_input_length 60 --max_output_length 60 &

# piranha
# CUDA_VISIBLE_DEVICES=0 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/split_files' --out_dir $DATAFOLDER/'emnlp/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 3 --num_workers $WORKERS --batch_size 32 --max_input_length 60 --max_output_length 60 &
# CUDA_VISIBLE_DEVICES=1 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/split_files' --out_dir $DATAFOLDER/'emnlp/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 4 --num_workers $WORKERS --batch_size 32 --max_input_length 60 --max_output_length 60 &
# CUDA_VISIBLE_DEVICES=3 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/split_files' --out_dir $DATAFOLDER/'emnlp/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 5 --num_workers $WORKERS --batch_size 32 --max_input_length 60 --max_output_length 60 &

# python convert_to_dataset_format.py --in_dir  $DATAFOLDER/'emnlp/paraphrased_files/topp0.8_tmp1.5' --out_dir $DATAFOLDER/'emnlp/full_dataset' 
# python preprocess_reddit_add_embeddings.py --dataset_path $DATAFOLDER/'emnlp/full_dataset/reddit_emnlp' 
# python test_tokenizers.py --dataset_path $DATAFOLDER/'emnlp/full_dataset/reddit_emnlp' 


# python sample_authorship_pairings.py --in_dir $DATAFOLDER/'emnlp/split_files' --out_dir $DATAFOLDER/'emnlp/authorship_pairings' --shard train --num_samples 100000
# python split_files.py --dir $DATAFOLDER/'emnlp/authorship_pairings/transfers_for_finetune' --max_per_file 5000

# For fine-tune dataset

# piranha
# CUDA_VISIBLE_DEVICES=0 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/authorship_pairings/transfers_for_finetune/split_files' --out_dir  $DATAFOLDER/'emnlp/authorship_pairings/transfers_for_finetune/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 0 --num_workers $WORKERS --batch_size 16 --max_input_length 60 --max_output_length 60 --num_return_sequences 5 #&
# CUDA_VISIBLE_DEVICES=2 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/authorship_pairings/transfers_for_finetune/split_files' --out_dir  $DATAFOLDER/'emnlp/authorship_pairings/transfers_for_finetune/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 1 --num_workers $WORKERS --batch_size 16 --max_input_length 60 --max_output_length 60 --num_return_sequences 5 &
# CUDA_VISIBLE_DEVICES=3 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/authorship_pairings/transfers_for_finetune/split_files' --out_dir  $DATAFOLDER/'emnlp/authorship_pairings/transfers_for_finetune/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 2 --num_workers $WORKERS --batch_size 16 --max_input_length 60 --max_output_length 60 --num_return_sequences 5 &

# seahorse
# CUDA_VISIBLE_DEVICES=3 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/authorship_pairings/transfers_for_finetune/split_files' --out_dir  $DATAFOLDER/'emnlp/authorship_pairings/transfers_for_finetune/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 3 --num_workers $WORKERS --batch_size 16 --max_input_length 60 --max_output_length 60 --num_return_sequences 5 &
# CUDA_VISIBLE_DEVICES=4 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/authorship_pairings/transfers_for_finetune/split_files' --out_dir  $DATAFOLDER/'emnlp/authorship_pairings/transfers_for_finetune/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 4 --num_workers $WORKERS --batch_size 16 --max_input_length 60 --max_output_length 60 --num_return_sequences 5 &
# CUDA_VISIBLE_DEVICES=5 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/authorship_pairings/transfers_for_finetune/split_files' --out_dir  $DATAFOLDER/'emnlp/authorship_pairings/transfers_for_finetune/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 5 --num_workers $WORKERS --batch_size 16 --max_input_length 60 --max_output_length 60 --num_return_sequences 5 &
# CUDA_VISIBLE_DEVICES=6 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/authorship_pairings/transfers_for_finetune/split_files' --out_dir  $DATAFOLDER/'emnlp/authorship_pairings/transfers_for_finetune/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 6 --num_workers $WORKERS --batch_size 16 --max_input_length 60 --max_output_length 60 --num_return_sequences 5 &

# make transfer dataset

# piranha
# CUDA_VISIBLE_DEVICES=3 python tinystyle_data_generation.py --worker_idx 0 --num_workers $WORKERS

# tigerfish
# CUDA_VISIBLE_DEVICES=3 python tinystyle_data_generation.py --worker_idx 1 --num_workers $WORKERS

# seahorse
# CUDA_VISIBLE_DEVICES=3 python tinystyle_data_generation.py --worker_idx 2 --num_workers $WORKERS &
# CUDA_VISIBLE_DEVICES=4 python tinystyle_data_generation.py --worker_idx 3 --num_workers $WORKERS &
# CUDA_VISIBLE_DEVICES=5 python tinystyle_data_generation.py --worker_idx 4 --num_workers $WORKERS &
# CUDA_VISIBLE_DEVICES=6 python tinystyle_data_generation.py --worker_idx 5 --num_workers $WORKERS &
# CUDA_VISIBLE_DEVICES=7 python tinystyle_data_generation.py --worker_idx 6 --num_workers $WORKERS &


# python sample_authorship_pairings.py --in_dir $DATAFOLDER/'emnlp/split_files' --out_dir $DATAFOLDER/'emnlp/authorship_pairings_2' --shard train --num_samples 100000 --skip_author_dict_path /mnt/reddit_mud/raw_all/emnlp/authorship_pairings/texts_by_author_for_finetune.json
# python split_files.py --dir $DATAFOLDER/'emnlp/authorship_pairings_2/transfers_for_finetune' --max_per_file 5000

# seahorse
# CUDA_VISIBLE_DEVICES=1 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/authorship_pairings_2/transfers_for_finetune/split_files' --out_dir  $DATAFOLDER/'emnlp/authorship_pairings_2/transfers_for_finetune/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 0 --num_workers $WORKERS --batch_size 16 --max_input_length 60 --max_output_length 60 --num_return_sequences 5 &
# CUDA_VISIBLE_DEVICES=2 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/authorship_pairings_2/transfers_for_finetune/split_files' --out_dir  $DATAFOLDER/'emnlp/authorship_pairings_2/transfers_for_finetune/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 1 --num_workers $WORKERS --batch_size 16 --max_input_length 60 --max_output_length 60 --num_return_sequences 5 &
# CUDA_VISIBLE_DEVICES=3 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/authorship_pairings_2/transfers_for_finetune/split_files' --out_dir  $DATAFOLDER/'emnlp/authorship_pairings_2/transfers_for_finetune/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 2 --num_workers $WORKERS --batch_size 16 --max_input_length 60 --max_output_length 60 --num_return_sequences 5 &
# CUDA_VISIBLE_DEVICES=4 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/authorship_pairings_2/transfers_for_finetune/split_files' --out_dir  $DATAFOLDER/'emnlp/authorship_pairings_2/transfers_for_finetune/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 3 --num_workers $WORKERS --batch_size 16 --max_input_length 60 --max_output_length 60 --num_return_sequences 5 &
# CUDA_VISIBLE_DEVICES=5 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/authorship_pairings_2/transfers_for_finetune/split_files' --out_dir  $DATAFOLDER/'emnlp/authorship_pairings_2/transfers_for_finetune/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 4 --num_workers $WORKERS --batch_size 16 --max_input_length 60 --max_output_length 60 --num_return_sequences 5 &
# CUDA_VISIBLE_DEVICES=6 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/authorship_pairings_2/transfers_for_finetune/split_files' --out_dir  $DATAFOLDER/'emnlp/authorship_pairings_2/transfers_for_finetune/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 5 --num_workers $WORKERS --batch_size 16 --max_input_length 60 --max_output_length 60 --num_return_sequences 5 &
# CUDA_VISIBLE_DEVICES=7 python generate_paraphrases.py --in_dir $DATAFOLDER/'emnlp/authorship_pairings_2/transfers_for_finetune/split_files' --out_dir  $DATAFOLDER/'emnlp/authorship_pairings_2/transfers_for_finetune/paraphrased_files' --temp 1.5 --top_p 0.8 --idx 6 --num_workers $WORKERS --batch_size 16 --max_input_length 60 --max_output_length 60 --num_return_sequences 5 &


# seahorse
# CUDA_VISIBLE_DEVICES=1 python tinystyle_data_generation.py --worker_idx 0 --num_workers $WORKERS &
# CUDA_VISIBLE_DEVICES=2 python tinystyle_data_generation.py --worker_idx 1 --num_workers $WORKERS &
# CUDA_VISIBLE_DEVICES=3 python tinystyle_data_generation.py --worker_idx 2 --num_workers $WORKERS &
# CUDA_VISIBLE_DEVICES=4 python tinystyle_data_generation.py --worker_idx 3 --num_workers $WORKERS &
# CUDA_VISIBLE_DEVICES=5 python tinystyle_data_generation.py --worker_idx 4 --num_workers $WORKERS &
# CUDA_VISIBLE_DEVICES=6 python tinystyle_data_generation.py --worker_idx 5 --num_workers $WORKERS &
# CUDA_VISIBLE_DEVICES=7 python tinystyle_data_generation.py --worker_idx 6 --num_workers $WORKERS &



# GENERATED_FT_DATA='/mnt/reddit_mud/raw_all/emnlp/authorship_pairings' 
# GENERATED_FT_DATA='/mnt/reddit_mud/raw_all/emnlp/authorship_pairings_2'

# python combine_authorship_sets.py

GENERATED_FT_DATA='/mnt/reddit_mud/raw_all/emnlp/authorship_pairings_combined'

WORKERS=4

#CUDA_VISIBLE_DEVICES=3 python data_filtering.py --in_dir $GENERATED_FT_DATA --worker_idx 0 --num_workers $WORKERS &
#CUDA_VISIBLE_DEVICES=4 python data_filtering.py --in_dir $GENERATED_FT_DATA --worker_idx 1 --num_workers $WORKERS &
#CUDA_VISIBLE_DEVICES=6 python data_filtering.py --in_dir $GENERATED_FT_DATA --worker_idx 2 --num_workers $WORKERS &
#CUDA_VISIBLE_DEVICES=7 python data_filtering.py --in_dir $GENERATED_FT_DATA --worker_idx 3 --num_workers $WORKERS &



# [Some work to copy over and rename files, pulling two out for validation]

# python convert_to_dataset_format.py --in_dir  '/mnt/reddit_mud/raw_all/emnlp/combined_authorship_FT_data' --out_dir $DATAFOLDER/'emnlp_ft_dataset_v2/full_dataset' 

python convert_to_dataset_format.py --in_dir  '/mnt/reddit_mud/raw_all/emnlp/authorship_pairings_combined/filtered_results' --out_dir $DATAFOLDER/'emnlp_ft_dataset_v2_fixed/full_dataset' 
python preprocess_reddit_add_embeddings_target_texts.py --dataset_path $DATAFOLDER'/emnlp_ft_dataset_v2_fixed/full_dataset/reddit_emnlp/'
