DATA_FOLDER=./data/summarization

# Encode raw into bpe
for set in test-A train dev-0
do
        mkdir ./data/bpe/${set}/
        python fairseq/scripts/spm_encode.py \
        --model spm/spm.model \
        --inputs ./data/raw/${set}/in_100.tsv ./data/raw/${set}/expected_100.tsv \
        --outputs \
                ./data/bpe/${set}/in.tsv \
                ./data/bpe/${set}/expected.tsv
done

# Binarize bpe into bin
for set in train dev-0 test-A
do
        echo $set
        for file in in.tsv expected.tsv 
        do 
                file_short="expected"
                if [ $file = "in.tsv" ]; then
                        file_short="in"
                fi
                echo $file_short
                fairseq-preprocess\
                        --destdir ./data/bin/${set}/\
                        --joined-dictionary\
                        --task\
                        translation\
                        --dataset-impl\
                        mmap\
                        --workers\
                        16\
                        --seed\
                        31337\
                        --srcdict\
                        ./spm/spm.vocab\
                        --only-source\
                        --trainpref\
                        ./data/bpe/${set}/${file}
                mv ./data/bin/${set}/train.bin   ./data/bin/${set}/${file_short}.bin  
                mv ./data/bin/${set}/train.idx   ./data/bin/${set}/${file_short}.idx
        done
        mv ./data/bin/${set}/in.bin  ./data/summarization/${set}.src-tgt.src.bin  
        mv ./data/bin/${set}/in.idx   ./data/summarization/${set}.src-tgt.src.idx
        mv ./data/bin/${set}/expected.bin  ./data/summarization/${set}.src-tgt.tgt.bin  
        mv ./data/bin/${set}/expected.idx   ./data/summarization/${set}.src-tgt.tgt.idx
done

mv ./data/bin/${set}/dict.txt  ./data/summarization/dict.src.txt
mv ./data/bin/${set}/dict.txt  ./data/summarization/dict.tgt.txt      

# Prepare out for validation during training
cp ./data/bpe/dev-0/expected.tsv ./data/summarization/dev-0-gold.tsv
sed "s/ //g" -i  ./data/summarization/dev-0-gold.tsv  | sed "s/▁/ /g" -i ./data/summarization/dev-0-gold.tsv


for EXP_NAME in {1..33}
do
	TRAIN_ARGS=
	CUDA_VISIBLE_DEVICES=${DEVICE} fairseq-train 
			${DATA_FOLDER} \
			-c ${EXP_NAME}_config.yaml \
			--arch transformer \
			--task translation \
			--valid-subset ${DATA_FOLDER}/dev-0 \
			--save-dir results/train/${EXP_NAME} \
			--dataset-impl mmap 
done
