#!/usr/bin/env bash

pwd=$(dirname $(readlink -f $0))
ccDIR=~/cc100
TMPDIR=~/tmp_cc100
mkdir $TMPDIR
mkdir $ccDIR

# # download dataset
# hdfs dfs -get hdfs://haruna/home/byte_arnold_hl_mlnlc/user/wuxianze.0/Datasets/cc100/* ${ccDIR}
# xz -d $ccDIR/* # 解压xz文件

# # sample a subset of data
# cd $ccDIR

# sampleDIR=$ccDIR/sample_train
# echo "write sameple into ${sampleDIR}"
# if [ ! -e $sampleDIR ]; then
#     mkdir $sampleDIR
# fi
# head -n 2000000 de.txt > $sampleDIR/de.sample
# head -n 2000000 es.txt > $sampleDIR/es.sample
# head -n 1000000 zh-Hans.txt > $sampleDIR/zh.sample

# cd $ccDIR
# sampleDIR=$ccDIR/sample_dev
# if [ ! -e $sampleDIR ]; then
#     mkdir $sampleDIR
# fi
# echo "write sameple into ${sampleDIR}"
# tail -n 20000 de.txt > $sampleDIR/de.sample
# tail -n 20000 es.txt > $sampleDIR/es.sample
# tail -n 10000 zh-Hans.txt > $sampleDIR/zh.sample

# merge sentence into documents
cd ..
sampleDIR=$ccDIR/sample_train
mkdir ${sampleDIR}_doc
for lang in de es zh
do
    python3 mergeDoc.py -m mergeSent -i $sampleDIR/${lang}.sample -o $TMPDIR/${lang}.merged.sample -d "<q>"
    python3 mergeDoc.py -m mergeDoc -i $TMPDIR/${lang}.merged.sample -o ${sampleDIR}_doc/${lang}.doc.sample -d "<q>"
done

sampleDIR=$ccDIR/sample_dev
mkdir ${sampleDIR}_doc
for lang in de es zh
do
    python3 mergeDoc.py -m mergeSent -i $sampleDIR/${lang}.sample -o $TMPDIR/${lang}.merged.sample -d "<q>"
    python3 mergeDoc.py -m mergeDoc -i $TMPDIR/${lang}.merged.sample -o ${sampleDIR}_doc/${lang}.doc.sample -d "<q>"
done
cd ${pwd}

bash tokenizeCC100.sh

