#!/usr/bin/env bash

OPENNMT_DIR=OpenNMT
CURRENT_DIR=$(pwd)/

python3 scripts/attach_features.py \
        < data/train-lemmas.txt \
        > data/train-lemmas-clean.txt

python3 scripts/attach_features.py \
        < data/dev-lemmas.txt \
        > data/dev-lemmas-clean.txt

python3 scripts/attach_features.py \
        < data/test-output.txt \
        > data/test-output-clean.txt

python3 scripts/clean-corpus.py \
        data/dev-sentences.txt \
        data/dev-lemmas-clean.txt

python3 scripts/clean-corpus.py \
        data/test-sentences.txt \
        data/test-output-clean.txt


cd ${OPENNMT_DIR} || exit 1

th preprocess.lua \
   -train_src "${CURRENT_DIR}/data/train-lemmas-clean.txt" \
   -train_tgt "${CURRENT_DIR}/data/train-sentences.txt" \
   -valid_src "${CURRENT_DIR}/data/dev-lemmas-clean.txt" \
   -valid_tgt "${CURRENT_DIR}/data/dev-sentences.txt" \
   -save_data "${CURRENT_DIR}/data/processed-data" \
   -idx_files true
   # -src_words_min_frequency 1 \
   # -tgt_words_min_frequency 1

cd "${CURRENT_DIR}" || exit 1
