# Installation
1. Install fairseq
```
git clone https://github.com/pytorch/fairseq
cd fairseq
pip install --editable ./

```

2. Install awesome-align
```
git clone https://github.com/neulab/awesome-align
cd awesome-align
pip install -r requirements.txt
python setup.py install
```

# Download Data
The shared data contains English OpenIE data and generated AACT data for Spanish language.
```
unzip data.zip
```

Download parallel translation corpus for Spanish, Portuguese, Hindi, Telugu or Chinese.

For example, Spanish, LANGUAGE=es, LANGUAGE_ID=es_XX and download the europarl corpus from https://www.statmt.org/europarl/

Place them in data/${LANGUAGE}/mbart folder as train.en_XX, valid.en_XX, train.${LANGUAGE_ID} and valid.${LANGUAGE_ID}.

All the below commands are to be run in ```code``` folder

# Train aligner
```
mkdir -p ../data/${LANGUAGE}/align
paste <(sed 's/$/ |||/' ../mbart/train.en_XX) ../mbart/train.${LANGUAGE_ID} > train
paste <(sed 's/$/ |||/' ../mbart/valid.en_XX) ../mbart/valid.${LANGUAGE_ID} > valid

python aligner_train.py  --output_dir=../models/${LANGUAGE}/model_without_co  --model_name_or_path=bert-base-multilingual-cased --extraction 'softmax' --do_train --train_tlm  --train_so --train_data_file=../data/${LANGUAGE}/align/train  --per_gpu_train_batch_size 2   --gradient_accumulation_steps 4 --num_train_epochs 5 --learning_rate 2e-5 --save_steps 5000 --max_steps 40000 --do_eval  --eval_data_file=../data/${LANGUAGE}/align/valid --train_mlm --train_tlm_full --train_psi
```

# Make training data files for consistent translation
```
bash ctranslate.sh ${LANGUAGE} ${LANGUAGE_ID} 
```

# Repeat vanilla translated sentences 
```
python repeater.py --fp1 ../data/${LANGUAGE}/mbart/train.sentences --fp2 ../data/openie6/train.count_extractions --out ../data/${LANGUAGE}/mbart/train.sentences_repeated
```

# Inference for consistent translation
# bash ctranslate_inference.sh hi hi_IN
```
bash ctranslate_inference.sh ${LANGUAGE} ${LANGUAGE_ID} 
```

# Train sorter model
```
bash sort_train.sh ${LANGUAGE}
```

# Generate OpenIE data
# CLP, Translate, CTranslate
```
bash clp.sh ${LANGUAGE}
```

# Fairseq preprocess training files
```
bash preprocess_train.sh ${LANGUAGE} genoie ctranslate_clp
bash preprocess_train.sh ${LANGUAGE} gen2oie_s1 ctranslate_clp
bash preprocess_train.sh ${LANGUAGE} gen2oie_s2 ctranslate_clp
```

# Train model (HPC or GCloud)
```
BIN_DIR=./models/${LANGUAGE}/genoie/ctranslate_clp-bin SAVE_DIR=./models/${LANGUAGE}/genoie/ctranslate_clp-checkpoints bash fairseq_openie.sh
BIN_DIR=./models/${LANGUAGE}/gen2oie_s1/ctranslate_clp-bin SAVE_DIR=./models/${LANGUAGE}/gen2oie_s1/ctranslate_clp-checkpoints bash fairseq_openie.sh
BIN_DIR=./models/${LANGUAGE}/gen2oie_s2/ctranslate_clp-bin SAVE_DIR=./models/${LANGUAGE}/gen2oie_s2/ctranslate_clp-checkpoints bash fairseq_openie.sh
```


# Run genoie in test mode
```
bash preprocess_test.sh ${LANGUAGE} genoie
bash genoie_test.sh ${LANGUAGE} ctranslate_clp
```

# Run gen2oie in test mode
```
python make_test_s1.py --lang ${LANGUAGE}
bash preprocess_test.sh ${LANGUAGE} gen2oie_s1
bash gen2oie_test.sh ${LANGUAGE} ctranslate_clp
```

# Evaluate CaRB performance 
```
python carb/carb.py --allennlp ../models/${LANGUAGE}/genoie/ctranslate_clp-data/test.predicted.allennlp --gold carb/data/gold/${LANGUAGE}_test.tsv --out /dev/null
```

