average.py                                                                                          0000644 0075125 0072461 00000002662 13563513733 013652  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/usr/bin/env python
"""
This script takes multiple Marian *.npz model files and outputs an elementwise average of the model,
meant to do check-point averaging from: 

https://www.aclweb.org/anthology/W16-2316

usage:

./average.py -m model.1.npz model.2.npz --output model.avg.npz
"""

from __future__ import print_function

import os
import sys
import argparse

import numpy as np

# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--model', nargs='+', required=True,
                    help="models to average")
parser.add_argument('-o', '--output', required=True,
                    help="output path")
args = parser.parse_args()

# *average* holds the model matrix
average = dict()
# No. of models.
n = len(args.model)

for filename in args.model:
    print("Loading {}".format(filename))
    with open(filename, "rb") as mfile:
        # Loads matrix from model file
        m = np.load(mfile)
        for k in m:
            if k != "history_errs":
                # Initialize the key
                if k not in average:
                    average[k] = m[k]
                # Add to the appropriate value
                elif average[k].shape == m[k].shape and "special" not in k:
                    average[k] += m[k]

# Actual averaging
for k in average:
    if "special" not in k:
        average[k] /= n

# Save averaged model to file
print("Saving to {}".format(args.output))
np.savez(args.output, **average)
                                                                              evaluate_model.sh                                                                                   0000755 0075125 0072461 00000011362 13664725537 015221  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/bin/bash

set -ex

MARIAN_HOME=$HOME/marian

while [ "$1" != "" ]; do
    case $1 in
        -m | --model )   shift
                       MODEL=$1
                       ;;
        -h | --help )  usage
                       exit
                       ;;
        * )            usage
                       exit 1
    esac
    shift
done

if [[ $MODEL =~ /$ ]]; then
    MODEL=${MODEL:0:-1}
fi

SRC=${MODEL:7:2}
TGT=${MODEL:9:2}

if [[ $SRC == en ]]; then
    DATA_DIR=en${TGT}
else
    DATA_DIR=en${SRC}
fi

if [[ ! $MODEL =~ 'bpew' ]]; then
    echo "This script only with wordpiece-like BPE model." > /dev/stderr
    exit 1
fi

if [[ ! -d $MODEL ]]; then
    echo "Model directory \"$MODEL\" does not exist." > /dev/stderr
    exit 1
fi

# CHECKPOINT AVERAGING
if [ ! -f $MODEL/model.avg.npz -o $MODEL/model.npz -nt $MODEL/model.avg.npz ]; then
    python3 average.py -m $(ls -t $MODEL/model.iter*.npz | head -n 5) -o $MODEL/model.avg.npz
fi

for TOK in $(echo ${MODEL:12:1000} | sed -e 's/x//g;s/_/ /g'); do
    if [[ $TOK =~ ^bpew.*$ ]]; then
        INPUT_TYPE=$TOK
    fi
done
echo $INPUT_TYPE

TEST_FILE=data/${DATA_DIR}/test/${SRC}.${INPUT_TYPE}
TEST_TARGET=data/$DATA_DIR/test/$TGT

if [ ! -f $MODEL/test.txt -o $MODEL/model.avg.npz -nt $MODEL/test.txt ]; then
    $MARIAN_HOME/build/marian-decoder -c $MODEL/model.npz.decoder.yml -m $MODEL/model.avg.npz --beam-size 12 --normalize 0.4 --mini-batch 8 < $TEST_FILE > $MODEL/test.output
    sed -e 's/ //g;s/▁/ /g;s/^ //' $MODEL/test.output > $MODEL/test.txt

    sacrebleu $TEST_TARGET --score-only --width 2 --metrics bleu < $MODEL/test.txt > $MODEL/test_bleu
    sacrebleu $TEST_TARGET --score-only --width 4 --metrics chrf < $MODEL/test.txt > $MODEL/test_chrf
    java -jar meteor/meteor-1.5.jar $TEST_TARGET $MODEL/test.txt -l $TGT -norm | tail -n 1 | sed -e 's/.* //' > $MODEL/test_meteor

fi

echo -n 'Test BLEU score: '
cat $MODEL/test_bleu

# TODO do the noise evaluation
if [ ! -f $MODEL/test.noisy.txt -o $MODEL/model.avg.npz -nt $MODEL/test.noisy.txt ]; then
    NOISY_TEST_FILE=data/${DATA_DIR}/test/${SRC}.wtok.noisy.${INPUT_TYPE:4:1000}
    $MARIAN_HOME/build/marian-decoder -c $MODEL/model.npz.decoder.yml -m $MODEL/model.avg.npz --beam-size 12 --normalize 0.4  --mini-batch 8 < $NOISY_TEST_FILE > $MODEL/test.noisy.output
    sed -e 's/ //g;s/▁/ /g;s/^ //' $MODEL/test.noisy.output > $MODEL/test.noisy.txt

    split -l $(wc -l < $TEST_FILE) -d $MODEL/test.noisy.txt $MODEL/test.noisy.

    cat $MODEL/test_bleu > $MODEL/noisy_bleu
    cat $MODEL/test_chrf > $MODEL/noisy_chrf
    cat $MODEL/test_meteor > $MODEL/noisy_meteor

    for FILE in $MODEL/test.noisy.{00..09}; do
        sacrebleu $TEST_TARGET --score-only --width 2 --metrics bleu < $FILE >> $MODEL/noisy_bleu
        sacrebleu $TEST_TARGET --score-only --width 4 --metrics chrf < $FILE >> $MODEL/noisy_chrf
        java -jar meteor/meteor-1.5.jar $TEST_TARGET $FILE -l $TGT -norm | tail -n 1 | sed -e 's/.* //' >> $MODEL/noisy_meteor
    done

    python3 noisy_slope.py $MODEL/noisy_{bleu,chrf,meteor} | tee $MODEL/noisy_correlation
fi

# DO ALL THE FANCY STUFF FOR MORPHEVAL
if [[ $SRC$TGT =~ en(de|cs|fr) ]]; then
    if [ ! -f $MODEL/morpheval.tok -o $MODEL/model.avg.npz -nt $MODEL/morpheval.tok ]; then
            INPUT_TYPE=${INPUT_TYPE/bpew/wbpe}
            TEST_FILE=morpheval/segmented/${SRC}${TGT}/sents.${SRC}${TGT}.${INPUT_TYPE}
            $MARIAN_HOME/build/marian-decoder -c $MODEL/model.npz.decoder.yml -m $MODEL/model.avg.npz  --beam-size 12 --normalize 0.4 --mini-batch 12 < $TEST_FILE > $MODEL/morpheval.output
            sed -e 's/ //g;s/▁/ /g;s/^ //' $MODEL/morpheval.output | sacremoses tokenize -l $TGT -x > $MODEL/morpheval.tok

            if [[ $TGT == "de" ]]; then
                cd morpheval/SMOR
                tr ' ' '\n' < ../../$MODEL/morpheval.tok | sort | uniq | ./smor > ../../$MODEL/morpheval.smored
                cd ../..
                python3 morpheval/morpheval_v2/evaluate_de.py -i $MODEL/morpheval.tok -n morpheval/morpheval.limsi.v2.en.info -d $MODEL/morpheval.smored | tee $MODEL/morpheval.analysis
            fi

            if [[ $TGT == "cs" ]]; then
                sed 's/$/\n/' $MODEL/morpheval.tok | tr ' ' '\n' | morpheval/morphodita-1.3.0-bin/bin-linux64/run_morpho_analyze --input=vertical --output=vertical  morpheval/czech-morfflex-pdt-131112/czech-morfflex-131112.dict 1  > $MODEL/morpheval.morphodita
                python3 morpheval/morpheval_v2/evaluate_cs.py -i $MODEL/morpheval.morphodita -n morpheval/morpheval.limsi.v2.en.info | tee $MODEL/morpheval.analysis
            fi

            if [[ $TGT == "fr" ]]; then
                python3 morpheval/morpheval_v2/evaluate_fr.py -i output.tokenized -n morpheval.limsi.v2.en.info -d morpheval/lefff.pkl
            fi
    fi
fi

echo -n 'Test BLEU score: '
cat $MODEL/test_bleu
                                                                                                                                                                                                                                                                              finetune_transformer.sh                                                                             0000755 0075125 0072461 00000006371 13664727711 016472  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/bin/bash

set -ex

SRC=en
TGT=de
PAIR=$SRC$TGT
VALID_SCRIPT=valid_bpe_${SRC}${TGT}.sh
VALID_BATCH=16
BATCH_PER_STEP=200000000000000
MARIAN_HOME=$HOME/marian
BATCH=9500
LEARNING_RATE=1e-5


while [ "$1" != "" ]; do
    case $1 in
        -s | --src )   shift
                       SRC=$1
                       ;;
        -t | --tgt )   shift
					   TGT=$1
                       ;;
        --orig-model ) shift
                       ORIG_MODEL=$1
                       ;;
        --batches-per-step )  shift
                       BATCH_PER_STEP=$1
                       ;;
        --learning-rate )  shift
                       LEARNING_RATE=$1
                       ;;
        -b | --bpe )   shift
					   BPE=$1
                       ;;
        --gpus )       shift
					   GPUS=$1
                       ;;
        * )            usage
                       exit 1
    esac
    shift
done

# CHECK IF GPUS ARE SET AND FORMAT GPUS
if [ ! -v GPUS ]; then
    echo No GPUs were set. > /dev/stderr
    exit 1
fi
GPUS=$(echo $GPUS | sed -e 's/,/ /g')
GPU_COUNT=$(echo $GPUS | wc -w)
if [ $GPU_COUNT -gt 1 ]; then
    SUFFIX="${SUFFIX}_gpu$(echo $GPUS | wc -w)"
fi

# CHECK IF ORIGINAL MODEL EXISTS
if [ ! -d $ORIG_MODEL ]; then
    echo Original model \"$ORIG_MODEL\" does not exist. > /dev/stderr
    exit 1
fi

if [[ $SRC == en ]]; then
    PAIR=en${TGT}
else
    PAIR=en${SRC}
fi


ORIG_VOCAB_FILE=$ORIG_MODEL/vocab.$PAIR.yml
if [ ! -e $VOCAB_FILE ]; then
    echo Vocabulary of original model \"$VOCAB_FILE\" was not found. > /dev/stderr
    exit 1
fi

DATA_SUFFIX=bpe${BPE}
VALID_PREFIX=valid_bpe

# WHEN USING WORD-PIECE LIKE TOKENIZATION, USE MATCHING VALIDATION SCRIPT
if [[ $BPE =~ ^w.*$ ]]; then
    VALID_PREFIX=valid_wbpe
fi
VALID_SCRIPT=${VALID_PREFIX}_${SRC}${TGT}.sh

# CREATE NEW MODEL DIRECTORY AND COPY THE INITIAL MODEL AND VOCABULARY
MODEL_DIR=${ORIG_MODEL}_${DATA_SUFFIX}
mkdir -p $MODEL_DIR
VOCAB_FILE=$MODEL_DIR/vocab.$PAIR.yml

if [ ! -e $VOCAB_FILE ]; then
    cp $ORIG_VOCAB_FILE $MODEL_DIR
fi

if [ ! -e $MODEL_DIR/model.npz ]; then
    # If we did not attempt to train, copy the original model
    cp $ORIG_MODEL/model.npz $MODEL_DIR
else
    # If we already did some training, we need to remove the config because
    # otherwise it complains about conflict in the config file.
    rm $MODEL_DIR/model.npz.yml || echo
fi

# RE-TRAIN THE MODEL
$MARIAN_HOME/build/marian \
    --model $MODEL_DIR/model.npz \
    --train-sets data/$PAIR/train/{$SRC,$TGT}.$DATA_SUFFIX \
    --max-length 400 \
    --vocabs $VOCAB_FILE $VOCAB_FILE \
    --mini-batch-fit -w $BATCH --maxi-batch $((1000 * $GPU_COUNT)) \
    --early-stopping 5 \
    --after-batches $BATCH_PER_STEP \
    --valid-freq 5000 --save-freq 5000 --disp-freq 500 \
    --valid-metrics cross-entropy perplexity translation \
    --valid-sets data/$PAIR/val/{$SRC,$TGT}.$DATA_SUFFIX \
    --valid-script-path validation_scripts/$VALID_SCRIPT \
    --valid-translation-output $MODEL_DIR/valid.output --quiet-translation \
    --valid-mini-batch $VALID_BATCH \
    --beam-size 6 --normalize 0.6 \
    --log $MODEL_DIR/train.log --valid-log $MODEL_DIR/valid.log \
    --optimizer adam --learn-rate $LEARNING_RATE --clip-norm 5 \
    --devices $GPUS --sync-sgd --seed 12674 \
    --exponential-smoothing 0.1 --transformer-dropout 0.1
                                                                                                                                                                                                                                                                       get_vocabulary.py                                                                                   0000755 0075125 0072461 00000004677 13563513733 015261  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/usr/bin/env python3

import argparse
import multiprocessing
import sys
import yaml

def get_vocab(lines):
    vocabulary = {}

    for line in lines:
        for token in line.rstrip().split():
            if token not in vocabulary:
                vocabulary[token] = 0
            vocabulary[token] += 1
    return vocabulary


def merge_in_vocab(orig, new):
    for token, count in new.items():
        if token not in orig:
            orig[token] = count
        else:
            orig[token] += count

def main():
    parser = argparse.ArgumentParser("Get vocabulary from plain text files.")
    parser.add_argument(
        "input", nargs="+",  type=argparse.FileType('r'),
        help="List of input files, use - for stdin.")
    parser.add_argument(
        "--min-count", type=int, default=10)
    parser.add_argument(
        "--num-threads", type=int, default=4,
        help="Number of threads")
    parser.add_argument(
        "--marian-yaml", action="store_true", default=False,
        help="Get the vocabulary in the Marian YAML format.")
    args = parser.parse_args()


    pool = multiprocessing.Pool(processes=args.num_threads)
    vocabulary = {}

    for input_file in args.input:
        print(f"Reading file '{input_file}'", file=sys.stderr)
        line_buffers = []
        current_buffer = []

        for line in input_file:
            current_buffer.append(line)

            if len(current_buffer) > 200:
                line_buffers.append(current_buffer)
                current_buffer = []

            if len(line_buffers) > args.num_threads:
                for vocab in pool.map(get_vocab, line_buffers):
                    merge_in_vocab(vocabulary, vocab)
                line_buffers = []

        line_buffers.append(current_buffer)
        for vocab in pool.map(get_vocab, line_buffers):
            merge_in_vocab(vocabulary, vocab)


    if args.marian_yaml:
        yaml_vocabulary = {
            "</s>": 0,
            "<unk>": 1}

        for token, count in sorted(
                vocabulary.items(), key=lambda x: x[1], reverse=True):
            if count > args.min_count:
                yaml_vocabulary[token] = len(yaml_vocabulary)

        yaml.dump(yaml_vocabulary, sys.stdout, default_flow_style=False, allow_unicode=True)
    else:
        for token, count in sorted(
                vocabulary.items(), key=lambda x: x[1], reverse=True):
            print("{}\t{}".format(token, count))


if __name__ == "__main__":
    main()
                                                                 noisy_slope.py                                                                                      0000755 0075125 0072461 00000002434 13567266604 014611  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/usr/bin/env python3

"""Coefficient for noise sensitivity evaluation."""

import argparse
import math
import os
import sys

import numpy as np
from scipy.stats import linregress


def load_file(path):
    with open(path) as f:
        return np.array([float(line.strip()) for line in f])


def main():
    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument(
        "files", nargs="+", type=str, help="File with numbers.")
    args = parser.parse_args()

    non_existing = [path for path in args.files if not os.path.exists(path)]
    if non_existing:
        print(f"Files do not exists: {', '.join(non_existing)}",
              file=sys.stderr)
        exit(1)

    if len(args.files) < 2:
        print("Provide at least two series of numbers", file=sys.stderr)
        exit(1)

    all_series = [load_file(path) for path in args.files]

    for path, series in zip(args.files, all_series):
        if len(series) != 11:
            print(f"Missing measurements in {path}.", file=sys.stderr)
            exit(1)

    noise_probailities = np.arange(0, 1.1, 0.1)

    for i, series in enumerate(all_series):
        slope, intercept, r_value, p_value, std_err = linregress(
            noise_probailities, series)
        print(slope / intercept)


if __name__ == "__main__":
    main()
                                                                                                                                                                                                                                    prepare.sh                                                                                          0000755 0075125 0072461 00000017704 13664726004 013665  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/bin/bash

# #############################################################################
# Requirements:
#  * sacremoses
#  * fastBPE
# #############################################################################
FAST_BPE=$HOME/local/fastBPE/bin


# =============================================================================
# Training Data
# - en-de WMT14 data
# - en-cs WMT17 data
# =============================================================================
mkdir tmp_data
cd tmp_data
#
mkdir ende encs

wget http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz

tar -zxvf training-parallel-nc-v13.tgz training-parallel-nc-v13/news-commentary-v13.de-en.en
tar -zxvf training-parallel-nc-v13.tgz training-parallel-nc-v13/news-commentary-v13.de-en.de
tar -zxvf training-parallel-nc-v13.tgz training-parallel-nc-v13/news-commentary-v13.cs-en.en
tar -zxvf training-parallel-nc-v13.tgz training-parallel-nc-v13/news-commentary-v13.cs-en.cs
rm training-parallel-nc-v13.tgz

mv training-parallel-nc-v13/news-commentary-v13.de-en.{en,de} ende/
mv training-parallel-nc-v13/news-commentary-v13.cs-en.{en,cs} encs/

rmdir training-parallel-nc-v13

# # -----------------------------------------------------------------------------

wget http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz

tar -zxvf training-parallel-commoncrawl.tgz commoncrawl.de-en.en
tar -zxvf training-parallel-commoncrawl.tgz commoncrawl.de-en.de
tar -zxvf training-parallel-commoncrawl.tgz commoncrawl.fr-en.en
tar -zxvf training-parallel-commoncrawl.tgz commoncrawl.fr-en.fr
tar -zxvf training-parallel-commoncrawl.tgz commoncrawl.cs-en.en
tar -zxvf training-parallel-commoncrawl.tgz commoncrawl.cs-en.cs
rm training-parallel-commoncrawl.tgz

mv commoncrawl.de-en.{en,de} ende/
mv commoncrawl.fr-en.{en,fr} enfr/
mv commoncrawl.cs-en.{en,cs} encs/

# -----------------------------------------------------------------------------

wget http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz

tar -zxvf training-parallel-europarl-v7.tgz training/europarl-v7.de-en.en
tar -zxvf training-parallel-europarl-v7.tgz training/europarl-v7.de-en.de
tar -zxvf training-parallel-europarl-v7.tgz training/europarl-v7.cs-en.en
tar -zxvf training-parallel-europarl-v7.tgz training/europarl-v7.cs-en.cs
tar -zxvf training-parallel-europarl-v7.tgz training/europarl-v7.fr-en.en
tar -zxvf training-parallel-europarl-v7.tgz training/europarl-v7.fr-en.fr
rm training-parallel-europarl-v7.tgz

mv training/europarl-v7.de-en.{en,de} ende/
mv training/europarl-v7.cs-en.{en,cs} encs/
mv training/europarl-v7.fr-en.{en,fr} enfr/
rmdir training

# -----------------------------------------------------------------------------

wget http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz
tar -zxvf training-parallel-nc-v9.tgz training/news-commentary-v9.fr-en.en
tar -zxvf training-parallel-nc-v9.tgz training/news-commentary-v9.fr-en.fr
rm training-parallel-nc-v9.tgz

sed -i 's/\r//g' training/*
mv training/news-commentary-v9.fr-en.{en,fr} enfr/
rmdir training

# -----------------------------------------------------------------------------

wget https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-1458/data-plaintext-format.tar
tar xvf data-plaintext-format.tar
zcat data.plaintext-format/*train.gz > czeng.tsv
cut -f 3 czeng.tsv > encs/czeng.cs
cut -f 4 czeng.tsv > encs/czeng.en
chmod -R +w data.plaintext-format
rm -rf data.plaintext-format

# -----------------------------------------------------------------------------

# COLLECT AND TOKENIZE EVERYTHING

for TGT in de cs fr; do
    FINAL_DIR=en${TGT}_final
    mkdir $FINAL_DIR
    cd en$TGT
    dos2unix *
    paste <(cat *.en) <(cat *.$TGT) | shuf > all.en$TGT
    cut -f1 all.en$TGT > ../$FINAL_DIR/en
    cut -f2 all.en$TGT > ../$FINAL_DIR/$TGT

    cd ../$FINAL_DIR
    sacremoses tokenize -l en < en > en.tok
    sacremoses tokenize -l $TGT < $TGT > $TGT.tok
    cd ..
done

cd ..
mkdir -p data/en{de,cs,fr}/{train,val,test}


for TGT in de cs fr; do
    mv tmp_data/en${TGT}_final data/en${TGT}/train
done

# =============================================================================
# Download Turkish
# =============================================================================

wget http://opus.nlpl.eu/download.php?f=SETIMES/v2/moses/en-tr.txt.zip -O en-tr.txt.zip

unzip -p en-tr.txt.zip SETIMES.en-tr.en > data/entr/train/en
unzip -p en-tr.txt.zip SETIMES.en-tr.tr > data/entr/train/tr
rm en-tr.txt.zip

# =============================================================================
# Validation and test data
# =============================================================================

sacrebleu -t wmt13 -l en-de --echo src > data/ende/val/en
sacrebleu -t wmt13 -l en-de --echo ref > data/ende/val/de
sacrebleu -t wmt14 -l en-de --echo src > data/ende/test/en
sacrebleu -t wmt14 -l en-de --echo ref > data/ende/test/de

sacrebleu -t wmt17 -l en-cs --echo src > data/encs/val/en
sacrebleu -t wmt17 -l en-cs --echo ref > data/encs/val/cs
sacrebleu -t wmt18 -l en-cs --echo src > data/encs/test/en
sacrebleu -t wmt18 -l en-cs --echo ref > data/encs/test/cs

sacrebleu -t wmt17 -l en-tr --echo src > data/entr/val/en
sacrebleu -t wmt17 -l en-tr --echo ref > data/entr/val/tr
sacrebleu -t wmt18 -l en-tr --echo src > data/entr/test/en
sacrebleu -t wmt18 -l en-tr --echo ref > data/entr/test/tr

# =============================================================================
# Apply BPE on everything
# =============================================================================

for F in data/*/{train,test,val,val_full}/??; do
    echo $F
    ./wordpiece_tokenize.py $F > $F.wtok
done

for PAIR in ende enfr encs; do
    $FAST_BPE/fast learnbpe 32000 data/$PAIR/train/??.wtok > data/$PAIR/wbpe32k;
done

for PAIR in ende encs entr; do
    for KSIZE in 16 8 4 2 1; do
        head -n ${KSIZE}000 data/$PAIR/wbpe32k > data/$PAIR/wbpe${KSIZE}k
    done

    for SIZE in {950..0..50}; do
        head -n ${KSIZE} data/$PAIR/wbpe1k > data/$PAIR/wbpe${SIZE}
    done
done

for PAIR in ende encs entr; do
    for SIZE in 32k 16k 8k 4k 2k 1k {950..0..50}; do
        CODES=data/$PAIR/wbpe${SIZE}
        for FILE in data/$PAIR/*/*.wtok; do
            echo $FILE
            OUTFILE=${FILE:0:-5}.bpew${SIZE}
            $FAST_BPE/fast applybpe $OUTFILE $FILE $CODES
            sed -i 's/@@ / /g' $OUTFILE
        done
    done
done

# Prepare the data wtih sampled noise
git clone https://github.com/ybisk/charNMT-noise

for TGT in de cs tr; do
    PAIR=en${TGT}
    for LNG in en $TGT; do
        NOISY_FILE=data/$PAIR/test/$LNG.wtok.noisy
        for NOISE in 0.{1..9} 1.0; do
            ./sample_natural_noise.py charNMT-noise/noise/$LNG.natural $NOISE data/$PAIR/test/$LNG.wtok >> $NOISY_FILE
        done
        for SIZE in 32k 16k 8k 4k 2k 1k {950..0..50}; do
            CODES=data/$PAIR/wbpe${SIZE}
            $FAST_BPE/fast applybpe $NOISY_FILE.$SIZE $NOISY_FILE $CODES
            sed -i 's/@@ / /g' $NOISY_FILE.$SIZE
        done
    done
done

# Downlaod everything for Morpheval evaluation

mkdir morpheval
cd morpheval

wget https://morpheval.limsi.fr/morpheval.limsi.v2.en.info
wget https://morpheval.limsi.fr/morpheval.limsi.v2.en.sents
wget https://github.com/ufal/morphodita/releases/download/v1.3.0/morphodita-1.3.0-bin.zip
unzip morphodita-1.3.0-bin.zip

wget https://www.cis.uni-muenchen.de/~schmid/tools/SMOR/data/SMOR-linux.tar.gz
tar zxvf SMOR-linux.tar.gz

wget https://morpheval.limsi.fr/lefff.pkl

../wordpiece_tokenize.py morpheval.limsi.v2.en.sents > morpheval.limsi.v2.en.sents.wtok

for PAIR in ende encs enfr; do
    for BPE in ../data/$PAIR/wbpe*; do
        echo $BPE
        $FAST_BPE/fast applybpe_stream $BPE < morpheval.limsi.v2.en.sents.wtok | sed -e 's/@@ / /g' > segmented/$PAIR/sents.$PAIR.${BPE:13:1000}
    done
done

git clone https://github.com/franckbrl/morpheval_v2

curl --remote-name-all https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-1836{/czech-morfflex-pdt-161115.zip}
unzip czech-morfflex-pdt-161115.zip

git clone https://github.com/cmu-mtlab/meteor
cd meteor
ant
                                                            sample_natural_noise.py                                                                             0000755 0075125 0072461 00000003156 13563532265 016447  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/usr/bin/env python3

"""Sample natrual noise into tokenized text ."""

import argparse
import random
import sys


SPACE = "▁"


def load_table(table_file):
    error_table = {}
    for line in table_file:
        words = line.strip().split()
        error_table[words[0]] = words[1:]
    table_file.close()

    return error_table


def main():
    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument(
        "dictionary", type=argparse.FileType('r'),
        help="File from 'charNMT-noise' tabulating the frequent typos.")
    parser.add_argument(
        "probability", type=float, help="Sampling probability")
    parser.add_argument(
        "input", nargs="?", default=sys.stdin, type=argparse.FileType('r'))
    args = parser.parse_args()

    if args.probability < 0 or args.probability > 1:
        raise ValueError("Probability must be between 0 and 1.")

    error_table = load_table(args.dictionary)

    total_tokens = 0
    replacements = 0

    for line in args.input:
        new_tokens = []
        for token in line.strip().split():
            total_tokens += 1
            if token[1:] in error_table and random.random() < args.probability:
                new_tokens.append(
                        SPACE + random.choice(error_table[token[1:]]))
                replacements += 1
            else:
                new_tokens.append(token)
        print(" ".join(new_tokens))

    print(f"Total tokens: {total_tokens}, replaced: {replacements}",
          file=sys.stderr)
    print(f"Replacement rate: {replacements / total_tokens:.2f}",
          file=sys.stderr)


if __name__ == "__main__":
    main()
                                                                                                                                                                                                                                                                                                                                                                                                                  train_transformer.sh                                                                                0000755 0075125 0072461 00000012536 13664725234 015770  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/bin/bash

set -ex

MARIAN_HOME=$HOME/marian

SRC=en
TGT=de
BATCH=9500
VALID_BATCH=64
FF_LAYER=2048
SUFFIX=
SEED=1111
DEPTH=6
LEARNING_RATE=0.0003
DIM=512
DROP=

function usage {
    echo "Train MT system using Marian."
    echo "usage: ./train_baseline_marian.sh -s <lng1> -t <lng2> --bpe <bpe> ..."
    echo "   --depth        Number of the model layers, defalt: $DEPTH"
    echo "   --bpe          Size of BPE vocabulary: 32k|16k|8k|4k|2k|1k|950..0..50"
    echo "                  Prefix 'w' means word-piece-like tokeniation."
    echo "   --batch        Training batch size in number of words, default: $BATCH"
    echo "   --val-batch    Validation batch size in number of sentences, deafult: $VALID_BATCH"
    echo "   --seed         Random seed."
    echo "   --lr           Initial learning rate, default $LEARNING_RATE"
    echo "   --gpus         Comma-separated IDs of GPUs that will be used."
    echo "   --dropout      Turns on BPE-dropout on training data."
}


while [ "$1" != "" ]; do
    case $1 in
        -s | --src )   shift
                       SRC=$1
                       ;;
        -t | --tgt )   shift
					   TGT=$1
                       ;;
        -b | --bpe )   shift
					   BPE=$1
                       ;;
        -c | --char )  shift
					   CHARNGRAM=$1
                       ;;
        --words )      shift
					   WORDS=$1
                       ;;
        --batch ) shift
					   BATCH=$1
                       ;;
        --val-batch )  shift
					   VALID_BATCH=$1
                       ;;
        --suffix )     shift
					   SUFFIX=$1
                       ;;
        --seed )       shift
					   SEED=$1
                       ;;
        --depth )      shift
					   DEPTH=$1
                       ;;
        --ff-layer )   shift
					   FF_LAYER=$1
                       ;;
        --dim )        shift
					   DIM=$1
                       ;;
        --lr )         shift
					   LEARNING_RATE=$1
                       ;;
        --gpus )       shift
					   GPUS=$1
                       ;;
        --dropout )    DROP=drop
                       ;;
        -h | --help )  usage
                       exit
                       ;;
        * )            usage
                       exit 1
    esac
    shift
done

# CHECK IF GPUS ARE SET AND FORMAT GPUS
if [ ! -v GPUS ]; then
    echo No GPUs were set. > /dev/stderr
    exit 1
fi
GPUS=$(echo $GPUS | sed -e 's/,/ /g')
GPU_COUNT=$(echo $GPUS | wc -w)
if [ $GPU_COUNT -gt 1 ]; then
    SUFFIX="${SUFFIX}_gpu$(echo $GPUS | wc -w)"
fi


if [[ ! $SRC =~ ^(en|cs|fr|de|et|tr|ru)$ ]]; then
    echo Unknown source language $SRC 2> /dev/stderr
    exit 1
fi
if [[ ! $TGT =~ ^(en|cs|fr|de|et|tr|ru)$ ]]; then
    echo Unknown target language $TGT 2> /dev/stderr
    exit 1
fi

if [[ $SRC != en && $TGT != en ]]; then
    echo One of the languages must be English. 2> /dev/stderr
    exit 1
fi

if [[ $SRC == $TGT ]]; then
    echo One of the languages must be English.
    exit 1
fi


if [[ $SRC == en ]]; then
    PAIR=en${TGT}
else
    PAIR=en${SRC}
fi

if [[ -v BPE && -v CHARNGRAM || -v BPE && -v WORDS || -v CHARNGRAM && -v WORDS ]]; then
    echo You cannot use BPE and character n-grams at the same time. 2> /dev/stderr
fi

if [[ -v BPE && ! $BPE =~ ^w{0,1}(32k|16k|8k|4k|2k|1k|900|800|700|600|500|250|125|63|0)$ ]]; then
    echo Available BPE sizes are 32/16/8/4/2/1k/900/800/700/600/500/250/125/63/0, was ${BPE} 2> /dev/stderr
    exit 1
fi

DATA_SUFFIX=
if [[ -v BPE ]]; then
    DATA_SUFFIX=bpe${BPE}
    PREFIX=valid_bpe
    if [[ $BPE =~ ^w.*$ ]]; then
        PREFIX=valid_wbpe
    fi
    VALID_SCRIPT=${PREFIX}_${SRC}${TGT}.sh
fi
if [[ -v WORDS ]]; then
    DATA_SUFFIX=words${WORDS}
    VALID_SCRIPT=valid_bpe_${SRC}${TGT}.sh
fi
if [[ -v CHARNGRAM ]]; then
    DATA_SUFFIX=char$CHARNGRAM
    VALID_SCRIPT=valid_ngram_${SRC}${TGT}.sh
fi


MODEL_DIR=models/${SRC}${TGT}_${DATA_SUFFIX}${DROP}${SUFFIX}
mkdir -p $MODEL_DIR
if [ -e $MODEL_DIR/model.npz.yml ]; then
    rm $MODEL_DIR/model.npz.yml
fi

VOCAB_FILE=$MODEL_DIR/vocab.$PAIR.yml
if [[ ! -e $VOCAB_FILE ]]; then
    python3 ./get_vocabulary.py data/${PAIR}/train/{$SRC,$TGT}.$DATA_SUFFIX --marian-yaml | sort -nk2 > $VOCAB_FILE
fi


$MARIAN_HOME/build/marian \
    --model $MODEL_DIR/model.npz --type transformer \
    --train-sets data/$PAIR/train/{$SRC,$TGT}.$DATA_SUFFIX$DROP \
    --max-length 400 \
    --vocabs $VOCAB_FILE $VOCAB_FILE \
    --mini-batch-fit -w $BATCH --maxi-batch $((1000 * $GPU_COUNT)) \
    --early-stopping 5 \
    --valid-freq 5000 --save-freq 5000 --disp-freq 500 \
    --valid-metrics cross-entropy perplexity translation \
    --valid-sets data/$PAIR/val/{$SRC,$TGT}.$DATA_SUFFIX \
    --valid-script-path validation_scripts/$VALID_SCRIPT \
    --valid-translation-output $MODEL_DIR/valid.output --quiet-translation \
    --valid-mini-batch $VALID_BATCH \
    --beam-size 6 --normalize 0.6 \
    --log $MODEL_DIR/train.log --valid-log $MODEL_DIR/valid.log \
    --dim-emb $DIM \
    --enc-depth $DEPTH --dec-depth $DEPTH \
    --transformer-heads 8 \
    --transformer-postprocess-emb d \
    --transformer-postprocess dan \
    --transformer-dim-ffn $FF_LAYER \
    --transformer-dropout 0.1 --label-smoothing 0.1 \
    --learn-rate $LEARNING_RATE --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
    --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
    --tied-embeddings-all \
    --devices $GPUS --sync-sgd --seed $SEED \
    --exponential-smoothing
                                                                                                                                                                  validation_scripts/                                                                                 0000755 0075125 0072461 00000000000 13664725277 015573  5                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              validation_scripts/valid_wbpe_encs.sh                                                               0000755 0075125 0072461 00000000160 13563513735 021243  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/bin/bash

OUTPUT=$1

sed -e 's/ //g;s/▁/ /g;s/^ //' $1 | sacrebleu data/encs/val/cs --score-only --width 2
                                                                                                                                                                                                                                                                                                                                                                                                                validation_scripts/valid_wbpe_enfr.sh                                                               0000755 0075125 0072461 00000000160 13563513735 021245  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/bin/bash

OUTPUT=$1

sed -e 's/ //g;s/▁/ /g;s/^ //' $1 | sacrebleu data/enfr/val/fr --score-only --width 2
                                                                                                                                                                                                                                                                                                                                                                                                                validation_scripts/valid_wbpe_ende.sh                                                               0000755 0075125 0072461 00000000160 13563513735 021226  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/bin/bash

OUTPUT=$1

sed -e 's/ //g;s/▁/ /g;s/^ //' $1 | sacrebleu data/ende/val/de --score-only --width 2
                                                                                                                                                                                                                                                                                                                                                                                                                validation_scripts/valid_wbpe_deen.sh                                                               0000755 0075125 0072461 00000000160 13563513735 021226  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/bin/bash

OUTPUT=$1

sed -e 's/ //g;s/▁/ /g;s/^ //' $1 | sacrebleu data/ende/val/en --score-only --width 2
                                                                                                                                                                                                                                                                                                                                                                                                                validation_scripts/valid_wbpe_enet.sh                                                               0000755 0075125 0072461 00000000160 13571502133 021233  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/bin/bash

OUTPUT=$1

sed -e 's/ //g;s/▁/ /g;s/^ //' $1 | sacrebleu data/enet/val/et --score-only --width 2
                                                                                                                                                                                                                                                                                                                                                                                                                validation_scripts/valid_wbpe_fren.sh                                                               0000755 0075125 0072461 00000000160 13563513735 021245  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/bin/bash

OUTPUT=$1

sed -e 's/ //g;s/▁/ /g;s/^ //' $1 | sacrebleu data/enfr/val/en --score-only --width 2
                                                                                                                                                                                                                                                                                                                                                                                                                validation_scripts/valid_wbpe_eten.sh                                                               0000755 0075125 0072461 00000000160 13567734644 021256  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/bin/bash

OUTPUT=$1

sed -e 's/ //g;s/▁/ /g;s/^ //' $1 | sacrebleu data/enet/val/en --score-only --width 2
                                                                                                                                                                                                                                                                                                                                                                                                                validation_scripts/valid_wbpe_entr.sh                                                               0000755 0075125 0072461 00000000160 13572211642 021253  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/bin/bash

OUTPUT=$1

sed -e 's/ //g;s/▁/ /g;s/^ //' $1 | sacrebleu data/entr/val/tr --score-only --width 2
                                                                                                                                                                                                                                                                                                                                                                                                                validation_scripts/valid_wbpe_enru.sh                                                               0000755 0075125 0072461 00000000160 13571706067 021265  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/bin/bash

OUTPUT=$1

sed -e 's/ //g;s/▁/ /g;s/^ //' $1 | sacrebleu data/enru/val/ru --score-only --width 2
                                                                                                                                                                                                                                                                                                                                                                                                                validation_scripts/valid_wbpe_tren.sh                                                               0000755 0075125 0072461 00000000160 13570435070 021254  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/bin/bash

OUTPUT=$1

sed -e 's/ //g;s/▁/ /g;s/^ //' $1 | sacrebleu data/entr/val/en --score-only --width 2
                                                                                                                                                                                                                                                                                                                                                                                                                validation_scripts/valid_wbpe_csen.sh                                                               0000755 0075125 0072461 00000000160 13563513735 021243  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/bin/bash

OUTPUT=$1

sed -e 's/ //g;s/▁/ /g;s/^ //' $1 | sacrebleu data/encs/val/en --score-only --width 2
                                                                                                                                                                                                                                                                                                                                                                                                                wordpiece_tokenize.py                                                                               0000755 0075125 0072461 00000003413 13563513735 016131  0                                                                                                    ustar   libovicky                       cisintern                                                                                                                                                                                                              #!/usr/bin/env python3

"""Tokenize text in a word-piece style."""


import argparse
import multiprocessing
import sys
import unicodedata


SPACE = "▁"


ALNUM_CHARSET = set(
    chr(i) for i in range(sys.maxunicode)
    if (unicodedata.category(chr(i)).startswith("L")
        or unicodedata.category(chr(i)).startswith("N")))


def tokenize(string):
    space_separated = string.strip().split(" ")
    tokens = []

    for space_sep_tok in space_separated:
        if all(c in ALNUM_CHARSET for c in space_sep_tok):
            tokens.append(SPACE + space_sep_tok)
            continue

        in_alph = space_sep_tok[0] in ALNUM_CHARSET
        cur_token = [SPACE]
        for char in space_sep_tok:
            if in_alph == (char in ALNUM_CHARSET):
                cur_token.append(char)
            else:
                tokens.append("".join(cur_token))
                in_alph = char in ALNUM_CHARSET
                cur_token = [char]

        if cur_token:
            tokens.append("".join(cur_token))

    return tokens


def main():
    parser = argparse.ArgumentParser(__doc__)
    parser.add_argument(
        "input", nargs="?", default=sys.stdin, type=argparse.FileType('r'))
    parser.add_argument("--num-threads", type=int, default=8)
    parser.add_argument("--buffer-size", type=int, default=100000)
    args = parser.parse_args()

    pool = multiprocessing.Pool(args.num_threads)

    line_buffer = []

    def process_buffer():
        tokenized = pool.map(tokenize, line_buffer)
        for tok in tokenized:
            print(" ".join(tok))

    for line in args.input:
        line_buffer.append(line)

        if len(line_buffer) >= args.buffer_size:
            process_buffer()
            line_buffer = []

    process_buffer()


if __name__ == "__main__":
    main()
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     