# Training and Testing of SMT SMS normalisation

# Step1: Before your run this script, split the data into source(ill) and target(norm) corpus.
# Note in the training and tuning data odd line is correct message, even line is ill-formd message, for SMS and Tweets the order is reverse.

# Step2: To run this script, Please make sure following paths are correct in the testing machine and replace XXX with your folder name
TOK=/home/XXX/smt/moses/scripts/tokenizer # Moses tokenizer directory
CORPUS=/home/XXX/smt/corpus # Corpus folder: training, tuning, testing data
ART=artificial # Data prefix of training and tuning data
PRE=corpus.tweet2 # Data prefix of testing data
# tokenization
$TOK/tokenizer.perl <$CORPUS/$ART.train.ill > $CORPUS/$ART.train.tok.ill
$TOK/tokenizer.perl <$CORPUS/$ART.train.norm > $CORPUS/$ART.train.tok.norm
$TOK/tokenizer.perl <$CORPUS/$ART.tuning.ill > $CORPUS/$ART.tuning.tok.ill
$TOK/tokenizer.perl <$CORPUS/$ART.tuning.norm > $CORPUS/$ART.tuning.tok.norm
$TOK/tokenizer.perl <$CORPUS/$PRE.test.ill > $CORPUS/$PRE.test.tok.ill
$TOK/tokenizer.perl <$CORPUS/$PRE.test.norm > $CORPUS/$PRE.test.tok.norm

# sentence prunning, because our data is small, we ignore this step
# lowercase of words
$TOK/lowercase.perl <$CORPUS/$ART.train.tok.ill > $CORPUS/$ART.train.tok.lower.ill
$TOK/lowercase.perl <$CORPUS/$ART.train.tok.norm > $CORPUS/$ART.train.tok.lower.norm
$TOK/lowercase.perl <$CORPUS/$ART.tuning.tok.ill > $CORPUS/$ART.tuning.tok.lower.ill
$TOK/lowercase.perl <$CORPUS/$ART.tuning.tok.norm > $CORPUS/$ART.tuning.tok.lower.norm
$TOK/lowercase.perl <$CORPUS/$PRE.test.tok.ill > $CORPUS/$PRE.test.tok.lower.ill
$TOK/lowercase.perl <$CORPUS/$PRE.test.tok.norm > $CORPUS/$PRE.test.tok.lower.norm

# phrase table generation GIZA++ and mkcls (time consuming)
/home/XXX/smt/moses/scripts/training/train-model.perl -root-dir /home/XXX/smt --corpus /home/XXX/smt/corpus/$ART.train.tok.lower --f ill --e norm --lm 0:2:/home/XXX/smt/model/gold.lm > /home/XXX/smt/corpus/train.out

# Tuning
/home/XXX/smt/moses/scripts/training/mert-moses.pl --working-dir=/home/XXX/smt/mert /home/XXX/smt/corpus/$ART.tuning.tok.lower.ill /home/XXX/smt/corpus/$ART.tuning.tok.lower.norm /home/XXX/smt/moses/moses-cmd/src/moses /home/XXX/smt/model/moses.ini

# filtering phrase table
# ignore

# decoding
/home/XXX/smt/moses/moses-cmd/src/moses -config /home/XXX/smt/model/moses.ini -input-file /home/XXX/smt/corpus/$PRE.test.tok.lower.ill 1> /home/XXX/smt/corpus/$PRE.test.tok.lower.predict

# calc BLEU
/home/XXX/smt/moses/scripts/generic/multi-bleu.perl /home/XXX/smt/corpus/$PRE.test.tok.lower.norm < /home/XXX/smt/corpus/$PRE.test.tok.lower.predict > /home/XXX/smt/corpus/$PRE.result
