#!/usr/bin/perl

# ------------------------
# -------- files ---------
# ------------------------

# implements buildmodel(GEN_DATA, TAR_DATA, TAR_ANNOTATED, TAR_DIC, MODEL_DIR, PROGRAM, SOLVER, HYPERPARAM)
require "build-model.pl";

# implements analyze(MODEL_DIR, INPUT_RAW, OUTPUT_WORDPROB, PROGRAM)
require "analyze.pl";

# implements evaluate(REFERENCE, TEST, OUTPUT_PREFIX)
require "evaluate.pl";

# implements find-annotation(GEN_DATA, TAR_DATA, TAR_ANNOTATED, OUTPUT_WORDPROB, TAR_OUTPUT, ACTIVE)
require "find-annotation.pl";

use strict;
use File::Path;
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";

# --------------------------------
# -------- parameters ------------
# --------------------------------

if(@ARGV != 4) {
    print STDERR "Usage: process.pl PROGRAM={liblinear,crfpp,crfsuite,mecab,classias,kytea} SOLVER={lrprimal,lrdual,lbfgs,mira,sgd} TYPE={part,full,sent,dict} CRITERION={margin,tot,avg}\n";
    exit 1; 
}
my ($PROGRAM, $SOLVER, $ACTIVE, $CRITERION) = @ARGV;
my $id = "exp/".join("-", @ARGV);

# program type
#  point
#   liblinear
#  sequence
#   crfsuite
#   mecab
$PROGRAM =~ /^(liblinear|crfpp|crfsuite|mecab|classias|kytea)$/ or die "Bad program $PROGRAM";

# solver type
#  lrprimal (LIBLINEAR)
#  lrdual (LIBLINEAR)
#  lbfgs (CRF++)
#  mira (CRF++)
$SOLVER =~ /^(svm|lrprimal|lrdual|lbfgs|sgd|mira)$/ or die "Bad solver $SOLVER";

# active learning type
#  part: annotate the word with the lowest confidence
#  sent: annotate the sentence with the lowest marginal confidence
#  dict: add the word with the lowest confidence to the dictionary
#  full: annotate sentences starting at the beginning
$ACTIVE =~ /^(part|full|sent|dict)$/ or die "Bad active learning type $ACTIVE";

$CRITERION =~ /^(margin|tot|avg)$/ or die "Bad criterion $CRITERION";


# constants
my $ITERATIONS = 100;
my $WORDS = 100;
my $COVERAGE = ($PROGRAM eq "mecab") ? 0.9999 : 0.99;
my $wd = `pwd`; chomp $wd;

# --------------------------
# -------- process ---------
# --------------------------

# 0) build original files (if necessary)
# remove directory
die "Directory $id already exists" if -e $id;
# split corpus
# combine corpora into training data
# make dicionaries

# 1) tune parameters
my %bestweights = (
    # "liblinear-lrprimal" => 1000,
    # "crfsuite-lbfgs" => 0.002,
    # "liblinear-lrdual" => 100,
    # "liblinear-svm" => 1,
    # "kytea-lrdual" => 100,
    # "kytea-svm" => 5,
    # "kytea-lrprimal" => 100,
    # "classias-lbfgs" => 0.2,
    # "mecab-lbfgs" => 1
);
my ($bestweight, $bestscore) = ($bestweights{"$PROGRAM-$SOLVER"});
my @lineweight = qw(0.01 0.02 0.05 0.1 0.2 0.5 1 2 5 10 100 1000 10000 1000000);
@lineweight = qw(0.0001 0.001 0.002 0.005 0.01 0.02 0.05 0.1 0.2 0.5 1 2 5 10 100 1000) if $PROGRAM =~ /(classias|crfsuite)/;
if(not $bestweight) {
    my $tunegentrain = "gen-01-08"; # TODO change to 01-08
    my $tunetartrain = "tar-01-08";
    my $tunegentest = "gen-09";
    my $dname = "data/$tunegentrain-$COVERAGE.dic";
    `sed 's/\\/[^ ]*//g; s/ //g' < data/$tunegentest.wordpart > data/$tunegentest.raw` if not -e "data/$tunegentest.raw";
    builddict("/dev/null","data/$tunegentrain.wordpart",0,$dname,$COVERAGE);
    foreach my $weight ( @lineweight ) {
        buildmodel("data/$tunegentrain.wordpart", "data/$tunetartrain.wordpart", "/dev/null", $dname, 
                    "$id/tune-$weight/model", $PROGRAM, $SOLVER, $weight);
        analyze("$id/tune-$weight/model", $dname, "data/$tunegentest.raw", 
                    "$id/tune-$weight/$tunegentest", $PROGRAM);
        my $currscore = evaluate("data/$tunegentest.wordpart", "$id/tune-$weight/$tunegentest.wordpart", "$id/tune-$weight/$tunegentest");
        print "weight: $weight, score: $currscore\n";
        if($currscore > $bestscore) {
            $bestweight = $weight;
            $bestscore = $currscore;
        }
        `rm -r $id/tune-$weight/*{feat,idx,model,prob}`;
    }
}
print STDERR "Found best score $bestscore for $id at $bestweight\n";

# 2) prepare data for the original model
my $gentrain = "gen-01-09"; #TODO fix
my $tartrain = "tar-01-09"; #TODO fix
my $gentest = "gen-10";
my $tartest = "tar-10";
`sed 's/\\/[^ ]*//g; s/ //g' < data/$gentest.wordpart > data/$gentest.raw` if not -e "data/$gentest.raw";
`sed 's/\\/[^ ]*//g; s/ //g' < data/$tartest.wordpart > data/$tartest.raw` if not -e "data/$tartest.raw";
`sed 's/\\/[^ ]*//g; s/ //g' < data/$tartrain.wordpart > data/$tartrain.raw` if not -e "data/$tartrain.raw";
my $dname = "data/$gentrain-$COVERAGE.dic";
(mkpath "$id/000" or die "$id/000: $!") if not -e "$id/000";
builddict("/dev/null","data/$gentrain.wordpart",0,$dname,$COVERAGE);
`touch $id/000/000.annot`;
`ln -s $wd/$dname $id/000/000.dic`;

# 3) do iterations of active learning

foreach my $iter (1 .. $ITERATIONS) {
    print STDERR "Starting Iteration $iter\n";
    my $curr = sprintf("%03i",$iter-1);
    my $next = sprintf("%03i",$iter);
    `ln -s $wd/data/$tartrain.wordpart $id/$curr/$curr.wordpart`;
    buildmodel("data/$gentrain.wordpart", "$id/$curr/$curr.wordpart", 
                ($ACTIVE eq "dict")  ? "/dev/null" : "$id/$curr/$curr.annot", 
                "$id/$curr/$curr.dic", 
                "$id/$curr/model", $PROGRAM, $SOLVER, $bestweight);
    # grade the in-domain and out of domain test sets
    analyze("$id/$curr/model", "$id/$curr/$curr.dic", "data/$gentest.raw", "$id/$curr/$gentest", $PROGRAM);
    my $genf = evaluate("data/$gentest.wordpart", "$id/$curr/$gentest.wordpart", "$id/$curr/$gentest");
    analyze("$id/$curr/model", "$id/$curr/$curr.dic", "data/$tartest.raw", "$id/$curr/$tartest", $PROGRAM);
    my $tarf = evaluate("data/$tartest.wordpart", "$id/$curr/$tartest.wordpart", "$id/$curr/$tartest");
    print STDERR " F-measure: general=$genf, target=$tarf\n";
    analyze("$id/$curr/model", "$id/$curr/$curr.dic", "data/$tartrain.raw", "$id/$curr/$tartrain", $PROGRAM);
    # find the spots that should be annotated
    `mkdir $id/$next` if not -e $id/$next;
    findannot("$id/$curr/$curr.wordpart", "$id/$curr/$tartrain.wordpart", 
                "$id/$curr/$tartrain-ws.prob", "$id/$curr/$tartrain-pos.prob",
                "$id/$curr/$curr.annot", "$id/$next/$next.annot", 
                $WORDS*$iter, $PROGRAM, $ACTIVE, $CRITERION);
    builddict($dname, "data/$tartrain.wordpart", "$id/$next/$next.annot", "$id/$next/$next.dic",1);
    `rm -r $id/$curr/*{feat,idx,model,prob}`;
}
