#!/usr/bin/perl

use strict;
use List::Util qw(min max shuffle sum);
use utf8;

# training program paths
my $TRAIN_LIBLINEAR="ll-train";
my $TRAIN_KYTEA="train-kytea";
my $CRFSUITE="crfsuite";
my $CRFPP_LEARN="crf_learn";
my $CLASSIAS_TRAIN="classias-train";

# The n-gram window for character and character types
my ($XWIND, $XGRAM, $TWIND, $TGRAM, $DGRAM) = (2,3,3,3,5);
my $MAXCOL = 100;

sub writeids {
    my ($file, $ids) = @_;
    open IDX, ">:utf8", "$file" or die "$file: $!";
    for(sort { $ids->{$a} <=> $ids->{$b} } keys %$ids) {
        print IDX "$_\t".$ids->{$_}."\n";
    }
    close IDX;
}
sub loadids {
    my ($IDIN) = @_;
    my %idx;
    if($IDIN) {
        open FILE, "<:utf8", "$IDIN" or die "$IDIN: $!";
        while(<FILE>) { chomp; my($w,$p)=split(/[ \t]/); $idx{$w}=$p }
        close FILE;
    }
    return %idx;
}

sub loadcorpus {
    my ($DATA, $ANNOT) = @_;
    open DATA, "<:utf8", $DATA or die "$DATA: $!";
    my (@data,@annot);
    my $default = ($ANNOT ? 0 : 1);
    while(<DATA>) { 
        chomp; 
        push @data, $_;
        my @arr = map { $default } split(/ /);
        push @annot, \@arr;    
    }
    close DATA;
    if($ANNOT) {
        open ANNFILE, "<:utf8", $ANNOT or die "$ANNOT: $!";
        while(<ANNFILE>) {
            chomp;
            for(split(/ /)) {
                my ($sent,$word) = split(/-/);
                $annot[$sent]->[$word] = 1;
            }
        }
        close ANNFILE;
    } 
    my @newannot = map { join('', @$_) } @annot;
    return (\@data, \@newannot);
}

# build a dictionary with the given coverage and write it out
sub builddict { 
    my ($ORIGINAL,$TAR_DATA,$TAR_ANNOTATED,$outfile,$coverage) = @_;
    return if -e $outfile;
    print STDERR "Making dictionary $outfile\n";
    my ($tdata,$tannot) = loadcorpus($TAR_DATA,$TAR_ANNOTATED);
    my ($tot, %words);
    foreach my $i (0 .. @$tannot-1) {
        my @myd = split(/ /, $tdata->[$i]);
        my @mya = split(//, $tannot->[$i]);
        for(0 .. $#mya) {
            if($mya[$_]) {
                $tot += $mya[$_];
                $words{$myd[$_]} += $mya[$_];
            }
        }
    }
    print STDERR " $tot words annotated in dictionary construction\n";
    # count and remove singletons until the desired coverage is obtained
    my @singletons;
    while(my ($k,$v) = each(%words)) {
        push @singletons, $k if $v==1; 
    }
    print STDERR "DICT before words=".(scalar keys %words)." singletons=".(scalar @singletons);
    @singletons = shuffle(@singletons);
    for( 0 .. min(int((1-$coverage)*$tot)-1, $#singletons) ) {
        delete $words{$singletons[$_]};
    }
    print STDERR ", after words=".(scalar keys %words);
    open FILE, "<:utf8", $ORIGINAL or die "$ORIGINAL: $!";
    while(<FILE>) { chomp; $words{$_}++; }
    close FILE;
    delete $words{""};
    print STDERR ", adding general words=".(scalar keys %words)."\n";
    open FILE, ">:utf8", $outfile or die "$outfile: $!";
    print FILE "$_\n" for(sort keys %words);
    close FILE;
}

# find dictionary words in the sentence
sub dictfeat {
    my ($str, $dict, $maxlen) = @_;
    my $l = length($str);
    my @feat = map { {} } ( 1 .. $l );
    foreach my $i (0 .. $l-1) {
        foreach my $j (1 .. min($maxlen,$l-$i)) {
            my $sub = substr($str, $i, $j);
            if($dict->{$sub}) {
                if($i != 0) { $feat[$i-1]->{"R".min($j,$DGRAM)}++; }
                foreach my $k ( 1.. $j-1 ) { $feat[$i+$k-1]->{"I".min($j,$DGRAM)}++; }
                if($i+$j != $l) { $feat[$i+$j-1]->{"L".min($j,$DGRAM)}++; }
            }
        }
    }
    return map { my @arr = map { "$_|$_" } keys %$_; \@arr } @feat;
}

# map characters to types
sub maptype {
    $_ = shift;
    s/[a-zA-Zａ-ｚＡ-Ｚ]/R/g;
    s/[0-9０-９]/N/g;
    s/\p{InKatakana}/T/g;
    s/\p{InHiragana}/H/g;
    s/\p{InCJKUnifiedIdeographs}/K/g;
    s/[^RNTHK]/O/g;
    return $_;
}

# make n-gram features
sub ngramfeat {
    my ($str,$wind,$gram,$left,$right,$head) = @_;
    my $lstr = substr($str,max($left-$wind+1,0),min($wind,$left+1));
    $lstr = '_'.$lstr while(length($lstr) < $wind);
    my $rstr = substr($str,$right,min($wind,length($str)-$right));
    $rstr = $rstr.'_' while(length($rstr) < $wind);
    my $nstr = $lstr.$rstr;
    my @ret;
    foreach my $myg (1 .. $gram) {
        foreach my $myl (0 .. length($nstr)-$myg) {
            next if ($myl+$myg < $wind) or ($myl > $wind);
            my $id = $head.($myl-$wind).",$myg|".substr($nstr,$myl,$myg);
            push @ret, $id if not $id =~ /_/;
        }
    }
    return @ret;
}

# take a corpus, an annotation file, and a feature ID file as input, writes
#  $OUTPREFIX.ll-feat and $OUTPREFIX.ll-fidx as output
#  if $OUTPREFIX.ll-feat already exists, don't do anything
#  note that if the dictionary is out of date features won't be updated
sub buildwsfeat {
    my ($CORP_FILE,$ANNOT_FILE,$dict) = @_;
    my($data,$annot) = loadcorpus($CORP_FILE, $ANNOT_FILE);
    my $maxlen = max( map { length($_) } keys %$dict );
    # build WS features
    my (@featlist);
    foreach my $i (0 .. @$annot - 1) {
        next if not $annot->[$i];
        my $str = $data->[$i]; $str =~ s/\/\S*//g;
        my @myd = split(/ /,$str);
        $str =~ s/ //g;
        my @mya = split(//,$annot->[$i]);
        my $begin = 0;
        my $end = 0;
        # label the word boundaries but skip the last one
        my @mywb;
        foreach my $j ( 0 .. $#myd ) {
            push @mywb, ($mya[$j]?-1:0) for(2 .. length($myd[$j]));
            push @mywb, (($mya[$j] or $mya[$j+1])?1:0);
        }
        pop @mywb;
        my $tstr = maptype($str);
        my @feats = dictfeat($str,$dict,$maxlen);
        for(0 .. $#mywb) {
            next if $mywb[$_] == 0;
            push @{$feats[$_]}, ngramfeat($str,$XWIND,$XGRAM,$_,$_+1,"X"); 
            push @{$feats[$_]}, ngramfeat($tstr,$TWIND,$TGRAM,$_,$_+1,"T"); 
            push @featlist, "$mywb[$_] ".join(' ',@{$feats[$_]});
        }
        push @featlist, "EOS";
    }
    return @featlist;
}

sub buildllwsfeat {
    my ($CORP_FILE,$ANNOT_FILE,$IDIN,$dict,$prefix,$abbrv,$addline,$sep) = @_;
    return if -e "$prefix.$abbrv-feat";
    # build the features
    my @featlist = buildwsfeat($CORP_FILE,$ANNOT_FILE,$dict);
    # get the feature names
    my %fidx = loadids($IDIN?"$IDIN-fidx":$IDIN);
    # parse the features into the required format
    open WSFEAT, ">:utf8", "$prefix.$abbrv-feat" or die "$prefix.$abbrv-feat: $!";
    for(@featlist) {
        if(/^EOS$/) {
            print WSFEAT "\n" if $addline;
        } else {
            my ($wb, @feats) = split(/ /);
            @feats = map { $fidx{$_} = keys(%fidx)+1 if(not $fidx{$_}); $fidx{$_} } @feats;
            print WSFEAT join($sep,$wb,map { "$_:1" } sort { $a <=> $b } @feats)."\n";
        }
    }
    close WSFEAT;
    writeids("$prefix.$abbrv-fidx", \%fidx);
}

sub buildcrfppwsfeat {
    my ($CORP_FILE,$ANNOT_FILE,$IDIN,$dict,$prefix,$abbrv) = @_;
    return if -e "$prefix.$abbrv-feat";
    # build the features
    my @featlist = buildwsfeat($CORP_FILE,$ANNOT_FILE,$dict);
    # get the column names
    my %cidx = loadids($IDIN?"$IDIN-cidx":$IDIN);
    # parse the features into the required format
    open WSFEAT, ">:utf8", "$prefix.$abbrv-feat" or die "$prefix.$abbrv-feat: $!";
    for(@featlist) {
        if(/^EOS$/) { print WSFEAT "\n"; }
        else {
            my ($wb, @feats) = split(/ /);
            my @outfeats = map { "NUL" } ( 1 .. $MAXCOL );
            for(@feats) {
                my ($name,$val) = split(/\|/);
                $cidx{$name} = keys(%cidx)+1 if(not $cidx{$name});
                $outfeats[$cidx{$name}-1] = $val;
            }
            print WSFEAT "@outfeats $wb\n";
        }
    }
    close WSFEAT;
    writeids("$prefix.$abbrv-cidx", \%cidx);
    open FILE, ">:utf8", "$prefix.$abbrv-temp" or die "$prefix.$abbrv-temp: $!\n";
    print FILE "# Unigram\n";
    print FILE "U$_:%x[0,$_]\n" for(0 .. $MAXCOL-1);
    print FILE "\n# Bigram\nB\n";
    close FILE;
}

sub buildposfeat {
    my ($CORP_FILE,$ANNOT_FILE,$dict) = @_;
    my($data,$annot) = loadcorpus($CORP_FILE, $ANNOT_FILE);
    # build POS features
    my (@featlist);
    foreach my $i (0 .. @$annot - 1) {
        next if not $annot->[$i]; 
        my (@myd,@myp);
        my $str = $data->[$i];
        for(split(/ /,$str)) {
            my ($d,$p) = split(/\//);
            push @myd, $d; push @myp, ($p?$p:"NONE");
        }
        $str =~ s/\/\S*//g; $str =~ s/ //g;
        my @mya = split(//,$annot->[$i]);
        # label the word boundaries but skip the last one
        my @mywb;
        my $start = 0;
        my $end = 0;
        my $tstr = maptype($str);
        foreach my $j ( 0 .. $#myd ) {
            my $end = $start+ length($myd[$j]);
            my @feats = ( "W|$myd[$j]", "Z|".maptype($myd[$j]) );
            if($dict->{$myd[$j]}) {
                push @feats, "P$_|P$_" for(@{$dict->{$myd[$j]}});
            }
            push @feats, ngramfeat($str,$XWIND,$XGRAM,$start-1,$end,"X"); 
            push @feats, ngramfeat($tstr,$TWIND,$TGRAM,$start-1,$end,"T"); 
            push @featlist, "$myp[$j] @feats";
            $start = $end;
        }
        push @featlist, "EOS";
    }
    return @featlist;
}

sub buildcrfppposfeat {
    my ($CORP_FILE,$ANNOT_FILE,$IDIN,$dict,$prefix,$abbrv) = @_;
    return if -e "$prefix.$abbrv-feat";
    # load the dictionaries
    my %fidx = loadids($IDIN?"$IDIN-fidx":0);
    my %cidx = loadids($IDIN?"$IDIN-cidx":0);
    # build the features
    my @featlist = buildposfeat($CORP_FILE,$ANNOT_FILE,$dict);
    # parse the features into the required format
    open POSFEAT, ">:utf8", "$prefix.$abbrv-feat" or die "$prefix.$abbrv-feat: $!";
    for(@featlist) {
        if(/^EOS$/) { print POSFEAT "\n"; }
        else {
            my ($pos, @feats) = split(/ /);
            my @outfeats = map { "NUL" } ( 1 .. $MAXCOL );
            for(@feats) {
                my ($name,$val) = split(/\|/);
                $cidx{$name} = keys(%cidx)+1 if(not $cidx{$name});
                $outfeats[$cidx{$name}-1] = $val;
            }
            print POSFEAT "@outfeats $pos\n";
        }
    }
    close POSFEAT;
    writeids("$prefix.$abbrv-fidx", \%fidx);
    writeids("$prefix.$abbrv-cidx", \%cidx);
    open FILE, ">:utf8", "$prefix.$abbrv-temp" or die "$prefix.$abbrv-temp: $!\n";
    print FILE "# Unigram\n";
    print FILE "U$_:%x[0,$_]\n" for(0 .. $MAXCOL-1);
    print FILE "\n# Bigram\nB\n";
    close FILE;
}

sub buildllposfeat {
    my ($CORP_FILE,$ANNOT_FILE,$IDIN,$dict,$prefix,$abbrv,$addline,$sep) = @_;
    return if -e "$prefix.$abbrv-feat";
    # load the dictionaries
    my %fidx = loadids($IDIN?"$IDIN-fidx":0);
    my %pidx = loadids($IDIN?"$IDIN-pidx":0);
    # build the features
    my @featlist = buildposfeat($CORP_FILE,$ANNOT_FILE,$dict);
    # parse the features into the required format
    open POSFEAT, ">:utf8", "$prefix.$abbrv-feat" or die "$prefix.$abbrv-feat: $!";
    for(@featlist) {
        if(/^EOS$/) {
            print POSFEAT "\n" if $addline;
        } else {
            my ($p, @feats) = split(/ /);
            $pidx{$p} = keys(%pidx)+1 if($p and not $pidx{$p});   
            $p = $pidx{$p};
            @feats = map { $fidx{$_} = keys(%fidx)+1 if(not $fidx{$_}); $fidx{$_} } @feats;
            print POSFEAT join($sep,$p,map { "$_:1" } sort { $a <=> $b } @feats)."\n";
        }
    }
    close POSFEAT;
    writeids("$prefix.$abbrv-fidx", \%fidx);
    writeids("$prefix.$abbrv-pidx", \%pidx);
}

sub kyteapartial {
    my ($CORP_FILE,$ANNOT_FILE) = @_;
    my($data,$annot) = loadcorpus($CORP_FILE, $ANNOT_FILE);
    my @ret;
    for(0 .. @$data-1) {
        my (@myw,@myp,@mya);
        for(split(/ /,$data->[$_])) { my ($w,$p) = split(/\//); push @myw, $w; push @myp, $p; }
        my @mya = split(//, $annot->[$_]);
        my @mybelong;
        foreach my $i (0 .. $#myw) {
            push @mybelong, $i for(1 .. length($myw[$i]));
        }
        my @myx = split(//,join('', @myw));
        my $out = $myx[0];
        foreach my $i (1 .. $#myx) {
            my ($p,$n) = ( $mybelong[$i-1], $mybelong[$i] );
            $out .= "/$myp[$p]" if($p != $n and $mya[$p]);
            if(($mya[$p] or $mya[$n])) {
                $out .= ($p==$n)?"-":"|";
            } else {
                $out .= " ";
            }
            $out .= $myx[$i];
        }
        $out .= "/$myp[-1]" if($mya[-1]);
        # print STDERR "line $out\n";
        push @ret, $out; 
    }
    return @ret;
}

sub readdict {
    my ($TAR_DICT) = (@_);
    open DICTFILE, "<:utf8", $TAR_DICT or die "$TAR_DICT: $!";
    my %dict;
    while(<DICTFILE>) {
        chomp;
        my ($w,$p) = split(/\//);
        $dict{$w} = [] if not $dict{$w};
        push @{$dict{$w}}, $p;
    }
    close DICTFILE;
    return %dict;
}

# build a model for mecab
sub buildmecab {
    my ($GEN_DATA, $TAR_DATA, $TAR_ANNOT, $dict, $MODEL_DIR, $HYPERPARAM) = @_;

    # link the definition files
    my $wd = `pwd`; chomp $wd;
    `cp -r $wd/mecab-seed $MODEL_DIR/seed`;
    
    # make the dictionary
    open DIC, ">:utf8", "$MODEL_DIR/seed/mecab-dic.csv" or die "$MODEL_DIR/seed/mecab-dic.csv: $!\n";
    while(my($w,$v) = each(%$dict)) {
        for(@$v) {
            print DIC "$w,0,0,0,$_,$w\n";
        }
    }
    close DIC;
    
    # make the corpus
    my ($gdata,$gannot) = loadcorpus($GEN_DATA, 0);
    my ($tdata,$tannot) = loadcorpus($TAR_DATA, $TAR_ANNOT);
    my @data = ( @$gdata, @$tdata );
    my @annot = ( @$gannot, @$tannot );
    open CORP, ">:utf8", "$MODEL_DIR/seed/corpus" or die "$MODEL_DIR/seed/corpus: $!\n";
    for(0 .. $#data) {
        my @myd = split(/ /,$data[$_]);
        my @mya = split(//,$annot[$_]);
        if(sum(@mya) == @myd) {
            foreach my $str (@myd) {
                my($w,$p) = split(/\//, $str);
                print CORP "$w\t$p,$w\n";
            }
            print CORP "EOS\n";
        }
    }
    close CORP;

    # lexicalize model
    `script/lexicalizetopn.pl 5000 < $MODEL_DIR/seed/corpus > $MODEL_DIR/seed/rewrite.def`;

    # build the dictionary and the model
    print STDERR "cd $wd/$MODEL_DIR/seed; mecab-dict-index -t utf-8 -f utf-8\n";
    `cd $wd/$MODEL_DIR/seed; mecab-dict-index -t utf-8 -f utf-8`;
    print STDERR "cd $wd/$MODEL_DIR/seed; mecab-cost-train -c $HYPERPARAM corpus model &> log.txt\n";
    `cd $wd/$MODEL_DIR/seed; mecab-cost-train -c $HYPERPARAM corpus model &> log.txt`;
    `mkdir $MODEL_DIR/final` if not -e "$MODEL_DIR/final";
    print STDERR "cd $wd/$MODEL_DIR/seed; mecab-dict-gen -o ../final -m model\n";
    `cd $wd/$MODEL_DIR/seed; mecab-dict-gen -o ../final -m model`;
    print STDERR "cd $wd/$MODEL_DIR/final; mecab-dict-index -t utf-8 -f utf-8\n";
    `cd $wd/$MODEL_DIR/final; mecab-dict-index -t utf-8 -f utf-8`;

}

# implements buildmodel(GEN_DATA, TAR_DATA, TAR_ANNOT, TAR_DICT, MODEL_DIR, PROGRAM, SOLVER, HYPERPARAM)
sub buildmodel {
    my ($GEN_DATA, $TAR_DATA, $TAR_ANNOT, $TAR_DICT, $MODEL_DIR, $PROGRAM, $SOLVER, $HYPERPARAM) = @_;

    # make the directories
    (mkpath $MODEL_DIR or die "$MODEL_DIR: $!") if not -e $MODEL_DIR;

    # make and load the dictionary
    my $wd = `pwd`; chomp $wd;
    die "Dictionary $TAR_DICT not found\n" if(not -e $TAR_DICT);
    my %dict = readdict($TAR_DICT);

    # run the training
    my (%solvers,$abbrv,$options,$wsfunc,$posfunc,$addspace,$sep);
    if ($PROGRAM eq "crfpp") { 
        %solvers = ("lbfgs"=>"CRF", "mira"=>"MIRA");
        $abbrv = "crfpp";
        $options = "-a $solvers{$SOLVER} -c $HYPERPARAM";
        $wsfunc = \&buildcrfppwsfeat;
        $posfunc = \&buildcrfppposfeat;
    } elsif ($PROGRAM eq "liblinear") {
        %solvers = ("lrprimal"=>0, "lrdual"=>7, "svm"=>1);
        $abbrv = "ll";
        $options = "-s $solvers{$SOLVER} -c $HYPERPARAM -B 1";
        $options .= " -e 0.0001" if $SOLVER eq "lrprimal";
        $wsfunc = \&buildllwsfeat;
        $posfunc = \&buildllposfeat;
        $sep = " ";
    } elsif ($PROGRAM eq "classias") {
        %solvers = ("lbfgs"=>"lbfgs.logistic", "pegasos"=>"pegasos.logistic");
        $abbrv = "cls";
        $options = "-a $solvers{$SOLVER} -p c2=$HYPERPARAM -p delta=1e-4 -p epsilon=1e-4 -b 1 -p max_iterations=100";
        $wsfunc = \&buildllwsfeat;
        $posfunc = \&buildllposfeat;
        $sep = " ";
    } elsif ($PROGRAM eq "crfsuite") {
        %solvers = ("lbfgs"=>"lbfgs", "sgd"=>"l2sgd");
        $abbrv = "suite";
        $options = "-p algorithm=$solvers{$SOLVER} -p epsilon=0.0001 -p delta=0.0001 -p c2=$HYPERPARAM";
        # $options = "-p algorithm=$solvers{$SOLVER} -p c2=$HYPERPARAM -p delta=0.001 -p epsilon=0.001 -p max_iterations=100";
        $wsfunc = \&buildllwsfeat;
        $posfunc = \&buildllposfeat;
        $addspace = 1;
        $sep = "\t";
    } elsif ($PROGRAM eq "kytea") {
        %solvers = ("lrprimal"=>0, "lrdual"=>7, "svm"=>1);
        $abbrv = "kytea";
        $options = "-solver $solvers{$SOLVER} -cost $HYPERPARAM";
        print STDERR "Building KyTea partial corpus\n";
        my @data = ( kyteapartial($GEN_DATA,0), kyteapartial($TAR_DATA,$TAR_ANNOT) );
        open OUT, ">:utf8", "$MODEL_DIR/train.part" or die "$MODEL_DIR/train.part: $!";
        for(@data) { print OUT "$_\n"; }
        close OUT;
        print STDERR "$TRAIN_KYTEA $options -charn $XGRAM -charw $XWIND -typen $TGRAM -typew $TWIND -part $MODEL_DIR/train.part -dict $TAR_DICT -model $MODEL_DIR/kytea.mod\n";
        `$TRAIN_KYTEA $options -charn $XGRAM -charw $XWIND -typen $TGRAM -typew $TWIND -part $MODEL_DIR/train.part -dict $TAR_DICT -model $MODEL_DIR/kytea.mod`;
        return;
    } elsif ($PROGRAM eq "mecab") {
        buildmecab($GEN_DATA, $TAR_DATA, $TAR_ANNOT, \%dict, $MODEL_DIR, $HYPERPARAM);
        return;
    } else {
        die "bad program $PROGRAM";
    }
    die "bad solver $SOLVER" if(not exists $solvers{$SOLVER});

    # build WS features
    print STDERR "Building WS features\n";
    my (%wsidx,%posfeat,%posidx);
    my $gen_pref = $GEN_DATA; $gen_pref =~ s/.wordpart//g;
    my $tar_pref = $TAR_DATA; $tar_pref =~ s/.wordpart//g;    
    $wsfunc->($GEN_DATA,0,0,\%dict,"$gen_pref-ws",$abbrv,$addspace,$sep);
    $wsfunc->($TAR_DATA,$TAR_ANNOT,"$gen_pref-ws.$abbrv",\%dict,"$tar_pref-ws",$abbrv,$addspace,$sep);
    `cat $gen_pref-ws.$abbrv-feat $tar_pref-ws.$abbrv-feat > $MODEL_DIR/ws-mod.$abbrv-feat`;
    foreach my $f ( qw(fidx pidx cidx temp) ) {
        `ln -s $wd/$tar_pref-ws.$abbrv-$f $MODEL_DIR/ws-mod.$abbrv-$f` if -e "$wd/$tar_pref-ws.$abbrv-$f";
    }

    # train WS
    print STDERR "Training WS model\n";
    if($PROGRAM eq "liblinear") {
        print STDERR "$TRAIN_LIBLINEAR $options $MODEL_DIR/ws-mod.ll-feat $MODEL_DIR/ws-mod.mod &> $MODEL_DIR/ws-mod.log\n";
        `$TRAIN_LIBLINEAR $options $MODEL_DIR/ws-mod.ll-feat $MODEL_DIR/ws-mod.mod &> $MODEL_DIR/ws-mod.log`;
    } elsif($PROGRAM eq "crfpp") {
        print STDERR "$CRFPP_LEARN $options $MODEL_DIR/ws-mode.crfpp-temp $MODEL_DIR/ws-mod.crfpp-feat $MODEL_DIR/ws-mod.mod &> $MODEL_DIR/ws-mod.log\n";
        `$CRFPP_LEARN $options $MODEL_DIR/ws-mod.crfpp-temp $MODEL_DIR/ws-mod.crfpp-feat $MODEL_DIR/ws-mod.mod &> $MODEL_DIR/ws-mod.log`;
    } elsif($PROGRAM eq "classias") {
        print STDERR "$CLASSIAS_TRAIN $options -tb -m $MODEL_DIR/ws-mod.mod $MODEL_DIR/ws-mod.$abbrv-feat &> $MODEL_DIR/ws-mod.log\n";
        `$CLASSIAS_TRAIN $options -tb -m $MODEL_DIR/ws-mod.mod $MODEL_DIR/ws-mod.$abbrv-feat &> $MODEL_DIR/ws-mod.log`;
    } elsif($PROGRAM eq "crfsuite") {
        print STDERR "$CRFSUITE learn $options -m $MODEL_DIR/ws-mod.mod $MODEL_DIR/ws-mod.$abbrv-feat &> $MODEL_DIR/ws-mod.log\n";
        `$CRFSUITE learn $options -m $MODEL_DIR/ws-mod.mod $MODEL_DIR/ws-mod.$abbrv-feat &> $MODEL_DIR/ws-mod.log`;
    }

    # build POS features
    print STDERR "Building POS features\n";
    $posfunc->($GEN_DATA,0,0,\%dict,"$gen_pref-pos",$abbrv,$addspace,$sep);
    $posfunc->($TAR_DATA,$TAR_ANNOT,"$gen_pref-pos.$abbrv",\%dict,"$tar_pref-pos",$abbrv,$addspace,$sep);
    `cat $gen_pref-pos.$abbrv-feat $tar_pref-pos.$abbrv-feat > $MODEL_DIR/pos-mod.$abbrv-feat`;
    foreach my $f ( qw(fidx pidx cidx temp) ) {
        `ln -s $wd/$tar_pref-pos.$abbrv-$f $MODEL_DIR/pos-mod.$abbrv-$f` if -e "$wd/$tar_pref-pos.$abbrv-$f";
    }

    # train POS
    print STDERR "Training POS model\n";
    if($PROGRAM eq "liblinear") {
        print STDERR "$TRAIN_LIBLINEAR $options $MODEL_DIR/pos-mod.ll-feat $MODEL_DIR/pos-mod.mod &> $MODEL_DIR/pos-mod.log\n";
        `$TRAIN_LIBLINEAR $options $MODEL_DIR/pos-mod.ll-feat $MODEL_DIR/pos-mod.mod &> $MODEL_DIR/pos-mod.log`;
    } elsif($PROGRAM eq "crfpp") {
        print STDERR "$CRFPP_LEARN $options $MODEL_DIR/pos-mod.crfpp-temp $MODEL_DIR/pos-mod.crfpp-feat $MODEL_DIR/pos-mod.mod &> $MODEL_DIR/pos-mod.log\n";
        `$CRFPP_LEARN $options $MODEL_DIR/pos-mod.crfpp-temp $MODEL_DIR/pos-mod.crfpp-feat $MODEL_DIR/pos-mod.mod &> $MODEL_DIR/pos-mod.log`;
    } elsif($PROGRAM eq "classias") {
        print STDERR "$CLASSIAS_TRAIN $options -tn -m $MODEL_DIR/pos-mod.mod $MODEL_DIR/pos-mod.$abbrv-feat &> $MODEL_DIR/pos-mod.log\n";
        `$CLASSIAS_TRAIN $options -tn -m $MODEL_DIR/pos-mod.mod $MODEL_DIR/pos-mod.$abbrv-feat &> $MODEL_DIR/pos-mod.log`;
    } elsif($PROGRAM eq "crfsuite") {
        print STDERR "$CRFSUITE learn $options -m $MODEL_DIR/pos-mod.mod $MODEL_DIR/pos-mod.$abbrv-feat &> $MODEL_DIR/pos-mod.log\n";
        `$CRFSUITE learn $options -m $MODEL_DIR/pos-mod.mod $MODEL_DIR/pos-mod.$abbrv-feat &> $MODEL_DIR/pos-mod.log`;
    }

}

1
