
# require "build-model.pl"
use strict;
use utf8;

# program paths
my $RUN_LIBLINEAR = "ll-predict";
my $RUN_MECAB = "mecab";
my $RUN_CRFSUITE = "crfsuite";
my $RUN_CRFPP = "crf_test";
my $RUN_CLASSIAS = "classias-tag";
my $RUN_KYTEA = "kytea";

my $MAXCOL=100;

# implements analyze(MODEL_DIR, DICT, IN_DATA, OUT_PREFIX, PROGRAM)
sub analyze {
    my ($MODEL_DIR, $DICT, $IN_DATA, $OUT_PREFIX, $PROGRAM) = @_;
    my (@data,@annot);
    my %dict = readdict($DICT);

    # analyze using KyTea
    if ($PROGRAM eq "kytea") {
        print STDERR "$RUN_KYTEA -out conf -deftag \"名詞\" -model $MODEL_DIR/kytea.mod < $IN_DATA > $OUT_PREFIX.wordprob\n";
        `$RUN_KYTEA -out conf -deftag "名詞" -model $MODEL_DIR/kytea.mod < $IN_DATA > $OUT_PREFIX.wordprob`;
        my($a,$b,$c,$d);
        open INFILE, "<:utf8", "$OUT_PREFIX.wordprob" or die "$OUT_PREFIX.wordprob: $!";
        open WPFILE, ">:utf8", "$OUT_PREFIX.wordpart" or die "$OUT_PREFIX.wordpart: $!";
        open WSFILE, ">:utf8", "$OUT_PREFIX-ws.prob"  or die "$OUT_PREFIX-ws.prob: $!";
        open POSFILE, ">:utf8","$OUT_PREFIX-pos.prob" or die "$OUT_PREFIX-pos.prob: $!";
        while($a = <INFILE> and $b = <INFILE> and $c = <INFILE> and $d = <INFILE>) {
            chomp $a; $a =~ s/&\S*//g; print WPFILE "$a\n";
            chomp $b; for(split(/ /,$b)) { print WSFILE "1 $_\n"; }
            chomp $c; for(split(/ /,$c)) { s/&/ /g; $_ = 1 if(not $_); print POSFILE "1 $_\n"; }
        }
        close INFILE; close WPFILE; close WSFILE; close POSFILE;
        return;
    }

    # analyze using MeCab
    if ($PROGRAM eq "mecab") {
        print STDERR "$RUN_MECAB -l2 -F\"%m %H %pP\\n\" -E\"EOS\\n\" -d $MODEL_DIR/final < $IN_DATA > $OUT_PREFIX.wordprob\n";
        `$RUN_MECAB -l2 -F"%m %H %pP\\n" -E"EOS\\n" -d $MODEL_DIR/final < $IN_DATA > $OUT_PREFIX.wordprob`;
        my($a,$b,$c,$d);
        open INFILE, "<:utf8", "$OUT_PREFIX.wordprob" or die "$OUT_PREFIX.wordprob: $!";
        open WPFILE, ">:utf8", "$OUT_PREFIX.wordpart" or die "$OUT_PREFIX.wordpart: $!";
        open WSFILE, ">:utf8", "$OUT_PREFIX-ws.prob"  or die "$OUT_PREFIX-ws.prob: $!";
        open POSFILE, ">:utf8","$OUT_PREFIX-pos.prob" or die "$OUT_PREFIX-pos.prob: $!";
        my (@wps,$len);
        while(<INFILE>) {
            chomp; my ($word,$pos,$prob) = split(/ /);
            if($word eq "EOS") {
                print WPFILE "@wps\n";
                print WSFILE "1 1\n" while(--$len > 0);
                $len = 0; @wps = ();
            } else {
                $len += length($word);
                print POSFILE "$pos $prob\n";
                $pos =~ s/,[^ ]*//g;
                push @wps, "$word/$pos";
            }
        }
        close INFILE; close WPFILE; close WSFILE; close POSFILE;
        return;
    }
    
    # analyze using other programs
    my ($abbrv,$wsfunc,$posfunc,$usecol,$addline,%posarr,$sep);
    if ($PROGRAM eq "crfpp") { 
        $abbrv = "crfpp";
        $wsfunc = \&buildcrfppwsfeat;
        $posfunc = \&buildcrfppposfeat;
        $usecol = $MAXCOL+1;
    } else {
        $sep = " ";
        if($PROGRAM eq "liblinear") {
            $abbrv = "ll";
        } elsif ($PROGRAM eq "classias") {
            $abbrv = "cls";
        } elsif ($PROGRAM eq "crfsuite") {
            $abbrv = "suite";
            $addline = 1;
            $sep = "\t";
        } else {
            die "Cannot handle program $PROGRAM in analyze yet";
        }
        $wsfunc = \&buildllwsfeat;
        $posfunc = \&buildllposfeat;
        $usecol = 0;
        # load the POS ids 
        open FILE, "<:utf8", "$MODEL_DIR/pos-mod.$abbrv-pidx" or die $!;
        while(<FILE>) { chomp; my($w,$p)=split(/[ \t]/); $posarr{$p} = $w; }
        close FILE;
    }

    # build WS features
    print STDERR "Building WS features\n";
    $wsfunc->($IN_DATA,0,"$MODEL_DIR/ws-mod.$abbrv",\%dict,"$OUT_PREFIX-ws",$abbrv,$addline,$sep);
    # train WS
    print STDERR "Using WS model\n";
    if($PROGRAM eq "liblinear") {
        print STDERR "$RUN_LIBLINEAR -b 1 $OUT_PREFIX-ws.$abbrv-feat $MODEL_DIR/ws-mod.mod $OUT_PREFIX-ws.prob\n";
        `$RUN_LIBLINEAR -b 1 $OUT_PREFIX-ws.$abbrv-feat $MODEL_DIR/ws-mod.mod $OUT_PREFIX-ws.prob`;
    } elsif($PROGRAM eq "crfpp") {
        print STDERR "$RUN_CRFPP -v 2 -m $MODEL_DIR/ws-mod.mod $OUT_PREFIX-ws.$abbrv-feat > $OUT_PREFIX-ws.prob\n";
        `$RUN_CRFPP -v 2 -m $MODEL_DIR/ws-mod.mod $OUT_PREFIX-ws.$abbrv-feat > $OUT_PREFIX-ws.prob`;
    } elsif($PROGRAM eq "classias") {
        print STDERR "$RUN_CLASSIAS -ap -m $MODEL_DIR/ws-mod.mod < $OUT_PREFIX-ws.$abbrv-feat > $OUT_PREFIX-ws.prob\n";
        `$RUN_CLASSIAS -ap -m $MODEL_DIR/ws-mod.mod < $OUT_PREFIX-ws.$abbrv-feat > $OUT_PREFIX-ws.prob`;
    } elsif($PROGRAM eq "crfsuite") {
        print STDERR "$RUN_CRFSUITE tag -i -m $MODEL_DIR/ws-mod.mod < $OUT_PREFIX-ws.$abbrv-feat > $OUT_PREFIX-ws.prob\n";
        `$RUN_CRFSUITE tag -i -m $MODEL_DIR/ws-mod.mod < $OUT_PREFIX-ws.$abbrv-feat > $OUT_PREFIX-ws.prob`;
    }
    
    print STDERR "Splitting words\n";

    # split words
    open FILE, "<:utf8", "$OUT_PREFIX-ws.prob" or die $!;
    open OUTFILE, ">:utf8", "$OUT_PREFIX.word" or die $!;
    my ($data, $annot) = loadcorpus($IN_DATA,0);
    my (@newdata,@newannot);
    $_ = <FILE> if ($PROGRAM eq "liblinear");
    for(@$data) {
        my @myd = split(//);
        my $curr = shift(@myd);
        my $ann = "1";
        my @words;
        for(@myd) {
            my $val = "";
            while($val =~ /^#/ or (length($val) == 0)) { 
                $val = <FILE>; 
                die "No value in file $OUT_PREFIX-ws.prob\n" if not $val;
                chomp $val;
            }
            my @va = split(/[ \t:]+/,$val); $val = $va[$usecol]; $val =~ s/\/\S*//g;
            # print "USING WS $val out of @va\n";
            if($val > 0) { push @words, $curr; $ann .= 1; $curr = ""; }
            $curr .= $_;
        }
        push @words, $curr;
        push @newdata, "@words";
        print OUTFILE "@words\n";
    }
    close FILE;
    close OUTFILE;

    # build POS features
    print STDERR "Building POS features\n";
    $posfunc->("$OUT_PREFIX.word",0,"$MODEL_DIR/pos-mod.$abbrv",\%dict,"$OUT_PREFIX-pos",$abbrv,$addline,$sep);
    # train POS
    print STDERR "Using POS model\n";
    if($PROGRAM eq "liblinear") {
        print STDERR "$RUN_LIBLINEAR -b 1 $OUT_PREFIX-pos.$abbrv-feat $MODEL_DIR/pos-mod.mod $OUT_PREFIX-pos.prob\n";
        `$RUN_LIBLINEAR -b 1 $OUT_PREFIX-pos.$abbrv-feat $MODEL_DIR/pos-mod.mod $OUT_PREFIX-pos.prob`;
    } elsif($PROGRAM eq "crfpp") {
        print STDERR "$RUN_CRFPP -v 2 -m $MODEL_DIR/pos-mod.mod $OUT_PREFIX-pos.$abbrv-feat > $OUT_PREFIX-pos.prob\n";
        `$RUN_CRFPP -v 2 -m $MODEL_DIR/pos-mod.mod $OUT_PREFIX-pos.$abbrv-feat > $OUT_PREFIX-pos.prob`;
    } elsif($PROGRAM eq "classias") {
        print STDERR "$RUN_CLASSIAS -ap -m $MODEL_DIR/pos-mod.mod < $OUT_PREFIX-pos.$abbrv-feat > $OUT_PREFIX-pos.prob\n";
        `$RUN_CLASSIAS -ap -m $MODEL_DIR/pos-mod.mod < $OUT_PREFIX-pos.$abbrv-feat > $OUT_PREFIX-pos.prob`;
    } elsif($PROGRAM eq "crfsuite") {
        print STDERR "$RUN_CRFSUITE tag -i -m $MODEL_DIR/pos-mod.mod < $OUT_PREFIX-pos.$abbrv-feat > $OUT_PREFIX-pos.prob\n";
        `$RUN_CRFSUITE tag -i -m $MODEL_DIR/pos-mod.mod < $OUT_PREFIX-pos.$abbrv-feat > $OUT_PREFIX-pos.prob`;
    }

    # annotate POS
    open FILE, "<:utf8", "$OUT_PREFIX-pos.prob" or die $!;
    open OUTFILE, ">:utf8", "$OUT_PREFIX.wordpart" or die $!;
    $_ = <FILE> if ($PROGRAM eq "liblinear");
    foreach my $j (0 .. $#newdata) {
        my @myd = split(/ /, $newdata[$j]);
        foreach my $i (0 .. $#myd) {
            my $val = "";
            if($PROGRAM eq "classias") {
                while(<FILE>) {
                    # print "$_, val=$val\n";
                    if(/^([\+\-])(\d*):/) {
                        $val = $2 if $1 eq "+";
                    } elsif(/eoi/) {
                        last;
                    } elsif(not /boi/) {
                        print STDERR "bad line in $PROGRAM file\n";
                    }
                }
            } else {
                while($val =~ /^#/ or (length($val) == 0)) { $val = <FILE>; chomp $val; }
                my @va = split(/[ \t:]+/, $val); $val = $va[$usecol]; $val =~ s/\/\S*//g;
            }
            die "empty POS at $myd[$i] in @myd" if not length($val);
            # print "USING POS $val -> $posarr{$val}\n"; 
            $myd[$i] .= "/".((exists $posarr{$val})?$posarr{$val}:$val);
        }
        $newdata[$j] = "@myd";
        print OUTFILE "@myd\n";
    }
    close FILE;
    close OUTFILE;

}

1
