use strict;
use List::Util qw(min max);

sub fullannot {
    my($ref,$ann,$words) = @_;
    foreach my $i (0 .. @$ref-1) {
        my @arr = split(/ /, $ref->[$i]);
        foreach my $j (0 .. $#arr) {
            $ann->{"$i-$j"}++;
        }
        last if (keys %$ann) >= $words;
    }
}

sub sentannot {
    my ($ref, $test, $ann, $wsprobs, $posprobs, $TYPE, $WORDS) = @_;
    my @allpoints;
    foreach my $i (0 .. @$ref-1) {
        my $str = $ref->[$i]; $str =~ s/\/\S*//g;
        my @rarr = split(/ /, $str);
        my $rstr = $str; $rstr =~ s/ //g;
        my $str = $test->[$i]; $str =~ s/\/\S*//g;
        my @tarr = split(/ /, $str);
        # make the test array
        my @point = (0);
        my $sentconf = 1;
        my $senttot = 0;
        # add the ws values
        foreach my $j (2 .. length($rstr)) {
            $sentconf *= shift(@$wsprobs);
            $senttot++;
        }
        for(@tarr) {
            $sentconf *= shift(@$posprobs);
            $senttot++;
        }
        die "no more scores (".scalar(@$wsprobs).",".scalar(@$posprobs).") at sentence $i: $rstr\n" if not $sentconf;
        if($TYPE eq "avg") {
            $sentconf = exp(log($sentconf)/$senttot);
        } elsif($TYPE ne "tot") {
            die "cannot handle criterion $TYPE\n";
        }
        my @point = ( $sentconf, map { "$i-$_" } ( 0 .. $#rarr ) );
        push @allpoints, \@point;
    }
    @allpoints = sort { $a->[0] <=> $b->[0] } @allpoints;
    while(keys(%$ann) < $WORDS) {
        my ($conf, @val) = @{shift(@allpoints)};
        $ann->{$_}++ for(@val);
    }
}

sub partannot {
    my ($ref, $test, $ann, $wsprobs, $posprobs, $WORDS) = @_;
    my @allpoints;
    foreach my $i (0 .. @$ref-1) {
        my $str = $ref->[$i]; $str =~ s/\/\S*//g;
        my @rarr = split(/ /, $str);
        my $str = $test->[$i]; $str =~ s/\/\S*//g;
        my @tarr = split(/ /, $str);
        # make the test array
        my @rxarr;
        foreach my $j (0 .. $#rarr) {
            push @rxarr, $j for(1 .. length($rarr[$j]));
        }
        # add the WS values
        foreach my $j (1 .. $#rxarr) {
            my $conf = shift @$wsprobs;
            my @point = ($conf, "$i-".$rxarr[$j]);
            push @point, "$i-".$rxarr[$j-1] if ($rxarr[$j] != $rxarr[$j-1]);
            push @allpoints, \@point;
        }
        # add the POS values
        my $xpos = 0;
        foreach my $j (0 .. $#tarr) {
            my %active;
            my $conf = shift @$posprobs;
            for(1 .. length($tarr[$j])) {
                $active{"$i-".$rxarr[$xpos++]}++;
            }
            my @point = ($conf, keys %active);
            push @allpoints, \@point;
        }
    }
    my @allpoints = sort { $a->[0] <=> $b->[0] } @allpoints;
    while(keys(%$ann) < $WORDS) {
        my ($conf, @val) = @{shift(@allpoints)};
        $ann->{$_}++ for(@val);
    }
}

sub wordprobcrit {
    my $ret = max(@_);
    die "Bad wordprob probability" if($ret == 0);
    return $ret;
}
sub margincrit {
    @_ = sort { $b <=> $a } @_;
    my $ret = $_[0]-$_[1];
    die "Bad margin probability" if not $_[0];
    return $ret;
}
sub entropycrit {
    my $ret = 0;
    $ret += $_*log($_) for @_;
    die "Bad entropy" if($ret == 0);
    return $ret;
}

sub findannot {
    my ($REF, $TEST, $WSPROB, $POSPROB, $PREVANN, $NEXTANN, $WORDS, $PROGRAM, $ACTIVE, $CRITERION) = @_;

    # load the reference and previous annotation, get the previous
    #  annotated words
    open FILE, "<:utf8", $REF or die "$REF: $!";
    my @ref = map { chomp; $_ } <FILE>;
    close FILE;
    open FILE, "<:utf8", $PREVANN or die "$PREVANN: $!";
    my %ann = map { chomp; $_ => 1 } <FILE>;
    close FILE;
     
    if($ACTIVE eq "full") {
        fullannot(\@ref, \%ann, $WORDS);
    }
    else {

        # test value 
        open FILE, "<:utf8", $TEST or die "$TEST: $!";
        my @test = map { chomp; $_ } <FILE>;
        close FILE;

        # get the criterion function, that converts a probability array to a 
        #  a confidence measure
        my $critfunc;
        if(($CRITERION eq "wordprob") or ($ACTIVE eq "sent")) {
            $critfunc = \&wordprobcrit;
        } elsif($CRITERION eq "margin") {
            $critfunc = \&margincrit;
        }elsif($CRITERION eq "entropy") {
            $critfunc = \&entropycrit;
        }

        # get the confidences for WS and POS, higher number = higher confidence
        open FILE, "<:utf8", $WSPROB or die "$WSPROB: $!";
        my (@wsprobs,@posprobs,@tmp);
        $_ = <FILE> if ($PROGRAM eq "liblinear");
        while(<FILE>) {
            chomp;
            next if not length($_);
            my ($lab, @arr) = split(/[ :]/);
            push @wsprobs, $critfunc->(@arr);
        }
        close FILE;
        open FILE, "<:utf8", $POSPROB or die "$POSPROB: $!";
        $_ = <FILE> if ($PROGRAM eq "liblinear");
        while(<FILE>) {
            chomp;
            next if not length($_);
            if($PROGRAM =~ /(liblinear|kytea|mecab|)/) {
                @tmp = split(/[: ]/); shift @tmp;
                push @posprobs, $critfunc->(@tmp);
            } elsif($PROGRAM eq "classias") {
                    if(/^([\+\-])(\d*):(.*)/) {
                        push @tmp, $3;
                    } elsif(/eoi/) {
                        push @posprobs, $critfunc->(@tmp);
                        @tmp = ();
                    } elsif(not /boi/) {
                        print STDERR "bad line in $PROGRAM file\n";
                    }
            } else {
                die "Could not handle POSs for program $PROGRAM\n";
            }
        }
        close FILE;

        # do confidences
        if ($ACTIVE =~ /^(part|dict)$/) {
            partannot(\@ref, \@test, \%ann, \@wsprobs, \@posprobs, $WORDS);
        }
        elsif ($ACTIVE eq "sent") {
            sentannot(\@ref, \@test, \%ann, \@wsprobs, \@posprobs, $CRITERION, $WORDS);
        }
        else {
            die "can't use active learning technique $ACTIVE yet";
        }
    }
    open FILE, ">:utf8", $NEXTANN or die "$NEXTANN: $!";
    for(keys %ann) {
        print FILE "$_\n";
    }
    close FILE;
}


1
