###############################################################################
##                                                                           ##
## This file is part of ModelBlocks. Copyright 2009, ModelBlocks developers. ##
##                                                                           ##
##    ModelBlocks is free software: you can redistribute it and/or modify    ##
##    it under the terms of the GNU General Public License as published by   ##
##    the Free Software Foundation, either version 3 of the License, or      ##
##    (at your option) any later version.                                    ##
##                                                                           ##
##    ModelBlocks is distributed in the hope that it will be useful,         ##
##    but WITHOUT ANY WARRANTY; without even the implied warranty of         ##
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          ##
##    GNU General Public License for more details.                           ##
##                                                                           ##
##    You should have received a copy of the GNU General Public License      ##
##    along with ModelBlocks.  If not, see <http://www.gnu.org/licenses/>.   ##
##                                                                           ##
###############################################################################

use Getopt::Std;

getopts('u:c:dl');
$mincount = $opt_c;
$unknowns = $opt_u;
$delete = $opt_d;
$lowercase = $opt_l;
if ($lowercase) {
  $unk = "unk";
} else {
  $unk = "UNK";
}

print stderr "mincount = $mincount\n";

while ( <> ) {
  chomp;
  if ( $_ =~ /(.*) : (.*) = ([0-9]*)/ ) {
    $Cond{$1}+=$3;
    $Targ{$2}+=$3;
    $Val{$1}{$2}+=$3;
  }
  elsif ( $_ =~ /(.*) : (.*)/ ) {
    $Cond{$1}++;
    $Targ{$2}++;
    $Val{$1}{$2}++;
  }
}

# for backoff, treat low-freq words as a model of the unknown
# does not work in normalization mode, only in count-then-normalize
if ($unknowns) {

  foreach $c (keys %Cond) {
    if ( $Cond{$c}<=$unknowns ) {
      ($type, @realc) = split( / /, $c );
      foreach $v (keys %{$Val{$c}}) {
	$Val{$type." $unk"}{$v}++;
	$Cond{$type." $unk"}++;
	if ($delete) {
		delete $Cond{$c};
		delete $Val{$c}{$v};
	}
	#printf( "$c : $v = %.8f\n", ($Val{$c}{$v}/$Cond{$c}));
      }
    }
  }
  foreach $v (keys %Targ) {
    if ( $Targ{$v}<=$unknowns ) {
      foreach $c (keys %Cond) {
	if ($Val{$c}{$v}) {
	  $Val{$c}{$unk}++;
	  $Targ{$unk}++;
	}
	if($delete) {
		delete $Targ{$v};
		delete $Val{$c}{$v};
	}
      }
    }
  }
#  foreach $v (keys %{$Val{$unk}}) {
#    printf( "UNK : $v = %.8f\n", ($Val{$unk}{$v}/$Cond{$unk}));
#  }
#  foreach $c (keys %Val) {
#    printf( "$c : UNK = %.8f\n", ($Val{$c}{$unk}/$Targ{$unk}));
#  }
}


# regular print-out
foreach $c (keys %Cond) {
  if ( $Cond{$c}>=$mincount ) {
    foreach $v (keys %{$Val{$c}}) {
      printf( "$c : $v = %.8f\n", ($Val{$c}{$v}/$Cond{$c}));
    }
  } else {
    printf( stderr " P($v|$c); fewer than $mincount examples were found.  not printing.\n" );
  }
}

