###################################################
#
# rlnclust-EM.pl
#
#  does E-M on for referents and relations to reduce dimensionality
#  USAGE: cat genmodel/testrlnclust.freqdat | perl scripts/rlnclust-EM.pl
#
#  OPTIONS:
#    -e <number of referents>
#    -o <output log file>
#    -i <max # iterations>
#    -p <frequency of log/stderr printing>
#    -r <seed for random number generation in debugging>
#    -v = verbose mode
#
###################################################
use strict;
use Getopt::Std;

getopts('c:e:o:i:p:vr:'); # will print out M instead of GG
my $cutoff = 0; #1.000001;
my $VERBOSE = 0;
my $MAX_ITER   = 1000; #for E-M finishing
my $PRINT_ITER = 100;
my $domEsize = 10;
my $seed = -99;
our($opt_c,$opt_e,$opt_o,$opt_i,$opt_p,$opt_v,$opt_r);

########### OPTIONS
if ($opt_e ne "") {
    if ($cutoff!=0) { $cutoff = 1+$opt_c; }
}
if ($opt_e ne "") {
	$domEsize = $opt_e;
}
if ($opt_o ne "") {
    my $filename = $opt_o.".log";
    open( LOG, '>', $filename) or die $!;
}
if ($opt_i ne "") {
    $MAX_ITER = $opt_i;
}
if ($opt_p ne "") {
    $PRINT_ITER = $opt_p;
}
if ($opt_v ne "") {
    $VERBOSE = 1;
}
if ($opt_r ne "") {
	srand($opt_r);
}
###


########### READ IN DATA

my %F_all;   # frequency counts for l h1 h2, from .freqdat file
my $DF_all;  # total counts for all rules in .freqdat file
my %domL;    # domain of L relations, to be determined by file
my %domH;    # domain of H headwords, to be determined by file

# set the domain of hidden referent variables
my %domE;
for (my $i=1;$i<=$domEsize;$i++) { $domE{"e$i"}=1; }


while ( <> ) {
  chomp;

  if ( m/^\#/ ) { 
    next; 

  # read in a conditional rule
  } elsif ( m/^([^ ]+) +(.*) +: +([^=]*)( = (.*))?$/ ) {
    #my $model = $1; my $cond = $2; my $targ = $3; my $prob = $5;

	# for relational clustering, we'll only use L rules from cNNhw freqdat
	if ($1 eq "L") {
		$F_all{"$2 $3"}  = defined($5) ? $5 : $F_all{"$2 $3"}+1;
		$DF_all         += defined($5) ? $5 : 1;
		my @le = split(/ +/,$2);
		$domL{$le[0]}++;
		$domH{$le[1]}++;
		$domH{$3}++;
	}

  } elsif ( m/^([^ ]+) +: +([^=]*)( = (.*))?$/ ) {
    $F_all{"$2"} = defined($4) ? $4 : $F_all{"$2"}+1;
  }

}

#foreach my $cond (sort keys %F_all) {
#  print "Counts: $cond = $F_all{$cond}\n";
#}
#foreach my $l (sort keys %domL) { print "read L $l\n"; }
#foreach my $h (sort keys %domH) { print "read H $h\n"; }


##################################
#### Parameter Initialization ####
##################################
my %Params;

# global variable to identify the models
my $thetaL = "L";  
my $thetaH = "H";
my $piE = "E";

# Random initial distribution for thetaL, P(e'|le)
my %DParams;
foreach my $l (keys %domL) {
	foreach my $e1 (keys %domE) {
		foreach my $e2 (keys %domE) {
			$Params{$thetaL}{"$l $e1 $e2"} = rand;
			$DParams{$thetaL}{"$l $e1"} += $Params{$thetaL}{"$l $e1 $e2"};
		}
	}
}
foreach my $l (keys %domL) {
	foreach my $e1 (keys %domE) {
		foreach my $e2 (keys %domE) {
			$Params{$thetaL}{"$l $e1 $e2"} /= $DParams{$thetaL}{"$l $e1"};
		}
	}
}
# Random initial distribution for thetaH, P(h|e)
my %DParams;
foreach my $e (keys %domE) {
	foreach my $h (keys %domH) {
		$Params{$thetaH}{"$e $h"} = rand;
		$DParams{$thetaH}{$e} += $Params{$thetaH}{"$e $h"};
	}
}
foreach my $e (keys %domE) {
	foreach my $h (keys %domH) {
		$Params{$thetaH}{"$e $h"} /= $DParams{$thetaH}{$e};
	}
}
# Random initial distribution for thetaE, P(e)
my %DParams;
foreach my $e (keys %domE) {
	$Params{$piE}{$e} = rand;
	$DParams{$piE} += $Params{$piE}{$e};
}
foreach my $e (keys %domE) {
	$Params{$piE}{$e} /= $DParams{$piE};
}
# Calculate empirical P(l), will remain unchanged
my %piL;
my $DpiL;
foreach my $rule (keys %F_all) {
	my @vars = split(/ +/,$rule); 
	$piL{shift(@vars)} += $F_all{$rule};
	$DpiL += $F_all{$rule};
}
foreach my $l (keys %piL) {
	$piL{$l} /= $DpiL;
}

###
# Print of the initialized parameters
if ( $VERBOSE==1 && $opt_o ne "" ) {
	print LOG "Starting distributions\n";
	foreach my $l (sort keys %domL) {
		foreach my $e1 (sort keys %domE) {
			foreach my $e2 (sort keys %domE) {
				print LOG "$thetaL $l $e1 : $e2 = ".$Params{$thetaL}{"$l $e1 $e2"}."\n";
			}
		}
	}
	foreach my $e (sort keys %domE) {
		foreach my $h (sort keys %domH) {
			print LOG "$thetaH $e : $h = ".$Params{$thetaH}{"$e $h"}."\n";
		}
	}
	foreach my $e (sort keys %domE) {
		print LOG "$piE : $e = ".$Params{$piE}{$e}."\n";
	}
}

###
# Set up conveniences for better looping, etc
my %CONST_Params = %Params;
#my %domALL;  # not faster
#foreach my $l (keys %domL) {
#	foreach my $h1 (keys %domH) {
#		foreach my $h2 (keys %domH) {
#			foreach my $e1 (keys %domE) {
#				foreach my $e2 (keys %domE) {
#					$domALL{"$e1 $e2 $l $h1 $h2"} = 1;
#					print "match data: $e1 $e2 $l $h1 $h2 = ".$F_all{"$l $h1 $h2"}."\n";
#				}
#			}
#		}
#	}
#}
#exit;

sub comp {
  return (1.0-@_[0]);
}




## Main Loop
my $prevloglik;
for (my $iter=1; $iter<=$MAX_ITER; $iter++) {
  if ($iter % $PRINT_ITER == 0) {
	  print stderr "==== E-M iteration $iter... ";
	  if ($opt_o ne "") {
		  print LOG "==== E-M iteration $iter ====\n";
	  }
  }

  ###################################
  ##### E-Step ######################
  ###################################

  my %P_joint;
  my %DP_joint;  # this is also the likelihood function
  
#  foreach my $l (keys %domL) {
#	  foreach my $h1 (keys %domH) {
#		  foreach my $h2 (keys %domH) {
  foreach my $inrule (keys %F_all) {
	  our($l,$h1,$h2) = split(/ +/,$inrule);
			  foreach my $e1 (keys %domE) {
				  foreach my $e2 (keys %domE) {
					  $P_joint{"$e1 $e2 $l $h1 $h2"} = 
						  $piL{$l} * $Params{$piE}{$e1} * $Params{$thetaL}{"$l $e1 $e2"}
					      * $Params{$thetaH}{"$e1 $h1"} * $Params{$thetaH}{"$e2 $h2"};
					  $DP_joint{"$l $h1 $h2"} += $P_joint{"$e1 $e2 $l $h1 $h2"};
				  }
			  }
  }
#		  }
#	  }
#  }
  
#  foreach my $l (keys %domL) {
#	  foreach my $h1 (keys %domH) {
#		  foreach my $h2 (keys %domH) {
  foreach my $inrule (keys %F_all) {
	  our($l,$h1,$h2) = split(/ +/,$inrule);
			  foreach my $e1 (keys %domE) {
				  foreach my $e2 (keys %domE) {
					  # gives P(Hidden|Observed)
					  if ($DP_joint{"$l $h1 $h2"}!=0) {
						  $P_joint{"$e1 $e2 $l $h1 $h2"} /= $DP_joint{"$l $h1 $h2"};
					  }
					  # gives empirical P(Hidden,Observed)
					  $P_joint{"$e1 $e2 $l $h1 $h2"} *= ($F_all{"$l $h1 $h2"}); #/ $DF_all);

					  if ($iter % $PRINT_ITER==0 && $iter!=$MAX_ITER && $opt_o ne "" && $VERBOSE==1) {
						  print LOG "Empir: $e1 $e2 $l $h1 $h2 = ".$P_joint{"$e1 $e2 $l $h1 $h2"}."\n";
					  }
				  }
			  }
  }
#		  }
#	  }
#  }
  
  if ($VERBOSE==1 || $iter % $PRINT_ITER==0 || $iter==$MAX_ITER) {
	  my $loglik;
	  foreach my $rule (sort keys %DP_joint) {
		  if ($F_all{$rule}>0) {
			  $loglik += log($DP_joint{$rule})*log($F_all{$rule});
		  }
	  }
	  print stderr "Loglik: ".$loglik." ====\n";
	  if ($opt_o ne "") {
		  print LOG "Loglik: ".$loglik."\n";
	  }
          if ($loglik<($prevloglik*$cutoff) && $cutoff!=0) { last; }
          $prevloglik = $loglik;
  }
  foreach my $key (keys %DP_joint) { delete $DP_joint{$key}; }

  
  ###################################
  ##### M-Step ######################
  ###################################
  foreach my $z (keys %CONST_Params) { delete $Params{$z}; }
  my %DParams;

  foreach my $rule (keys %P_joint) {
	  our($e1,$e2,$l,$h1,$h2) = split(/ +/,$rule);
	  $Params{$thetaL}{"$l $e1 $e2"} += $P_joint{$rule};
	  $DParams{$thetaL}{"$l $e1"} += $P_joint{$rule};
	  $Params{$thetaH}{"$e1 $h1"} += $P_joint{$rule};
	  $DParams{$thetaH}{$e1} += $P_joint{$rule};
	  $Params{$thetaH}{"$e2 $h2"} += $P_joint{$rule};
	  $DParams{$thetaH}{$e2} += $P_joint{$rule};
	  $Params{$piE}{$e1} += $P_joint{$rule};
	  $DParams{$piE}{""} += $P_joint{$rule};
  }

  foreach my $model (sort keys %Params) {
	  foreach my $rule (sort keys %{$Params{$model}}) {
		  my @vars = split(/ +/,$rule);
		  pop @vars;
		  eval {
			  $Params{$model}{$rule} /= $DParams{$model}{join(' ',@vars)};
		  };
		  if ($@) {
			  print stderr "ERROR: division of P_$model($rule) = $Params{$model}{$rule}/".$DParams{$model}{join(' ',@vars)}."\n";
			  print LOG "ERROR: division of P_$model($rule) = $Params{$model}{$rule}/".$DParams{$model}{join(' ',@vars)}."\n";
		  }

		  if ($iter % $PRINT_ITER==0 && $iter!=$MAX_ITER && $opt_o ne "") {
			  print LOG "Distr: $model $rule = $Params{$model}{$rule}\n";
		  }
	  }
  }

  foreach my $key (keys %DParams) { delete $DParams{$key}; }


}
## (END Main Loop)

# final printout
foreach my $model (sort keys %Params) {
	foreach my $rule (sort keys %{$Params{$model}}) {
      if ($Params{$model}{$rule}>0) {
		print LOG "$model $rule = $Params{$model}{$rule}\n";
		print "$model $rule = $Params{$model}{$rule}\n";
      }
	}
}
