#!/usr/bin/perl -w

# vectorize.pl
# Shane Bergsma
# July 5, 2009

# This program takes in:
#  1) A set of word-labelled features
#  2) A list of the features, in order
#
# It outputs either:
#  1) A set of feature vectors

##########################################
# Initialization:
##########################################
# Read the arguments:
die "Must pass examples at STDIN and feature-list
E.g. cat EXAMPLES | vectorize.pl -f FEATURELIST\n"
  unless (scalar(@ARGV) == 2);

my ($fflag, $featureListFN) = @ARGV;
die "Bad input." unless ($fflag eq "-f");

# Open the requird input:
open(FEATURELIST, $featureListFN) or die "Cannot open $featureListFN";

# First, read the feature list:
my %featureMap;
my $idx = 1;
for (<FEATURELIST>) {
  chomp;
  $featureMap{$_} = $idx++;
}
close(FEATURELIST);

##########################################
# Generate the FVs:
##########################################

# In effect, we add one to all counts:
my $SMOOTHING = 1.0;

while (<STDIN>) {
  $example = $_;
  ##########################################
  # Parse the example:
  ##########################################
  #  print $example;
  chomp $example;
  my $label;
  my $feats;
  if ($example =~ /^([^ ][^ ]*) ([^\#]*)(\# .*)?/) {
    $label = $1;
    $feats = $2;
    #    $comments = $3;
  } else {
    die "Example $example not in right format.";
  }

  #  print "$label ... $feats ... $comments \n";
  # Split on any sequence of whitespace:
  my (@featPairs) = split(/\s+/, $feats);
  my @fv;			# Build the fv in here.
  for (@featPairs) {
    next if /^$/; # If there were extra spaces, we just skip the blank
                  # entries.
    # Strip out the value part of the field:
	my $flabel;
	my $value;
	# If there was no value, we assume 1_L: binary
    if ($_ =~ s/:([^:][^:]*)//g) {
	  $value = $1;
	} else {
	  $value = "1_L";
	}
	$flabel = $_;
    my $idx = $featureMap{$flabel};

    # Essentially, check if this feature was observed in the training set:
    if (defined($idx)) {
	  # See if it was labeled for us NOT to take the log:
      my ($numericalValue, $typeTag) = split(/_/,$value);
      my $entry = "";
      if (defined($typeTag) && $typeTag eq "L") {
		$entry = "$idx:$numericalValue";
		# Otherwise, take the log:
      } else {
		$entry = sprintf("%i:%.3f", $idx, log($value+$SMOOTHING));
      }
      push(@fv, $entry);
    }
  }

  # Once you've collected all the feats, sort and print:
  @fv = sort {
    my ($idx1, $val1) = split(/:/, $a);
    my ($idx2, $val2) = split(/:/, $b);
    return $idx1 <=> $idx2;
  } @fv;
  print "$label @fv\n";
}

