#!/usr/bin/ruby
###############################################################################
##                                                                           ##
## This file is part of ModelBlocks. Copyright 2009, ModelBlocks developers. ##
##                                                                           ##
##    ModelBlocks is free software: you can redistribute it and/or modify    ##
##    it under the terms of the GNU General Public License as published by   ##
##    the Free Software Foundation, either version 3 of the License, or      ##
##    (at your option) any later version.                                    ##
##                                                                           ##
##    ModelBlocks is distributed in the hope that it will be useful,         ##
##    but WITHOUT ANY WARRANTY; without even the implied warranty of         ##
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          ##
##    GNU General Public License for more details.                           ##
##                                                                           ##
##    You should have received a copy of the GNU General Public License      ##
##    along with ModelBlocks.  If not, see <http://www.gnu.org/licenses/>.   ##
##                                                                           ##
###############################################################################


####################
# words2ints.rb
#
# Purpose: This script takes a corpus of sentences, one sentence per line,
# words separated by spaces (no tree structure) and maps each word to an
# int.  This is useful for things like reading in words in matlab.
#
# WARNING: Saves all sentences in array for 2nd pass, so may be memory intensive
# 
# Sample usage: cat trees.txt | ./scripts/trees2words.sh | ruby scripts/words2ints.rb > ints.txt
#
# Author: Tim Miller
####################

require 'scripts/umnlp.rb'

min_count = 50
sentences = Array.new
counts = CounterHash.new
words_to_ints = Hash.new
word_num = 1
stop_words = Hash.new
stop_word_file = ARGV[0]

while(line = $stdin.gets)
  line.chomp!
  ## Save the sentence for 2nd pass
  sentences.push(line)
  
  words = line.split
  
  words.each{ |value|
    if value =~ /\d/
      value = "NUM"
    end
    counts[value] += 1
  }
end

if stop_word_file != nil
  swf = File.open(stop_word_file)
  while line = swf.gets
    line.chomp!
    stop_words[line] = 1
  end
end

## Now go through each sentence and print its corresponding word
sentences.each{ |sentence|
  words = sentence.split
  words.each{ |word|
    if word =~ /\d/
      word = "NUM"
    end
    if counts[word] < min_count
      #print "0 "
      ## do nothing
    elsif stop_words.has_key?(word)
      ## do nothing
    elsif words_to_ints.has_key?(word)
      print "#{words_to_ints[word]} "
    else
      words_to_ints[word] = word_num
      $stderr.puts "#{word} #{words_to_ints[word]}"
      word_num += 1
      print "#{words_to_ints[word]} "
    end
  }
  print "\n"
}

