###############################################################################
##                                                                           ##
## This file is part of ModelBlocks. Copyright 2009, ModelBlocks developers. ##
##                                                                           ##
##    ModelBlocks is free software: you can redistribute it and/or modify    ##
##    it under the terms of the GNU General Public License as published by   ##
##    the Free Software Foundation, either version 3 of the License, or      ##
##    (at your option) any later version.                                    ##
##                                                                           ##
##    ModelBlocks is distributed in the hope that it will be useful,         ##
##    but WITHOUT ANY WARRANTY; without even the implied warranty of         ##
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          ##
##    GNU General Public License for more details.                           ##
##                                                                           ##
##    You should have received a copy of the GNU General Public License      ##
##    along with ModelBlocks.  If not, see <http://www.gnu.org/licenses/>.   ##
##                                                                           ##
###############################################################################

#!/usr/bin/ruby

########################
# Use best first search to find optimal alignments
# for reparanda and alterations of speech repairs,
# then build a distribution for repair building.
# Usage: cat /project/nlp/data/treebank/dysfl/dps/swbd/[23]/* | ruby scripts/alignRepairs.rb
#############################################


## Penalties for various operations
$subs_p = 7
$ins_p = 4
$del_p = 4
$cp_p = 0

## Aligns a reparandum and an alteration using a best-first search.
## 

def align(rep, alt)
  if rep.size == 0 and alt.size == 0
    return Array[0, Array[], Array[]]
  end
    
  if rep[0] == alt[0]
    align_p = $cp_p
  else
    align_p = $subs_p
  end
  
  if rep.size > 0 and alt.size > 0
    align_v = align(rep[1..-1], alt[1..-1])
  else
    align_v = Array[999999999]
  end
  if rep.size > 0
    ins_v = align(rep[1..-1], alt)
  else
    ins_v = Array[999999999]
  end
  if alt.size > 0
    del_v = align(rep, alt[1..-1])
  else
    del_v = Array[999999999]
  end
  
  align_s = align_p + align_v[0]
  ins_s = $ins_p + ins_v[0]
  del_s = $del_p + del_v[0]
  
  if align_s <= ins_s and align_s <= del_s
    ## align is the cheapest
    return Array[align_s, Array[rep[0],align_v[1]].flatten!, Array[alt[0],align_v[2]].flatten!]
  elsif ins_s < del_s
    ## insert is the cheapest
    return Array[ins_s, Array[rep[0],ins_v[1]].flatten!, Array["@",ins_v[2]].flatten!]
  else
    ## delete is the cheapest
    return Array[del_s, Array["@",del_v[1]].flatten!, Array[alt[0],del_v[2]].flatten!]
  end
  
end

## Counts for 1st word of alteration: copy or substitution?
alt_cp = 0
alt_sub = 0

ins_c = 0
del_c = 0
align_c = 0
align_cp_c = 0
align_sub_c = 0

longestSoFar = 0
line_num = 0
pastIntro = true

del_dist = Array.new(20,0)

while(line = gets)
  line_num += 1
  if line_num % 100000 == 0
    $stderr.puts "Line #{line_num}"
  end
  
  # Remove punctuation and editing terms from repairs  
  line.gsub!(/[\.\,\?]\/[\.\,\?]/, '')
  line.gsub!(/\{[^{}]*\}/, '')
  while line =~ /\[ ([^\[\+]+[^ ]) *\+ *([^\[\]]+) \]/
    rep_str = $1
    alt_str = $2
    rep = rep_str.split(/ /)
    alt = alt_str.split(/ /)
    line.sub!(/\[ ([^\[\+]+[^ ]) *\+ *([^\[\]]+) \]/, '')
    if rep.size == 0 or alt.size == 0
      next
    end
 
    if rep.shift == alt.shift
      alt_cp += 1
    else
      alt_sub += 1
    end
    
    if rep.size > 10 or alt.size > 10
      next
    end
    
#    if rep.size + alt.size > longestSoFar
#      longestSoFar = rep.size + alt.size
#      $stderr.puts "Length of #{longestSoFar} found: " + rep.join(" ")
#    end
    
    if rep.size > 0 or alt.size > 0
    
#    $stderr.puts "Calling align with rep: " + rep.join(',') + " and alt: " + alt.join(',')
      alignment = align(rep, alt)
#      puts "Score: #{alignment[0]}"
#      puts "  " + alignment[1].join(',')
#      puts "  " + alignment[2].join(',')
    
      if alignment[1].size != alignment[2].size
        $stderr.puts "ERROR: Alignment arrays do not match up!"
        $stderr.puts "Alignment1 = " + alignment[1].join(',')
        $stderr.puts "Alignment2 = " + alignment[2].join(',')
      end
      
      alt_str = alignment[2].join('')
      while alt_str =~ /(@+)/
        len = $1.length
        del_dist[len]+=1
        alt_str.sub!(/(@+)/, '')
      end
      
      alignment[1].each_index{ |i|
        if alignment[1][i] == "@"
          ins_c += 1
        elsif alignment[2][i] == "@"
          del_c += 1
        else
          align_c += 1
          if alignment[1][i] == alignment[2][i]
            align_cp_c += 1
          else
            align_sub_c += 1
          end
        end
      }
    end
  end
end

total_c = del_c + ins_c + align_c

ins_prob = ins_c.to_f / total_c.to_f
del_prob = del_c.to_f / total_c.to_f
align_prob = align_c.to_f / total_c.to_f

#puts "alt_cp = #{alt_cp}, alt_sub = #{alt_sub}"
#$stderr.puts "1st word copy probability: #{alt_cp.to_f / (alt_cp.to_f + alt_sub.to_f)}"
#$stderr.puts "1st word subs probability: #{alt_sub.to_f / (alt_cp.to_f + alt_sub.to_f)}"
#$stderr.puts "Alteration delete probability: #{del_c.to_f / total_c.to_f}"
#$stderr.puts "Alteration insert probability: #{ins_c.to_f / total_c.to_f}"
#$stderr.puts "Alteration align  probability: #{align_c.to_f / total_c.to_f}"
#$stderr.puts "Align sub probability: #{align_sub_c.to_f / align_c.to_f}"
#$stderr.puts "Align cp  probability: #{align_cp_c.to_f / align_c.to_f}"

bufferSize = 4
sum = Array.new
sum[0] = 0.0
del_dist.each_index{ |i|
  sum[i] += del_dist[i]
  sum[i+1] = sum[i]
  if i > bufferSize-2
    del_dist[bufferSize-2] += del_dist[i]
    del_dist[i]= 0
    sum[bufferSize-2] += del_dist[i]
    sum[i] = 0
  end
}

## Generate Buffer Back model:
## back_sum keeps a running total so I can easily normalize distributions
## of differing lengths for going backwards from wherever I am in the buffer
probs = Array.new
back_sum = Array.new
back_sum[0] = 0.0
0.upto(bufferSize-1){ |i|
  probs[i] = 1.22 * 0.45**(i.to_f)
  back_sum[i] += probs[i]
  back_sum[i+1] = back_sum[i]
}

## Renormalize
#0.upto(bufferSize-1){ |i|
#  probs[i] = probs[i] / sum
(bufferSize-1).downto(0){ |i|
  i.downto(0){ |j|
    printf "BB %d : %d = %f\n", i, j, probs[i-j] / back_sum[i]
  }
}

## Generate Buffer Move model:
#0.upto(bufferSize-1){ |i|
#  1.upto(bufferSize-2){ |j|
#    printf "BM %d : %d = %0.6f\n", i, (i+j+1) % bufferSize, del_prob * del_dist[j].to_f / sum.to_f
#  }
#  printf "BM %d : %d = %0.6f\n", i, i, ins_prob
#  printf "BM %d : %d = %0.6f\n", i, (i+1) % bufferSize, align_prob
#}

#del_dist.each_index{ |i| 
#  puts "del_dist[#{i}] = #{del_dist[i].to_f / sum.to_f}"
#}

0.upto(bufferSize-2) { |i|
  i.upto(bufferSize-1) { |j|
    if i==j
      printf "BM %d : %d = %f\n", i, j, ins_prob
    elsif i+1==j and i == bufferSize-2
      printf "BM %d : %d = %f\n", i, j, align_prob + del_prob
    elsif i+1==j
      printf "BM %d : %d = %f\n", i, j, align_prob
    else
      printf "BM %d : %d = %f\n", i, j, del_prob * del_dist[j-1-i].to_f / sum[j-1-i].to_f      
    end
  }
}

