#!/usr/bin/env python

import csv
import os
import string
import sys

from collections import defaultdict

import nltk

def usage():
    print 'Usage: compute_agreement_and_bins.py <inputfile>'

try:
    inputfile = sys.argv[1]
except IndexError:
    usage()
    sys.exit(1)
    
# Read in the input csv file    
r = csv.reader(file(inputfile))

# Get the header 
header = r.next()

# Create a dictionary to hold the bins and a list to hold the agreement values
bins = defaultdict(list)
agreements = []

# Iterate over the rows in the csv file
for row in r:
    
    # Get the various fields
    unitid, prep, sentence, preplocation, sys1pred, sys2pred, internal, crowd = row
    
    # Ignore any 'Ungram' (denoted by '2') ratings by Turkers
    crowd = [x for x in crowd.split('|') if x != '2']

    # Use an NLTK FreqDist to compute the majority Turker rating
    fd = nltk.FreqDist()
    fd.update(crowd)
    majority = fd.max()
    
    # Compute the agreement on the majority rating
    agreement = fd.freq(majority) * 100
    agreement = round(agreement, 2)
    agreements.append(agreement)

    # Put the instance into one of three bins bin (based on the agreement value)
    if agreement < 75.0:     
        bins[0].append(row)
    elif agreement >= 75.0 and agreement < 90.0:
        bins[1].append(row)
    elif agreement >= 90.0:
        bins[2].append(row)

# Create output csv files for each of the three bins and the agreement values
writers = [csv.writer(open('bin1-50-75.csv','w')), csv.writer(open('bin2-75-90.csv','w')), \
        csv.writer(open('bin3-90-100.csv','w'))]
agreementfile = open('agreements', 'w')

# Write the header in each bin output file
for w in writers:
    w.writerow(header)

# Write out the bins
for key, rowlist in bins.items():
    for row in rowlist:
        writers[key].writerow(row)

# Write out the agreement values
for a in agreements:
    agreementfile.write(str(a) + '\n')
    
# Close the agreements file
agreementfile.close()
