import numpy as np
import re
import itertools, csv
from collections import Counter
from nltk.tokenize import TweetTokenizer
import json
import simplejson
import gzip


def parse(filename):
  f = gzip.open(filename, 'r')
  entry = {}
  for l in f:
    l = l.strip()
    colonPos = l.find(str.encode(':'))
    if colonPos == -1:
      yield entry
      entry = {}
      continue
    eName = l[:colonPos]
    rest = l[colonPos+2:]
    entry[eName] = rest
  yield entry


def clean_str(string):
    tokenizer = TweetTokenizer()
    string = ' '.join(tokenizer.tokenize(string))
    string = re.sub(r"[-.#\"/]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'m", " \'m", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r"\'(?!(s|ve|t|re|d|ll))", " ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

def load_data_and_labels(data_file):
    x_text,y=[],[]
    for review in parse(data_file):
      review_dict=json.loads(simplejson.dumps(review))
      if 'review/helpfulness' not in review_dict:
          continue
      helpful=review_dict['review/helpfulness']
      hh=helpful.split('/')
      if int(hh[1])<=5:
        continue
      vote=int(hh[0])/int(hh[1])
      x_text.append(clean_str(review_dict['review/text']))
      y.append([vote])
    return [x_text, y]

def batch_iter(data, batch_size, num_epochs, shuffle=True):

    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            np.random.seed(0)
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data

        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index],shuffle_indices[start_index:end_index]
