

from __future__ import print_function
import six.moves.cPickle as pickle

from collections import OrderedDict
import sys
import time

import numpy
import theano
from theano import config
import theano.tensor as tensor
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

import data



# Set the random number generators' seeds for consistency
SEED = 4123
numpy.random.seed(SEED)

def numpy_floatX(data):
    return numpy.asarray(data, dtype=config.floatX)


def get_minibatches_idx(n, minibatch_size, shuffle=False):
    """
    Used to shuffle the dataset at each iteration.
    I do not want to shuffle the dataset. 
    Set shuffle = False
    """

    idx_list = numpy.arange(n, dtype="int32")

    if shuffle:
        numpy.random.shuffle(idx_list)

    minibatches = []
    minibatch_start = 0
    for i in range(n // minibatch_size):
        minibatches.append(idx_list[minibatch_start:
                                    minibatch_start + minibatch_size])
        minibatch_start += minibatch_size

    if (minibatch_start != n):
        # Make a minibatch out of what is left
        minibatches.append(idx_list[minibatch_start:])

    return zip(range(len(minibatches)), minibatches)


def zipp(params, tparams):
    """
    When we reload the model. Needed for the GPU stuff.
    """
    for kk, vv in params.items():
        tparams[kk].set_value(vv)


def unzip(zipped):
    """
    When we pickle the model. Needed for the GPU stuff.
    """
    new_params = OrderedDict()
    for kk, vv in zipped.items():
        new_params[kk] = vv.get_value()
    return new_params


def dropout_layer(state_before, _p, trng):

    proj = state_before * trng.binomial(size=(state_before.shape[1],),
                                        p=_p, n=1,
                                        dtype=state_before.dtype)
    return proj


def _p(pp, name):
    return '%s_%s' % (pp, name)


def init_params(options, Wemb_value=None):
    """
    Global (not CNN) parameter. For the embeding and the classifier.
    """

    rng = numpy.random.RandomState(23455)


    params = OrderedDict()

    # embeddings of differences of locations
    params['Lemb'] = (numpy.asarray(rng.uniform(
                            low = -1. / (options['dim_locDiff']),
                            high = 1. / (options['dim_locDiff']),
                            size=(options['n_locDiffs'], options['dim_locDiff'])
                            )
                        )
                    ).astype(theano.config.floatX)


    # embeddings of tokens in sentences
    if Wemb_value is None:
        params['Wemb'] = (numpy.asarray(rng.uniform(
                            low = -1. / (options['dim_token']),
                            high = 1. / (options['dim_token']),
                            size=(options['n_words'], options['dim_token'])
                            )
                        )
                    ).astype(theano.config.floatX)
    else:
        params['Wemb'] = (numpy.asarray(Wemb_value)).astype(theano.config.floatX)


    #CNN parameters
    params = param_init_cnn(options, params, prefix='cnn')

    # classifier softmax
    params['W3'] = (numpy.asarray(rng.uniform(
                            low = -1. / (options['n2'] + options['ydim']),
                            high = 1. / (options['n2'] + options['ydim']),
                            size=(options['n2'] + options['dim_locDiff'], options['ydim'])
                            )
                        )
                    ).astype(theano.config.floatX)

    params['b3'] = numpy.zeros((options['ydim'],)).astype(config.floatX)

    return params


def load_params(path, params):
    pp = numpy.load(path)
    for kk, vv in params.items():
        if kk not in pp:
            raise Warning('%s is not in the archive' % kk)
        params[kk] = pp[kk]

    return params


def init_tparams(params):
    tparams = OrderedDict()
    for kk, pp in params.items():
        tparams[kk] = theano.shared(params[kk], name=kk)
    return tparams



def param_init_cnn(options, params, prefix='cnn'):
    """
    Init the CNN parameter:

    :see: init_params
    """

    rng = numpy.random.RandomState(45645)

    #parameters of Convolutional Layer
    W1 = (numpy.asarray(rng.uniform(
                            low = -numpy.sqrt(1. / (options['dim_token'] * options['win_size'] + options['n1'])),
                            high = numpy.sqrt(1. / (options['dim_token'] * options['win_size'] + options['n1'])),
                            size=(options['dim_token'] * options['win_size'], options['n1'])
                            )
                        )
                    ).astype(theano.config.floatX)
    params[_p(prefix, 'W1')] = W1


    b1 = numpy.zeros((options['n1'],)).astype(theano.config.floatX)
    params[_p(prefix, 'b1')] = b1


    #parameters of Hidden Layer
    W2 = (numpy.asarray(rng.uniform(
                            low = -numpy.sqrt(1. / (options['n1'] * 3 + options['n2'])),
                            high = numpy.sqrt(1. / (options['n1'] * 3 + options['n2'])),
                            size=(options['n1'] * 3, options['n2'])
                            )
                        )
                    ).astype(theano.config.floatX)
    params[_p(prefix, 'W2')] = W2

    b2 = numpy.zeros((options['n2'],)).astype(theano.config.floatX)
    params[_p(prefix, 'b2')] = b2

    return params


def cnn_layer(tparams, state_below, options, trng, prefix, mask):

    #Convolutional Layer
    state_below = state_below * mask[:, :, None]
    conv_y1 = tensor.dot(state_below, tparams[_p(prefix, 'W1')]) + tparams[_p(prefix, 'b1')]
    
    

    conv_out_p = tensor.abs_(conv_y1) / (tensor.sum(tensor.abs_(conv_y1), axis=1))[:, None, :]
    conv_out1 = tensor.sum(conv_y1 * conv_out_p, axis=1)
    conv_out2 = tensor.max(conv_y1, axis=1)
    conv_out3 = tensor.min(conv_y1, axis=1)

    conv_out = tensor.concatenate((conv_out1,conv_out2,conv_out3), axis=1)
    

    #Hidden Layer
    hidden_y2 = tensor.tanh(tensor.dot(conv_out , tparams[_p(prefix, 'W2')]) + tparams[_p(prefix, 'b2')])
    Norm_z = (tensor.sqrt(tensor.sum(hidden_y2**2, axis=1)))[:, None]
    hidden_out = hidden_y2 / Norm_z  

    
    return hidden_out



def sgd(lr, tparams, grads, x, mask, y, cost):
    """ Stochastic Gradient Descent

    """
    # New set of shared variable that will contain the gradient
    # for a mini-batch.
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
               for k, p in tparams.items()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]

    # Function that computes gradients for a mini-batch, but do not
    # updates the weights.
    f_grad_shared = theano.function([x, mask,y], cost, updates=gsup,
                                    name='sgd_f_grad_shared')

    pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]

    # Function that updates the weights from the previously computed
    # gradient.
    f_update = theano.function([lr], [], updates=pup,
                               name='sgd_f_update')

    return f_grad_shared, f_update


def adadelta(lr, tparams, grads, x, mask, y, cost):
    
    zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                  name='%s_grad' % k)
                    for k, p in tparams.items()]
    running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                 name='%s_rup2' % k)
                   for k, p in tparams.items()]
    running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.items()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up,
                                    name='adadelta_f_grad_shared')

    updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
             for zg, ru2, rg2 in zip(zipped_grads,
                                     running_up2,
                                     running_grads2)]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
             for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]

    f_update = theano.function([lr], [], updates=ru2up + param_up,
                               on_unused_input='ignore',
                               name='adadelta_f_update')

    return f_grad_shared, f_update


def rmsprop(lr, tparams, grads, x, mask, y, cost):

    zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                  name='%s_grad' % k)
                    for k, p in tparams.items()]
    running_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                   name='%s_rgrad' % k)
                     for k, p in tparams.items()]
    running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.items()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function([x, mask, y], cost,
                                    updates=zgup + rgup + rg2up,
                                    name='rmsprop_f_grad_shared')

    updir = [theano.shared(p.get_value() * numpy_floatX(0.),
                           name='%s_updir' % k)
             for k, p in tparams.items()]
    updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
                                            running_grads2)]
    param_up = [(p, p + udn[1])
                for p, udn in zip(tparams.values(), updir_new)]
    f_update = theano.function([lr], [], updates=updir_new + param_up,
                               on_unused_input='ignore',
                               name='rmsprop_f_update')

    return f_grad_shared, f_update


def build_model(tparams, options):
    trng = RandomStreams(SEED)


    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    x = tensor.matrix('x', dtype='int32')
    mask = tensor.matrix('mask', dtype=config.floatX)
    y = tensor.vector('y', dtype='int32')

    idx_LocDiff = x[:,0]
    idx_sentInfo_1 = x[: , 1 : ]#1 + options['win_size'] * options['maxTokens1']]

    locDiff_emb = tparams['Lemb'][idx_LocDiff]
    sentInfo_emb_1 = tparams['Wemb'][idx_sentInfo_1]

    sentInfo_emb_1 = sentInfo_emb_1.reshape((x.shape[0], -1, 
                        options['dim_token'] * options['win_size']))

    #CNN Layer
    conv_feat = cnn_layer(tparams, sentInfo_emb_1, options, trng,
        prefix='cnn', mask=mask)


    proj = tensor.concatenate((locDiff_emb, conv_feat),axis=1)
    proj_test = proj

    if options['use_dropout']:
        proj = dropout_layer(proj, 0.8, trng)

    # softmax
    pred = tensor.nnet.softmax(tensor.dot(proj, tparams['W3']) + tparams['b3'])
    pred_test = tensor.nnet.softmax(tensor.dot(proj_test, tparams['W3']) + tparams['b3'])

    f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
    f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')

    f_pred_prob_test = theano.function([x, mask], pred_test, name='f_pred_prob_test')
    f_pred_test = theano.function([x, mask], pred_test.argmax(axis=1), name='f_pred_test')

    off = 1e-8
    if pred.dtype == 'float16':
        off = 1e-6

    lamda_value = theano.shared(1e-4)
    theta_sum = theano.shared(0.)
    for kk, pp in tparams.items():
        theta_sum += tensor.sum(tparams[kk]**2)

    cost = -tensor.mean(tensor.log(pred[tensor.arange(y.shape[0]), y] + off)) + \
                0.5 * lamda_value * theta_sum


    return use_noise, x, mask, y, f_pred_prob, f_pred, cost, f_pred_prob_test, f_pred_test


def pred_probs(f_pred_prob, prepare_data, data, iterator, options, verbose=False):
    """ If you want to use a trained model, this is useful to compute
    the probabilities of new examples.
    """
    n_samples = len(data[0])
    probs = numpy.zeros((n_samples, options['ydim'])).astype(config.floatX)

    n_done = 0

    for _, valid_index in iterator:
        x, mask, y, x_maxlen = prepare_data([data[0][t] for t in valid_index],
                                  numpy.array(data[1])[valid_index],
                                  maxlen=None)
        pred_probs = f_pred_prob(x, mask)
        probs[valid_index, :] = pred_probs

        n_done += len(valid_index)
        if verbose:
            print('%d/%d samples classified' % (n_done, n_samples))

    return probs



def output_pred_labels(f_pred, prepare_data, data, iterator, verbose, path):
    f = open(path,'w')
    for _, valid_index in iterator:
        x, mask, y, x_maxlen = prepare_data([data[0][t] for t in valid_index],
                                  numpy.array(data[1])[valid_index],
                                  maxlen=None)
        preds = f_pred(x, mask)

        for i in range(preds.shape[0]):
            f.write(str(preds[i])+'\n')

    f.close()


def train_cnn(

    # Hyper-Parameters

    dim_token = 100,  # word embeding dimension
    dim_locDiff = 10, # location difference dimension
    n1 = 200,
    n2 = 500,
    ydim = 3,
    win_size = 3,

    #maxTokens1 = 60, # maximum tokens in sentence 1

    n_words = 1125,  # Vocabulary size
    n_locDiffs = 79,  # Location difference size

    patience=10,  # Number of epoch to wait before early stop if no progress
    max_epochs=300,  # The maximum number of epoch to run
    dispFreq=10,  # Display to stdout the training progress every N updates
    decay_c=0.,  # Weight decay for the classifier applied to the U weights.
    lrate=0.001,  # Learning rate for sgd (not used for adadelta and rmsprop)
    
    optimizer = sgd,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).

    maxlen=1000,  # Sequence longer then this get ignored
    batch_size=16,  # The batch size during training.

    # Parameter for extra option
    noise_std=0.,
    use_dropout=True,  # if False slightly faster, but worst test error
                       # This frequently need a bigger model.
    reload_model=None,
    saveto=None,#'model.npz',
    test_size=-1
):

    # Model options
    model_options = locals().copy()
    print('----------------------------------------------')
    print("model options", model_options)
    print('----------------------------------------------')

    #load_data, prepare_data = get_dataset(dataset)

    print('Loading data ... ... ...')
    train, valid, test = data.load_data(path='../mydata.pkl.gz',
                                n_words=n_words, valid_portion=0.,
                                maxlen=maxlen, sort_by_len=False)
    '''if test_size > 0:
        # The test set is sorted by size, but we want to keep random
        # size example.  So we must select a random selection of the
        # examples.
        idx = numpy.arange(len(test[0]))
        numpy.random.shuffle(idx)
        idx = idx[:test_size]
        test = ([test[0][n] for n in idx], [test[1][n] for n in idx])'''

    

    print('Building model ... ... ...')
    params = init_params(model_options, Wemb_value=data.read_gz_file("../emb.pkl.gz"))

    if reload_model:
        load_params('model.npz', params)


    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    # use_noise is for dropout
    (use_noise, x, mask,
     y, f_pred_prob, f_pred, cost, 
     f_pred_prob_test, f_pred_test) = build_model(tparams, model_options)

  

    f_cost = theano.function([x, mask, y], cost, name='f_cost')

    grads = tensor.grad(cost, wrt=list(tparams.values()))
    f_grad = theano.function([x, mask, y], grads, name='f_grad')

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams, grads,
                                        x, mask, y, cost)


    print('training ... ... ...')

    kf_valid = get_minibatches_idx(len(valid[0]), batch_size)
    kf_test  = get_minibatches_idx(len(test[0]), batch_size)

    print("%d train examples" % len(train[0]))
    print("%d valid examples" % len(valid[0]))
    print("%d test examples" % len(test[0]))

    history_errs = []
    best_p = None
    bad_counter = 0

  

    last_training_sum_costs = numpy.inf

    uidx = 0  # the number of update done
    estop = False  # early stop
    try:
        for eidx in range(max_epochs):
            n_samples = 0

            # Get new shuffled index for the training set.
            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)

            training_sum_costs = 0

            for train_batch_idx, train_index in kf:
                uidx += 1
                use_noise.set_value(1.)

                y = [train[1][t] for t in train_index]
                x = [train[0][t] for t in train_index]


                x, mask, y, x_maxlen = data.prepare_data(x, y)
                n_samples += x.shape[1]

                if train_batch_idx % 100 == 0 or train_batch_idx == len(kf) - 1:
                    print("%d/%d training bacthes @ epoch = %d" % (train_batch_idx, len(kf), eidx))

                cost = f_grad_shared(x, mask, y)
                f_update(lrate)

                training_sum_costs += cost

            print("sum of costs of all the training samples = ",training_sum_costs,"@ epoch = ", eidx)



            if numpy.isnan(training_sum_costs) or numpy.isinf(training_sum_costs):
                print('bad cost detected: ', training_sum_costs)
                print('End of Program')
                break

            if saveto:
                history_errs.append(training_sum_costs)
                print('Saving ... ... ...')
                params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, **params)
                pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
                print('Savings Done.')


            print('outputing predicted labels of test set ... ... ...')

            output_pred_labels(f_pred_test, data.prepare_data, test, kf_test, 
                verbose=False, path="test_predicted_labels.txt")

            if training_sum_costs >= last_training_sum_costs * 0.9995:
                bad_counter += 1
            

            last_training_sum_costs = training_sum_costs


            print('bad counter for early stopping : %d/%d' % (bad_counter, patience))

            print('--------------------------------------------------')


            if bad_counter >= patience:
                print('Early Stop!')
                estop = True
                break

            if estop:
                break


    except KeyboardInterrupt:
        print("Training interupted")



if __name__ == '__main__':
    train_cnn(
        max_epochs=300,
        test_size=-1,
    )
