import numpy as np
import json, string
import xgboost as xgb
import sys, os
from sklearn.metrics import roc_auc_score, roc_curve, auc
import pickle
from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score
import random


train = False
test = True
if train:
    inputs = [] # input
    labels = [] # label
    count = 0
    # x = np.array(inputs)
    # label = np.array(labels, dtype=int)
    x = np.load(sys.argv[1])
    label = np.load(sys.argv[2])

    colsample = float(sys.argv[3])  #0.3
    lr = float(sys.argv[4])   #0.3
    iteration = int(sys.argv[5])   #50


    print(x.shape, label.shape)
    print(np.sum(label))
    interval = x.shape[0] // 50 * 49
    acc = 0.
    auc = 0.
    k_times = 1
    for k in range(k_times):
        # shuffle data
        index = np.random.permutation(label.shape[0])
         
        shuffled_x = x[index]; shuffled_label = label[index]
        
        train_data = shuffled_x[:interval, :]; train_label = shuffled_label[:interval]
        test_data = shuffled_x[interval:, :]; test_label = shuffled_label[interval:]

        depth = 25
        
        # xg_cls = xgb.XGBClassifier(objective='binary:logistic', colsample_bylevel=colsample, colsample_bynode=colsample, colsample_bytree=colsample, max_depth=depth, n_estimators=iteration, learning_rate=.3)
        # xg_cls.fit(train_data[:, :3], train_label, eval_metric=['auc', 'logloss'], eval_set=[(train_data[:1000, :3], train_label[:1000]), (test_data[:, :3], test_label)], verbose=True)
        # with open('xgboost_passage@100_thres@0.01_data@dev_depth@'+str(depth)+'_dim@3', 'wb') as f:
        #     pickle.dump(xg_cls, f)
        xg_cls = xgb.XGBClassifier(objective='binary:logistic', colsample_bylevel=colsample, colsample_bynode=colsample, colsample_bytree=colsample, max_depth=depth, n_estimators=iteration, learning_rate=lr)
        xg_cls.fit(train_data, train_label, eval_metric=['auc', 'logloss'], eval_set=[(train_data[:1000], train_label[:1000]), (test_data, test_label)], verbose=True)
        
        preds = xg_cls.predict_proba(test_data)
        auc += roc_auc_score(test_label, preds[:, 1])
        preds = xg_cls.predict(test_data)
        print('k: %d'%k)
        print(sum(preds))
        acc += accuracy_score(test_label, xg_cls.predict(test_data)) # np.where(test_label == np.ones_like(test_label), np.equal(preds, test_label), np.zeros_like(test_label)).sum() * 1.0 / test_label.sum()
        
        print(acc / (k+1), auc / (k+1))

        with open('xgboost_acc@%4.4f_auc@%4.4f_depth@%d_col@%f_lr@%4.4f_iter@%d'%(acc / (k+1), auc / (k+1), depth, colsample, lr, iteration), 'wb') as f:
            pickle.dump(xg_cls, f)  
    print(acc / k_times, auc /k_times)

if test:
    count=0
    acc = 0.
    auc = 0.
    test_data = np.load(sys.argv[1])[:,:-1]
    #test_label = np.load(sys.argv[2])
    #xgboost_acc@0.7620_auc@0.8465_depth@25_col@0.500000_lr@0.5000_iter@100
    xg_cls = pickle.load(open('xgboost_acc@0.7620_auc@0.8465_depth@25_col@0.500000_lr@0.5000_iter@100', 'rb'))
    #xg_cls = pickle.load(open('xgboost_passage@1_data@dev_depth@25_0_col@0.3', 'rb'))

    preds = xg_cls.predict_proba(test_data)

    # output confidence score
    conf = preds[:, 1].tolist()
    fw = open(sys.argv[2], 'w')
    print('writing to %s ...'%(sys.argv[2]))
    for l in conf:
        fw.write(str(l)+'\n')
    fw.close()

    #auc += roc_auc_score(test_label, preds[:, 1])
    #preds = xg_cls.predict(test_data)
    #print(sum(preds))
    #acc += accuracy_score(test_label, xg_cls.predict(test_data)) # np.where(test_label == np.ones_like(test_label), np.equal(preds, test_label), np.zeros_like(test_label)).sum() * 1.0 / test_label.sum()
    #print(acc, auc)



