#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
import re
import sys
import sqlite3
from collections import Counter
from tqdm import tqdm

def file_lines(file_path):
    with open(file_path, 'rb') as fp:
        b = fp.read()
    content = b.decode('utf8', 'ignore')
    lines = []
    for line in tqdm(content.split('\n')):
        # try:
        # if idx > max_len:
        #     break
        # idx += 1
        line = line.replace('\n', '').strip()
        if line.startswith('E'):
            lines.append('')
        elif line.startswith('M '):
            chars = line[2:].split('/')
            while len(chars) and chars[len(chars) - 1] == '.':
                chars.pop()
            if chars:
                sentence = ''.join(chars)
                sentence = re.sub('\s+', u'，', sentence)
                lines.append(sentence)
        # except:
        #     print(line)
        #     return lines
        #     lines.append('')
    return lines

def contain_chinese(s):
    if re.findall('[\u4e00-\u9fa5]+', s):
        return True
    return False

def valid(a, max_len=0):
    if len(a) > 0 and contain_chinese(a):
        if max_len <= 0:
            return True
        elif len(a) <= max_len:
            return True
    return False

def insert(a, b, cur):
    cur.execute("""
    INSERT INTO conversation (ask, answer) VALUES
    ('{}', '{}')
    """.format(a.encode("utf8").replace("'", "''"), b.encode("utf8").replace("'", "''")))

def insert_if(question, answer, cur, input_len=500, output_len=500):
    # if valid(question, input_len) and valid(answer, output_len):
    insert(question, answer, cur)
    return 1
    # return 0

def main(file_path):
    db = 'bucket_dbs/bucket_50_50.db'
    if os.path.exists(db):
        os.remove(db)
    conn = sqlite3.connect(db)
    cur = conn.cursor()
    cur.execute("""
        CREATE TABLE IF NOT EXISTS conversation
        (ask text, answer text);
        """)
    conn.commit()
    with open(file_path) as f:
        num_lines = sum(1 for line in f)
    words = Counter()
    inserted = 0
    batch_size = 1e5
    with open(file_path) as f:
        for line in tqdm(f, total=num_lines):
            ask, ans = line.decode("utf8").strip().split("#TAB#")
            inserted += insert_if(ask, ans, cur)

            if inserted != 0 and inserted % batch_size == 0:
                conn.commit()    

    conn.commit()

    print('Total inserted: %d' % inserted)

if __name__ == '__main__':
    file_path = './data/train.char3.txt'
    if len(sys.argv) == 2:
        file_path = sys.argv[1]
    if not os.path.exists(file_path):
        print('file {} not exists'.format(file_path))
    else:
        main(file_path)
