#!/usr/bin/env python3

from __future__ import print_function

import argparse
import datetime
from pymongo import MongoClient
from bson import json_util
from collections import defaultdict
import json
import sys
import string
import random

MAX_LENGTH = 1000

class ID_Gen(object):
    def __init__(self, suffix=""):
        self.count = 0
        self.suffix = suffix
    
    def __call__(self, name):
        self.count += 1
        return "{}.{}.{}".format(name, self.count, self.suffix)

    def reset(self):
        self.count = 0

generator = ID_Gen()

def filename_to_name(filename):
    return '_'.join(filename.split('/'))

def read_raw_doc(filename):
    text = []
    token_count = defaultdict(lambda: 0)
    for line in open(filename):
        for part in line.strip().split():
            text.append(part)
            token_count[part] += 1
        text.append("\n")
    if token_count['-'] > 0.1 * len(text):
        print("Note: File was read as plain text without spans.", file=sys.stderr)
    return {'text': text, 'name': filename_to_name(filename)}

def read_span_doc(filename, ignore_spans=False):
    text = []
    
    # Read content
    spans = {}
    incomplete = {}
    for line in open(filename):
        parts = line.strip().split()
        if len(parts) == 0:
            continue
        num = int(parts[1])
        text.append(parts[2])
        for count, item in enumerate(parts[3:]):
            for span in item.split("|"):
                if span.startswith("("):
                    if span.endswith(")"):
                        # Single token span
                        label = '' if '-' not in span else span[1:-1].split('-')[1] 
                        assert (num, num) not in spans
                        spans[num, num] = {'start': num, 'end': num, 'label': label}
                    else:
                        # Start of a multi-token span
                        span_id = span[1:]
                        label = ''
                        if '-' in span:
                            span_id, label = span_id.split('-')
                        incomplete.setdefault((count, span_id), []).append({'start': num, 'end': None, 'label': label})
                elif span.endswith(")"):
                    # End of a multi-token span
                    span_id = span[:-1]
                    span = incomplete[count, span_id].pop()
                    span['end'] = num
                    if span['end'] - span['start'] <= MAX_LENGTH:
                        assert (span['start'], span['end']) not in spans
                        spans[span['start'], span['end']] = span
    if ignore_spans:
        spans = {}
    return {'text': text, 'name': filename_to_name(filename), 'spans': spans}

def insert_text(info, args):
    text_id = args.group +'.'+ info['name']
    text_doc = {
        "_id": text_id,
        "filename": args.group + name,
        "tokens": [
            { "_id": n, "text": w } for n, w in enumerate(info['text'])
        ]
    }
    if args.dry_run:
        print("Insert text:", text_id, text_doc)
    else:
        if db.texts.find_one({"_id": text_id}) is not None:
            print("Skipping insertion as document with same text_id is present")
        else:
            text_id = db.texts.insert_one(text_doc).inserted_id
            print("Inserted text:", text_id)
    return text_id

def insert_annotation(info, text_id, args, spans, labels, extra={}):
    ann_id = generator(args.group +'.'+ info['name'])
    clusters = []
    checks = []
    for key, span in spans.items():
        clusters.append({
            "_id": ':'.join([str(v) for v in key]),
            "mentions": [span],
            "color": "#7570b3",
        })
        checks.append({
            "mention": span,
            "decision": args.interface,
        })
    ann = {
        "_id": ann_id,
        "text_id": text_id,
        "user": "server",
        "clusters": clusters,
        "labels": labels,
        "checks": checks,
        "focus_mention": None,
    }
    for name, value in extra.items():
        ann[name] = value
    if args.dry_run:
        print("Insert annotation:", ann_id, ann)
    else:
        ann_id = db.annotations.insert_one(ann).inserted_id
        print("Inserted annotation:", ann_id)
    return ann_id

def insert_assignment(info, ann_id, text_id, args, extra={}):
    assign_id = generator(args.group +'.'+ info['name'])
    if args.tutorial:
        extra['tutorial'] = args.tutorial
    assignment = {
        "_id": assign_id,
        "group": args.group,
        "count": args.count,
        "text_id": text_id,
        "ann_id": ann_id,
        "ui" : args.interface,
        "createdAt": datetime.datetime.now(),
        "instructions": args.interface,
        "minTime": args.min_time,
    }
    for name, value in extra.items():
        assignment[name] = value
    if args.dry_run:
        print("Insert assignment:", assign_id, assignment)
    else:
        assign_id = db.assignments.insert_one(assignment).inserted_id
        print("Inserted assignment:", assign_id)
    info['assign_id'] = assign_id

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Add an assignment to MongoDB')
    parser.add_argument('-u', '--url',
            help='URL of MongoDB instance.', default="127.0.0.1:3001")
    parser.add_argument('-r', '--dry-run',
            help='Read and prepare but do not access DB.', action='store_true')

    parser.add_argument('group',
            help='Group name for this task.')
    parser.add_argument('-n', "--num-suffix",
            help='Number used to make assignments and annotations unique.')
    parser.add_argument('-d', '--data',
            help='Files with text to annotate.', nargs="+", required=True)
    parser.add_argument('-s', '--has-spans',
            help='The data has spans in it.', action='store_true')
    parser.add_argument('-c', '--count',
            help='People to do annotation.', type=int, required=True)
    parser.add_argument('-mt', '--min-time',
            help='Minimum time annotators must spend on the task.', type=float, required=True)
    parser.add_argument('-l', '--labels',
            help='Manually specify the label set. For labels with spaces, use "label here". For a long and a short version of a label, use "short version::long version".', nargs="+")
    parser.add_argument('-i', '--interface',
            help='User interface.', required=True,
            choices=[
                'sentiment',
                'full',
                'conditionals',
                'link-to-one-before', 'link-to-one-after',
                "one-entity-at-a-time",
                'mentions',
                'check-mentions-include', 'check-mentions-exclude',
                'check-mentions-include-editable', 'check-mentions-exclude-editable',
                'actions', 'conditionals-from-actions',
                'freq-ents',
            ])
    parser.add_argument('-t', '--tutorial',
            help='The name of a tutorial to require.')
    args = parser.parse_args()

    if args.num_suffix:
        generator = ID_Gen(args.num_suffix)

    # Connect
    client, db = None, None
    if not args.dry_run:
        client = MongoClient(args.url)
        db = client.meteor

    # Read data
    data = []
    for name in args.data:
        if args.has_spans:
            if args.interface == 'mentions':
                data.append(read_span_doc(name, True))
            else:
                data.append(read_span_doc(name))
        else:
            data.append(read_raw_doc(name))
    labels = set()
    if args.labels:
        labels = set(args.labels)
    else:
        for info in data:
            if 'spans' in info:
                for _, span in info['spans'].items():
                    labels.add(span['label'])
    if '' in labels:
        labels.remove('')
    labels = list(labels)

    # Write Data
    for info in data:
        # Insert Text
        text_id = insert_text(info, args)

        if args.interface == "conditionals-from-actions":
            for _, span in info['spans'].items():
                ann_id = insert_annotation(info, text_id, args, {0: span}, labels, {"focus_mention": span})
                insert_assignment(info, ann_id, text_id, args)
        elif args.interface.startswith('link-to-one'):
            for _, span in info['spans'].items():
                ann_id = insert_annotation(info, text_id, args, info.get('spans', {}), labels, {"focus_mention": span})
                insert_assignment(info, ann_id, text_id, args)
        else:
            ann_id = insert_annotation(info, text_id, args, info.get('spans', {}), labels)
            insert_assignment(info, ann_id, text_id, args)

