#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import sys
import os.path
import logging

_BASE_PATH = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.join(_BASE_PATH, '..'))


import re
import csv
import joblib
import soft404
import numpy as np
from collections import defaultdict
from typing import Any


from lib.dl.linear.src.topic_tagger import TopicTagger
from common.topic import Topic
from common.onion import Onion


CATEGORY = 'CATEGORY'
KEYWORDS = 'KEYWORDS'
CO_KEYWORDS = 'CO-KEYWORDS'
ALLOW_LEFT_CHAR  = 'ALLOW-LEFT-CHAR'
ALLOW_RIGHT_CHAR = 'ALLOW-RIGHT-CHAR'


class Prefilter:
    blacklist   = None
    title_rules = defaultdict(list)
    THRESHOLD_LENGTH_TEXT  = 100
    THRESHOLD_PROB_SOFT404 = 0.75

    @classmethod
    def prefilter(cls, onion: Onion, app) -> dict:
        if cls.filter_by_txtlength(onion):
            app.logger.debug('In prefilter - NoContents')
            return {'textlen': True}
        if cls.filter_by_html(onion):
            app.logger.debug('In prefilter - Soft404')
            return {'soft404': True}
        listed = cls.filter_by_list(onion)
        if listed:
            app.logger.debug('In prefilter - Listed')
            return {'listed': listed}
        subdomain = cls.filter_by_subdomain(onion)
        if subdomain:
            app.logger.debug('In prefilter - Subdomain')
            return {'subdomain': subdomain}
        title = cls.filter_by_title(onion)
        if title:
            app.logger.debug('In prefilter - Title')
            return {'title': title}
        signature = cls.filter_by_signature(onion)
        if signature:
            app.logger.debug('In prefilter - Signature')
            return {'signature': signature}
        return {key: False for key in ('textlen', 'soft404', 'subdomain', 'title', 'signature')}


    @classmethod
    def init(cls):
        with open(os.path.join(_BASE_PATH, '../res/signature/title_keyword_v2.csv')) as f:
            reader = csv.DictReader(f)
            for i, row in enumerate(reader):
                try:
                    cls.title_rules[row[CATEGORY]].append({
                        'keywords': cls.make_regex(row[ALLOW_LEFT_CHAR], row[ALLOW_RIGHT_CHAR], row[KEYWORDS]), 
                        'co-keywords': tuple(map(lambda x: x.lower(), row[CO_KEYWORDS].split(';'))) if row[CO_KEYWORDS] else ('', )})
                except Exception as e:
                    if isinstance(e, re.error):
                        print(e)
                        print(row)


    @staticmethod
    def make_regex(left, right, phrase):
        exprs = []
        if left != 'Y': exprs.append('\\b')
        exprs.append(re.escape(phrase))
        if right != 'Y': exprs.append('\\b')
        try:
            return re.compile(''.join(exprs), re.I)
        except re.error:
            print(exprs)


    @classmethod
    def filter_by_title(cls, onion: Onion) -> Any:
        if not cls.title_rules: cls.init()
        table = defaultdict(lambda: False)
        for clz in cls.title_rules:
            for rule in cls.title_rules[clz]:
                a = rule['keywords'].search(onion.title) is not None
                b = any(map(lambda co: co in onion.title, rule['co-keywords']))
                table[clz] |= bool(a and b)
        if sum(table.values()) == 1:
            return [clz for clz, matches in table.items() if matches == 1].pop() 

        return None
    

    @classmethod
    def filter_by_txtlength(cls, onion: Onion) -> bool:
        return True if len(onion.text) < cls.THRESHOLD_LENGTH_TEXT else False


    @staticmethod
    def filter_by_subdomain(onion: Onion) -> Any:
        return Topic.PORN.name.title() if Topic.PORN.match_in_subdomain(onion.subdomain) else None
    

    @classmethod
    def filter_by_html(cls, onion: Onion) -> bool:
        return True if soft404.probability(onion.html) > cls.THRESHOLD_PROB_SOFT404 else False

    @classmethod
    def filter_by_signature(cls, onion: Onion) -> Any:
        for topic in Topic:
            for signature in topic.signatures:
                if signature(onion):
                    return topic.name.title()
        return None


    @classmethod
    def filter_by_list(cls, onion: Onion) -> Any:
        # if cls.check_domain_in_list(onion.domain):
        #     return Topic.OTHERS.name.title()
        for topic in Topic:
            if topic.check_domain_in_list(onion.domain):
                return topic.name.title()
        return None


    @classmethod
    def check_domain_in_list(cls, onion: Onion) -> Any:
        if cls.blacklist is None:
            cls.init_blacklist()
        return onion.domain in cls.blacklist


    @classmethod
    def init_blacklist(cls):
        with open(os.path.join(_BASE_PATH, '../res/list/blacklist.txt')) as f:
            cls.blacklist = tuple(line.strip() for line in f if line.strip() and not line.startswith('#'))


class Classifier:
    tagger = TopicTagger()
    # tagger2 = SVMTagger()
    # tagger3 = NBTagger()
    pipeline = joblib.load(os.path.join(_BASE_PATH, '../lib/ml/model/svm/drg.prn.fin.oth.including_vb.pipeline'))
    deprecated = frozenset(('political', 'credential_leaks', 'community', 'personal', 'wiki', 'service_provider', 'paste', 'news_portal', 'search'))
    prob_to_topic = lambda prob: np.argmax(prob) + 1 if isinstance(prob, np.ndarray) else prob

    @classmethod
    def classify(cls, onion: Onion, app) -> dict:
        """
        answer = {}
        answer['mcsvm'] = cls.classify_by_mcsvm(onion)
        app.logger.debug(f'In classify - {answer["mcsvm"][0]} by MCSVM')
        answer['fcdl']  = cls.classify_by_fcdl(onion)
        app.logger.debug(f'In classify - {answer["fcdl"][0]} by FC DL')
        if Topic.PORN.name.lower() in map(lambda val: val[0].lower(), answer.values()):
            app.logger.debug(f'In classify - returning {Topic.PORN.name.title()}')
            return {'topic': Topic.PORN.name.title()}
        if any(map(lambda val: val[0].lower() in cls.deprecated, answer.values())):
            app.logger.debug(f'In classify - returning {Topic.OTHERS.name.title()} because one of the deprecated topics is predicted.')
            return {'topic': Topic.OTHERS.name.title()} 
        app.logger.debug(f'In classify - returning {answer["fcdl"][0].title()}')
        return {'topic': answer['fcdl'][0].title()}
        """
        topic, dist = cls.classify_by_fcdl(onion)
        app.logger.debug(f'Classify gets {topic} by FCDL.')
        if topic.lower() in cls.deprecated:
            app.logger.debug(f'Classify returns {Topic.OTHERS.name.title()} because one of the deprecated topics is predicted.')
            return {'topic': Topic.OTHERS.name.title()}

        if (topic.lower() in (Topic.PORN.name.lower(), Topic.OTHERS.name.lower())) or \
            (topic.lower() not in (Topic.DRUGS.name.lower(), Topic.FINANCIAL.name.lower())):
            app.logger.debug(f'Classify returns {topic} by FCDL.')
            return {'topic': topic.title()}

        topic, dist = cls.classify_by_mcsvm(onion)
        app.logger.debug(f'Classify gets {topic} by MCSVM.')
        app.logger.debug(f'Classify returns {topic} by MCSVM.')
        return {'topic': topic.title()}
        


    @classmethod
    def classify_by_mcsvm(cls, onion: Onion) -> tuple:
        proba = cls.pipeline.predict_proba([onion.text])
        topic = cls.prob_to_topic(proba)
        topic = Topic.get_name(topic) #Topic(topic).name.title()
        return topic, {z[0]: z[1] for z in zip((t.name.title() for t in Topic), proba.tolist()[0])}


    @classmethod
    def classify_by_fcdl(cls, onion: Onion) -> tuple:
        return cls.tagger.predict([onion.text])


def main():
    onion1 = Onion('1111111111111111.onion','','tit tits','',)
    onion2 = Onion('porn-cp.1111111111111111.onion','','Pic Dump : 1st Studio Veronika Babko : Fresh Onions','',)
    onion3 = Onion('1111111111111111.onion','','horizon store','',)
    print('Title: ', onion1.title,  'Cat:', Prefilter.filter_by_title(onion1))
    print('Title: ', onion2.title,  'Cat:', Prefilter.filter_by_title(onion2))
    print('Domain:', onion2.domain, 'Cat:', Prefilter.filter_by_subdomain(onion2))
    # for module in sys.modules:
        # print(module)
    print('Domain:', onion3.domain, 'Cat:', Prefilter.filter_by_signature(onion3)) 

if __name__ == '__main__':
    main()
