import os
import random
from typing import List, Tuple, Dict, NoReturn, Optional, Any
import pandas as pd
import pathlib
import logging
import IPython
import omegaconf
import hydra

from tqdm import tqdm, tqdm_pandas
tqdm.pandas()

from Lexicalization.Utils import OMCSBase
from Utils import CoreQuisiteBase

logger = logging.getLogger(__name__)


class TextLookupGenerator(CoreQuisiteBase, OMCSBase):
    def __init__(self, config: omegaconf.DictConfig) -> NoReturn:
        super(TextLookupGenerator, self).__init__()
        self.config = config
        self._check_config()
        if 'predicates' in self.config and self.config.predicates is not None:
            assert isinstance(self.config.predicates, omegaconf.listconfig.ListConfig), f'{type(self.config.predicates)}'
            assert all([isinstance(s, str) and
                        (s.lower() in self.USEFUL_PREDICATE)
                        for s in self.config.predicates]), f'{self.config.predicates}'
            logger.warning(f'Updating USEFUL_PREDICATE: {self.config.predicates}')
            self.USEFUL_PREDICATE = [p.lower() for p in self.config.predicates]

    def _check_config(self) -> NoReturn:
        assert {'graph_path', 'output_path'}.issubset(self.config.keys()), \
            f'Invalid config {self.config}'
        assert pathlib.Path(self.config.graph_path).suffix == '.graphml', f'Only supporting graphml for KG format'
        assert pathlib.Path(self.config.output_path).suffix == '.csv', 'Only supporting csv for output format'

    def generate(self, materialize: bool = True) -> pd.DataFrame:
        all_matches = []
        for p in tqdm(self.USEFUL_PREDICATE, desc='predicate'):
            logger.debug(f'predicate: {p}')
            p_lex = self._lexicalize_predicate(p)
            logger.debug(f'p_lex: {p_lex}')
            p_base = self._get_predicate_base(p_lex)
            logger.debug(f'p_base: {p_base}')

            matches = []
            for wp in self.cn_web_paths:
                file_path = self.cache_path / wp.split('/')[-1]
                assert file_path.exists(), f'{file_path} does not exist.'
                matches += self._lookup_in_file(words=[p_base], path=file_path, keep_score=True)

            partial_df = pd.DataFrame(matches, columns=['sentence', 'score'])
            partial_df['predicate'] = p
            all_matches.append(
                partial_df
            )

        df = pd.concat(all_matches, axis=0).progress_apply(self._lexicalize, axis=1)

        if materialize:
            a: pd.DataFrame = df.set_index(['predicate', 'weight', 'sentence']).stack()
            a.to_csv(self.config.output_path, header=True)
            subset = a.sample(n=20, axis=0)
            # os.system(f'shuf -n 20 {self.config.output_path} > Predicates.txt')
            subset.to_csv('Predicates.txt', index=False, header=False)
            with open(f'Predicates.txt', 'a') as fp:
                fp.write('Predicates:' + str(self.USEFUL_PREDICATE) + '\n')
                fp.write(f'Total #: {str(len(a))}')

        return df

    def _lexicalize(self, lex_edge: pd.Series) -> pd.Series:
        pred, sent, score = lex_edge[['predicate', 'sentence', 'score']]

        sent = sent.replace('.', '')

        edge_info = {
            'weight': score,
            'predicate': pred,
            'sentence': sent,
        }
        if self.COND_TYPE[pred.lower()] in ['both', 'enabling']:
            enabling = {"enabling": f'{sent}, only if'}
        else:
            enabling = {"enabling": None}

        if self.COND_TYPE[pred.lower()] in ['both', 'disabling']:
            disabling = {"disabling": f'{sent}, unless'}
        else:
            disabling = {"disabling": None}

        return pd.Series({
            **edge_info,
            **enabling,
            **disabling,
        })


@hydra.main("../Configs/ClozeGeneratorConfig.yaml")
def main(config: omegaconf.DictConfig) -> Optional[Any]:
    gen = TextLookupGenerator(config)
    gen.generate()
    return 0


if __name__ == '__main__':
    main()
