import argparse
import contextlib
import pathlib
import shutil
import sys

import skemb.dataset
import skemb.tokenizer
import skemb.sgpattern_miner
import skemb.sgpattern


parser = argparse.ArgumentParser()

parser.add_argument('--dataset', type=pathlib.Path, default='dataset/train')
parser.add_argument('--output-dir', '-o', type=pathlib.Path, default='sgpatterns')
parser.add_argument('--no-logging', action='store_true')
parser.add_argument('--force', '-f', action='store_true')

def run(dataset, output_dir, no_logging, force):
    if output_dir.exists():
        if not force:
            raise Exception(f'directory {output_dir} exists. Use -f to overwrite it')
        else:
            print(f'Removing existing directory {output_dir}', file=sys.stderr)
            shutil.rmtree(output_dir)
    output_dir.mkdir()

    if not no_logging:
        log_file = output_dir / 'log.txt'
    else:
        log_file = None

    with contextlib.ExitStack() as exit_stack:
        if log_file:
            log_file = open(log_file, 'w')
            exit_stack.enter_context(log_file)

        ds = list(skemb.dataset.read_dataset(dataset))

        labels = [label for label, _ in ds]
        texts = [text for _, text in ds]
        sequences = [skemb.tokenizer.tokenize(text) for text in texts]

        miner = skemb.sgpattern_miner.SGPatternMiner(
            sequences=sequences,
            labels=labels,
            str_representations=texts,
            skipgram_k=6,
            min_support=10,
            max_iterations=6,
            pattern_buffer_size=6,
        )
        miner.logfile = log_file
        patterns = miner.run()
        patterns.save(output_dir / 'patterns.pickle')

if __name__ == '__main__':
    args = parser.parse_args()
    run(**vars(args))
