from typing import Any, Dict, Tuple, List, Union
import numpy as np
import collections
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Any, Dict
from transformers import GPT2LMHeadModel, AutoTokenizer
from datasets import load_metric

from torchfly.training import FlyModel
from torchfly.metrics import CategoricalAccuracy, Average, MovingAverage, Speed


class GPTGenerationFlyModel(FlyModel):
    def __init__(self, config):
        super().__init__(config)
        self.gpt2 = GPT2LMHeadModel.from_pretrained(
            config.task.pretrained_model, resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0
        )
        self.tokenizer = AutoTokenizer.from_pretrained(config.task.pretrained_model)
        # configure metrics here
        self.configure_metrics()
        self.metric = load_metric("squad")
        self.gts = []
        self.preds = []
        self.eos_token_id = self.tokenizer.encode("\n\n", add_special_tokens=False)[0]


    def configure_metrics(self):
        self.training_metrics = {"loss": MovingAverage()}
        self.evaluation_metrics = {"loss": Average(), "f1": Average()}

    def forward(self, batch: Dict[str, Any]) -> Dict[str, Any]:
        output = self.gpt2(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"],
            return_dict=True,
        )
        self.training_metrics["loss"](output.loss.item())
        return output

    def predict_step(self, batch):
        output = self.gpt2(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"],
            return_dict=True,
        )
        self.evaluation_metrics["loss"](output.loss.item())

        outputs = self.gpt2.generate(
            input_ids=batch["prefix_input_ids"],
            attention_mask=batch["prefix_attention_mask"],
            eos_token_id=self.eos_token_id,
            forced_eos_token_id=self.eos_token_id,
            early_stopping=True,
            max_length=1024,
            num_beams=4,
            do_sample=False,
            return_dict_in_generate=True,
        )

        all_gt_tokens = batch["gt_target"]
        all_gen_tokens = outputs.sequences[:, batch["prefix_input_ids"].shape[1]:].tolist()
        for idx in range(len(outputs.sequences)):
            if not batch["if_empty"][idx]:
                self.gts.append(self.tokenizer.decode(all_gt_tokens[idx], skip_special_tokens=True).strip())
                self.preds.append(self.tokenizer.decode(all_gen_tokens[idx], skip_special_tokens=True).strip())

        return None


    def get_training_metrics(self) -> Dict[str, str]:
        loss = self.training_metrics["loss"].get_metric()
        metrics = {"loss": f"{loss:.4f}"}
        return metrics

    def get_evaluation_metrics(self) -> Dict[str, str]:
        loss = self.evaluation_metrics["loss"].get_metric()

        preds = [{"prediction_text": text, "id": str(idx)} for idx, text in enumerate(self.preds)]
        gts = [
            {"answers": {"answer_start": [100], "text": [text]}, "id": str(idx)} for idx, text in enumerate(self.gts)
        ]
        if len(preds) > 0:
            results = self.metric.compute(predictions=preds, references=gts)
        else:
            results = {"em": 0.0, "f1": 0.0}

        self.metric
        ppl = np.exp(loss)
        score = -ppl

        metrics = {
            "loss": f"{loss:8.4f}",
            "ppl": f"{ppl:8.4f}",
            "f1": f"{results['f1']:8.4f}",
            "score": f"{score:8.4f}",
        }

        return metrics

    def validation_loop(self, dataloader):
        if self.trainer.global_step_count > 0:
            super().validation_loop(dataloader)

    def reset_evaluation_metrics(self):
        super().reset_evaluation_metrics()
        self.gts = []
        self.preds = []
