#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

"""
Wav2letter decoders.
"""

import gc
import itertools as it
import os.path as osp
import warnings
from collections import deque, namedtuple

import numpy as np
import torch
from examples.speech_recognition.data.replabels import unpack_replabels
from fairseq import tasks, utils
from fairseq.utils import apply_to_sample
from fairseq.data.data_utils import post_process
from fairseq.data import data_utils


try:
    from wav2letter.common import create_word_dict, load_words
    from wav2letter.criterion import CpuViterbiPath, get_data_ptr_as_bytes
    from wav2letter.decoder import (
        CriterionType,
        DecoderOptions,
        KenLM,
        LM,
        LMState,
        SmearingMode,
        Trie,
        LexiconDecoder,
        LexiconFreeDecoder,
    )
except:
    warnings.warn(
        "wav2letter python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/wav2letter/wiki/Python-bindings"
    )
    LM = object
    LMState = object


class W2lDecoder(object):
    def __init__(self, args, tgt_dict):
        self.tgt_dict = tgt_dict
        self.blank_idx = tgt_dict.bos()
        self.vocab_size = len(tgt_dict)
        self.nbest = args.nbest
        self.encoder_ctc = False
        # criterion-specific init
        if args.criterion == "ctc" or args.criterion == 'ctc_mlm':
            self.criterion_type = CriterionType.CTC
            self.blank = (
                tgt_dict.index("<ctc_blank>")
                if "<ctc_blank>" in tgt_dict.indices
                else tgt_dict.bos()
            )
            self.asg_transitions = None
        elif args.criterion == "asg_loss":
            self.criterion_type = CriterionType.ASG
            self.blank = -1
            self.asg_transitions = args.asg_transitions
            self.max_replabel = args.max_replabel
            assert len(self.asg_transitions) == self.vocab_size ** 2
        elif args.criterion == 'ctc2_mlm':
            self.encoder_ctc = True
            self.blank = (
                tgt_dict.index("<ctc_blank>")
                if "<ctc_blank>" in tgt_dict.indices
                else tgt_dict.bos()
            )
            self.asg_transitions = None
        else:
            raise RuntimeError(f"unknown criterion: {args.criterion}")

    def generate(self, models, sample, need_prev_output=False, **unused):
        """Generate a batch of inferences."""
        # model.forward normally channels prev_output_tokens into the decoder
        # separately, but SequenceGenerator directly calls model.encoder
        if need_prev_output:
            encoder_input = {
                k: v for k, v in sample["net_input"].items()
            }
            emissions, padding_mask = self.get_emissions(models, encoder_input)#encoder_out(V=B, T, C)
            non_padding_mask = ~padding_mask#[1, 811]B, SRC_DIM
            input_lengths = non_padding_mask.long().sum(-1)#811
            return self.decode(emissions, input_lengths)
        else:
            encoder_input = {
                k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens"
            }
            emissions, _ = self.get_emissions(models, encoder_input)#encoder_out(V=B, T, C)
            return self.decode(emissions)

    def get_emissions(self, models, encoder_input, log_probs=True):
        """Run encoder and normalize emissions"""
        # encoder_out = models[0].encoder(**encoder_input)
        encoder_out = models[0](**encoder_input)
        if self.encoder_ctc:
            emissions = models[0].get_normalized_ctc_probs(encoder_out, log_probs=log_probs)
        elif self.criterion_type == CriterionType.CTC:
            emissions = models[0].get_normalized_probs(encoder_out, log_probs=log_probs)#encoder_out(T, B, C) from the encoder
        elif self.criterion_type == CriterionType.ASG:
            emissions = encoder_out["encoder_out"]
        return emissions.transpose(0, 1).float().cpu().contiguous(), encoder_out["padding_mask"]#B,T,C

    def get_tokens(self, idxs):
        """Normalize tokens by handling CTC blank, ASG replabels, etc."""
        idxs = (g[0] for g in it.groupby(idxs))
        if self.criterion_type == CriterionType.CTC:
            idxs = filter(lambda x: x != self.blank, idxs)
        elif self.criterion_type == CriterionType.ASG:
            idxs = filter(lambda x: x >= 0, idxs)
            idxs = unpack_replabels(list(idxs), self.tgt_dict, self.max_replabel)
        return torch.LongTensor(list(idxs))


class W2lViterbiDecoder(W2lDecoder):
    def __init__(self, args, tgt_dict):
        super().__init__(args, tgt_dict)

    def decode(self, emissions):
        B, T, N = emissions.size()
        hypos = []
        if self.asg_transitions is None:
            transitions = torch.FloatTensor(N, N).zero_()
        else:
            transitions = torch.FloatTensor(self.asg_transitions).view(N, N)
        viterbi_path = torch.IntTensor(B, T)
        workspace = torch.ByteTensor(CpuViterbiPath.get_workspace_size(B, T, N))
        CpuViterbiPath.compute(
            B,
            T,
            N,
            get_data_ptr_as_bytes(emissions),
            get_data_ptr_as_bytes(transitions),
            get_data_ptr_as_bytes(viterbi_path),
            get_data_ptr_as_bytes(workspace),
        )
        return [
            [{"tokens": self.get_tokens(viterbi_path[b].tolist()), "score": 0}]
            for b in range(B)
        ]

from examples.speech_recognition.ctc_beam_search import ctcBeamSearch
import copy

import sys
sys.path.append("/wav2bert/fairseq/bert")

from tokenization import BasicTokenizer, BertTokenizer
from modeling import BertModel, BertForPreTraining, BertModelWithAdapter, BertForMaskedLM
from multiprocessing import Process, Manager
from fairseq import checkpoint_utils

class W2lDecoderBertRescore(W2lDecoder):
    def __init__(self, args, tgt_dict):
        super().__init__(args, tgt_dict)

        self.index_list_path = args.index_list_path
        self.top_k_size = args.top_k_size # 100
        self.mask_idx = 103 #tgt_dict.mask()
        self.lamda = args.score_lamda #0.5

        self.process_num = args.process_num

        if getattr(args, 'kenlm_model', None) is not None:
            # import ipdb; ipdb.set_trace()
            self.lm = checkpoint_utils.load_model_ensemble([args.kenlm_model])
            self.lm = self.lm[0][0]
            self.train_lm = True
            print(self.lm)
        else:
            self.lm = BertForMaskedLM.from_pretrained(args.decoder_bert_model_name)
            self.train_lm = False
        self.lm.to("cuda:0" if torch.cuda.is_available() else "cpu")
        self.bert_tokenizer = BertTokenizer.from_pretrained(args.decoder_bert_model_name)

        self.lang_indexs = []
        for line in open(self.index_list_path, 'r'):
            self.lang_indexs.append(int(line.strip()))
        
        self.beam_width = args.beam
        self.lm_batch_size = args.lm_batch_size
    
    def generate(self, models, sample, need_prev_output=False, **unused):
        """Generate a batch of inferences."""
        # model.forward normally channels prev_output_tokens into the decoder
        # separately, but SequenceGenerator directly calls model.encoder
        # print("In W2lGreedyDecoder func generate...")
        if need_prev_output:
            encoder_input = {
                k: v for k, v in sample["net_input"].items()
            }
        else:
            encoder_input = {
                k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens"
            }
        emissions, padding_mask = self.get_emissions(models, encoder_input, log_probs=False)#encoder_out(V=B, T, C), 这里获取了ctc的输出概率，首先要获取top n的sentences,然后用于rescore

        non_padding_mask = ~padding_mask#[1, 811]B, SRC_DIM
        input_lengths = non_padding_mask.long().sum(-1)#811
        return self.beamDecode(emissions, input_lengths)
    
    def beamDecodeOne(self, process_count, emissions, input_lengths):
        tokens_list = []
        for i, (lp, inp_l) in enumerate(zip(
            emissions,
            input_lengths,
        )):
            import time
            lp = lp[:inp_l]
            # print(lp.shape)
            
            beam_search_time = time.time()
            result = ctcBeamSearch(lp, self.blank_idx, self.lang_indexs, None, beamWidth=self.beam_width)
            print("beam search time: {}", time.time()-beam_search_time)
            # # print(result) # beam size * using_index_list_len, sort by probs, high to low. sample format is (seq, prob)

            # # 取 topk 的结果进行rescore
            # result = result[:self.top_k_size]

            # # 对每个句子使用bert进行操作
            # mask_lm_score_time = time.time()
            # for result_idx, (seq, prob) in enumerate(result):
            #     lm_score = 0
            #     # 对每个句子的每个subword 进行mask
            #     for pos, subword in enumerate(seq):
            #         mask_seq = list(copy.deepcopy(seq))
            #         mask_seq[pos] = self.mask_idx
                    
            #         # 在用bert把mask位置得到原来subword的概率算出来
            #         input_tensor = torch.tensor(mask_seq).reshape((1, -1))
            #         output = self.lm(input_tensor)
            #         output = output.squeeze()
            #         output = torch.nn.functional.log_softmax(output)
            #         # result = output.argmax(dim=-1)
            #         mask_subword_prob = output[pos][subword]
            #         # print(mask_subword_prob)
            #         # 注意要取log

            #         # 把一个句子所有log概率相加，相当于原概率相乘
            #         lm_score += mask_subword_prob
            #     # 和am的score进行加权 
            #     total_score = self.lamda * prob + (1-self.lamda) * lm_score

            #     result[result_idx] = (seq, total_score)
            
            # print("mask lm score time {}".format(time.time()-mask_lm_score_time))    

            # sort_result = sorted(result, reverse=True, key=lambda x: x[1])
            # # 最后只需要取score最高的就可以了
            # toks = sort_result[0][0]
            # tokens_list.append(toks)
        
        return tokens_list
            
            #finalized[i].append(get_hypo_nat(torch.LongTensor(list(toks))))

            # toks = lp.argmax(dim=-1).unique_consecutive()
            # pred_units_arr = toks[toks != self.blank_idx].tolist()
            # finalized[i].append(get_hypo_nat(torch.LongTensor(list(pred_units_arr))))

    def beamDecode(self, emissions, input_lengths):
        def get_hypo_nat(decoded_id):
            return {
                'tokens': decoded_id,
                'score': 0.0,
                'attention': None,  # src_len x tgt_len
                'alignment': None,
                'positional_scores': torch.Tensor([0.0]),
            }
        bsz, _, _ = emissions.size()
        finalized = [[] for i in range(bsz)]

        # emissions = emissions.cpu().numpy()
        # print(emissions.shape)
        # print(self.process_num)
        # each_process_count = int(len(emissions) / self.process_num)
        # print(each_process_count)
        # process_emissions = []
        # process_input_lengths = []
        # for process_count in range(self.process_num - 1):
        #     print(emissions[process_count*each_process_count:(process_count+1)*each_process_count])
        #     process_emissions.append(emissions[process_count*each_process_count:(process_count+1)*each_process_count])
        #     process_input_lengths.append(input_lengths[process_count*each_process_count:(process_count+1)*each_process_count])
        # process_emissions.append(emissions[process_count*each_process_count:])
        # process_input_lengths.append(input_lengths[process_count*each_process_count:])

        # beam_jobs = []
        # manager = Manager()
        # return_dict = manager.dict()
        # for process_count in range(self.process_num):
        #     p = Process(target=self.beamDecodeOne, args=(process_count,process_emissions[process_count], process_input_lengths[process_count]))
        #     p.start()
        #     beam_jobs.append(p)
        
        # for p in beam_jobs:
        #     p.join()

        # print(return_dict.values)

        for i, (lp, inp_l) in enumerate(zip(
            emissions,
            input_lengths,
        )):
            import time
            lp = lp[:inp_l]
            toks = lp.argmax(dim=-1).unique_consecutive()
            targ_units = self.tgt_dict.string(toks)
            # print(targ_units)
            targ_words = post_process(targ_units, "letter")
            # print(targ_words)

            beam_search_time = time.time()
            lp = lp.cpu().numpy()
            result = ctcBeamSearch(lp, self.blank_idx, self.lang_indexs, None, beamWidth=self.beam_width, tokenizer=self.tgt_dict)
            
            # print("beam search time: {}", time.time()-beam_search_time)
            # print(result) # beam size * using_index_list_len, sort by probs, high to low. sample format is (seq, prob)

            # 取 topk 的结果进行rescore
            result = result[:self.top_k_size]

            # 将不同token进行转换
            trans_result = []
            for (seq, prob)  in result:
                targ_units = self.tgt_dict.string(seq)
                targ_words = post_process(targ_units, "letter")
                targ_words += '.'
                # print(targ_words)
                word_tgt_tokens = self.bert_tokenizer.encode_line(targ_words, post_proces='bert_bpe_piece')
                # print(word_tgt_tokens)
                word_tgt_tokens.insert(0, self.bert_tokenizer.cls())
                word_tgt_tokens.append(self.bert_tokenizer.sep())
                
                trans_result.append(torch.LongTensor(list(word_tgt_tokens)))
            trans_result = data_utils.collate_tokens(trans_result, pad_idx=self.bert_tokenizer.pad(), left_pad=False)
            trans_result = utils.move_to_cuda(trans_result, device=inp_l.device)

            # 对每个句子使用bert进行操作
            mask_lm_score_time = time.time()
            for result_idx, ((origin_seq, prob), seq) in enumerate(zip(result, trans_result)):
                lm_score = 0
                # 对每个句子的每个subword 进行mask, 这里组成一个batch传给模型forward是不是会省时间一点，逐个来太慢了
                mask_seq_batch = []
                mask_subword = []
                for pos, subword in enumerate(seq):
                    # 这里要跳过几个特殊符号
                    if pos == 0 or pos == len(seq) - 1 or pos == len(seq) - 2:
                        continue

                    mask_seq = list(copy.deepcopy(seq))
                    mask_seq[pos] = self.mask_idx
                    mask_seq_batch.append(torch.LongTensor(mask_seq))
                    mask_subword.append(subword)
                    # # 在用bert把mask位置得到原来subword的概率算出来
                    # input_tensor = torch.tensor(mask_seq).reshape((1, -1))
                    # output = self.lm(input_tensor)
                    # output = output.squeeze()
                    # output = torch.nn.functional.log_softmax(output)
                    # # result = output.argmax(dim=-1)
                    # mask_subword_prob = output[pos][subword]
                    # # print(mask_subword_prob)
                    # # 注意要取log

                    # # 把一个句子所有log概率相加，相当于原概率相乘
                    # lm_score += mask_subword_prob

                if self.lamda < 1: # 等于1就没必要算了
                    subword_index = 0
                    for batch_index in range(0, len(mask_seq_batch), self.lm_batch_size):
                        if batch_index + self.lm_batch_size < len(mask_seq_batch):
                            one_batch = data_utils.collate_tokens(mask_seq_batch[batch_index:batch_index+self.lm_batch_size], pad_idx=self.bert_tokenizer.pad(), left_pad=False)
                        else:
                            one_batch = data_utils.collate_tokens(mask_seq_batch[batch_index:], pad_idx=self.bert_tokenizer.pad(), left_pad=False)
                        one_batch = utils.move_to_cuda(one_batch, device=inp_l.device)
                        
                        if self.train_lm:
                            output = self.lm(prev_output_tokens=one_batch)
                            output = output['decoder_out']
                        else:
                            output = self.lm(one_batch)

                        output = torch.nn.functional.log_softmax(output, dim=-1)
                        for pos in range(output.shape[0]):
                            # 因为跳过了cls，所以mask的位置其实时pos+1self.tgt_dict.string(torch.tensor(mask_subword[subword_index]).unsqueeze(0))
                            # print(output[pos][subword_index+1][mask_subword[subword_index]], self.bert_tokenizer.convert_ids_to_tokens([int(mask_subword[subword_index].cpu())]))
                            lm_score += output[pos][subword_index+1][mask_subword[subword_index]]
                            subword_index += 1
                    # print(lm_score)
                    # print(prob)
                    # print(np.log(prob))
                    # 和am的score进行加权 
                    total_score = self.lamda * np.log(prob) + (1-self.lamda) * lm_score.cpu().numpy()
                    # print(total_score)
                else:
                    total_score = np.log(prob)

                result[result_idx] = (origin_seq, total_score)
            
            # print("mask lm score time {}".format(time.time()-mask_lm_score_time))    

            sort_result = sorted(result, reverse=True, key=lambda x: x[1])
            # 最后只需要取score最高的就可以了
            toks = sort_result[0][0]
            targ_units = self.tgt_dict.string(toks)
            targ_words = post_process(targ_units, "letter")
            # print(targ_words)
            # print()

            finalized[i].append(get_hypo_nat(torch.LongTensor(list(toks))))

            # toks = lp.argmax(dim=-1).unique_consecutive()
            # pred_units_arr = toks[toks != self.blank_idx].tolist()
            # finalized[i].append(get_hypo_nat(torch.LongTensor(list(pred_units_arr))))
        return finalized

    def decode(self, emissions, input_lengths):
        def get_hypo_nat(decoded_id):
            return {
                'tokens': decoded_id,
                'score': 0.0,
                'attention': None,  # src_len x tgt_len
                'alignment': None,
                'positional_scores': torch.Tensor([0.0]),
            }

        bsz, _, _ = emissions.size()
        finalized = [[] for i in range(bsz)]
        for i, (lp, inp_l) in enumerate(zip(
            emissions,
            input_lengths,
        )):
            lp = lp[:inp_l].unsqueeze(0)
            toks = lp.argmax(dim=-1).unique_consecutive()
            pred_units_arr = toks[toks != self.blank_idx].tolist()
            finalized[i].append(get_hypo_nat(torch.LongTensor(list(pred_units_arr))))
        return finalized

class W2lGreedyDecoder(W2lDecoder):
    def __init__(self, args, tgt_dict):
        super().__init__(args, tgt_dict)

    def generate(self, models, sample, need_prev_output=False, **unused):
        """Generate a batch of inferences."""
        # model.forward normally channels prev_output_tokens into the decoder
        # separately, but SequenceGenerator directly calls model.encoder
        print("In W2lGreedyDecoder func generate...")
        if need_prev_output:
            encoder_input = {
                k: v for k, v in sample["net_input"].items()
            }
        else:
            encoder_input = {
                k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens"
            }
        emissions, padding_mask = self.get_emissions(models, encoder_input)#encoder_out(V=B, T, C)
        non_padding_mask = ~padding_mask#[1, 811]B, SRC_DIM
        input_lengths = non_padding_mask.long().sum(-1)#811
        return self.decode(emissions, input_lengths)

    def decode(self, emissions, input_lengths):
        def get_hypo_nat(decoded_id):
            return {
                'tokens': decoded_id,
                'score': 0.0,
                'attention': None,  # src_len x tgt_len
                'alignment': None,
                'positional_scores': torch.Tensor([0.0]),
            }

        bsz, _, _ = emissions.size()
        finalized = [[] for i in range(bsz)]
        for i, (lp, inp_l) in enumerate(zip(
            emissions,
            input_lengths,
        )):
            lp = lp[:inp_l].unsqueeze(0)
            toks = lp.argmax(dim=-1).unique_consecutive()
            pred_units_arr = toks[toks != self.blank_idx].tolist()
            finalized[i].append(get_hypo_nat(torch.LongTensor(list(pred_units_arr))))
        return finalized


class W2lKenLMDecoder(W2lDecoder):
    def __init__(self, args, tgt_dict):
        super().__init__(args, tgt_dict)

        self.silence = (
            tgt_dict.index("<ctc_blank>")
            if "<ctc_blank>" in tgt_dict.indices
            else tgt_dict.bos()
        )
        self.lexicon = load_words(args.lexicon)
        self.word_dict = create_word_dict(self.lexicon)
        self.unk_word = self.word_dict.get_index("<unk>")

        self.lm = KenLM(args.kenlm_model, self.word_dict)
        self.trie = Trie(self.vocab_size, self.silence)

        start_state = self.lm.start(False)
        for i, (word, spellings) in enumerate(self.lexicon.items()):
            word_idx = self.word_dict.get_index(word)
            _, score = self.lm.score(start_state, word_idx)
            for spelling in spellings:
                spelling_idxs = [tgt_dict.index(token) for token in spelling]
                assert (
                    tgt_dict.unk() not in spelling_idxs
                ), f"{spelling} {spelling_idxs}"
                self.trie.insert(spelling_idxs, word_idx, score)
        self.trie.smear(SmearingMode.MAX)

        self.decoder_opts = DecoderOptions(
            args.beam,
            int(getattr(args, "beam_size_token", len(tgt_dict))),
            args.beam_threshold,
            args.lm_weight,
            args.word_score,
            args.unk_weight,
            args.sil_weight,
            0,
            False,
            self.criterion_type,
        )

        if self.asg_transitions is None:
            N = 768
            # self.asg_transitions = torch.FloatTensor(N, N).zero_()
            self.asg_transitions = []

        self.decoder = LexiconDecoder(
            self.decoder_opts,
            self.trie,
            self.lm,
            self.silence,
            self.blank,
            self.unk_word,
            self.asg_transitions,
            False,
        )


    def generate(self, models, sample, need_prev_output=False, **unused):
        """Generate a batch of inferences."""
        # model.forward normally channels prev_output_tokens into the decoder
        # separately, but SequenceGenerator directly calls model.encoder
        if need_prev_output:
            encoder_input = {
                k: v for k, v in sample["net_input"].items()
            }
            emissions, padding_mask = self.get_emissions(models, encoder_input)#encoder_out(V=B, T, C)
            non_padding_mask = ~padding_mask#[1, 811]B, SRC_DIM
            input_lengths = non_padding_mask.long().sum(-1)#811
            return self.decode(emissions, input_lengths)
        else:
            encoder_input = {
                k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens"
            }
            emissions, padding_mask = self.get_emissions(models, encoder_input)#encoder_out(V=B, T, C)
            non_padding_mask = ~padding_mask#[1, 811]B, SRC_DIM
            input_lengths = non_padding_mask.long().sum(-1)#811
            # return self.decode(emissions, input_lengths)
            return self.decode(emissions)
    
    def collate_tokens(
        self,
        values,
        pad_idx,
        eos_idx=None,
        left_pad=False,
    ):
        """Convert a list of 1d tensors into a padded 2d tensor."""
        size = max(v.size(0) for v in values)
       
        res = values[0].new(len(values), size, values[0].size(1)).fill_(pad_idx)

        def copy_tensor(src, dst):
            assert dst.numel() == src.numel()
            
            dst.copy_(src)

        for i, v in enumerate(values):
            copy_tensor(v, res[i][size - len(v) :] if left_pad else res[i][: len(v)])
        return res
    
    # def decode(self, emissions, input_lengths):

    #     def get_hypo_nat(decoded_id):
    #         return {
    #             'tokens': decoded_id,
    #             'score': 0.0,
    #             'attention': None,  # src_len x tgt_len
    #             'alignment': None,
    #             'positional_scores': torch.Tensor([0.0]),
    #         }

    #     bsz, _, channel = emissions.size()

    #     new_lps = []
    #     for i, (lp, inp_l) in enumerate(zip(
    #         emissions,
    #         input_lengths,
    #     )):

    #         new_lp = lp[:inp_l].clone()
    #         lp = lp[:inp_l].unsqueeze(0)
    #         toks, inverse_indices = lp.argmax(dim=-1).unique_consecutive(return_inverse=True)
    #         ind_list = inverse_indices.tolist()[0]
    #         insert_tensor = torch.zeros((channel))
    #         insert_tensor.scatter_(0,torch.tensor([170]),1)
    #         pre = ind_list[0]
    #         insert_count = 0
    #         for pos, idx in enumerate(ind_list):
    #             if idx != pre:
    #                 # import ipdb; ipdb.set_trace()
    #                 # print(pos)
    #                 new_lp = torch.cat((new_lp[:pos + insert_count], insert_tensor.clone().unsqueeze(0), new_lp[pos+insert_count:]))
    #                 pre = idx
    #                 insert_count += 1
    #         new_lps.append(new_lp)
        
    #     new_emissions = self.collate_tokens(new_lps, 0)
    #     emissions = new_emissions
        
    #     B, T, N = emissions.size()
    #     hypos = []
    #     for b in range(B):
    #         emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0)
    #         results = self.decoder.decode(emissions_ptr, T, N)

    #         nbest_results = results[: self.nbest]
    #         hypos.append(
    #             [
    #                 {
    #                     "tokens": self.get_tokens(result.tokens),
    #                     "score": result.score,
    #                     "words": [
    #                         self.word_dict.get_entry(x) for x in result.words if x >= 0
    #                     ],
    #                 }
    #                 for result in nbest_results
    #             ]
    #         )
    #     return hypos

    def decode(self, emissions,):
        B, T, N = emissions.size()
        hypos = []
        for b in range(B):
            emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0)
            results = self.decoder.decode(emissions_ptr, T, N)

            nbest_results = results[: self.nbest]
            hypos.append(
                [
                    {
                        "tokens": self.get_tokens(result.tokens),
                        "score": result.score,
                        "words": [
                            self.word_dict.get_entry(x) for x in result.words if x >= 0
                        ],
                    }
                    for result in nbest_results
                ]
            )
        return hypos


FairseqLMState = namedtuple("FairseqLMState", ["prefix", "incremental_state", "probs"])


class FairseqLM(LM):
    def __init__(self, dictionary, model):
        LM.__init__(self)
        self.dictionary = dictionary
        self.model = model
        self.unk = self.dictionary.unk()

        self.save_incremental = False  # this currently does not work properly
        self.max_cache = 20_000

        model.cuda()
        model.eval()
        model.make_generation_fast_()

        self.states = {}
        self.stateq = deque()

    def start(self, start_with_nothing):
        state = LMState()
        prefix = torch.LongTensor([[self.dictionary.eos()]])
        incremental_state = {} if self.save_incremental else None
        with torch.no_grad():
            res = self.model(prefix.cuda(), incremental_state=incremental_state)
            probs = self.model.get_normalized_probs(res, log_probs=True, sample=None)

        if incremental_state is not None:
            incremental_state = apply_to_sample(lambda x: x.cpu(), incremental_state)
        self.states[state] = FairseqLMState(
            prefix.numpy(), incremental_state, probs[0, -1].cpu().numpy()
        )
        self.stateq.append(state)

        return state

    def score(self, state: LMState, token_index: int, no_cache: bool = False):
        """
        Evaluate language model based on the current lm state and new word
        Parameters:
        -----------
        state: current lm state
        token_index: index of the word
                     (can be lexicon index then you should store inside LM the
                      mapping between indices of lexicon and lm, or lm index of a word)

        Returns:
        --------
        (LMState, float): pair of (new state, score for the current word)
        """
        curr_state = self.states[state]

        def trim_cache(targ_size):
            while len(self.stateq) > targ_size:
                rem_k = self.stateq.popleft()
                rem_st = self.states[rem_k]
                rem_st = FairseqLMState(rem_st.prefix, None, None)
                self.states[rem_k] = rem_st

        if curr_state.probs is None:
            new_incremental_state = (
                curr_state.incremental_state.copy()
                if curr_state.incremental_state is not None
                else None
            )
            with torch.no_grad():
                if new_incremental_state is not None:
                    new_incremental_state = apply_to_sample(
                        lambda x: x.cuda(), new_incremental_state
                    )
                elif self.save_incremental:
                    new_incremental_state = {}

                res = self.model(
                    torch.from_numpy(curr_state.prefix).cuda(),
                    incremental_state=new_incremental_state,
                )
                probs = self.model.get_normalized_probs(
                    res, log_probs=True, sample=None
                )

                if new_incremental_state is not None:
                    new_incremental_state = apply_to_sample(
                        lambda x: x.cpu(), new_incremental_state
                    )

                curr_state = FairseqLMState(
                    curr_state.prefix, new_incremental_state, probs[0, -1].cpu().numpy()
                )

            if not no_cache:
                self.states[state] = curr_state
                self.stateq.append(state)

        score = curr_state.probs[token_index].item()

        trim_cache(self.max_cache)

        outstate = state.child(token_index)
        if outstate not in self.states and not no_cache:
            prefix = np.concatenate(
                [curr_state.prefix, torch.LongTensor([[token_index]])], -1
            )
            incr_state = curr_state.incremental_state

            self.states[outstate] = FairseqLMState(prefix, incr_state, None)

        if token_index == self.unk:
            score = float("-inf")

        return outstate, score

    def finish(self, state: LMState):
        """
        Evaluate eos for language model based on the current lm state

        Returns:
        --------
        (LMState, float): pair of (new state, score for the current word)
        """
        return self.score(state, self.dictionary.eos())

    def empty_cache(self):
        self.states = {}
        self.stateq = deque()
        gc.collect()


class W2lFairseqLMDecoder(W2lDecoder):
    def __init__(self, args, tgt_dict):
        super().__init__(args, tgt_dict)

        self.silence = tgt_dict.bos()

        self.unit_lm = getattr(args, "unit_lm", False)

        self.lexicon = load_words(args.lexicon) if args.lexicon else None
        self.idx_to_wrd = {}

        checkpoint = torch.load(args.kenlm_model, map_location="cpu")
        lm_args = checkpoint["args"]
        lm_args.data = osp.dirname(args.kenlm_model)
        print(lm_args)
        task = tasks.setup_task(lm_args)
        model = task.build_model(lm_args)
        model.load_state_dict(checkpoint["model"], strict=False)

        self.trie = Trie(self.vocab_size, self.silence)

        self.word_dict = task.dictionary
        self.unk_word = self.word_dict.unk()
        self.lm = FairseqLM(self.word_dict, model)

        self.decoder_opts = DecoderOptions(
            args.beam,
            int(getattr(args, "beam_size_token", len(tgt_dict))),
            args.beam_threshold,
            args.lm_weight,
            args.word_score,
            args.unk_weight,
            args.sil_weight,
            0,
            False,
            self.criterion_type,
        )

        if self.lexicon:
            start_state = self.lm.start(False)
            for i, (word, spellings) in enumerate(self.lexicon.items()):
                if self.unit_lm:
                    word_idx = i
                    self.idx_to_wrd[i] = word
                    score = 0
                else:
                    word_idx = self.word_dict.index(word)
                    _, score = self.lm.score(start_state, word_idx, no_cache=True)

                for spelling in spellings:
                    spelling_idxs = [tgt_dict.index(token) for token in spelling]
                    assert (
                        tgt_dict.unk() not in spelling_idxs
                    ), f"{spelling} {spelling_idxs}"
                    self.trie.insert(spelling_idxs, word_idx, score)
            self.trie.smear(SmearingMode.MAX)

            self.decoder = LexiconDecoder(
                self.decoder_opts,
                self.trie,
                self.lm,
                self.silence,
                self.blank,
                self.unk_word,
                [],
                self.unit_lm,
            )
        else:
            self.decoder = LexiconFreeDecoder(
                self.decoder_opts, self.lm, self.silence, self.blank, []
            )

    def decode(self, emissions):
        B, T, N = emissions.size()
        hypos = []

        def idx_to_word(idx):
            if self.unit_lm:
                return self.idx_to_wrd[idx]
            else:
                return self.word_dict[idx]

        def make_hypo(result):
            hypo = {"tokens": self.get_tokens(result.tokens), "score": result.score}
            if self.lexicon:
                hypo["words"] = [idx_to_word(x) for x in result.words if x >= 0]
            return hypo

        for b in range(B):
            emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0)
            results = self.decoder.decode(emissions_ptr, T, N)

            nbest_results = results[: self.nbest]
            hypos.append([make_hypo(result) for result in nbest_results])
            self.lm.empty_cache()

        return hypos



# #!/usr/bin/env python3

# # Copyright (c) Facebook, Inc. and its affiliates.
# #
# # This source code is licensed under the MIT license found in the
# # LICENSE file in the root directory of this source tree.

# """
# Flashlight decoders.
# """

# import gc
# import itertools as it
# import os.path as osp
# import warnings
# from collections import deque, namedtuple

# import numpy as np
# import torch
# from examples.speech_recognition.data.replabels import unpack_replabels
# from fairseq import tasks
# from fairseq.utils import apply_to_sample
# from omegaconf import open_dict
# from fairseq.dataclass.utils import convert_namespace_to_omegaconf


# try:
#     from flashlight.lib.text.dictionary import create_word_dict, load_words
#     from flashlight.lib.sequence.criterion import CpuViterbiPath, get_data_ptr_as_bytes
#     from flashlight.lib.text.decoder import (
#         CriterionType,
#         LexiconDecoderOptions,
#         KenLM,
#         LM,
#         LMState,
#         SmearingMode,
#         Trie,
#         LexiconDecoder,
#     )
# except:
#     warnings.warn(
#         "flashlight python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/flashlight/tree/master/bindings/python"
#     )
#     LM = object
#     LMState = object


# class W2lDecoder(object):
#     def __init__(self, args, tgt_dict):
#         self.tgt_dict = tgt_dict
#         self.vocab_size = len(tgt_dict)
#         self.nbest = args.nbest

#         # criterion-specific init
#         if args.criterion == "ctc":
#             self.criterion_type = CriterionType.CTC
#             self.blank = (
#                 tgt_dict.index("<ctc_blank>")
#                 if "<ctc_blank>" in tgt_dict.indices
#                 else tgt_dict.bos()
#             )
#             if "<sep>" in tgt_dict.indices:
#                 self.silence = tgt_dict.index("<sep>")
#             elif "|" in tgt_dict.indices:
#                 self.silence = tgt_dict.index("|")
#             else:
#                 self.silence = tgt_dict.eos()
#             self.asg_transitions = None
#         elif args.criterion == "asg_loss":
#             self.criterion_type = CriterionType.ASG
#             self.blank = -1
#             self.silence = -1
#             self.asg_transitions = args.asg_transitions
#             self.max_replabel = args.max_replabel
#             assert len(self.asg_transitions) == self.vocab_size ** 2
#         else:
#             raise RuntimeError(f"unknown criterion: {args.criterion}")

#     def generate(self, models, sample, **unused):
#         """Generate a batch of inferences."""
#         # model.forward normally channels prev_output_tokens into the decoder
#         # separately, but SequenceGenerator directly calls model.encoder
#         encoder_input = {
#             k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens"
#         }
#         emissions = self.get_emissions(models, encoder_input)
#         return self.decode(emissions)

#     def get_emissions(self, models, encoder_input):
#         """Run encoder and normalize emissions"""
#         model = models[0]
#         encoder_out = model(**encoder_input)
#         if self.criterion_type == CriterionType.CTC:
#             if hasattr(model, "get_logits"):
#                 emissions = model.get_logits(encoder_out) # no need to normalize emissions
#             else:
#                 emissions = model.get_normalized_probs(encoder_out, log_probs=True)
#         elif self.criterion_type == CriterionType.ASG:
#             emissions = encoder_out["encoder_out"]
#         return emissions.transpose(0, 1).float().cpu().contiguous()

#     def get_tokens(self, idxs):
#         """Normalize tokens by handling CTC blank, ASG replabels, etc."""
#         idxs = (g[0] for g in it.groupby(idxs))
#         if self.criterion_type == CriterionType.CTC:
#             idxs = filter(lambda x: x != self.blank, idxs)
#         elif self.criterion_type == CriterionType.ASG:
#             idxs = filter(lambda x: x >= 0, idxs)
#             idxs = unpack_replabels(list(idxs), self.tgt_dict, self.max_replabel)
#         return torch.LongTensor(list(idxs))


# class W2lViterbiDecoder(W2lDecoder):
#     def __init__(self, args, tgt_dict):
#         super().__init__(args, tgt_dict)

#     def decode(self, emissions):
#         B, T, N = emissions.size()
#         hypos = []
#         if self.asg_transitions is None:
#             transitions = torch.FloatTensor(N, N).zero_()
#         else:
#             transitions = torch.FloatTensor(self.asg_transitions).view(N, N)
#         viterbi_path = torch.IntTensor(B, T)
#         workspace = torch.ByteTensor(CpuViterbiPath.get_workspace_size(B, T, N))
#         CpuViterbiPath.compute(
#             B,
#             T,
#             N,
#             get_data_ptr_as_bytes(emissions),
#             get_data_ptr_as_bytes(transitions),
#             get_data_ptr_as_bytes(viterbi_path),
#             get_data_ptr_as_bytes(workspace),
#         )
#         return [
#             [{"tokens": self.get_tokens(viterbi_path[b].tolist()), "score": 0}]
#             for b in range(B)
#         ]


# class W2lKenLMDecoder(W2lDecoder):
#     def __init__(self, args, tgt_dict):
#         super().__init__(args, tgt_dict)

#         self.unit_lm = getattr(args, "unit_lm", False)

#         if args.lexicon:
#             self.lexicon = load_words(args.lexicon)
#             self.word_dict = create_word_dict(self.lexicon)
#             self.unk_word = self.word_dict.get_index("<unk>")

#             self.lm = KenLM(args.kenlm_model, self.word_dict)
#             self.trie = Trie(self.vocab_size, self.silence)

#             start_state = self.lm.start(False)
#             for i, (word, spellings) in enumerate(self.lexicon.items()):
#                 word_idx = self.word_dict.get_index(word)
#                 _, score = self.lm.score(start_state, word_idx)
#                 for spelling in spellings:
#                     spelling_idxs = [tgt_dict.index(token) for token in spelling]
#                     assert (
#                         tgt_dict.unk() not in spelling_idxs
#                     ), f"{spelling} {spelling_idxs}"
#                     self.trie.insert(spelling_idxs, word_idx, score)
#             self.trie.smear(SmearingMode.MAX)
#             print("*"*50)
#             print(args.beam, args.beam_threshold, int(getattr(args, "beam_size_token", len(tgt_dict))))
#             self.decoder_opts = LexiconDecoderOptions(
#                 beam_size=args.beam,
#                 beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))),
#                 beam_threshold=args.beam_threshold,
#                 lm_weight=args.lm_weight,
#                 word_score=args.word_score,
#                 unk_score=args.unk_weight,
#                 sil_score=args.sil_weight,
#                 log_add=False,
#                 criterion_type=self.criterion_type,
#             )

#             if self.asg_transitions is None:
#                 N = 768
#                 # self.asg_transitions = torch.FloatTensor(N, N).zero_()
#                 self.asg_transitions = []

#             self.decoder = LexiconDecoder(
#                 self.decoder_opts,
#                 self.trie,
#                 self.lm,
#                 self.silence,
#                 self.blank,
#                 self.unk_word,
#                 self.asg_transitions,
#                 self.unit_lm,
#             )
#         else:
#             assert args.unit_lm, "lexicon free decoding can only be done with a unit language model"
#             from flashlight.lib.text.decoder import LexiconFreeDecoder, LexiconFreeDecoderOptions

#             d = {w: [[w]] for w in tgt_dict.symbols}
#             self.word_dict = create_word_dict(d)
#             self.lm = KenLM(args.kenlm_model, self.word_dict)
#             self.decoder_opts = LexiconFreeDecoderOptions(
#                 beam_size=args.beam,
#                 beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))),
#                 beam_threshold=args.beam_threshold,
#                 lm_weight=args.lm_weight,
#                 sil_score=args.sil_weight,
#                 log_add=False,
#                 criterion_type=self.criterion_type,
#             )
#             self.decoder = LexiconFreeDecoder(
#                 self.decoder_opts, self.lm, self.silence, self.blank, []
#             )


#     def decode(self, emissions):
#         B, T, N = emissions.size()
#         hypos = []
#         for b in range(B):
#             emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0)
#             results = self.decoder.decode(emissions_ptr, T, N)

#             nbest_results = results[: self.nbest]
#             hypos.append(
#                 [
#                     {
#                         "tokens": self.get_tokens(result.tokens),
#                         "score": result.score,
#                         "words": [
#                             self.word_dict.get_entry(x) for x in result.words if x >= 0
#                         ],
#                     }
#                     for result in nbest_results
#                 ]
#             )
#         return hypos


# FairseqLMState = namedtuple("FairseqLMState", ["prefix", "incremental_state", "probs"])


# class FairseqLM(LM):
#     def __init__(self, dictionary, model):
#         LM.__init__(self)
#         self.dictionary = dictionary
#         self.model = model
#         self.unk = self.dictionary.unk()

#         self.save_incremental = False  # this currently does not work properly
#         self.max_cache = 20_000

#         model.cuda()
#         model.eval()
#         model.make_generation_fast_()

#         self.states = {}
#         self.stateq = deque()

#     def start(self, start_with_nothing):
#         state = LMState()
#         prefix = torch.LongTensor([[self.dictionary.eos()]])
#         incremental_state = {} if self.save_incremental else None
#         with torch.no_grad():
#             res = self.model(prefix.cuda(), incremental_state=incremental_state)
#             probs = self.model.get_normalized_probs(res, log_probs=True, sample=None)

#         if incremental_state is not None:
#             incremental_state = apply_to_sample(lambda x: x.cpu(), incremental_state)
#         self.states[state] = FairseqLMState(
#             prefix.numpy(), incremental_state, probs[0, -1].cpu().numpy()
#         )
#         self.stateq.append(state)

#         return state

#     def score(self, state: LMState, token_index: int, no_cache: bool = False):
#         """
#         Evaluate language model based on the current lm state and new word
#         Parameters:
#         -----------
#         state: current lm state
#         token_index: index of the word
#                      (can be lexicon index then you should store inside LM the
#                       mapping between indices of lexicon and lm, or lm index of a word)
#         Returns:
#         --------
#         (LMState, float): pair of (new state, score for the current word)
#         """
#         curr_state = self.states[state]

#         def trim_cache(targ_size):
#             while len(self.stateq) > targ_size:
#                 rem_k = self.stateq.popleft()
#                 rem_st = self.states[rem_k]
#                 rem_st = FairseqLMState(rem_st.prefix, None, None)
#                 self.states[rem_k] = rem_st

#         if curr_state.probs is None:
#             new_incremental_state = (
#                 curr_state.incremental_state.copy()
#                 if curr_state.incremental_state is not None
#                 else None
#             )
#             with torch.no_grad():
#                 if new_incremental_state is not None:
#                     new_incremental_state = apply_to_sample(
#                         lambda x: x.cuda(), new_incremental_state
#                     )
#                 elif self.save_incremental:
#                     new_incremental_state = {}

#                 res = self.model(
#                     torch.from_numpy(curr_state.prefix).cuda(),
#                     incremental_state=new_incremental_state,
#                 )
#                 probs = self.model.get_normalized_probs(
#                     res, log_probs=True, sample=None
#                 )

#                 if new_incremental_state is not None:
#                     new_incremental_state = apply_to_sample(
#                         lambda x: x.cpu(), new_incremental_state
#                     )

#                 curr_state = FairseqLMState(
#                     curr_state.prefix, new_incremental_state, probs[0, -1].cpu().numpy()
#                 )

#             if not no_cache:
#                 self.states[state] = curr_state
#                 self.stateq.append(state)

#         score = curr_state.probs[token_index].item()

#         trim_cache(self.max_cache)

#         outstate = state.child(token_index)
#         if outstate not in self.states and not no_cache:
#             prefix = np.concatenate(
#                 [curr_state.prefix, torch.LongTensor([[token_index]])], -1
#             )
#             incr_state = curr_state.incremental_state

#             self.states[outstate] = FairseqLMState(prefix, incr_state, None)

#         if token_index == self.unk:
#             score = float("-inf")

#         return outstate, score

#     def finish(self, state: LMState):
#         """
#         Evaluate eos for language model based on the current lm state
#         Returns:
#         --------
#         (LMState, float): pair of (new state, score for the current word)
#         """
#         return self.score(state, self.dictionary.eos())

#     def empty_cache(self):
#         self.states = {}
#         self.stateq = deque()
#         gc.collect()


# class W2lFairseqLMDecoder(W2lDecoder):
#     def __init__(self, args, tgt_dict):
#         super().__init__(args, tgt_dict)

#         self.unit_lm = getattr(args, "unit_lm", False)

#         self.lexicon = load_words(args.lexicon) if args.lexicon else None
#         self.idx_to_wrd = {}

#         checkpoint = torch.load(args.kenlm_model, map_location="cpu")

#         if "cfg" in checkpoint and checkpoint["cfg"] is not None:
#             lm_args = checkpoint["cfg"]
#         else:
#             lm_args = convert_namespace_to_omegaconf(checkpoint["args"])

#         with open_dict(lm_args.task):
#             lm_args.task.data = osp.dirname(args.kenlm_model)

#         task = tasks.setup_task(lm_args.task)
#         model = task.build_model(lm_args.model)
#         model.load_state_dict(checkpoint["model"], strict=False)

#         self.trie = Trie(self.vocab_size, self.silence)

#         self.word_dict = task.dictionary
#         self.unk_word = self.word_dict.unk()
#         self.lm = FairseqLM(self.word_dict, model)

#         if self.lexicon:
#             start_state = self.lm.start(False)
#             for i, (word, spellings) in enumerate(self.lexicon.items()):
#                 if self.unit_lm:
#                     word_idx = i
#                     self.idx_to_wrd[i] = word
#                     score = 0
#                 else:
#                     word_idx = self.word_dict.index(word)
#                     _, score = self.lm.score(start_state, word_idx, no_cache=True)

#                 for spelling in spellings:
#                     spelling_idxs = [tgt_dict.index(token) for token in spelling]
#                     assert (
#                         tgt_dict.unk() not in spelling_idxs
#                     ), f"{spelling} {spelling_idxs}"
#                     self.trie.insert(spelling_idxs, word_idx, score)
#             self.trie.smear(SmearingMode.MAX)

#             self.decoder_opts = LexiconDecoderOptions(
#                 beam_size=args.beam,
#                 beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))),
#                 beam_threshold=args.beam_threshold,
#                 lm_weight=args.lm_weight,
#                 word_score=args.word_score,
#                 unk_score=args.unk_weight,
#                 sil_score=args.sil_weight,
#                 log_add=False,
#                 criterion_type=self.criterion_type,
#             )

#             self.decoder = LexiconDecoder(
#                 self.decoder_opts,
#                 self.trie,
#                 self.lm,
#                 self.silence,
#                 self.blank,
#                 self.unk_word,
#                 [],
#                 self.unit_lm,
#             )
#         else:
#             assert args.unit_lm, "lexicon free decoding can only be done with a unit language model"
#             from flashlight.lib.text.decoder import LexiconFreeDecoder, LexiconFreeDecoderOptions

#             d = {w: [[w]] for w in tgt_dict.symbols}
#             self.word_dict = create_word_dict(d)
#             self.lm = KenLM(args.kenlm_model, self.word_dict)
#             self.decoder_opts = LexiconFreeDecoderOptions(
#                 beam_size=args.beam,
#                 beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))),
#                 beam_threshold=args.beam_threshold,
#                 lm_weight=args.lm_weight,
#                 sil_score=args.sil_weight,
#                 log_add=False,
#                 criterion_type=self.criterion_type,
#             )
#             self.decoder = LexiconFreeDecoder(
#                 self.decoder_opts, self.lm, self.silence, self.blank, []
#             )

#     def decode(self, emissions):
#         B, T, N = emissions.size()
#         hypos = []

#         def idx_to_word(idx):
#             if self.unit_lm:
#                 return self.idx_to_wrd[idx]
#             else:
#                 return self.word_dict[idx]

#         def make_hypo(result):
#             hypo = {"tokens": self.get_tokens(result.tokens), "score": result.score}
#             if self.lexicon:
#                 hypo["words"] = [idx_to_word(x) for x in result.words if x >= 0]
#             return hypo

#         for b in range(B):
#             emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0)
#             results = self.decoder.decode(emissions_ptr, T, N)

#             nbest_results = results[: self.nbest]
#             hypos.append([make_hypo(result) for result in nbest_results])
#             self.lm.empty_cache()

#         return hypos