import logging
import re
import subprocess
import os
import numpy as np
import torch
from torch.utils.data import TensorDataset

from nsp.util import write_list_to_file, write_json_to_file, read_lines_in_list, truncate_sequences
from nsp.evals.f1_word_match import evaluate as evaluate_f1_word_match

DIR_PATH = os.path.dirname(os.path.realpath(__file__))

LOGGER = logging.getLogger(__name__)


class GenExample():
    """
    This class encodes the bare minimum an instance needs to specify for a VariableHeadsNSP model.
    """
    def __init__(self):
        """
        A single set of data. Subclasses should overwrite as appropriate.

        :param classify_id_cls: the id index that is the gold target for the prediction on [CLS]
                                (pass None if not applicable, if applicable, needs to be in a list, 
                                in case there are more than one CLS heads)
        :param part_a: text string for Part A
        :param part_b: text string for Part B
        """
        self.classify_id_cls = [-1]
        self.part_a = None
        self.part_b = None

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        collect_string = ""
        collect_string += "part_a: %s" % self.part_a
        collect_string += ", part_b: %s" % self.part_b
        return collect_string


class BitextHandler(object):
    """
    Base class for bitext data sets.
    Other classes should inherit from this.

    A subclass should have the following variables:

    - examples: a list of examples, example should be dataset specific, for a very generic version see :py:class:`~nsp.dataset_handlers.dataset_bitext.GenExample`
    - features: a list of features, where a feature at index i maps to the example at index i in examples list, See :py:class:`~nsp.masking.GenInputFeatures` for an example.
    - write_predictions: How to write predictions, either write_list_to_file or write_json_to_file
    - write_eval: How to write evaluations, either write_list_to_file or write_json_to_file

    **For sequence classification**:

    - num_labels_cls: Number of labels for the [CLS] token. Should be given as a list, where each element corresponds to the number of classification classes for one [CLS]    classification head. (In most cases there will only be None then set variable to None, or there will be one in which case the list only has one entry.
    - num_labels_tok: Number of labels for token classification. Should be given as a list. Concept is the same as for num_labels_cls above.
    - _text2id: dictionary that maps text to ids for CLS prediction

    **For token classification**:

    - num_labels_tok: Number of labels for token classification
    - _text2id_tok: dictionary that maps text to ids for token prediction

    For a class implementing all aspects (generation, classify tokens and classify on [CLS], see dataset_sharc.py)
    """
    # pylint: disable=too-many-instance-attributes
    def __init__(self, nsp_args):
        """
        - examples: a list of examples of type :py:class:`~nsp.dataset_handlers.dataset_bitext.BitextHandler`
        - features: a list of features of type :py:class:`~nsp.masking.GenInputFeatures`
        """
        self.examples = []
        self.features = []
        # If generation predictions should be written as a list or json or something else
        self.write_predictions = write_list_to_file
        # If token classification predictions should be written as a list or json or something else
        self.write_tok_predictions = self.write_predictions
        # If sequence classification predictions should be written as a list or json or something else
        self.write_cls_predictions = self.write_predictions
        # If evaluation metrics should be written as a list or json or something else
        self.write_eval = write_json_to_file
        # For each CLS / sequence classification head: a dictionary that maps text to ids
        # (e.g. {'bad_review': 0, 'good_review': 1}). Inherently defined by the dataset.
        self._text2id = []  # list of dict
        # For each token classification head: a dictionary that maps text to ids
        # (e.g. {'person': 0, 'location': 1}). Inherently defined by the dataset.
        self._text2id_tok = []  # list of dict
        self.map_to_cls_index = 0  # index if multiple cls heads exist
        self.map_to_tok_index = 0  # index if multiple tok heads exist
        self.map_to_gen_index = 0  # index if multiple gen heads exist
        # if True, convert_examples_to_features in masking.py will truncate the end
        # if it exceeds max_part_a
        self.truncate_end = True
        # How to truncate instances that are too long.
        # see arg in transformers.tokenizer.truncate_sequences (v2.1.1 or grater)
        # 'longest_first', 'only_first': part_a only, 'only_second': part_b only, 'do_not_truncate': raise an error if too long
        self.truncation_strategy = nsp_args.truncation_strategy
        # The maximum sequence length for Part A [SEP] Part B [SEP]
        self.max_seq_length = nsp_args.max_seq_length
        # The locations for training, prediction and gold prediction file respectively
        # (Prediction and gold prediction file can be the same for some datasets)
        self.train_file = nsp_args.train_file
        self.predict_file = nsp_args.predict_file
        self.valid_gold = nsp_args.valid_gold
        # While reading this dataset's data, we collect some statistics
        self.trunc_part_a = 0
        self.trunc_part_b = 0
        self.num_invalid = 0
        self.max_a = 0
        self.max_b = 0

        # Which tokenizer this dataset should use, set via @tokenizer.setter
        self._tokenizer = None
        # When using multiple dataset handler, we can set which dataset this is via @datahandler_index.setter
        self._datahandler_index = 0

        self.train_dataloader = None  #TODO: needed?
        self.eval_dataloader = None  #TODO: needed?
        # during generation, set True to only return what is produced after first sep
        # Should be True e.g. for dialogue generation tasks
        self.return_all = False

        # if MetaHandler is called, these numbers should be merged.
        self.plus_classify_sequence = nsp_args.plus_classify_sequence
        self.plus_classify_tokens = nsp_args.plus_classify_tokens
        self.plus_generation = nsp_args.plus_generation

        #how to treat [SEP] token during generation
        #possible values {'break', 'ignore', 'continue'}
        #use break for generation tasks
        #use ignore for LM handling tasks where SEP shouldn't be produced
        #use continue for LM handling tasks where SEP can be produced
        self.on_sep = 'break'

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        return self.__class__.__name__

    @property
    def tokenizer(self):
        """
        The tokenizer.
        
        :getter: Returns tokenizer
        :setter: Sets tokenizer
        :type: object
        """
        return self._tokenizer

    @tokenizer.setter
    def tokenizer(self, tokenizer):
        self._tokenizer = tokenizer

    @property
    def datahandler_index(self):
        """
        The datahandler index.
        
        :getter: Returns datahandler index
        :setter: Sets datahandler index
        :type: int
        """
        return self._datahandler_index

    @datahandler_index.setter
    def datahandler_index(self, i):
        self._datahandler_index = i

    @property
    def text2id(self):
        """
        For each CLS / sequence classification head, a mapping of text to ids.
        
        :getter: Returns text2id mapping
        :type: dict
        """
        return self._text2id[self.map_to_cls_index]

    @property
    def text2id_tok(self):
        """
        For each token classification head, a mapping of text to ids.
        
        :getter: Returns text2id mapping
        :type: dict
        """
        return self._text2id_tok[self.map_to_tok_index]

    @property
    def id2text(self):
        """
        For each CLS / sequence classification head, a mapping of ids to text.
        
        :getter: Returns id2text mapping
        :type: dict
        """
        return {v: k for k, v in self.text2id.items()}

    @property
    def id2text_tok(self):
        """
        For each token classification head, a mapping of ids to text.
        
        :getter: Returns id2text mapping
        :type: dict
        """
        return {v: k for k, v in self.text2id_tok.items()}

    @property
    def num_labels_cls(self):
        """
        Number of labels for the [CLS] token.
        
        :getter: Returns no. of labels
        :type: list
        """
        return [len(t) for t in self._text2id]

    @property
    def num_labels_tok(self):
        """
        Number of labels for token classification.
        
        :getter: Returns no. of labels
        :type: list
        """
        return [len(t) for t in self._text2id_tok]

    def initialize(self):
        """
        Initialize lists, variables expected to be called before read_examples()
        """
        self.examples = []
        self.features = []
        self.max_a = 0
        self.max_b = 0
        self.trunc_part_a = 0
        self.trunc_part_b = 0
        self.num_invalid = 0
        if self.tokenizer is None:
            raise ValueError('Tokenizer not found. Please set a tokenizer before calling read_examples().')

    def read_examples(self, is_training=False):
        """
        Reads a bitext that is separated by a tab, e.g. word1 word2 \t word3 word4
        Everything before the tab will become Part A, rest Part B

        :param is_training: True for training, then we expect \t, else we do not.
        :return: 0 on success
        """
        if is_training is True:
            input_file = self.train_file
        else:
            input_file = self.predict_file
        self.initialize()  # reset previous lot
        LOGGER.info("Part a: prior to tab")
        LOGGER.info("Part b: post tab")
        input_data = read_lines_in_list(input_file)

        example_counter = 0
        for entry in input_data:
            part_a = entry
            part_b = ""
            if is_training is True:
                split_line = entry.split("\t")
                assert len(split_line) == 2
                part_a = split_line[0]
                part_b = split_line[1]

            example = BitextHandler.BitextExample(
                example_index=example_counter,
                part_a=part_a,
                part_b=part_b,
                classify_id_cls=[-1] * self.plus_classify_sequence,
                dataset_index=self.datahandler_index
            )
            try:
                example = self.apply_tokenization(example, is_training)
            except Exception as e:
                LOGGER.debug(e)
                LOGGER.warning("Invalid example at index {}. Skip.".format(example.example_index))
                self.num_invalid += 1
            else:
                self.examples.append(example)
                example_counter += 1
                assert len(self.examples) == example_counter

        # data stats
        LOGGER.info("Maximum Part A is: %s", self.max_a)
        LOGGER.info("Maximum Part B is: %s", self.max_b)
        LOGGER.warning("Couldn't encode example %s times.", self.num_invalid)
        LOGGER.warning("Truncated part a %s times.", self.trunc_part_a)
        LOGGER.warning("Truncated part b %s times.", self.trunc_part_b)
        LOGGER.info("Number of valid examples: %s", example_counter)

        return 0

    def process_inputs(self, inputs):
        '''
        Converts the input data to :py:class:OpiecExample object
        :param inputs:  list of input sentences
        :return: 0 on success
        '''
        pass

    def apply_tokenization(self, example, is_training=False):
        """
        Applies tokenization to an example.

        :param example (BitextHandler.BitextExample): The example to apply tokenization to.
        :param is_training:
        :return: the tokenized example
        """
        # tokenizer
        if len(example.part_a) > 0:
            example.part_a_tokens = self.tokenizer.tokenize(example.part_a)
            if len(example.part_a_tokens) < 1:
                raise ValueError('Tokenization Error | tokenization of part_a at index {} failed.: "{}"'.format(
                    example.example_index, example.part_a))
        else:
            example.part_a_tokens = []

        if len(example.part_b) > 0:
            example.part_b_tokens = self.tokenizer.tokenize(example.part_b)
            if len(example.part_b_tokens) < 1:
                raise ValueError('Tokenization Error | tokenization of part_b at index {} failed.: "{}"'.format(
                    example.example_index, example.part_b))
        else:
            example.part_b_tokens = []

        self.max_a = max(self.max_a, len(example.part_a_tokens))
        self.max_b = max(self.max_b, len(example.part_b_tokens))
        try:
            # truncate total seq length
            example = self.validate_example(example, is_training=is_training)
        except Exception as e:
            # skip this example
            raise ValueError(e)
        else:
            return example

    def validate_example(self, example, is_training=False):
        """
        post-tokenization validation

        - here, truncate token length
        - each subclass can implement its own validation on tokenized tokens

        :param example (BitextHandler.BitextExample): The example to validate. Note that:

               - example.part_a: list of tokens (before [SEP] token added at the end of seq), can be empty
               - example.part_b: list of tokens (before [SEP] token added at the end of seq), can be empty
        :param is_training: In case the information whether it is training time or not is needed in a subclass.
        :return: example (BitextHandler.BitextExample), raise an error if invalid
        """

        a_length = len(example.part_a_tokens)
        b_length = len(example.part_b_tokens)

        max_seq_length = min(self.tokenizer.max_len, self.max_seq_length)
        max_seq_length -= 3 if b_length > 0 else 2 # take into account special tokens
        total_length = a_length + b_length

        if total_length > max_seq_length:
            try:
                part_a, part_b = truncate_sequences(
                    part_a=example.part_a_tokens, part_b=example.part_b_tokens,
                    num_tokens_to_remove=(total_length - max_seq_length),
                    truncation_strategy=self.truncation_strategy,
                    truncate_end=self.truncate_end
                )
            except Exception as e:
                raise ValueError(e)
            example.part_a_tokens = part_a
            example.part_b_tokens = part_b
        assert len(example.part_a_tokens) + len(example.part_b_tokens) <= max_seq_length

        """ truncate part-wise
            # Part A
            if self.max_part_a is not None and a_length > self.max_part_a:
                if self.truncate_end:
                    tokens = part_a[:self.max_part_a]
                else: # truncate beginning
                    tokens = part_a[-self.max_part_a:]
            # Part B
            if self.max_part_b is not None and b_length > self.max_part_b:
                if self.truncate_end:
                    tokens = part_b[:self.max_part_b]
                else: # truncate beginning
                    tokens = part_b[-self.max_part_b:]

            assert len(part_a) <= self.max_part_a
            assert len(part_b) <= self.max_part_b
        """

        if a_length > len(example.part_a_tokens):
            self.trunc_part_a += 1
        if b_length > len(example.part_b_tokens):
            self.trunc_part_b += 1

        return example

    # pylint: disable=no-self-use
    def get_token_classification_ids(self, current_example, input_ids):
        """
        This function sets the gold IDs sequence for token classification.
        We cannot set this prior to calling masking, because the dataset handlers do not know about
        tokenization. So we call this function within masker with he input_ids of the entire
        sequence and the tokenizer. Then we let each dataset handler decide on which tokens
        which label should be classified.

        On the Sharc dataset this equals to setting the classification label
        (Yes, No, More, Irrelevant),
        for every token in the rule text

        This class implement the basic case where none of the tokens should be classified.
        (Hence a list of just -1 equal to the length of the input_ids)

        :param current_example: an instance of :py:class:`~nsp.dataset_handlers.dataset_bitext.BitextHandler.BitextExample`
        :param input_ids: the masker prepared input ids for this example
        :return: a list of lists, where each inner list is of length max_seq_length with
                 the classification label set to -1.
                 The outer list respects the fact that there could be several heads.
        """
        #TODO: Mayumi re-wrote the code so that each dataset handler has a tokenizer now, so maybe this could be refactored.
        del current_example
        #the outer list respects that there could be more than 1 token classification head.
        classify_id_tokens = []
        for i in range(len(self.num_labels_tok)):
            #self.map_to_tok_index = i
            classify_id_tokens.append([-1] * len(input_ids))
        return classify_id_tokens

    # pylint: disable=no-self-use
    def arrange_generated_output(self, current_example, generated_ids, input_ids):
        """
        Simply returns generated_text, other data sets can arrange the output
        ready for evaluation here.

        For example Sharc expects a specific format.

        :param current_example: The current example
        :param generated_ids: (np.ndarray, dtype=np.int64) list of the token ids generated by the model
        :param input_ids: the input_ids as they where given to the BERT model
        :return: generated_text
        """
        # how to convert to string depends on tokenizer!!!
        generated_tokens = self.tokenizer.convert_ids_to_tokens(map(int, generated_ids))
        # each pretrained model can have different subwords delimiter symbol "##"
        #generated_text = " ".join(generated_tokens).replace(" ##", "")
        generated_text = self.tokenizer.convert_tokens_to_string(generated_tokens)
        return generated_text

    # pylint: disable=no-self-use
    def arrange_classify_output(self, current_example, max_classify_index):
        """
        Simply returns the classification index, other data sets can arrange
        the output ready for evaluation here.

        :param current_example: The current example
        :param max_classify_index: (np.ndarray, dtype=np.int64) the index of the most likely class,
                                   can be converted back into a string via id2text[max_classify_index]
        :return: the most likely class
        """
        return max_classify_index

    # pylint: disable=no-self-use
    def arrange_token_classify_output(self, current_example, classification_tokens, input_ids):
        """
        Simply returns all classification elements, other data sets can arrange the output
        as needed here.

        :param current_example: The current example
        :param classification_tokens: (np.ndarray, dtype=np.int64) the classification labels for all tokens
        :param input_ids: the input_ids as they where given to the BERT model
        :return: classification_tokens
        """
        return classification_tokens

    # pylint: disable=no-self-use
    def combine_classification_tokens(self, classification_tokens):
        """
        Given a list of classified tokens, we can combine them as required by a specific data set.
        Here we just leave the list untouched. This is implemented in datasets_meta.py

        :param classification_tokens: a list of classified tokens
        :return: the same list (returns classification_tokens)
        """
        raise NotImplementedError('combine_classification_tokens() is not implemented.')

    def evaluate(self, output_prediction_file, mode):
        """
        Given the location of the prediction and gold output file,
        calls a dataset specific evaluation script.
        Here it calls case-sensitive BLEU script and F1 word match.

        :param output_prediction_file: the file location of the predictions
        :param mode: possible values: generation, cls, token as called from predict.py;
                     bitext handles all the same
        :return: a dictionary with various statistics
        """
        def _convert_to_float(convert):
            try:
                convert = float(convert)
            except OverflowError:
                convert = 0.0
            return convert

        # BLEU
        with open(output_prediction_file, "r") as file:
            eval_process = \
                subprocess.Popen([DIR_PATH+"/../evals/multi-bleu.perl", "-lc", self.valid_gold],
                                 stdin=file, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, _ = eval_process.communicate()
        #format example:
        # BLEU = 26.27, 59.8/38.8/32.5/27.1 (BP=0.695, ratio=0.733, hyp_len=4933, ref_len=6729)
        bleu_all = stdout.decode("utf-8")
        if bleu_all.startswith("Illegal division"):
            results = {mode+"_"+'moses_bleu': 0.0, mode+"_"+'moses_bleu_1': 0.0,
                       mode+"_"+'moses_bleu_2': 0.0, mode+"_"+'moses_bleu_3': 0.0,
                       mode+"_"+'moses_bleu_4': 0.0}
        else:
            bleu = 0.0
            try:
                bleu = float(re.compile('BLEU = (.*?),').findall(bleu_all)[0])
            except (OverflowError, IndexError):  # if all translations are the empty string
                pass

            bleu_all = re.sub(r".*?, ", '', bleu_all, 1)
            bleu_all = re.sub(r" .BP.*\n", '', bleu_all)
            #format now: 159.8/38.8/32.5/27.1
            bleu_all = bleu_all.split("/")
            try:
                results = {mode+"_"+'moses_bleu': bleu,
                           mode+"_"+'moses_bleu_1': _convert_to_float(bleu_all[0]),
                           mode+"_"+'moses_bleu_2': _convert_to_float(bleu_all[1]),
                           mode+"_"+'moses_bleu_3': _convert_to_float(bleu_all[2]),
                           mode+"_"+'moses_bleu_4': _convert_to_float(bleu_all[3])}
            except OverflowError:
                results = {mode+"_"+'moses_bleu': 0.0, mode+"_"+'moses_bleu_1': 0.0,
                           mode+"_"+'moses_bleu_2': 0.0, mode+"_"+'moses_bleu_3': 0.0,
                           mode+"_"+'moses_bleu_4': 0.0}

        # F1 word match
        f1_results = self.get_f1(output_prediction_file)
        for key in f1_results:
            results[mode+"_"+key] = f1_results[key]
        return results

    def get_f1(self, output_prediction_file):
        """
        Given an output prediction file and a true gold file,
        calculate f1 word overlap scores.

        :param output_prediction_file: the output prediction file
        :return: the F1 scores (dictionary with several different values)
        """
        # !!! evaluate_f1_word_match() will compute scores on seq split by white space !!!
        # !!! not appropriate for Japanese. !!!
        hyps = read_lines_in_list(output_prediction_file)
        golds = read_lines_in_list(self.valid_gold)
        f1_results = evaluate_f1_word_match(golds, hyps)
        return f1_results

    def select_deciding_score(self, results):
        """
        Returns the score that should be used to decide whether or not
        a model is best compared to a previous score.
        Here we return BLEU-4

        :param results: what is returned by the method evaluate,
                        a dictionary that should contain 'bleu_4'
        :return: BLEU-4 value
        """
        return results['generation_moses_bleu_4']

    def select_scores_to_plot(self, results):
        """
        Could select here which metrics to plot. Here we plot all.

        :param results: dictionary with various metrics (returned from self.evaluate())
        :return:
        """
        return results  #plot all scores

    def possible_mask_locations(self, part_a, part_b, is_training=False, example=None):
        """
        Given an example's part_a and part_b, decide which positions may be masked.
        By default, this class allows any position to be masked.
        Subclasses may overwrite this functionality.

        :param part_a: taken from part_a from a subclass instance of :py:class:`~nsp.dataset_handlers.dataset_bitext.GenExample`,
                       but it is potentially tokenised
        :param part_b: taken from part_b from a subclass instance of :py:class:`~nsp.dataset_handlers.dataset_bitext.GenExample`,
                       but it is potentially tokenised
        :param is_training: True if training, else False
        :return: a tuple of two lists, one for part_a and one for part_b. Each list is
                 of length part_a/part_b, where value of 1.0 indicates that this
                 position may be masked and 0.0 indicates that it should not be masked.
        """
        possible_mask_locations_a = [True] * len(part_a)
        possible_mask_locations_b = [True] * len(part_b)
        return possible_mask_locations_a, possible_mask_locations_b

    def create_tensor_dataset(self):
        """
        Using a data_handler, whose features have been filled via the function
        convert_examples_to_features from a subclass instance of :py:class:`~nsp.masking.Masking`,
        convert the features into a TensorDataset

        :return: the features represented as a TensorDataset should always consist of the following:
                 (subclasses may overwrite how it is collected but should not change the available info)
                 This should only be overwritten by dataset_meta.py

                 1. input ids
                 2. input mask
                 3. segment id
                 4. gen label id
                 5. cls label id
                 6. token label id
                 7. example index
        """
        # pylint: disable=not-callable
        # pylint: disable=no-member
        all_input_ids = torch.tensor([f.input_ids for f in self.features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in self.features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in self.features], dtype=torch.long)
        all_gen_label_ids = torch.tensor([f.gen_label_ids for f in self.features], dtype=torch.long)
        all_classify_ids_cls = torch.tensor([f.classify_id_cls for f in self.features])
        all_classify_ids_tokens = torch.tensor([f.classify_id_tokens for f in self.features])
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        data_set = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_gen_label_ids,
                                 all_classify_ids_cls, all_classify_ids_tokens, all_example_index)
        return data_set

    def token_classification_prediction(self, example, logits, current_input_ids):
        """
        For token classification, we return the most likely classification.
        Other datasets might handle this differently, e.g. dataset_mmkg.py

        :param example:
        :param logits:
        :param current_input_ids:
        :return:
        """
        def predict_token_classification(logits):
            """
            Given a data set, index for current example and logits for classification on several tokens,
            output the most likely classification label

            :param data_handler: an instance of a subclass of :py:class:`DatasetHandler`
            :param logits: logits for each token to be classified, each with dimension
                           data_handler.num_labels_tok
            :param example_index: index of the current example, example can be accessed via
                                  data_handler.examples[example_index]
            :return: a list of most likely class label
            """
            classified_tokens = []
            for i, _ in enumerate(logits):
                max_vocab_index = np.argmax(logits[i])
                classified_tokens.append(max_vocab_index)
            return classified_tokens

        classified_tokens_indices = predict_token_classification(logits)
        return self.arrange_token_classify_output(example, classified_tokens_indices,
                        current_input_ids)

    class BitextExample(GenExample):
        """
        A single training/test example from BitextHandler.
        """
        # pylint: disable=too-few-public-methods
        def __init__(self, example_index, part_a, part_b, classify_id_cls=None, dataset_index=None):
            super().__init__()
            self._example_index = example_index
            self._part_a = part_a        # raw sequence surface in part_a
            self._part_b = part_b        # raw sequence surface in part_b
            self._part_a_tokens = []    # tokenized sequence in part_a
            self._part_b_tokens = []    # tokenized sequence in part_b
            self.classify_id_cls = classify_id_cls
            self._dataset_index = dataset_index # datahandler index assigned by the meta datahandler

        @property
        def dataset_index(self):
            """
            The datahandler index.
            
            :getter: Returns datahandler index
            :setter: Sets datahandler index
            :type: int
            """
            return self._dataset_index

        @dataset_index.setter
        def dataset_index(self, dataset_index):
            self._dataset_index = dataset_index

        @property
        def example_index(self):
            """
            The example index.
            
            :getter: Returns example index
            :setter: Sets example index
            :type: int
            """
            return self._example_index

        @example_index.setter
        def example_index(self, example_index):
            self._example_index = example_index

        @property
        def part_a(self):
            """
            The raw sequence surface in part_a.
            
            :getter: Returns part_a
            :setter: Sets part_a
            :type: sting
            """
            return self._part_a

        @part_a.setter
        def part_a(self, surface):
            self._part_a = surface

        @property
        def part_b(self):
            """
            The raw sequence surface in part_b.
            
            :getter: Returns part_b
            :setter: Sets part_b
            :type: sting
            """
            return self._part_b

        @part_b.setter
        def part_b(self, surface):
            self._part_b = surface

        @property
        def part_a_tokens(self):
            """
            The tokenized sequence in part_a.
            
            :getter: Returns tokenized part_a
            :setter: Sets tokenized part_a
            :type: sting
            """
            return self._part_a_tokens

        @part_a_tokens.setter
        def part_a_tokens(self, tokens):
            self._part_a_tokens = tokens

        @property
        def part_b_tokens(self):
            """
            The tokenized sequence in part_b.
            
            :getter: Returns tokenized part_b
            :setter: Sets tokenized part_b
            :type: sting
            """
            return self._part_b_tokens

        @part_b_tokens.setter
        def part_b_tokens(self, tokens):
            self._part_b_tokens = tokens

        @property
        def classify_id_cls(self):
            """
            The classification ids.
            
            :getter: Returns classification ids
            :setter: Sets classification ids
            :type: list
            """
            return self._classify_id_cls

        @classify_id_cls.setter
        def classify_id_cls(self, classify_id_cls):
            if classify_id_cls is None:
                self._classify_id_cls = []  # should be [-1] * self.plus_classify_sequence
            elif isinstance(classify_id_cls, list):
                self._classify_id_cls = classify_id_cls
            else:
                raise ValueError('BitextExample: Invalid classify_id_cls format.'
                                 ' need to be a list of length = plus_classify_sequence.')

