import argparse
import glob
import logging as log
import os
import random
import time
import torch.nn.functional as F
import json
import numpy as np
import torch
from eval_utils import f1_score, precision_score, recall_score, classification_report, macro_score
from utils import gen_knn_mix_batch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
import pickle
from transformers import *
from read_data import *

from tensorboardX import SummaryWriter

from bert_models import BertModel4Mix

logger = log.getLogger(__name__)

use_cuda = torch.cuda.is_available()
#CUDA_VISIBLE_DEVICES=6,7
#os.environ["CUDA_VISIBLE_DEVICES"] = "6,7"
MODEL_CLASSES = {"bert": (BertConfig, BertForTokenClassification, BertTokenizer)}

parser = argparse.ArgumentParser(description='PyTorch BaseNER')
parser.add_argument("--data-dir", default = './data', type = str, required = True)
parser.add_argument("--model-type", default = 'bert', type = str)
parser.add_argument("--model-name", default = 'bert-base-multilingual-cased', type = str)
parser.add_argument("--output-dir", default = './german_eval', type = str)
parser.add_argument('--gpu', default='0,1,2,3', type=str, help='id(s) for CUDA_VISIBLE_DEVICES')
parser.add_argument('--train-examples', default = -1, type = int)

parser.add_argument("--labels", default = "", type = str)
parser.add_argument('--config-name', default = '', type = str)
parser.add_argument("--tokenizer-name", default = '', type = str)
parser.add_argument("--max-seq-length", default = 128, type = int)

parser.add_argument("--do-train", action="store_true", help="Whether to run training.")
parser.add_argument("--do-eval", action="store_true", help="Whether to run eval on the dev set.")
parser.add_argument("--do-predict", action="store_true", help="Whether to run predictions on the test set.")
parser.add_argument("--evaluate-during-training", action="store_true", help="Whether to run evaluation during training at each logging step.")
parser.add_argument("--do-lower-case", action="store_true", help="Set this flag if you are using an uncased model.")

parser.add_argument("--batch-size", default = 16, type = int)
parser.add_argument('--eval-batch-size', default = 128, type = int)

parser.add_argument("--gradient-accumulation-steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.")

parser.add_argument("--learning-rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight-decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--adam-epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--max-grad-norm", default=1.0, type=float, help="Max gradient norm.")

parser.add_argument("--num-train-epochs", default=20, type=float, help="Total number of training epochs to perform.")
parser.add_argument("--max-steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
parser.add_argument('--warmup-steps', default = 0, type = int,  help="Linear warmup over warmup_steps.")

parser.add_argument('--logging-steps', default = 150, type = int, help="Log every X updates steps.")
parser.add_argument("--save-steps", type=int, default=0, help="Save checkpoint every X updates steps.")
parser.add_argument("--eval-all-checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
parser.add_argument("--overwrite-output-dir", action="store_true", help="Overwrite the content of the output directory")

parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

parser.add_argument("--pad-subtoken-with-real-label", action="store_true", help="give real label to the padded token instead of `-100` ")
parser.add_argument("--subtoken-label-type",default='real', type=str,help="[real|repeat|O] three ways to do pad subtoken with real label. [real] give the subtoken a real label e.g., B -> B I. [repeat] simply repeat the label e.g., B -> B B. [O] give it a O label. B -> B O")


parser.add_argument("--eval-pad-subtoken-with-first-subtoken-only", action="store_true", help="only works when --pad-subtoken-with-real-label is true, in this mode, we only test the prediction of the first subtoken of each word (if the word could be tokenized into multiple subtoken)")
parser.add_argument("--label-sep-cls", action="store_true", help="label [SEP] [CLS] with special labels, but not [PAD]") 



parser.add_argument("--log-file", default = "results.csv", type = str,help="the file to store resutls")

parser.add_argument("--optimizer", default = "adam", type = str,help='optimizer')
parser.add_argument('--special-label-weight', default=0, type=float, help='the special_label_weight in training . default 0')



# intra-mix




args = parser.parse_args()

os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args.device = device
args.n_gpu = torch.cuda.device_count()
print("gpu num: ", args.n_gpu)

best_f1 = 0


class TextFlintExample(object):
  

    def __init__(self,  words, labels):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            words: list. The words of the sequence.
            labels: (Optional) list. The labels for each word of the sequence. This should be
            specified for train and dev examples, but not for test examples.
        """
        #self.guid = guid
        self.x = words
      
        self.y = labels


def set_seed(args):
    logger.info("random seed %s", args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if len(args.gpu) > 0:
        torch.cuda.manual_seed_all(args.seed)

def count_rule_based_aug(args, tokenizer, labels, pad_token_label_id, mode,  
              omit_sep_cls_token=False,
              pad_subtoken_with_real_label=False):

    examples = read_examples_from_file_excel("data/conll2003",mode)  #examples = read_examples_from_file_excel("data/conll2003",mode)
    augexamples=[]
    eligibleexamples=[]
    totalexamplelist=copy.deepcopy(examples)
    examples_for_rule_based_aug_training=copy.deepcopy(examples)
    
    if mode =='train':

    
        
        
        import pickle 
        

       
        
        #Count all eligible examples
        num_of_aug_examples_to_generate_for_aug_type_1=10000000000000000000000000000
        num_of_aug_examples_to_generate_for_aug_type_3=10000000000000000000000000000
        num_of_aug_examples_to_generate_for_aug_type_4=10000000000000000000000000000
        augruniter=0
       
        num_of_aug_examples_generated=0
        

         
        #Pattern 1
    
        data4 = pd.read_excel ('data/conll2003/Sports.xlsx') 
        sportnames=list(data4['Names'])
        data = pd.read_excel ('data/conll2003/ORG Entity Phrases for Hold Out Experiments.xlsx') 
        df=data['Pattern1']
        placement=list(data['Placement'])
        phrasedomains=list(data['Domain'])
        compatibility=list(data['Compatibility'])
        labelsdf=list(data['Labels'])
        phrases=[]
        labels2=[]
        dfindex=0
        exampleindex=0
        while dfindex< len(df.index):
            phrases.append(df[dfindex].split(","))
            dfindex=dfindex+1

        for label in labelsdf:
            labels2.append(label.replace(" ", "").split("|"))


        examplesguids=[]
        selectedwords=[]
        exampleguidsalreadyaugmented=[]
        iccands=[]
        iccandbool=False

        i=0
        selectedword=''
        pattern1orgtransitioncounter=0
    

        for ex in examples:
        
            e=random.choice(examples)
            for word,label in zip(e.words,e.labels):
                if 'B-PER' in label or 'B-LOC' in label and 'B-MISC' not in label:
                    iccandbool=True
                    selectedword=word            
                
                elif iccandbool==True and label=='O' and i==1:
                    
                    if e.guid not in examplesguids:

                        examplesguids.append(e.guid)
                        selectedwords.append(selectedword)
                

                    break 
                
                if iccandbool==True:                
                    i=i+1        
            pattern1orgtransitioncounter=pattern1orgtransitioncounter+1
            iccandbool=False
            i=0



        index=0
        
        for ex in examples:
            
            if ex.guid in examplesguids and ex.guid not in exampleguidsalreadyaugmented:
            
                for guid2,selword in zip(examplesguids,selectedwords):
                    if ex.guid==guid2:
                        
                    

                        randomindex=random.randint(0,len(phrases)-1)
                    
                        eligibleexamples.append(examples[index])
                        if placement[randomindex]=='before':
                            currentexample=copy.deepcopy(examples[index])
                            if "B-LOC" == currentexample.labels[currentexample.words.index(selword)]:
                                if currentexample.words[currentexample.words.index(selword)-1].lower()=="in":
                                    if currentexample.words.index(selword)-1==0:
                                        currentexample.words[currentexample.words.index(selword)-1]="At"
                                    else:
                                        currentexample.words[currentexample.words.index(selword)-1]="at"
                                        o=0
                                    o=0
                            if currentexample.words.index(selword)==0:
                                currentexample.words.insert(currentexample.words.index(selword),"The")
                                currentexample.labels.insert(currentexample.words.index(selword)-1,"O")
                            else:
                                currentexample.words.insert(currentexample.words.index(selword),"the")
                                currentexample.labels.insert(currentexample.words.index(selword)-1,"O")
                            
                        
                            #Sport phrase code
                            if  "<x>" in phrases[randomindex]:
                                exwords = [word.lower() for word in currentexample.words]
                                insports=False
                                
                                for sport in sportnames:                               
                                    if sport.lower() in exwords :
                                        insports=True
                                        sportphrase1=phrases[randomindex]
                                        sportphrase=copy.deepcopy(sportphrase1)
                                        pindex=sportphrase.index("<x>")
                                        
                                        sportphrase[pindex]=sport
                                        
                                    
                                        break
                                if insports==False:
                                    randomsportindex=random.randint(0,len(sportnames)-1) 
                                    sportphrase1=phrases[randomindex]
                                    sportphrase=copy.deepcopy(sportphrase1)
                                    pindex=phrases[randomindex].index("<x>")
                                    sportphrase[pindex]=sportnames[randomsportindex]

                                for word,label in zip(sportphrase,labels2[randomindex]):
                                

                                    currentexample.labels[currentexample.words.index(selword)]='I-ORG'
                                    currentexample.words.insert(currentexample.words.index(selword),word)
                                    currentexample.labels.insert(currentexample.words.index(selword),label)
                                firstwordindex=currentexample.words.index(selword)-len(phrases[randomindex])
                                currentexample.labels[firstwordindex]='B-ORG'
                                r=0
                            #Regular insertion 
                            else: 
                            
                                for word,label in zip(phrases[randomindex],labels2[randomindex]):
                                    

                                    currentexample.labels[currentexample.words.index(selword)]='I-ORG'
                                    currentexample.words.insert(currentexample.words.index(selword),word)
                                    currentexample.labels.insert(currentexample.words.index(selword),label)
                                firstwordindex=currentexample.words.index(selword)-len(phrases[randomindex])
                                currentexample.labels[firstwordindex]='B-ORG'
                            
                                

                        elif placement[randomindex]=='after':
                            currentexample=copy.deepcopy(examples[index])
                            if "B-LOC" == currentexample.labels[currentexample.words.index(selword)]:
                                if currentexample.words[currentexample.words.index(selword)-1].lower()=="in":
                                    if currentexample.words.index(selword)-1==0:
                                        currentexample.words[currentexample.words.index(selword)-1]="At"
                                    else:
                                        currentexample.words[currentexample.words.index(selword)-1]="at"
                            if currentexample.words.index(selword)==0:
                                currentexample.words.insert(currentexample.words.index(selword),"The")
                                currentexample.labels.insert(currentexample.words.index(selword)-1,"O")
                            else:
                                currentexample.words.insert(currentexample.words.index(selword),"the")
                                currentexample.labels.insert(currentexample.words.index(selword)-1,"O")


                            #Sport phrase code
                            if  "<x>" in phrases[randomindex]:
                                exwords = [word.lower() for word in currentexample.words]
                                insports=False
                                
                                for sport in sportnames:                               
                                    if sport.lower() in exwords :
                                        insports=True
                                        sportphrase1=phrases[randomindex]
                                        sportphrase=copy.deepcopy(sportphrase1)
                                        pindex=sportphrase.index("<x>")
                                        
                                        sportphrase[pindex]=sport
                            
                                    
                                        break
                                if insports==False:
                                    randomsportindex=random.randint(0,len(sportnames)-1) 
                                    sportphrase1=phrases[randomindex]
                                    sportphrase=copy.deepcopy(sportphrase1)
                                    pindex=phrases[randomindex].index("<x>")
                                    sportphrase[pindex]=sportnames[randomsportindex]

                                for word,label in zip(sportphrase,labels2[randomindex]):
                                

                                    currentexample.labels[currentexample.words.index(selword)]='I-ORG'
                                    currentexample.words.insert(currentexample.words.index(selword),word)
                                    currentexample.labels.insert(currentexample.words.index(selword),label)
                                firstwordindex=currentexample.words.index(selword)-len(phrases[randomindex])
                                currentexample.labels[firstwordindex]='B-ORG'
                                

                            #Regular insertion 
                            else:                        
                    
                            
                                #Start inserting words
                                k=1
                                for word,label in zip(phrases[randomindex],labels2[randomindex]):
                                    currentexample.words.insert(currentexample.words.index(selword)+k,word)
                                    currentexample.labels.insert(currentexample.words.index(selword)+k,label)
                                    currentexample.labels[currentexample.words.index(selword)]='B-ORG'
                                    k=k+1
                        augexamples.append(currentexample)
                        
                        exampleguidsalreadyaugmented.append(ex.guid)
                        num_of_aug_examples_generated=num_of_aug_examples_generated+1
                        break
                        
            if num_of_aug_examples_generated>=num_of_aug_examples_to_generate_for_aug_type_1:
                num_of_aug_examples_generated=0
                break
                        

            index=index+1
            
           
    
   
        
        dataloc = pd.read_excel ('data/conll2003/Transition To LOC.xlsx') 
        dfloc=list(dataloc['Pattern1'])
        dataloc2 = pd.read_excel ('data/conll2003/LOC Context Phrases.xlsx') 
        phrasesdfloc2=list(dataloc2['Pattern2'])
        labelsloc2=list(dataloc2['EntityType'])
        
        loclabels=[]
        phrasesloc=[]
        phrasesloc2=[]
        for locword in phrasesdfloc2:
            phrasesloc2.append(locword.split("|"))

        for label in labelsloc2:
            loclabels.append(label.replace(" ", "").split("|"))     
        
        for loc in dfloc:
            phrasesloc.append(loc.split(","))

        placementloc=list(dataloc['Placement'])

        labels3=[]
        labelset=[]
        for phrase, place in zip(phrasesloc, placementloc):
            if place=='before':
                first=True
                for word in phrase:
                    if first==True:
                        labelset.append('B-ORG')
                    else:
                        labelset.append('I-ORG')
            elif place=='after':
                
                for word in phrase:
                    labelset.append('I-ORG')
            labels3.append(labelset)
            labelset=[]
        index=0
        toloccands=[]
        iccandbool=False
        toloccounter=0
        selectedword=''
        pattern1loctransitioncounter=0
        #while  pattern1loctransitioncounter <30:
            
        for e in examples:
            if toloccounter==70:
                break

            if e.guid not in exampleguidsalreadyaugmented:
                for phrase,placementlociter in zip(phrasesloc,placementloc):
                    
                    if(set(phrase).issubset(set( e.words))):
                        
                        if placementlociter=='after':
                            
                            entityindex=e.words.index(phrase[-1])+1

                            if e.labels[entityindex]=='I-ORG':
                                #match e guid with example guid
                                index=0
                                
                                
                                for ex in examples:
                                    if ex.guid==e.guid:
                                        eligibleexamples.append(examples[index])
                                        currentexample=copy.deepcopy(examples[index])
                                        
                                        currentexample.labels[entityindex]='B-LOC'
                                        k=1
                                        
                                        for word in phrase:
                                            
                                            currentexample.words.pop(entityindex-k)
                                            currentexample.labels.pop(entityindex-k)
                                            
                                            k=k+1
                                        
                                        if currentexample.labels[currentexample.labels.index('B-LOC')+1]=='I-ORG':
                                            currentexample.labels[currentexample.labels.index('B-LOC')+1]='I-LOC'
                                        
                                        #pattern 2
                                        randomindex=random.randint(0,len(phrasesdfloc2)-1)
                                        selword=currentexample.labels.index('B-LOC')

                                        k=1
                                        
                                        for word,label in zip(phrasesloc2[randomindex],loclabels[randomindex]):
                                            currentexample.words.insert(selword+k,word)
                                            currentexample.labels.insert(selword+k,label)
                                            
                                            k=k+1

                                        augexamples.append(currentexample)
                                        #num_of_aug_examples_generated=num_of_aug_examples_generated+1
                                        exampleguidsalreadyaugmented.append(ex.guid)
                                        toloccounter=toloccounter+1
                                        break
                                        
                                        
                                    index=index+1
                            else:
                                break

                        elif placementlociter=='before':
                            entityindex=e.words.index(phrase[0])-1
                            if e.labels[entityindex]=='B-ORG':
                                #match e guid with example guid
                                index=0
                                
                                
                                for ex in examples:
                                    if ex.guid==e.guid:
                                        eligibleexamples.append(examples[index])
                                        currentexample=copy.deepcopy(examples[index])
                                        
                                        currentexample.labels[entityindex]='B-LOC'
                                        
                                        for word in phrase:
                                            
                                            currentexample.words.pop(entityindex+1)
                                            currentexample.labels.pop(entityindex+1)
                                        
                                        if currentexample.labels[currentexample.labels.index('B-LOC')+1]=='I-ORG':
                                            currentexample.labels[currentexample.labels.index('B-LOC')+1]='I-LOC'
                                        
                                        #pattern 2
                                        randomindex=random.randint(0,len(phrasesdfloc2)-1)
                                        selword=currentexample.labels.index('B-LOC')

                                        k=1
                                        
                                        for word,label in zip(phrasesloc2[randomindex],loclabels[randomindex]):
                                            currentexample.words.insert(selword+k,word)
                                            currentexample.labels.insert(selword+k,label)
                                            
                                            k=k+1

                                        augexamples.append(currentexample)
                                        
                                        exampleguidsalreadyaugmented.append(ex.guid)
                                        #num_of_aug_examples_generated=num_of_aug_examples_generated+1
                                        toloccounter=toloccounter+1
                                        break
                                        
                                        
                                    index=index+1

                            else:
                                break


          

                    if e.guid in exampleguidsalreadyaugmented:
                        break
                        
     

                            
      
        dataPER = pd.read_excel ('data/conll2003/PER Context Phrases for Hold Out Experiments.xlsx') 
        dataPER2 = pd.read_excel ('data/conll2003/PER Headline Phrases for Hold Out Experiments.xlsx') 
        dataORG = pd.read_excel ('data/conll2003/ORG Context Phrases for Hold Out Experiments.xlsx') 
        dfPER2=list(dataPER2['Pattern2'])
        labelsPER2=list(dataPER2['EntityType'])
        compatibility2=list(dataORG['Compatibility'])

        dataNames = pd.read_excel ('data/conll2003/Names.xlsx') 
        dfNames=list(dataNames['Names'])
        dfPER=list(dataPER['Pattern2'])
        perphrasedomains=list(dataPER['Domain'])
        orgphrasedomains=list(dataORG['Domain'])
        dfORG=list(dataORG['Pattern2'])
        labelsPER=list(dataPER['EntityType'])
        labelsORG=list(dataORG['EntityType'])
        perphrases=[] 
        orgphrases=[]
        perlabels=[]  
        orglabels=[]
        perphrases2=[] 
        perlabels2=[] 


        for per2 in dfPER2:
            perphrases2.append(per2.split("|"))

        for perlabel2 in labelsPER2:
            perlabels2.append(perlabel2.replace(" ", "").split("|"))

        for per in dfPER:
            perphrases.append(per.split("|"))
        
        for org in dfORG:
            orgphrases.append(org.split("|"))

        for label in labelsPER:
            perlabels.append(label.replace(" ", "").split("|"))
        for label in labelsORG:
            orglabels.append(label.replace(" ", "").split("|"))

        percands=[]
        orgcands=[]
        iccandbool=False
        i=0
        selectedword=''
        numofexamples=0
        pertransitioncounter=0
        orgtransitioncounter=0
        originallabel=''
        examplesguids=[]
        selectedwords=[]
        #while  pertransitioncounter <30:
        for ex in examples:
            #if len(examplesguids)==400:
            #    break
            e=random.choice(examples)
            if  e.guid not in exampleguidsalreadyaugmented:

                for word,label in zip(e.words,e.labels):
                                
                    nextwordinddex=e.words.index(word)+1
                    if nextwordinddex<len(e.words):
                        w=e.words[nextwordinddex]
                    else:
                        w=''

                    if 'B-ORG' in label or 'B-LOC' in label  and 'B-MISC' not in label and ')' not in w and '.' not in word:
                        iccandbool=True
                        selectedword=word
                        originallabel=label
                    elif iccandbool==True and label=='O' and i==1:
                        if e.guid not in examplesguids:
                        
                            examplesguids.append(e.guid)
                            selectedwords.append(selectedword)

                        break 
                    
                    if iccandbool==True:                
                        i=i+1        
                
                iccandbool=False
                i=0
                pertransitioncounter=pertransitioncounter+1
        index=0
        for ex in examples:
                
                if ex.guid in examplesguids and ex.guid not in exampleguidsalreadyaugmented:
                
                    for guid2,selword in zip(examplesguids,selectedwords):
                        if ex.guid==guid2:
                                            
                            eligibleexamples.append(examples[index])
                            currentexample=copy.deepcopy(examples[index])
                            numbercount = sum(entry.isdigit() for entry in currentexample.words)
                            if len(currentexample.words)<4 and 'AT' not in currentexample.words or numbercount>3:
                                randomindex=random.randint(0,len(dfPER2)-1)
                                k=0
                                for word,label in zip(perphrases2[randomindex],perlabels2[randomindex]):
                                    currentexample.words.insert(0+k,word)
                                    currentexample.labels.insert(0+k,label)
                                    currentexample.labels[currentexample.words.index(selword)]='B-PER'
                                    k=k+1
                            else:
                                randomindex=random.randint(0,len(dfPER)-1)        
                                k=1
                                for word,label in zip(perphrases[randomindex],perlabels[randomindex]):
                                    currentexample.words.insert(currentexample.words.index(selword)+k,word)
                                    currentexample.labels.insert(currentexample.words.index(selword)+k,label)
                                    currentexample.labels[currentexample.words.index(selword)]='B-PER'
                                    k=k+1

                            #Pattern 1
                            randomindex3=random.randint(0,len(dfNames)-1)
                            currentexample.words.insert(currentexample.words.index(selword)+1,dfNames[randomindex3])
                            currentexample.labels.insert(currentexample.words.index(selword)+1,'I-PER')
                            augexamples.append(currentexample)
                            exampleguidsalreadyaugmented.append(currentexample.guid)
                            num_of_aug_examples_generated=num_of_aug_examples_generated+1
                            
                            break

                index=index+1
                if num_of_aug_examples_generated>=num_of_aug_examples_to_generate_for_aug_type_3:
                    num_of_aug_examples_generated=0
                    break
        examplesguids=[]    
        selectedwords=[]

        #if num_of_aug_examples_generated<num_of_aug_examples_to_generate:
            
        
        for ex in examples:
            #if len(examplesguids)==400:
            #     break
            e=random.choice(examples)
            if  e.guid not in exampleguidsalreadyaugmented:
                for word,label in zip(e.words,e.labels):
                    if 'B-PER' in label or 'B-LOC' in label  and 'B-MISC' not in label:
                        iccandbool=True
                        selectedword=word
                        originallabel=label
                    elif iccandbool==True and label=='O' and i==1:
                    
                        if e.guid not in examplesguids:
                            examplesguids.append(e.guid)
                            selectedwords.append(selectedword)

                        break 
                    
                    if iccandbool==True:                
                        i=i+1        
                
                iccandbool=False
                i=0
                orgtransitioncounter=orgtransitioncounter+1

        j=0
        index=0
        for ex in examples:
                
                if ex.guid in examplesguids and ex.guid not in exampleguidsalreadyaugmented:
                    
                    for guid2,selword in zip(examplesguids,selectedwords):
                        if ex.guid==guid2:
                        

                            eligibleexamples.append(examples[index])
                            currentexample=copy.deepcopy(examples[index])
                                                        
                            randomindex2=random.randint(0,len(phrases)-1)
                            randomindex=random.randint(0,len(orgphrases)-1)

                                    
                            k=1
                            for word,label in zip(orgphrases[randomindex],orglabels[randomindex]):
                                currentexample.words.insert(currentexample.words.index(selword)+k,word)
                                currentexample.labels.insert(currentexample.words.index(selword)+k,label)
                                currentexample.labels[currentexample.words.index(selword)]='B-ORG'
                                k=k+1
                      
                            
                            if placement[randomindex2]=='before':

                                if "B-LOC" == currentexample.labels[currentexample.words.index(selword)]:
                                    if currentexample.words[currentexample.words.index(selword)-1].lower()=="in":
                                        if currentexample.words.index(selword)-1==0:
                                            currentexample.words[currentexample.words.index(selword)-1]="At"
                                        else:
                                            currentexample.words[currentexample.words.index(selword)-1]="at"
                                if currentexample.words.index(selword)==0:
                                    currentexample.words.insert(currentexample.words.index(selword),"The")
                                    currentexample.labels.insert(currentexample.words.index(selword)-1,"O")
                                else:
                                    currentexample.words.insert(currentexample.words.index(selword),"the")
                                    currentexample.labels.insert(currentexample.words.index(selword)-1,"O")



                                
                                #Sport phrase code
                                if  "<x>" in phrases[randomindex2]:
                                    exwords = [word.lower() for word in currentexample.words]
                                    insports=False
                                    
                                    for sport in sportnames:                               
                                        if sport.lower() in exwords :
                                            insports=True
                                            sportphrase1=phrases[randomindex2]
                                            sportphrase=copy.deepcopy(sportphrase1)
                                            pindex=sportphrase.index("<x>")
                                            
                                            sportphrase[pindex]=sport
                                    
                                        
                                            break
                                    if insports==False:
                                        randomsportindex=random.randint(0,len(sportnames)-1) 
                                        sportphrase1=phrases[randomindex2]
                                        sportphrase=copy.deepcopy(sportphrase1)
                                        pindex=phrases[randomindex2].index("<x>")
                                        sportphrase[pindex]=sportnames[randomsportindex]

                                    for word,label in zip(sportphrase,labels2[randomindex2]):
                                    

                                        currentexample.labels[currentexample.words.index(selword)]='I-ORG'
                                        currentexample.words.insert(currentexample.words.index(selword),word)
                                        currentexample.labels.insert(currentexample.words.index(selword),label)
                                    firstwordindex=currentexample.words.index(selword)-len(phrases[randomindex2])
                                    currentexample.labels[firstwordindex]='B-ORG'
                                    

                                #Regular insertion 
                                else:
                                        
                                
                                    for word,label in zip(phrases[randomindex2],labels2[randomindex2]):
                                        currentexample.labels[currentexample.words.index(selword)]='I-ORG'
                                        currentexample.words.insert(currentexample.words.index(selword),word)
                                        currentexample.labels.insert(currentexample.words.index(selword),label)
                                    firstwordindex=currentexample.words.index(selword)-len(phrases[randomindex2])
                                    currentexample.labels[firstwordindex]='B-ORG'


                            elif placement[randomindex2]=='after':
                                if "B-LOC" == currentexample.labels[currentexample.words.index(selword)]:
                                    if currentexample.words[currentexample.words.index(selword)-1].lower()=="in":
                                        if currentexample.words.index(selword)-1==0:
                                            currentexample.words[currentexample.words.index(selword)-1]="At"
                                        else:
                                            currentexample.words[currentexample.words.index(selword)-1]="at"
                                if currentexample.words.index(selword)==0:
                                    currentexample.words.insert(currentexample.words.index(selword),"The")
                                    currentexample.labels.insert(currentexample.words.index(selword)-1,"O")
                                else:
                                    currentexample.words.insert(currentexample.words.index(selword),"the")
                                    currentexample.labels.insert(currentexample.words.index(selword)-1,"O")


                                
            
                                #Sport phrase code
                                if  "<x>" in phrases[randomindex2]:
                                    exwords = [word.lower() for word in currentexample.words]
                                    insports=False
                            
                                    for sport in sportnames:                               
                                        if sport.lower() in exwords :
                                            insports=True
                                            sportphrase1=phrases[randomindex2]
                                            sportphrase=copy.deepcopy(sportphrase1)
                                            pindex=sportphrase.index("<x>")
                                            
                                            sportphrase[pindex]=sport
                                        
                                        
                                            break
                                    if insports==False:
                                        randomsportindex=random.randint(0,len(sportnames)-1) 
                                        sportphrase1=phrases[randomindex2]
                                        sportphrase=copy.deepcopy(sportphrase1)
                                        pindex=phrases[randomindex2].index("<x>")
                                        sportphrase[pindex]=sportnames[randomsportindex]

                                    for word,label in zip(sportphrase,labels2[randomindex2]):
                                    

                                        currentexample.labels[currentexample.words.index(selword)]='I-ORG'
                                        currentexample.words.insert(currentexample.words.index(selword),word)
                                        currentexample.labels.insert(currentexample.words.index(selword),label)
                                    firstwordindex=currentexample.words.index(selword)-len(phrases[randomindex2])
                                    currentexample.labels[firstwordindex]='B-ORG'
                                    

                                #Regular insertion 
                                else:
                        
                                    k=1
                                    for word,label in zip(phrases[randomindex2],labels2[randomindex2]):
                                        currentexample.words.insert(currentexample.words.index(selword)+k,word)
                                        currentexample.labels.insert(currentexample.words.index(selword)+k,label)
                                        currentexample.labels[currentexample.words.index(selword)]='B-ORG'
                                        k=k+1
                                        
                            exampleguidsalreadyaugmented.append(currentexample.guid)
                            augexamples.append(currentexample)
                            num_of_aug_examples_generated=num_of_aug_examples_generated+1
                            

                            break

                index=index+1
                
                if num_of_aug_examples_generated>=num_of_aug_examples_to_generate_for_aug_type_4:
                    num_of_aug_examples_generated=0
                    break
        logger.info("Number of eligible examples: %s", len(eligibleexamples))
        print("Number of augmented examples",len(augexamples))


        from itertools import islice
  
        # Input list initialization
        Input = 5523   #eligibleexamples 3x1381+1,380
        examplespertrans=len(augexamples)/2
        examplespertrans=round(examplespertrans)
        # list of length in which we have to split
        length_to_split = [examplespertrans, examplespertrans]
        
        # Using islice
        Inputt = iter(Input)
        Output = [list(islice(Inputt, elem))
                for elem in length_to_split]
        
        # Printing Output
        print("Initial list is:", Input)
        print("Split length list: ", length_to_split)
        print("List after splitting", Output)





       
        for eligibleexample in eligibleexamples:
            eligibleTF={}
            eligibleTF['x']=eligibleexample.words
            eligibleTF['y']=eligibleexample.labels
            with open("textflint3.json", "a") as p:
                json.dump(eligibleTF, p)
                p.write("\n")
            #eligibleTF=TextFlintExample(eligibleexample.words,eligibleexample.labels)

            #examplesforTextFlintAugmentation.append(eligibleTF)
        #jsonStr = json.dumps(examplesforTextFlintAugmentation)
        


        #file_name="eligibleexamples100percent.pkl"
        #file_path = os.path.join(args.data_dir, file_name)
        
        #open_file = open(file_path, "wb")
        #pickle.dump(augexamples, open_file)
        #open_file.close()
        return len(augexamples)

def read_data_rule_based_aug(args,amount, percentagename,tokenizer, labels, pad_token_label_id, mode,  
              omit_sep_cls_token=False,
              pad_subtoken_with_real_label=False):

    examples = read_examples_from_file_excel("data/conll2003",mode)  #examples = read_examples_from_file_excel("data/conll2003",mode)
    augexamples=[]
    totalexamplelist=copy.deepcopy(examples)
    examples_for_rule_based_aug_training=copy.deepcopy(examples)
    
    if mode =='train':

    
        
        
        import pickle 
        

        
       

        num_of_aug_examples_to_generate_minus_loc_transition=amount-70
        num_of_aug_examples_to_generate=num_of_aug_examples_to_generate_minus_loc_transition/3
        num_of_aug_examples_to_generate=round(num_of_aug_examples_to_generate)

        num_of_aug_examples_to_generate_for_aug_type_1=num_of_aug_examples_to_generate
        num_of_aug_examples_to_generate_for_aug_type_3=num_of_aug_examples_to_generate
        num_of_aug_examples_to_generate_for_aug_type_4=num_of_aug_examples_to_generate

        augruniter=0
       
        num_of_aug_examples_generated=0
        

         
        #Pattern 1
    
        data4 = pd.read_excel ('data/conll2003/Sports.xlsx') 
        sportnames=list(data4['Names'])
        data = pd.read_excel ('data/conll2003/ORG Entity Phrases for Hold Out Experiments.xlsx') 
        df=data['Pattern1']
        placement=list(data['Placement'])
        phrasedomains=list(data['Domain'])
        compatibility=list(data['Compatibility'])
        labelsdf=list(data['Labels'])
        phrases=[]
        labels2=[]
        dfindex=0
        exampleindex=0
        while dfindex< len(df.index):
            phrases.append(df[dfindex].split(","))
            dfindex=dfindex+1

        for label in labelsdf:
            labels2.append(label.replace(" ", "").split("|"))


        examplesguids=[]
        selectedwords=[]
        exampleguidsalreadyaugmented=[]
        iccands=[]
        iccandbool=False

        i=0
        selectedword=''
        pattern1orgtransitioncounter=0
    

        for ex in examples:
      
            e=random.choice(examples)
            for word,label in zip(e.words,e.labels):
                if 'B-PER' in label or 'B-LOC' in label and 'B-MISC' not in label:
                    iccandbool=True
                    selectedword=word            
                
                elif iccandbool==True and label=='O' and i==1:
                    
                    if e.guid not in examplesguids:

                        examplesguids.append(e.guid)
                        selectedwords.append(selectedword)
                

                    break 
                
                if iccandbool==True:                
                    i=i+1        
            pattern1orgtransitioncounter=pattern1orgtransitioncounter+1
            iccandbool=False
            i=0


        #Code for putting data into examples instead of cand list of tuples
        index=0
        
        for ex in examples:
            
            if ex.guid in examplesguids and ex.guid not in exampleguidsalreadyaugmented:
            
                for guid2,selword in zip(examplesguids,selectedwords):
                    if ex.guid==guid2:
                        
                    

                        randomindex=random.randint(0,len(phrases)-1)
                    
                        
                        if placement[randomindex]=='before':
                            currentexample=copy.deepcopy(examples[index])
                            if "B-LOC" == currentexample.labels[currentexample.words.index(selword)]:
                                if currentexample.words[currentexample.words.index(selword)-1].lower()=="in":
                                    if currentexample.words.index(selword)-1==0:
                                        currentexample.words[currentexample.words.index(selword)-1]="At"
                                    else:
                                        currentexample.words[currentexample.words.index(selword)-1]="at"
                                        o=0
                                    o=0
                            if currentexample.words.index(selword)==0:
                                currentexample.words.insert(currentexample.words.index(selword),"The")
                                currentexample.labels.insert(currentexample.words.index(selword)-1,"O")
                            else:
                                currentexample.words.insert(currentexample.words.index(selword),"the")
                                currentexample.labels.insert(currentexample.words.index(selword)-1,"O")
                            
                        
                            #Sport phrase code
                            if  "<x>" in phrases[randomindex]:
                                exwords = [word.lower() for word in currentexample.words]
                                insports=False
                                
                                for sport in sportnames:                               
                                    if sport.lower() in exwords :
                                        insports=True
                                        sportphrase1=phrases[randomindex]
                                        sportphrase=copy.deepcopy(sportphrase1)
                                        pindex=sportphrase.index("<x>")
                                        
                                        sportphrase[pindex]=sport
                                        
                                    
                                        break
                                if insports==False:
                                    randomsportindex=random.randint(0,len(sportnames)-1) 
                                    sportphrase1=phrases[randomindex]
                                    sportphrase=copy.deepcopy(sportphrase1)
                                    pindex=phrases[randomindex].index("<x>")
                                    sportphrase[pindex]=sportnames[randomsportindex]

                                for word,label in zip(sportphrase,labels2[randomindex]):
                                

                                    currentexample.labels[currentexample.words.index(selword)]='I-ORG'
                                    currentexample.words.insert(currentexample.words.index(selword),word)
                                    currentexample.labels.insert(currentexample.words.index(selword),label)
                                firstwordindex=currentexample.words.index(selword)-len(phrases[randomindex])
                                currentexample.labels[firstwordindex]='B-ORG'
                                r=0
                            #Regular insertion 
                            else: 
                            
                                for word,label in zip(phrases[randomindex],labels2[randomindex]):
                                    

                                    currentexample.labels[currentexample.words.index(selword)]='I-ORG'
                                    currentexample.words.insert(currentexample.words.index(selword),word)
                                    currentexample.labels.insert(currentexample.words.index(selword),label)
                                firstwordindex=currentexample.words.index(selword)-len(phrases[randomindex])
                                currentexample.labels[firstwordindex]='B-ORG'
                            
                                

                        elif placement[randomindex]=='after':
                            currentexample=copy.deepcopy(examples[index])
                            if "B-LOC" == currentexample.labels[currentexample.words.index(selword)]:
                                if currentexample.words[currentexample.words.index(selword)-1].lower()=="in":
                                    if currentexample.words.index(selword)-1==0:
                                        currentexample.words[currentexample.words.index(selword)-1]="At"
                                    else:
                                        currentexample.words[currentexample.words.index(selword)-1]="at"
                            if currentexample.words.index(selword)==0:
                                currentexample.words.insert(currentexample.words.index(selword),"The")
                                currentexample.labels.insert(currentexample.words.index(selword)-1,"O")
                            else:
                                currentexample.words.insert(currentexample.words.index(selword),"the")
                                currentexample.labels.insert(currentexample.words.index(selword)-1,"O")


                            #Sport phrase code
                            if  "<x>" in phrases[randomindex]:
                                exwords = [word.lower() for word in currentexample.words]
                                insports=False
                                
                                for sport in sportnames:                               
                                    if sport.lower() in exwords :
                                        insports=True
                                        sportphrase1=phrases[randomindex]
                                        sportphrase=copy.deepcopy(sportphrase1)
                                        pindex=sportphrase.index("<x>")
                                        
                                        sportphrase[pindex]=sport
                            
                                    
                                        break
                                if insports==False:
                                    randomsportindex=random.randint(0,len(sportnames)-1) 
                                    sportphrase1=phrases[randomindex]
                                    sportphrase=copy.deepcopy(sportphrase1)
                                    pindex=phrases[randomindex].index("<x>")
                                    sportphrase[pindex]=sportnames[randomsportindex]

                                for word,label in zip(sportphrase,labels2[randomindex]):
                                

                                    currentexample.labels[currentexample.words.index(selword)]='I-ORG'
                                    currentexample.words.insert(currentexample.words.index(selword),word)
                                    currentexample.labels.insert(currentexample.words.index(selword),label)
                                firstwordindex=currentexample.words.index(selword)-len(phrases[randomindex])
                                currentexample.labels[firstwordindex]='B-ORG'
                                

                            #Regular insertion 
                            else:                        
                    
                            
                                #Start inserting words
                                k=1
                                for word,label in zip(phrases[randomindex],labels2[randomindex]):
                                    currentexample.words.insert(currentexample.words.index(selword)+k,word)
                                    currentexample.labels.insert(currentexample.words.index(selword)+k,label)
                                    currentexample.labels[currentexample.words.index(selword)]='B-ORG'
                                    k=k+1
                        augexamples.append(currentexample)
                        exampleguidsalreadyaugmented.append(ex.guid)
                        if num_of_aug_examples_generated>=num_of_aug_examples_to_generate_for_aug_type_1:
                            num_of_aug_examples_generated=0
                            break
                        num_of_aug_examples_generated=num_of_aug_examples_generated+1
                        break
                        
            if num_of_aug_examples_generated>=num_of_aug_examples_to_generate_for_aug_type_1:
                num_of_aug_examples_generated=0
                break
                        

            index=index+1
            

        
        dataloc = pd.read_excel ('data/conll2003/Transition To LOC.xlsx') 
        dfloc=list(dataloc['Pattern1'])
        dataloc2 = pd.read_excel ('data/conll2003/LOC Context Phrases.xlsx') 
        phrasesdfloc2=list(dataloc2['Pattern2'])
        labelsloc2=list(dataloc2['EntityType'])
        
        loclabels=[]
        phrasesloc=[]
        phrasesloc2=[]
        for locword in phrasesdfloc2:
            phrasesloc2.append(locword.split("|"))

        for label in labelsloc2:
            loclabels.append(label.replace(" ", "").split("|"))     
        
        for loc in dfloc:
            phrasesloc.append(loc.split(","))

        placementloc=list(dataloc['Placement'])

        labels3=[]
        labelset=[]
        for phrase, place in zip(phrasesloc, placementloc):
            if place=='before':
                first=True
                for word in phrase:
                    if first==True:
                        labelset.append('B-ORG')
                    else:
                        labelset.append('I-ORG')
            elif place=='after':
                
                for word in phrase:
                    labelset.append('I-ORG')
            labels3.append(labelset)
            labelset=[]
        index=0
        toloccands=[]
        iccandbool=False
        toloccounter=0
        selectedword=''
        pattern1loctransitioncounter=0
        #while  pattern1loctransitioncounter <30:
            
        for e in examples:
            if toloccounter==70:
                break

            if e.guid not in exampleguidsalreadyaugmented:
                for phrase,placementlociter in zip(phrasesloc,placementloc):
                    
                    if(set(phrase).issubset(set( e.words))):
                        
                        if placementlociter=='after':
                            
                            entityindex=e.words.index(phrase[-1])+1

                            if e.labels[entityindex]=='I-ORG':
                                #match e guid with example guid
                                index=0
                                
                                
                                for ex in examples:
                                    if ex.guid==e.guid:
                                        currentexample=copy.deepcopy(examples[index])
                                        
                                        currentexample.labels[entityindex]='B-LOC'
                                        k=1
                                        
                                        for word in phrase:
                                            
                                            currentexample.words.pop(entityindex-k)
                                            currentexample.labels.pop(entityindex-k)
                                            
                                            k=k+1
                                        
                                        if currentexample.labels[currentexample.labels.index('B-LOC')+1]=='I-ORG':
                                            currentexample.labels[currentexample.labels.index('B-LOC')+1]='I-LOC'
                                        
                                        #pattern 2
                                        randomindex=random.randint(0,len(phrasesdfloc2)-1)
                                        selword=currentexample.labels.index('B-LOC')

                                        k=1
                                        
                                        for word,label in zip(phrasesloc2[randomindex],loclabels[randomindex]):
                                            currentexample.words.insert(selword+k,word)
                                            currentexample.labels.insert(selword+k,label)
                                            
                                            k=k+1

                                        augexamples.append(currentexample)
                                        #num_of_aug_examples_generated=num_of_aug_examples_generated+1
                                        exampleguidsalreadyaugmented.append(ex.guid)
                                        toloccounter=toloccounter+1
                                        break
                                        
                                        
                                    index=index+1
                            else:
                                break

                        elif placementlociter=='before':
                            entityindex=e.words.index(phrase[0])-1
                            if e.labels[entityindex]=='B-ORG':
                                #match e guid with example guid
                                index=0
                                
                                
                                for ex in examples:
                                    if ex.guid==e.guid:
                                        currentexample=copy.deepcopy(examples[index])
                                        
                                        currentexample.labels[entityindex]='B-LOC'
                                        
                                        for word in phrase:
                                            
                                            currentexample.words.pop(entityindex+1)
                                            currentexample.labels.pop(entityindex+1)
                                        
                                        if currentexample.labels[currentexample.labels.index('B-LOC')+1]=='I-ORG':
                                            currentexample.labels[currentexample.labels.index('B-LOC')+1]='I-LOC'
                                        
                                        #pattern 2
                                        randomindex=random.randint(0,len(phrasesdfloc2)-1)
                                        selword=currentexample.labels.index('B-LOC')

                                        k=1
                                        
                                        for word,label in zip(phrasesloc2[randomindex],loclabels[randomindex]):
                                            currentexample.words.insert(selword+k,word)
                                            currentexample.labels.insert(selword+k,label)
                                            
                                            k=k+1

                                        augexamples.append(currentexample)
                                        
                                        exampleguidsalreadyaugmented.append(ex.guid)
                                        #num_of_aug_examples_generated=num_of_aug_examples_generated+1
                                        toloccounter=toloccounter+1
                                        break
                                        
                                        
                                    index=index+1

                            else:
                                break


                        #if num_of_aug_examples_generated>=num_of_aug_examples_to_generate:
                        #    break

                    if e.guid in exampleguidsalreadyaugmented:
                        break
                        
                #if num_of_aug_examples_generated>=num_of_aug_examples_to_generate:
                #    break   

                            
        
   
      
   
        dataPER = pd.read_excel ('data/conll2003/PER Context Phrases for Hold Out Experiments.xlsx') 
        dataPER2 = pd.read_excel ('data/conll2003/PER Headline Phrases for Hold Out Experiments.xlsx') 
        dataORG = pd.read_excel ('data/conll2003/ORG Context Phrases for Hold Out Experiments.xlsx') 
        dfPER2=list(dataPER2['Pattern2'])
        labelsPER2=list(dataPER2['EntityType'])
        compatibility2=list(dataORG['Compatibility'])

        dataNames = pd.read_excel ('data/conll2003/Names.xlsx') 
        dfNames=list(dataNames['Names'])
        dfPER=list(dataPER['Pattern2'])
        perphrasedomains=list(dataPER['Domain'])
        orgphrasedomains=list(dataORG['Domain'])
        dfORG=list(dataORG['Pattern2'])
        labelsPER=list(dataPER['EntityType'])
        labelsORG=list(dataORG['EntityType'])
        perphrases=[] 
        orgphrases=[]
        perlabels=[]  
        orglabels=[]
        perphrases2=[] 
        perlabels2=[] 


        for per2 in dfPER2:
            perphrases2.append(per2.split("|"))

        for perlabel2 in labelsPER2:
            perlabels2.append(perlabel2.replace(" ", "").split("|"))

        for per in dfPER:
            perphrases.append(per.split("|"))
        
        for org in dfORG:
            orgphrases.append(org.split("|"))

        for label in labelsPER:
            perlabels.append(label.replace(" ", "").split("|"))
        for label in labelsORG:
            orglabels.append(label.replace(" ", "").split("|"))

        percands=[]
        orgcands=[]
        iccandbool=False
        i=0
        selectedword=''
        numofexamples=0
        pertransitioncounter=0
        orgtransitioncounter=0
        originallabel=''
        examplesguids=[]
        selectedwords=[]

        for ex in examples:
        
            e=random.choice(examples)
            if  e.guid not in exampleguidsalreadyaugmented:

                for word,label in zip(e.words,e.labels):
                                
                    nextwordinddex=e.words.index(word)+1
                    if nextwordinddex<len(e.words):
                        w=e.words[nextwordinddex]
                    else:
                        w=''

                    if 'B-ORG' in label or 'B-LOC' in label  and 'B-MISC' not in label and ')' not in w and '.' not in word:
                        iccandbool=True
                        selectedword=word
                        originallabel=label
                    elif iccandbool==True and label=='O' and i==1:
                        if e.guid not in examplesguids:
                        
                            examplesguids.append(e.guid)
                            selectedwords.append(selectedword)

                        break 
                    
                    if iccandbool==True:                
                        i=i+1        
                
                iccandbool=False
                i=0
                pertransitioncounter=pertransitioncounter+1
        index=0
        for ex in examples:
                
                if ex.guid in examplesguids and ex.guid not in exampleguidsalreadyaugmented:
                
                    for guid2,selword in zip(examplesguids,selectedwords):
                        if ex.guid==guid2:
                                            
                            
                            currentexample=copy.deepcopy(examples[index])
                            numbercount = sum(entry.isdigit() for entry in currentexample.words)
                            if len(currentexample.words)<4 and 'AT' not in currentexample.words or numbercount>3:
                                randomindex=random.randint(0,len(dfPER2)-1)
                                k=0
                                for word,label in zip(perphrases2[randomindex],perlabels2[randomindex]):
                                    currentexample.words.insert(0+k,word)
                                    currentexample.labels.insert(0+k,label)
                                    currentexample.labels[currentexample.words.index(selword)]='B-PER'
                                    k=k+1
                            else:
                                randomindex=random.randint(0,len(dfPER)-1)        
                                k=1
                                for word,label in zip(perphrases[randomindex],perlabels[randomindex]):
                                    currentexample.words.insert(currentexample.words.index(selword)+k,word)
                                    currentexample.labels.insert(currentexample.words.index(selword)+k,label)
                                    currentexample.labels[currentexample.words.index(selword)]='B-PER'
                                    k=k+1

                            #Pattern 1
                            randomindex3=random.randint(0,len(dfNames)-1)
                            currentexample.words.insert(currentexample.words.index(selword)+1,dfNames[randomindex3])
                            currentexample.labels.insert(currentexample.words.index(selword)+1,'I-PER')
                            augexamples.append(currentexample)
                            exampleguidsalreadyaugmented.append(currentexample.guid)
                            if num_of_aug_examples_generated>=num_of_aug_examples_to_generate_for_aug_type_3:
                                num_of_aug_examples_generated=0
                                break
                            num_of_aug_examples_generated=num_of_aug_examples_generated+1
                            
                            break

                index=index+1
                if num_of_aug_examples_generated>=num_of_aug_examples_to_generate_for_aug_type_3:
                    num_of_aug_examples_generated=0
                    break
        examplesguids=[]    
        selectedwords=[]

 
            
        
        for ex in examples:
           
            e=random.choice(examples)
            if  e.guid not in exampleguidsalreadyaugmented:
                for word,label in zip(e.words,e.labels):
                    if 'B-PER' in label or 'B-LOC' in label  and 'B-MISC' not in label:
                        iccandbool=True
                        selectedword=word
                        originallabel=label
                    elif iccandbool==True and label=='O' and i==1:
                    
                        if e.guid not in examplesguids:
                            examplesguids.append(e.guid)
                            selectedwords.append(selectedword)

                        break 
                    
                    if iccandbool==True:                
                        i=i+1        
                
                iccandbool=False
                i=0
                orgtransitioncounter=orgtransitioncounter+1

        j=0
        index=0
        for ex in examples:
                
                if ex.guid in examplesguids and ex.guid not in exampleguidsalreadyaugmented:
                    
                    for guid2,selword in zip(examplesguids,selectedwords):
                        if ex.guid==guid2:
                        

                            
                            currentexample=copy.deepcopy(examples[index])
                                                        
                            randomindex2=random.randint(0,len(phrases)-1)
                            randomindex=random.randint(0,len(orgphrases)-1)

                                    
                            k=1
                            for word,label in zip(orgphrases[randomindex],orglabels[randomindex]):
                                currentexample.words.insert(currentexample.words.index(selword)+k,word)
                                currentexample.labels.insert(currentexample.words.index(selword)+k,label)
                                currentexample.labels[currentexample.words.index(selword)]='B-ORG'
                                k=k+1
                            #Pattern 1 do the fix
                            
                            if placement[randomindex2]=='before':

                                if "B-LOC" == currentexample.labels[currentexample.words.index(selword)]:
                                    if currentexample.words[currentexample.words.index(selword)-1].lower()=="in":
                                        if currentexample.words.index(selword)-1==0:
                                            currentexample.words[currentexample.words.index(selword)-1]="At"
                                        else:
                                            currentexample.words[currentexample.words.index(selword)-1]="at"
                                if currentexample.words.index(selword)==0:
                                    currentexample.words.insert(currentexample.words.index(selword),"The")
                                    currentexample.labels.insert(currentexample.words.index(selword)-1,"O")
                                else:
                                    currentexample.words.insert(currentexample.words.index(selword),"the")
                                    currentexample.labels.insert(currentexample.words.index(selword)-1,"O")



                                
                                #Sport phrase code
                                if  "<x>" in phrases[randomindex2]:
                                    exwords = [word.lower() for word in currentexample.words]
                                    insports=False
                                    
                                    for sport in sportnames:                               
                                        if sport.lower() in exwords :
                                            insports=True
                                            sportphrase1=phrases[randomindex2]
                                            sportphrase=copy.deepcopy(sportphrase1)
                                            pindex=sportphrase.index("<x>")
                                            
                                            sportphrase[pindex]=sport
                                    
                                        
                                            break
                                    if insports==False:
                                        randomsportindex=random.randint(0,len(sportnames)-1) 
                                        sportphrase1=phrases[randomindex2]
                                        sportphrase=copy.deepcopy(sportphrase1)
                                        pindex=phrases[randomindex2].index("<x>")
                                        sportphrase[pindex]=sportnames[randomsportindex]

                                    for word,label in zip(sportphrase,labels2[randomindex2]):
                                    

                                        currentexample.labels[currentexample.words.index(selword)]='I-ORG'
                                        currentexample.words.insert(currentexample.words.index(selword),word)
                                        currentexample.labels.insert(currentexample.words.index(selword),label)
                                    firstwordindex=currentexample.words.index(selword)-len(phrases[randomindex2])
                                    currentexample.labels[firstwordindex]='B-ORG'
                                    

                                #Regular insertion 
                                else:
                                        
                                
                                    for word,label in zip(phrases[randomindex2],labels2[randomindex2]):
                                        currentexample.labels[currentexample.words.index(selword)]='I-ORG'
                                        currentexample.words.insert(currentexample.words.index(selword),word)
                                        currentexample.labels.insert(currentexample.words.index(selword),label)
                                    firstwordindex=currentexample.words.index(selword)-len(phrases[randomindex2])
                                    currentexample.labels[firstwordindex]='B-ORG'


                            elif placement[randomindex2]=='after':
                                if "B-LOC" == currentexample.labels[currentexample.words.index(selword)]:
                                    if currentexample.words[currentexample.words.index(selword)-1].lower()=="in":
                                        if currentexample.words.index(selword)-1==0:
                                            currentexample.words[currentexample.words.index(selword)-1]="At"
                                        else:
                                            currentexample.words[currentexample.words.index(selword)-1]="at"
                                if currentexample.words.index(selword)==0:
                                    currentexample.words.insert(currentexample.words.index(selword),"The")
                                    currentexample.labels.insert(currentexample.words.index(selword)-1,"O")
                                else:
                                    currentexample.words.insert(currentexample.words.index(selword),"the")
                                    currentexample.labels.insert(currentexample.words.index(selword)-1,"O")


                                
            
                                #Sport phrase code
                                if  "<x>" in phrases[randomindex2]:
                                    exwords = [word.lower() for word in currentexample.words]
                                    insports=False
                            
                                    for sport in sportnames:                               
                                        if sport.lower() in exwords :
                                            insports=True
                                            sportphrase1=phrases[randomindex2]
                                            sportphrase=copy.deepcopy(sportphrase1)
                                            pindex=sportphrase.index("<x>")
                                            
                                            sportphrase[pindex]=sport
                                        
                                        
                                            break
                                    if insports==False:
                                        randomsportindex=random.randint(0,len(sportnames)-1) 
                                        sportphrase1=phrases[randomindex2]
                                        sportphrase=copy.deepcopy(sportphrase1)
                                        pindex=phrases[randomindex2].index("<x>")
                                        sportphrase[pindex]=sportnames[randomsportindex]

                                    for word,label in zip(sportphrase,labels2[randomindex2]):
                                    

                                        currentexample.labels[currentexample.words.index(selword)]='I-ORG'
                                        currentexample.words.insert(currentexample.words.index(selword),word)
                                        currentexample.labels.insert(currentexample.words.index(selword),label)
                                    firstwordindex=currentexample.words.index(selword)-len(phrases[randomindex2])
                                    currentexample.labels[firstwordindex]='B-ORG'
                                    

                                #Regular insertion 
                                else:
                        
                                    k=1
                                    for word,label in zip(phrases[randomindex2],labels2[randomindex2]):
                                        currentexample.words.insert(currentexample.words.index(selword)+k,word)
                                        currentexample.labels.insert(currentexample.words.index(selword)+k,label)
                                        currentexample.labels[currentexample.words.index(selword)]='B-ORG'
                                        k=k+1
                                        
                            exampleguidsalreadyaugmented.append(currentexample.guid)
                            augexamples.append(currentexample)
                            if num_of_aug_examples_generated>=num_of_aug_examples_to_generate_for_aug_type_4:
                                num_of_aug_examples_generated=0
                                break
                            num_of_aug_examples_generated=num_of_aug_examples_generated+1
                            

                            break

                index=index+1
                
                if num_of_aug_examples_generated>=num_of_aug_examples_to_generate_for_aug_type_4:
                    num_of_aug_examples_generated=0
                    break


        logger.info("Number of augmented examples: %s", len(augexamples))
        file_name=percentagename+".pkl"
        file_path = os.path.join(args.data_dir, file_name)
        open_file = open(file_path, "wb")
        pickle.dump(augexamples, open_file)
        open_file.close()
     


def generate_aug_percentage_categories(args, tokenizer, labels, pad_token_label_id, mode,  
              omit_sep_cls_token=False,
              pad_subtoken_with_real_label=False):  
    from itertools import islice
    #countaug=count_rule_based_aug(args, tokenizer, labels, pad_token_label_id, mode = 'train', pad_subtoken_with_real_label=args.pad_subtoken_with_real_label)
    #num_of_eligible_examples = 5523   #eligibleexamples 3x1381+1,380
    #examplespertrans=num_of_eligible_examples/4
    examples = read_examples_from_file_excel("data/conll2003",mode)

    trans1=1381 
    trans2=1381 
    trans3=1381 
    trans4=1380 
    
    diff=trans1+trans2
    length_to_split = [trans2+1,2000,len(examples)-diff]
        
        # Using islice
    Input = iter(examples)
    Output = [list(islice(Input, elem))
            for elem in length_to_split]
    
   

    for eligibleexample in Output[0]:
            eligibleTF={}
            eligibleTF['x']=eligibleexample.words
            eligibleTF['y']=eligibleexample.labels
            with open("textflint100percentforConcatSent.json", "a") as p:
                json.dump(eligibleTF, p)
                p.write("\n")
    for eligibleexample in Output[1]:
            eligibleTF={}
            eligibleTF['x']=eligibleexample.words
            eligibleTF['y']=eligibleexample.labels
            with open("textflint100percentforEntityTypos.json", "a") as p:
                json.dump(eligibleTF, p)
                p.write("\n")

    eligibleexamples=text_flint_CrossCategory_transformation(args,Output[2], trans3,"textflintCrossCategoryexamples100percent",tokenizer, labels, pad_token_label_id, mode = 'train', pad_subtoken_with_real_label=args.pad_subtoken_with_real_label) 
    eligibleexamples2=text_flint_SwapLonger_transformation(args, eligibleexamples,trans4,"textflintSwapLongerexamples100percent",tokenizer, labels, pad_token_label_id, mode = 'train', pad_subtoken_with_real_label=args.pad_subtoken_with_real_label)

    
   
    #2761
    trans1=691
    trans2=690
    trans3=690 
    trans4=690 

    diff=trans1+trans2
    length_to_split = [trans2+1,1200,len(examples)-diff]
        
        # Using islice
    Input = iter(eligibleexamples2)
    Output = [list(islice(Input, elem))
            for elem in length_to_split]
    
    

    for eligibleexample in Output[0]:
            eligibleTF={}
            eligibleTF['x']=eligibleexample.words
            eligibleTF['y']=eligibleexample.labels
            with open("textflint50percentforConcatSent.json", "a") as p:
                json.dump(eligibleTF, p)
                p.write("\n")
    for eligibleexample in Output[1]:
            eligibleTF={}
            eligibleTF['x']=eligibleexample.words
            eligibleTF['y']=eligibleexample.labels
            with open("textflint50percentforEntityTypos.json", "a") as p:
                json.dump(eligibleTF, p)
                p.write("\n")

    eligibleexamples=text_flint_CrossCategory_transformation(args,Output[2], trans3,"textflintCrossCategoryexamples50percent",tokenizer, labels, pad_token_label_id, mode = 'train', pad_subtoken_with_real_label=args.pad_subtoken_with_real_label) 
    eligibleexamples2=text_flint_SwapLonger_transformation(args,eligibleexamples, trans4,"textflintSwapLongerexamples50percent",tokenizer, labels, pad_token_label_id, mode = 'train', pad_subtoken_with_real_label=args.pad_subtoken_with_real_label)

    
    #1657
    trans1=415
    trans2=414
    trans3=414
    trans4=414

    diff=trans1+trans2
    length_to_split = [trans2+1,900,len(examples)-diff]
        
        # Using islice
    Input = iter(eligibleexamples2)
    Output = [list(islice(Input, elem))
            for elem in length_to_split]


    for eligibleexample in Output[0]:
            eligibleTF={}
            eligibleTF['x']=eligibleexample.words
            eligibleTF['y']=eligibleexample.labels
            with open("textflint30percentforConcatSent.json", "a") as p:
                json.dump(eligibleTF, p)
                p.write("\n")
    for eligibleexample in Output[1]:
            eligibleTF={}
            eligibleTF['x']=eligibleexample.words
            eligibleTF['y']=eligibleexample.labels
            with open("textflint30percentforEntityTypos.json", "a") as p:
                json.dump(eligibleTF, p)
                p.write("\n")


    eligibleexamples=text_flint_CrossCategory_transformation(args,Output[2], trans3,"textflintCrossCategoryexamples30percent",tokenizer, labels, pad_token_label_id, mode = 'train', pad_subtoken_with_real_label=args.pad_subtoken_with_real_label) 
    eligibleexamples2=text_flint_SwapLonger_transformation(args, eligibleexamples,trans4,"textflintSwapLongerexamples30percent",tokenizer, labels, pad_token_label_id, mode = 'train', pad_subtoken_with_real_label=args.pad_subtoken_with_real_label)

    

    #553
    trans1=139
    trans2=138
    trans3=138
    trans4=138


    diff=trans1+trans2
    length_to_split = [trans2+1,500,len(examples)-diff]
        
        # Using islice
    Input = iter(eligibleexamples2)
    Output = [list(islice(Input, elem))
            for elem in length_to_split]
    
 

    for eligibleexample in Output[0]:
            eligibleTF={}
            eligibleTF['x']=eligibleexample.words
            eligibleTF['y']=eligibleexample.labels
            with open("textflint10percentforConcatSent.json", "a") as p:
                json.dump(eligibleTF, p)
                p.write("\n")
    for eligibleexample in Output[1]:
            eligibleTF={}
            eligibleTF['x']=eligibleexample.words
            eligibleTF['y']=eligibleexample.labels
            with open("textflint10percentforEntityTypos.json", "a") as p:
                json.dump(eligibleTF, p)
                p.write("\n")

    eligibleexamples=text_flint_CrossCategory_transformation(args,Output[2], trans3,"textflintCrossCategoryexamples10percent",tokenizer, labels, pad_token_label_id, mode = 'train', pad_subtoken_with_real_label=args.pad_subtoken_with_real_label) 
    eligibleexamples2=text_flint_SwapLonger_transformation(args, eligibleexamples,trans4,"textflintSwapLongerexamples10percent",tokenizer, labels, pad_token_label_id, mode = 'train', pad_subtoken_with_real_label=args.pad_subtoken_with_real_label)

    

    #277
    trans1=70
    trans2=69
    trans3=69
    trans4=69


    diff=trans1+trans2
    length_to_split = [trans2+1,300,len(examples)-diff]
        
        # Using islice
    Input = iter(eligibleexamples2)
    Output = [list(islice(Input, elem))
            for elem in length_to_split]
    
   

    for eligibleexample in Output[0]:
            eligibleTF={}
            eligibleTF['x']=eligibleexample.words
            eligibleTF['y']=eligibleexample.labels
            with open("textflint5percentforConcatSent.json", "a") as p:
                json.dump(eligibleTF, p)
                p.write("\n")
    for eligibleexample in Output[1]:
            eligibleTF={}
            eligibleTF['x']=eligibleexample.words
            eligibleTF['y']=eligibleexample.labels
            with open("textflint5percentforEntityTypos.json", "a") as p:
                json.dump(eligibleTF, p)
                p.write("\n")


    eligibleexamples=text_flint_CrossCategory_transformation(args,Output[2], trans3,"textflintCrossCategoryexamples5percent",tokenizer, labels, pad_token_label_id, mode = 'train', pad_subtoken_with_real_label=args.pad_subtoken_with_real_label) 
    eligibleexamples2=text_flint_SwapLonger_transformation(args,eligibleexamples, trans4,"textflintSwapLongerexamples5percent",tokenizer, labels, pad_token_label_id, mode = 'train', pad_subtoken_with_real_label=args.pad_subtoken_with_real_label)

    


def text_flint_CrossCategory_transformation(args,examples,amount, percentagename,tokenizer, labels, pad_token_label_id, mode,  
              omit_sep_cls_token=False,
              pad_subtoken_with_real_label=False):

    #examples = read_examples_from_file_excel("data/conll2003",mode)  #examples = read_examples_from_file_excel("data/conll2003",mode)
    augexamples=[]
   
    
    if mode =='train':

    
        
        
        import pickle 
        



        examplesguids=[]
        selectedwords=[]
        selectedlabels=[]


        exampleguidsalreadyaugmented=[]
        iccands=[]
        iccandbool=False

        i=0
        selectedword=''
    
    
        personentities=[]
        locationentities=[]
        organizationentities=[]
        for ex in examples:
      
            e=random.choice(examples)
            for word,label in zip(e.words,e.labels):
                if 'B-PER' in label or 'B-LOC' in label or 'B-ORG' in label and 'B-MISC' not in label:
                    iccandbool=True
                    selectedword=word            
                    selectedlabel=label

                elif iccandbool==True and label=='O' and i==1:
                    
                    if e.guid not in examplesguids:

                        examplesguids.append(e.guid)
                        selectedwords.append(selectedword)
                        selectedlabels.append(selectedlabel)

                        dict=(selectedword,selectedlabel)
                        if 'B-PER' in selectedlabel:
                            personentities.append(dict)
                        elif 'B-LOC' in selectedlabel:
                            locationentities.append(dict)
                        elif 'B-ORG' in selectedlabel:
                            organizationentities.append(dict)

                    break   
                
                if iccandbool==True:                
                    i=i+1        
        
            iccandbool=False
            i=0


        #Code for putting data into examples instead of cand list of tuples
        num_of_aug_examples_generated=0
        numtogenerate=amount
        index=0
        locationentitieswithorg=locationentities.copy()
        personentitieswithorg=personentities.copy()
        locationentitieswithper=locationentities.copy()

        locationentitieswithorg.extend(organizationentities)
        personentitieswithorg.extend(organizationentities)
        locationentitieswithper.extend(personentities)
        for ex in examples:
            
            if ex.guid in examplesguids and ex.guid not in exampleguidsalreadyaugmented:
            
                for guid2,selword,sellabel in zip(examplesguids,selectedwords,selectedlabels):
                    if ex.guid==guid2:
                        
                        #Swap with Random entity of other label at index
                        
                        currentexample=copy.deepcopy(examples[index])                                  
                        #straighten out combining of entity lists
                        l=currentexample.labels[currentexample.words.index(selword)]
                        i=currentexample.words.index(selword)
                        if 'B-PER' in l:
                        
                      
                            e=random.choice(locationentitieswithorg)
                            currentexample.words[i]=e[0]
                            currentexample.labels[i]=e[1]
                        elif 'B-LOC' in l:
                            
                            e=random.choice(personentitieswithorg)
                            currentexample.words[i]=e[0]
                            currentexample.labels[i]=e[1]
                        elif 'B-ORG' in l:
                           
                            e=random.choice(locationentitieswithper)
                            currentexample.words[i]=e[0]
                            currentexample.labels[i]=e[1]
                        exampleguidsalreadyaugmented.append(currentexample.guid)
                        augexamples.append(currentexample)  
                        num_of_aug_examples_generated=num_of_aug_examples_generated+1
                        examples.pop(index) 
                        break        
                                
            index=index+1                
                            
            if num_of_aug_examples_generated>=numtogenerate:
                num_of_aug_examples_generated=0
                break

        logger.info("Number of TextFlint examples: %s", len(augexamples))
        file_name=percentagename+".pkl"
        file_path = os.path.join(args.data_dir, file_name)
        open_file = open(file_path, "wb")
        pickle.dump(augexamples, open_file)
        open_file.close()           
          
        return examples

def text_flint_SwapLonger_transformation(args,examples,amount, percentagename,tokenizer, labels, pad_token_label_id, mode,  
              omit_sep_cls_token=False,
              pad_subtoken_with_real_label=False):

    #examples = read_examples_from_file_excel("data/conll2003",mode)  
    augexamples=[]
   
    
    if mode =='train':

    
        
        
        import pickle
       
        num_of_aug_examples_generated=0
        numtogenerate=amount

        examplesguids=[]
        selectedwords=[]
        selectedlabels=[]


        exampleguidsalreadyaugmented=[]
        iccands=[]
        iccandbool=False

        i=0
        selectedword=''
    
    
        allentities=[]
        correspondingfullentitylists=[]
        tokenlist=[]
        labellist=[]
        for ex in examples:
      
            e=random.choice(examples)
            for word,label in zip(e.words,e.labels):
                if 'B-PER' in label or 'B-LOC' in label or 'B-ORG' in label and 'B-MISC' not in label:
                    iccandbool=True
                    
                
                    tokenlist=[]
                    labellist=[]
                    tokenlist.append(word)
                    labellist.append(label)

                    

                   
                elif iccandbool==True and label=='O':
                    if e.guid not in examplesguids:

                        examplesguids.append(e.guid)
                        tup=(tokenlist,labellist)
                        
                        correspondingfullentitylists.append(tup)

                    iccandbool=False   
                    break 
                 

                elif iccandbool==True or 'I-PER' in label or 'I-LOC' in label or 'I-ORG' in label:                   
                        
                        tokenlist.append(word)
                        labellist.append(label)
              
                
                if iccandbool==True:                
                    i=i+1        
        
            iccandbool=False
            i=0

        #need to collect length of cand entity  - utilize correspondingfullentitylists
        #Pop all entity tokens and labels and insert longer entity tokens and labels

        index=0
  
        for ex in examples:
            
            if ex.guid in examplesguids and ex.guid not in exampleguidsalreadyaugmented:
            
                for guid2,entitylabel in zip(examplesguids,correspondingfullentitylists):
                    if ex.guid==guid2:
                        if 5<len(entitylabel[0]):
                            break
                        #Swap with Random entity of other label at index
                        
                        currentexample=copy.deepcopy(examples[index])                                  
                        #straighten out combining of entity lists
                        wordindex=currentexample.words.index(entitylabel[0][0])
                       
                        for w in entitylabel[0]:
                            currentexample.words.pop(wordindex) 
                            currentexample.labels.pop(wordindex)   

                        longer=False
                        while longer==False:

                            e=random.choice(correspondingfullentitylists)
                                                        
                            if len(e[0])>len(entitylabel[0]):
                                longer=True
                            
                        currentexample.words[wordindex:wordindex]=e[0]
                        currentexample.labels[wordindex:wordindex]=e[1]
                  
                        exampleguidsalreadyaugmented.append(currentexample.guid)
                        num_of_aug_examples_generated=num_of_aug_examples_generated+1
                        augexamples.append(currentexample)   
                        examples.pop(index) 
                        break        
                                
            index=index+1  
            if num_of_aug_examples_generated>=numtogenerate:
                num_of_aug_examples_generated=0
                break              
                            


        logger.info("Number of TextFlint examples: %s", len(augexamples))
        file_name=percentagename+".pkl"
        file_path = os.path.join(args.data_dir, file_name)
        open_file = open(file_path, "wb")
        pickle.dump(augexamples, open_file)
        open_file.close()                  
   
        return examples
    
    
def linear_rampup(current, rampup_length=args.num_train_epochs):
    if rampup_length == 0:
        return 1.0
    else:
        current = np.clip(current / rampup_length, 0.0, 1.0)
        return float(current)



def main():
    global best_f1
    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir):
        raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
    
    logger.setLevel(log.INFO)
    formatter = log.Formatter("%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S")
    
    if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
            
            
    fh = log.FileHandler(args.output_dir  +'/' + str(args.train_examples)+'-' + 'log.txt')
    fh.setLevel(log.INFO)
    fh.setFormatter(formatter)

    ch = log.StreamHandler()
    ch.setLevel(log.INFO)
    ch.setFormatter(formatter)

    logger.addHandler(ch)
    logger.addHandler(fh)
    
    logger.info("------NEW RUN-----")

    logger.info("device: %s, n_gpu: %s", args.device, args.n_gpu)

    set_seed(args)

    labels = get_labels(args.labels)
    num_labels = len(labels)
    args.num_labels=num_labels

    pad_token_label_id = CrossEntropyLoss().ignore_index





    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

  
    tokenizer = tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)

    
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name,
        num_labels=num_labels,
    )
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name,
        do_lower_case=args.do_lower_case,
    )


    logger.info("Training/evaluation parameters %s", args)
    

    generate_aug_percentage_categories(args, tokenizer, labels, pad_token_label_id, mode = 'train', pad_subtoken_with_real_label=args.pad_subtoken_with_real_label)        

    




main()

    