# this module determine the corpus, fields, field embeddings, and position embedding selections.

########################################################### Parameters to Modify
#### input fields
# Chinese
# input_fields =  ['token', 'subcomp', 'pinyin', 'medpos']  # orderly select fields from ['token', 'subcomp', 'pinyin', 'pos', 'medpos']

# English
input_fields =  ['token', 'char', 'phoneme']  # orderly select fields from ['token', 'char', 'phoneme', 'pos_en']


# pretrain_embeddings = 'LuohuCorpus/char'; SIZE = 100  # False # 'LuohuCorpus/char'
# pretrain_embeddings = 'WikiChinese/char'; SIZE = 200 
pretrain_embeddings = 'WikiEnglish/word'; SIZE = 200 
# pretrain_embeddings = False; SIZE = 200 


############## Open Domain Corpus
# Language Model
# Data_Dir = 'data/WikiChinese/char/'; min_token_freq =  5 # don't touch this 
# Sequence Labeling
# Data_Dir = 'data/ResumeCN/char/'; min_token_freq = 1 # don't touch this
# Data_Dir = 'data/boson/char/'; min_token_freq = 1 # don't touch this
# Data_Dir = 'data/MSRA/char/'; min_token_freq = 1 # don't touch this

Data_Dir = 'data/CoNLL-2003/word/'; min_token_freq = 3

############## Medical Corpus
# Language Model
# Data_Dir = 'data/LuohuCorpus/char/'; min_token_freq =  1 # don't touch this 
# Sequence Labeling
# Data_Dir = 'data/MedPos/char/'; min_token_freq = 1 # don't touch this
# Data_Dir = 'data/LuohuNER750Neat/char/'; min_token_freq = 1 # don't touch this
# Data_Dir = 'data/CCKS2017/char/'; min_token_freq = 1 # don't touch this
# Sequence Classification
# Data_Dir = 'data/CHIP2019_pkl/char/'; min_token_freq = 1 # don't touch this

model_type = 'seqlabel' # ['masklm', 'bilm', 'fwdlm'], ['seqlabel', 'seqcls']

use_pretrained_seqrepr = False


SeqRepr_Config_Name = 'BaseStruct' #, 'EmbedOnly'
# 'BaseStruct', 'EmbedOnly',
# ['MLM_AsSent_LSTM',  'MLM_AsToken_LSTM',  'MLM_AsToken_TFM']
# ['BiLM_AsSent_LSTM', 'BiLM_AsToken_LSTM', 'BiLM_AsToken_LSTM_cnn']
# ['FLM_AsSent_LSTM',  'FLM_AsToken_LSTM',  'FLM_AsToken_TFM']
use_residual_structure = False
# Keep it
pretrained_seqrepr_path = False

###########################################################


if 'lm' in model_type:
    assert use_pretrained_seqrepr == False

##############

MISC = {}

############## the max sentence length
# use start or end as special tokens.
# when adding them, the performance is not good compared with the baseline.
MISC['maxSentLeng']  = 300 # 360 # 400
MISC['maxGrainLeng'] = 70
##############

############## use tokne position or grain position embeddings or not.
# useTkPsn: means using token position embeddings or not.
# useGrPsn: means using grain position embeddings or not.

MISC['useTkPsn'] =  False
MISC['useGrPsn'] =  False
MISC['maskProportion'] = 0.2 # the proportation for masked words in a sentence.


############## addtional config for masked language model
if 'TFM' in SeqRepr_Config_Name:
    MISC['useTkPsn'] =  True
    MISC['useGrPsn'] =  True


if 'masklm' in model_type:
    MISC['useMask'] =  True
else:
    MISC['useMask'] = False


MISC['useTokenType'] = False # the proportation for masked words in a sentence.

############## 



############## target field
tagScheme = 'BIOES' # the tagScheme to use. ['BIO', 'BIOE', 'BIOES']
if 'lm' in model_type:
    target_field = 'token'  
    assert Data_Dir in ['data/LuohuCorpus/char/', 'data/WikiChinese/char/']
elif model_type == 'seqcls':
    target_field = 'category'
    assert Data_Dir in ['data/CHIP2019_pkl/char/', 'data/WikiChinese/char/']
else:
    target_field = 'annoE'  
    assert Data_Dir in ['data/ResumeCN/char/', 'data/boson/char/', 'data/MSRA/char/',
                        'data/MedPos/char/', 'data/LuohuNER750Neat/char/', 'data/CCKS2017/char/',
                        'data/CoNLL-2003/word/']


############## field embeddings
# the path to pretrain field embeddings
if pretrain_embeddings == 'LuohuCorpus/char':
    FldEmbed_Dir_Dict = {
        1: 'embeddings/fieldembed/LuohuCorpus/char/token/cb-it50-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/LF1-SmpGrT/',
        2: 'embeddings/fieldembed/LuohuCorpus/char/token_subcomp/cb-it50-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/LF3-SmpGrT/',
        3: 'embeddings/fieldembed/LuohuCorpus/char/token_pinyin_subcomp/cb-it50-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/LF3-SmpGrT/',
        4: 'embeddings/fieldembed/LuohuCorpus/char/token_pinyin_subcomp_pos/cb-it50-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/LF3-SmpGrT/',
        5: 'embeddings/fieldembed/LuohuCorpus/char/token_subcomp_pinyin_medpos/cb-it100-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/LF3-SmpGrT/',
    }
    idx = len(input_fields) if 'medpos' not in input_fields  else 5
    FldEmbed_Dir = FldEmbed_Dir_Dict[idx]



elif pretrain_embeddings == 'WikiChinese/char':
    # old
    # FldEmbed_Dir = 'embeddings/fieldembed/WikiChinese/char/token_subcomp_pinyin_pos/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/LF3-SmpGrT/' 

    # New
    FldEmbed_Dir_Dict = {
        1: 'embeddings/fieldembed/WikiChinese/char/token/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/LF1-SmpGrT/',
        2: 'embeddings/fieldembed/WikiChinese/char/token_subcomp/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/LF3-SmpGrT/',
        3: 'embeddings/fieldembed/WikiChinese/char/token_subcomp_pinyin/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/LF3-SmpGrT/',
        4: 'embeddings/fieldembed/WikiChinese/char/token_subcomp_pinyin_pos/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th4/LF3-SmpGrT/'
    }
    idx = len(input_fields) 
    FldEmbed_Dir = FldEmbed_Dir_Dict[idx]




elif pretrain_embeddings == 'WikiEnglish/word':
    FldEmbed_Dir_Dict = {
        1: 'embeddings/fieldembed/WikiEnglish/word/token/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF1-SmpGrT/',
        2: 'embeddings/fieldembed/WikiEnglish/word/token_char/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF3-SmpGrT/',
        3: 'embeddings/fieldembed/WikiEnglish/word/token_char_phoneme/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF3-SmpGrT/',
        4: 'embeddings/fieldembed/WikiEnglish/word/token_char_phoneme_pos_en/cb-it5-w5-ng10-lr0.025-smp0.001-nsexp0.75-th8/LF3-SmpGrT/'
    }

    idx = len(input_fields) 

    # idx = 4
    FldEmbed_Dir = FldEmbed_Dir_Dict[idx]

else:
    FldEmbed_Dir = None


############## special tokens
special_tokens = ['</pad>', '</unk>', '</start>', '</end>', '</mask>', '</CLS>', '</SEP>'] 
MISC['idx2specialtokens'] = special_tokens                                # don't touch 
MISC['specialtokens2idx'] = {k:v for v, k in enumerate(special_tokens)}   # don't touch 
############## 

############## save model_type information here
MISC['lm_mode']  = model_type # notice the use model      # don't touch 
# when adding them, the performance is not good compared with the baseline.
MISC['useStartEnd']  = False 
# only have a </start> token in the generated sentences; only valid in text generations.
MISC['useStartOnly'] = False 
##############

if model_type == 'fwdlm':
    assert 'FLM' in SeqRepr_Config_Name
elif model_type == 'masklm':
    assert 'MLM' in SeqRepr_Config_Name
elif model_type == 'bilm':
    assert 'BiLM' in SeqRepr_Config_Name

