#################################################################
# TOPOLOGY ANALYSIS
#################################################################

# path to the test portion of the language-specific treebank, sentences of which are used for the topological analysis
in_file = "data/tr_imst-ud-test.conllu" 

# flag indicating if we're generating representations with the adapter-transformer (True) or regular transformer (False)
adapter = False

# bert-base-cased, roberta-base, bert-base-multilingual-cased, or xlm-roberta-base
original_transformer = "xlm-roberta-base" 

# either one of the original transformers, or a transformer model obtained after some intermediate training or downstream fine-tuning
pretrained_transformer = "models/biaffine_parser/xlmr/tr" # example relative path, XLM-R transformer after intermediate parsing training on Turkish treebank

# flag indicates whether to add special sequence start and sequence end tokens when encoding sentences (recommended to keep to True)
add_special_tokens = True

# maximal sentence length in number of subwords for the transformers (recommended to keep the default value)
max_length = 510

# maximal sentence length in number of word-level tokens (recommended to keep the default value)
max_word_len = 100

# output directory in which to store the matrices containing encoded representations of sentences from "in_file", one matrix per for each transformer layer
outpath = "topology/new/xlmr/tr/parse" # example relative path, to store layer-wise sentence representations after IPT (P)

# flag indicating is we're generating and serializing layer-wise representations with a transformer (False) or 
# computing the actual l-CKA scores between layer-wise representations of two transformers (True)  
topology_eval = False

# directory with serialized layer-wise representations for sentences for the first transformer variant in comparison
topology_first_variant_path = "topology/new/xlmr/tr/parse"

# directory with serialized layer-wise representations for sentences for the second transformer variant in comparison
topology_second_variant_path = "topology/new/xlmr/tr/nli"

# default value for padding sequences
pad_value = -2

#################################################################
# DATA PREPROCESSING 
#################################################################

# used to know which tokenizer to load: bert-base-cased, roberta-base, bert-base-multilingual-cased, or xlm-roberta-base
pretrained_transformer = "xlm-roberta-base"

# maximal sentence length in number of subwords for the transformers
max_length = 200

# # maximal sentence length in number of word-level tokens (used only for treebank preprocessing, sentences longer than the set value are ignored)
max_word_len = 100

# flag indicates whether to add special sequence start and sequence end tokens when encoding sentences (recommended to keep to True)
add_special_tokens = True

# flag indicating whether to lowercase the text before tokenization
# in all experiments, we use transformers pre-trained on non-lowercased text (False is recommended for result replication)
preprocessing_lowercase = False

### aditional MLM configuration parameters
mlm = True # set to True if preprocessing test for masked language modeling training

# masking probability (which portion of subword tokens will be masked)
mask_probability = 0.15

# maximal number of subword tokens allowed to be masked in a single sentence
max_masked_per_sent = 20

# base directory path of the dataset to be preprocessed
base_path = "ud-treebanks-v2.5/UD_German-GSD" # example relative path to German GSD treebank

# the actual file containing text to be preprocessed
in_file = "de_gsd-ud-train.conllu"

# indicates if the dataset is in Chinese, which entail special preprocessing for XLM-R 
is_zh = True

# path to which to serialize the preprocessed dataset (PyTorch's TensorDataset)
out_file = "serialized/de_hdt-ud-test.xlmr.td"

# path to which to serialize the the dictionary of dependency relations (only relevant for the treebank preprocessing for IPT)
deps_dict_path  = "serialized/deps_dict.pkl"

# hidden representation size of the transformer (all transformers in the experiments have H = 768)
hidden_size = 768

# padding indices
pad_value = -2
def_value = -1

#################################################################
# MODELING, TRAINING, OPTIMIZATION, and EVALUATION
#################################################################

# the type of the task we're training/evaluating for
# supported values: "parsing" (dependency parsing), "mlm" (masked language modeling), "seq_class" (sequence classification), and "mcqa" (multiple-choice question answering)  
task_type = "parsing" 

# the concrete task we're  training/evaluating for
# supported values: "parsing", "mlm", "nli" (for task_type = "seq_class"), "paws" (for task_type = "seq_class"), "siqa" (for task_type = "mcqa") and "copa" (for task_type = "mcqa")
task = "parsing" 

# flag indicating if we're training/evaluating a regular transformer (adapter = False) or the transformer augmented with bottleneck adapters (adapter = True)
adapter = False

# original LM-pretrained transformer: bert-base-cased, roberta-base, bert-base-multilingual-cased, or xlm-roberta-base
original_transformer = "xlm-roberta-base"

# model from which to start the current training/evaluation setup
# can be one of the original transformers, or path to some intermediately trained/fine-tuned transformer-based model
pretrained_transformer = "models/biaffine_parser/xlmr/tr" # example relative path to the XLM-R biaffine parser model trained on TR treebank (e.g., if we want to fine-tune TR IPT XLM-R for downstream language transfer for, e.g., NLI) 

# directory path of the training/validation/test data
base_path = "ud-treebanks-v2.5/UD_Turkish-IMST"

# indicates if we're training the model (in which case train and validation dataset will be used) or evaluation a model (in which case the test dataset will be used)
train_or_test = "train"
train_set = "serialized/tr_imst-ud-train.xlmr.td"
val_set = "serialized/tr_imst-ud-dev.xlmr.td"
test_set = "serialized/tr_imst-ud-test.xlmr.td"

###
# optimization
### 

# device on which to run the training/evaluation. By default set to first available GPU. 
device = "cuda:0"

# maximal number of training epochs
num_train_epochs = 30

# early stopping criteria: number of consecutive evaluations on the development set with no improvement in the relevant metric
num_evals_early_stop = 10

# number of gradient accumulation steps (larger than 1 only if we're feeding very small batches)
gradient_accumulation_steps = 1

# metric for evaluation on the development set and early stopping; 
# for task_type = "parsing": "UAS"; for task_type = "mlm": "Loss"; for task_type = "seq_class" or task_type = "mcqa": "Accuracy"
eval_stop_metric = "Loss"

# indicates if the evaluation metric on the development set should increase or decrease. For eval_stop_metric = "Loss": False; for eval_stop_metric = "UAS" or "Accuracy": True 
eval_metric_increasing = False

# batch size in evaluation
eval_batch_size = 8

# Grid search over hyperparameters for training (default: grid_search = True)
grid_search = True

# output directory for models trained during grid-search (one model for each hyperparam configuration)
grid_search_base_output_directory = "models/mlm/xlmr/tr"

# Number of update steps between logging (i.e., evaluation on the development set)
grid_base_logging_steps = 250

# Base batch size for the grid. If in some configuration the batch_size is different than this value, the number of logging steps between evaluations 
# will be accordingly adjusted so that all configurations see the same number of training instances between development set evaluations
grid_base_batch_size = 8

# the specification of the grid: different hyperparameters and the set of values for each
grid = { "lr" : [1e-5], 
         "drpt" : [0.1], 
         "batch" : [6],
         "seed" : [42]}

# fixed optimization values
weight_decay = 0.0
adam_epsilon = 1e-8
max_grad_norm = 1.0
max_steps = -1
warmup_steps = 0
