import torch
from transformers import RobertaModel, RobertaTokenizer
from transformers import BeitImageProcessor, BeitForMaskedImageModeling

gen_step_size =  1
chars_len = 40  # maximum character length allowed in each segment
tokens_len = 3  # maximum number of tokens allowerd in each segment
pos_size = 6  # size of positional features
text_embed_size = 768
visual_embed_size = 8192
use_visual_embeds = False
in_dim = text_embed_size+visual_embed_size if use_visual_embeds else text_embed_size # tokens_len+chars_len+pos_size+1
segment_label_to_idx = {"B": 1, "I": 0}
edge_label_to_idx = {'l': 0, 'c': 1, 'r': 2, 't': 3, 'm': 4, 'b': 5, 'l2': 6, 'c2': 7, 'r2': 8, 't2': 9, 'm2': 10, 'b2': 11, 'beta': -1}
edge_dim = 6  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta = RobertaModel.from_pretrained("roberta-base")
fpn_processor = BeitImageProcessor.from_pretrained("microsoft/dit-base")
fpn_model = BeitForMaskedImageModeling.from_pretrained("microsoft/dit-base")
num_patches = (fpn_model.config.image_size // fpn_model.config.patch_size) ** 2
for param in fpn_model.parameters(): param.requires_grad = False
train_from_scratch = False

# dataset paths
funsd_home = '/path/to/funsd'
idl_home = '/path/to/idl'
docvqa_home = '/path/to/docvqa'
rvl_home = '/path/to/rvl-cdip'
sroie_home = '/path/to/sroie'
cord_home = '/path/to/cord'
buddie_home = '/path/to/buddie_v1'
