import torch, numpy as np

def tensorize(doc_tokenizer, passages, labels, bsize):
    # passages  : List[str] = The list of passages
    # labels    : List[ndarray(float32)] = The list of token-level labels for each passage
    assert len(passages) == len(labels)
    assert bsize is None or len(passages) % bsize == 0

    N = len(passages)
    
    D_ids, D_mask = doc_tokenizer.tensorize(passages)
    # print(f'D_ids {D_ids.size()}\n{D_ids[0, :6]}')
    # print(f'D_mask {D_mask.size()}\n{D_mask[0, :6]}')

    # Padding labels
    D_label = torch.zeros_like(D_mask, dtype=torch.float)
    for i, x in enumerate(labels):
        x_label = torch.tensor(x, dtype=torch.float, device=D_label.device)
        # print(f'x_label {x_label.size()}\n{x_label}')
        D_label[i, :len(x)] = x_label

    batches = _split_into_batches(D_ids, D_mask, D_label, bsize=bsize)
    # batches: List[ Tuple ( List[tensor], List[tensor], List[tensor] ) ]
    # = The list of batches,
    # where each batch consists of (token ids, token masks, token labels)

    return batches

def _sort_by_length(ids, mask, bsize):
    if ids.size(0) <= bsize:
        return ids, mask, torch.arange(ids.size(0))

    indices = mask.sum(-1).sort().indices
    reverse_indices = indices.sort().indices

    return ids[indices], mask[indices], reverse_indices

def _split_into_batches(ids, mask, label, bsize):
    batches = []
    for offset in range(0, ids.size(0), bsize):
        batches.append((ids[offset:offset+bsize], mask[offset:offset+bsize], label[offset:offset+bsize]))
    return batches
