import os
import json

from transformers import AutoTokenizer


def count_test_length(
    base_model: str = "", 
    data_path: str = ""
):
    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
    with open(data_path, 'r', encoding="utf-8") as f_r:
        x = json.load(f_r)
        total_tokens = 0
        for i in x:
            whole_text = ' '.join(i['segments'])
            result = tokenizer(whole_text)
            total_tokens += len(result['input_ids'])
        print(total_tokens)
        

if __name__ == "__main__":
    count_test_length()

