@inproceedings{baek-etal-2025-harnessing,
title = "Harnessing {PDF} Data for Improving {J}apanese Large Multimodal Models",
author = "Baek, Jeonghun and
Aizawa, Akiko and
Aizawa, Kiyoharu",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.findings-acl.108/",
pages = "2108--2123",
ISBN = "979-8-89176-256-5",
abstract = "Large Multimodal Models (LMMs) have demonstrated strong performance in English, but their effectiveness in Japanese remains limited due to the lack of high-quality training data. Current Japanese LMMs often rely on translated English datasets, restricting their ability to capture Japan-specific cultural knowledge. To address this, we explore the potential of Japanese PDF data as a training resource, an area that remains largely underutilized. We introduce a fully automated pipeline that leverages pretrained models to extract image-text pairs from PDFs through layout analysis, OCR, and vision-language pairing, removing the need for manual annotation. Additionally, we construct instruction data from extracted image-text pairs to enrich the training data. To evaluate the effectiveness of PDF-derived data, we train Japanese LMMs and assess their performance on the Japanese LMM Benchmark. Our results demonstrate substantial improvements, with performance gains ranging from 2.1{\%} to 13.8{\%} on Heron-Bench. Further analysis highlights the impact of PDF-derived data on various factors, such as model size and language models, reinforcing its value as a multimodal resource for Japanese LMMs."
}
Markdown (Informal)
[Harnessing PDF Data for Improving Japanese Large Multimodal Models](https://preview.aclanthology.org/display_plenaries/2025.findings-acl.108/) (Baek et al., Findings 2025)
ACL