@inproceedings{song-etal-2025-figex,
title = "{F}ig{E}x: Aligned Extraction of Scientific Figures and Captions",
author = "Song, Jifeng and
Das, Arun and
Cui, Ge and
Huang, Yufei",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.899/",
doi = "10.18653/v1/2025.findings-emnlp.899",
pages = "16558--16571",
ISBN = "979-8-89176-335-7",
abstract = "Automatic understanding of figures in scientific papers is challenging since they often contain subfigures and subcaptions in complex layouts. In this paper, we propose FigEx, a vision-language model to extract aligned pairs of subfigures and subcaptions from scientific papers. We also release BioSci-Fig, a curated dataset of 7,174 compound figures with annotated subfigure bounding boxes and aligned subcaptions. On BioSci-Fig, FigEx improves subfigure detection $AP^b$ over Grounding DINO by 0.023 and boosts caption separation BLEU over Llama-2-13B by 0.465. The source code is available at: https://github.com/Huang-AI4Medicine-Lab/FigEx."
}Markdown (Informal)
[FigEx: Aligned Extraction of Scientific Figures and Captions](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.899/) (Song et al., Findings 2025)
ACL