@inproceedings{zhong-etal-2026-multimodal,
title = "Multimodal Chemical Structure-Text Coreference in Intellectual Property via Rule-guided Reinforcement Learning",
author = "Zhong, Hanmeng and
Wu, Wentao and
Chen, Linqing and
Zhou, Peng",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1489/",
pages = "29784--29796",
ISBN = "979-8-89176-395-1",
abstract = "Navigating biopharmaceutical intellectual property necessitates precisely associating visual chemical structures with their textual referents across lengthy documents. Despite its critical role in drug discovery, this multimodal coreference task remains underexplored. It presents unique challenges, including handling Markush structures and distinguishing the atom-level differences between adjacent structures. To bridge this gap, we define the multimodal Chemical Structure-Text coreference and introduce CheST, the first dataset explicitly designed for the task. Furthermore, to satisfy the strict logical consistency in the task, we propose RULER, a RULE-guided multimodal Reinforcement learning framework built upon an SFT cold start. RULER utilizes rule-driven reward functions operationalizing multidimensional consistencies, acting as a domain-specific ``verifier'' to obtain the correct domain knowledge. Experimental results demonstrate that RULER achieves a 40{\%} improvement over the strongest baseline{--}Gemini-2.5-Pro, demonstrating the superior efficacy."
}Markdown (Informal)
[Multimodal Chemical Structure-Text Coreference in Intellectual Property via Rule-guided Reinforcement Learning](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1489/) (Zhong et al., Findings 2026)
ACL