@inproceedings{le-etal-2025-docie,
title = "{D}oc{IE}@{XLLM}25: {Z}ero{S}emble - Robust and Efficient Zero-Shot Document Information Extraction with Heterogeneous Large Language Model Ensembles",
author = "Le, Nguyen and
Thien, An and
Luu, Son and
Nguyen, Kiet",
editor = "Fei, Hao and
Tu, Kewei and
Zhang, Yuhui and
Hu, Xiang and
Han, Wenjuan and
Jia, Zixia and
Zheng, Zilong and
Cao, Yixin and
Zhang, Meishan and
Lu, Wei and
Siddharth, N. and
{\O}vrelid, Lilja and
Xue, Nianwen and
Zhang, Yue",
booktitle = "Proceedings of the 1st Joint Workshop on Large Language Models and Structure Modeling (XLLM 2025)",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.xllm-1.25/",
pages = "288--297",
ISBN = "979-8-89176-286-2",
abstract = "The schematization of knowledge, including the extraction of entities and relations from documents, poses significant challenges to traditional approaches because of the document{'}s ambiguity, heterogeneity, and high cost domain-specific training. Although Large Language Models (LLMs) allow for extraction without prior training on the dataset, the requirement of fine-tuning along with low precision, especially in relation extraction, serves as an obstacle. In absence of domain-specific training, we present a new zero-shot ensemble approach using DeepSeek-R1-Distill-Llama-70B, Llama-3.3-70B, and Qwen-2.5-32B. Our key innovation is a two-stage pipeline that first consolidates high-confidence entities through ensemble techniques, then leverages Qwen-2.5-32B with engineered prompts to generate precise semantic triples. This approach effectively resolves the low precision problem typically encountered in relation extraction. Experiments demonstrate significant gains in both accuracy and efficiency across diverse domains, with our method ranking in the top 2 on the official leaderboard in Shared Task-IV of The 1st Joint Workshop on Large Language Models and Structure Modeling. This competitive performance validates our approach as a compelling solution for practitioners seeking robust document-level information extraction without the burden of task-specific fine-tuning. Our code can be found at https://github.com/dinhthienan33/ZeroSemble."
}