@inproceedings{xu-etal-2026-ds2,
title = "{DS}2-Instruct: Domain-Specific Data Synthesis for Large Language Models Instruction Tuning",
author = "Xu, Ruiyao and
Samia, Noelle I. and
Liu, Han",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.176/",
pages = "3368--3384",
ISBN = "979-8-89176-386-9",
abstract = "Adapting Large Language Models (LLMs) to specialized domains requires high-quality instruction tuning datasets, which are expensive to create through human annotation. Existing data synthesis methods focus on general-purpose tasks and fail to capture domain-specific terminology and reasoning patterns. To address this, we introduce DS$^2$-Instruct, a zero-shot framework that generates domain-specific instruction datasets without human supervision. Our approach first generates task-informed keywords to ensure comprehensive domain coverage. It then creates diverse instructions by pairing these keywords with different cognitive levels from Bloom{'}s Taxonomy. Finally, it uses self-consistency validation to ensure data quality. We apply this framework to generate datasets across seven challenging domains, such as mathematics, finance, and logical reasoning. Comprehensive evaluation demonstrates that models fine-tuned on our generated data achieve substantial improvements over existing data generation methods."
}Markdown (Informal)
[DS2-Instruct: Domain-Specific Data Synthesis for Large Language Models Instruction Tuning](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.176/) (Xu et al., Findings 2026)
ACL