@inproceedings{li-etal-2026-beyond, title = "Beyond a Single Extractor: Re-thinking {HTML}-to-Text Extraction for {LLM} Pre-training", author = "Li, Jeffrey and Gardner, Joshua P and Kang, Doug and Shi, Fangping and Singh, Karanjeet and Li, Chun-Liang and Shandilya, Herumb and Hall, David Leo Wright and Tuzel, Oncel and Liang, Percy and Schmidt, Ludwig and Pouransari, Hadi and Faghri, Fartash", editor = "Demberg, Vera and Inui, Kentaro and Marquez, Llu{\'i}s", booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026", month = mar, year = "2026", address = "Rabat, Morocco", publisher = "Association for Computational Linguistics", url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.307/", pages = "5836--5861", ISBN = "979-8-89176-386-9" }