@inproceedings{oh-kim-2026-seam,
title = "{SEAM}: Bridging the Temporal-Semantic Granularity Gap for {LLM}-based Speech Recognition",
author = "Oh, Junseok and
Kim, Ji-Hwan",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.112/",
pages = "2135--2144",
ISBN = "979-8-89176-386-9",
abstract = "Speech-LLM integration faces a temporal-semantic granularity gap: speech representations scale with temporal duration while text tokens scale with semantic content. Existing duration-based methods generate embeddings at fixed rates, creating distributional mismatch with LLM pre-training. We propose SEAM (Speech Encoder-Decoder Alignment Module), an encoder-decoder architecture employing variable-rate generation through cross-attention between speech features and text embeddings. SEAM produces embeddings at adaptive rates that closely match natural text distributions while preserving pre-trained knowledge by freezing both speech encoder and LLM. We introduce a multi-stage training strategy and First Token Guidance to improve initial token prediction. SEAM achieves competitive performance on LibriSpeech (2.6{\%}/5.2{\%} WER). More significantly, trained only on LibriSpeech (960h), SEAM achieves 4.7{\%} WER on cross-domain TED-LIUM-v2, demonstrating that integrating LLM{'}s linguistic knowledge enables effective generalization beyond limited speech training data."
}Markdown (Informal)
[SEAM: Bridging the Temporal-Semantic Granularity Gap for LLM-based Speech Recognition](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.112/) (Oh & Kim, Findings 2026)
ACL