@inproceedings{kirtac-2026-evaluating,
title = "Evaluating Large Language Model News Sentiment in Finance under Liquidity and Market Frictions",
author = "Kirtac, Kemal",
editor = "Akhtar, Mubashara and
Batzner, Jan and
Choshen, Leshem and
Ghosh, Avijit and
Gohar, Usman and
Mickel, Jennifer and
Pant, Ichhya and
Talat, Zeerak and
Lin, Michelle",
booktitle = "Proceedings of the Workshop on Evaluating Evaluations ({E}val{E}val)",
month = jul,
year = "2026",
address = "San Diego, CA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.evaleval-1.4/",
pages = "26--35",
ISBN = "979-8-89176-429-3",
abstract = "This paper studies whether large language models can extract useful sentiment signals from firm-specific financial news when evaluation accounts for realistic market frictions. Many financial NLP studies report strong offline prediction results, but these do not always show whether model outputs remain useful once trading constraints are imposed. I address this gap by evaluating sentiment models through classification performance, return predictability, and implementable portfolio performance. The analysis links Refinitiv News Analytics to CRSP and begins with 3,129,924 U.S. news items published between January 1, 2010 and January 30, 2026. Filtering retains single-firm stories, removes redundant coverage using a five-day cosine-similarity novelty screen, and restricts the sample to tradable stocks with positive bid and ask quotes, minimum share and dollar volume thresholds, quoted spreads below 20{\%}, and available Amihud illiquidity ratios and Kyle{'}s lambda estimates. The final sample contains 973,481 tradable news items linked to 3,452 firms. I compare six sentiment approaches: LLaMA{--}3, OPT, RoBERTa, BERT, FinBERT, and the Loughran{--}McDonald dictionary. LLaMA{--}3 achieves the strongest classification performance with 78.2{\%} accuracy and produces the largest predictive coefficients in panel regressions. Daily rebalanced long{--}short portfolios with a 5 bps trading cost show that the LLaMA{--}3 strategy earns a cumulative return of approximately 180{\%} from June 2024 to January 2026, followed by OPT with 155{\%} and RoBERTa with 120{\%}, while the dictionarybased strategy loses 9{\%}. The results show that evaluation becomes more informative when financial NLP models are assessed beyond offline accuracy and under realistic deployment constraints. High-capacity language models retain economically meaningful predictive content under market frictions, whereas simpler lexicon-based methods do not."
}