﻿\documentclass[11pt]{article}

% Official ACL style (camera-ready / final version)
\usepackage[final]{acl}

\usepackage{times}
\usepackage{latexsym}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{float}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{enumitem}
\usepackage{fancyvrb}

\title{One and Only at SemEval-2026 Task 2: Evaluating Zero-Shot Autonomous LLM Agents and Heuristic Proxies in Ecological Affect Forecasting}

\author{
  Nam Dinh Phuong \\
  University of Information Technology, VNU-HCM \\
  \texttt{namdp.20@grad.uit.edu.vn} \\
}

\begin{document}
\maketitle

\begin{abstract}
This paper presents team \textit{One and Only}'s system for SemEval-2026 Task 2: Predicting Variation in Emotional Valence and Arousal over Time \cite{soni-etal-2026-semeval}. We investigate whether zero-shot LLM reasoning can replace fine-tuning for ecological affect forecasting by combining deterministic statistical priors with frozen LLMs (Gemini 3 Pro, Claude Opus 4.6, GPT-5.2). For short-term state changes (Subtask~2A), an OLS mean-reversion anchor is paired with LLM-generated impulses; for long-term disposition changes (Subtask~2B), a Chain-of-Thought prompt drives direct numeric prediction. Our system underperforms fine-tuned approaches on both subtasks. However, post-submission ablation across three LLMs reveals a task-dependent pattern: CoT reasoning substantially improves disposition forecasting ($r_V$: $-0.185 \to +0.129$; MAE$_V$: $0.899 \to 0.422$), while uncalibrated LLM impulses degrade state-change prediction due to variance collapse ($\sigma_{\text{pred}} = 0.41$ vs.\ $\sigma_{\text{gold}} = 1.73$). We provide a detailed diagnostic analysis of these failure modes and release all prompts and outputs for reproducibility.
\end{abstract}

\section{Introduction}
\label{sec:intro}

\textbf{SemEval-2026 Task 2} \citep{soni-etal-2026-semeval} introduces longitudinal ecological data---daily narrative essays with quantitative Valence and Arousal annotations \citep{russell1980circumplex}---for temporal affect forecasting. Existing EMA-based affect forecasting \cite{stone1994ecological, shiffman2008ecological} typically fine-tunes Transformer models \citep{vaswani2017attention} on per-user histories, risking overfitting on sparse data. While chain-of-thought prompting \citep{wei2022chain} enables structured multi-step reasoning and sentiment analysis can extract continuous affective dimensions from text \citep{mohammad2021sentiment}, applying LLMs to \emph{numeric} continuous-valued regression remains challenging. \citet{gruver2023large} showed that frozen LLMs achieve competitive zero-shot time-series forecasting, though safety-tuned models exhibit predictive variance collapse when used as direct numeric regressors \cite{ouyang2022training}.

We hypothesized that a training-free baseline combining statistical priors with zero-shot LLM reasoning could be competitive. Our system pairs an OLS regression anchor with LLM semantic adjustment for Subtask 2A, and uses a CoT prompt for Subtask 2B. Both subtasks underperformed fine-tuned systems; this paper serves as a diagnostic evaluation with two key findings:
\begin{itemize}[noitemsep,topsep=2pt,parsep=0pt,partopsep=0pt,leftmargin=*]
    \item \textbf{Variance Collapse:} LLM predictions compress to $\sigma_V = 0.41$ vs.\ gold $\sigma_V = 1.73$, destroying Pearson correlation in state-change prediction.
    \item \textbf{Task-Dependent Utility:} CoT reasoning helps disposition forecasting ($\Delta r_V = +0.314$) but harms state-change prediction without calibration.
\end{itemize}

\section{Methodology}
\label{sec:meth}

\subsection{Subtask 2A: OLS Anchor + LLM Adjustment}

\paragraph{Mean-Reversion Anchor.}
Affective states exhibit regression toward a physiological mean \cite{kuppens2010inertia}. We model the predicted shift $\hat{\Delta} X$ for $X \in \{V, A\}$ via OLS regression against the prior state:
\begin{equation}
    \hat{\Delta} X_{\text{base}} = \alpha_X \cdot X_{t-1} + \beta_X
\end{equation}
where $\alpha_X = \rho_{(X_{t-1}, \Delta X)} (\sigma_{\Delta X} / \sigma_{X_{t-1}})$ and $\beta_X = \mu_{\Delta X} - \alpha_X \mu_{X_{t-1}}$, computed from training data. The resulting deterministic anchors are:
\begin{align}
    \hat{\Delta} V_{\text{base}} &= -0.6626 \cdot V_{t-1} + 0.1573 \\
    \hat{\Delta} A_{\text{base}} &= -0.7624 \cdot A_{t-1} + 0.5749
\end{align}

\paragraph{LLM Semantic Adjustment.}
A zero-shot Gemini 3 Pro agent reads the recent essays and generates a numeric emotional-shift estimate via CoT reasoning (prompt in Appendix~\ref{sec:appendix}). The raw LLM output is added directly to the OLS anchor without calibration, and clamped to $[-3, 3]$:
\begin{equation}
    \Delta V = \text{clamp}\bigl(\hat{\Delta} V_{\text{base}} + \text{LLM}(T_t),\;[-3, 3]\bigr)
\end{equation}
This deliberately uncalibrated design stress-tests the raw LLM signal: if it adds value without calibration, the approach is promising; if it degrades the anchor, the failure mode is informative.

\subsection{Subtask 2B: LLM Disposition Forecasting}
Subtask 2B targets long-horizon disposition change. The same LLM receives per-user statistics (essay count, historical means, historical disposition change) and five recent essays, applying a 3-step CoT to output $\hat{D}^{(U)}_{\text{future}} \in [-3, 3]$ (prompt in Appendix~\ref{sec:appendix2b}). We also report a \textbf{trend-continuation heuristic}: $\hat{D}^{(U)}_{\text{heuristic}} = \text{clamp}(D^{(U)}_{\text{historical}},\;[-3,3])$.
\section{Results}
\label{sec:results}

Evaluation uses Pearson $r$ and MAE per SemEval-2026 protocols. The LLM component uses \textbf{Gemini 3 Pro} via GitHub Copilot Chat, zero-shot with default decoding parameters, CoT reasoning, and JSON output formatting. The OLS component is fully deterministic; the LLM is subject to closed-source inference nondeterminism. All prompts are in Appendix~\ref{sec:appendix} and~\ref{sec:appendix2b}. Tables~\ref{tab:subtask2a}--\ref{tab:subtask2b} show official submissions; Table~\ref{tab:ablation} and~\ref{tab:subtask2b_ablation} are post-submission ablations on released labels.

\subsection{Subtask 2A: Short-Term State Change}

\begin{table}[h]
\centering
\small
\caption{Ablation Study on Subtask 2A (Test Set, N=46)}
\label{tab:ablation}
\resizebox{\columnwidth}{!}{%
\begin{tabular}{l c c c c}
\toprule
\multirow{2}{*}{\textbf{Model Config.}} & \multicolumn{2}{c}{\textbf{Valence}} & \multicolumn{2}{c}{\textbf{Arousal}} \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5}
& $\boldsymbol{r}$ & \textbf{MAE} & $\boldsymbol{r}$ & \textbf{MAE} \\
\midrule
Linear (Prev Baseline) & $0.615$ & $1.168$ & $0.670$ & $0.638$ \\
OLS Mean Anchor Only & $0.357$ & $\mathbf{0.907}$ & $0.410$ & $0.752$ \\
Hybrid (LLM CoT + Anchor) & $-0.009$ & $1.377$ & $-0.114$ & $1.054$ \\
LLM Direct (GPT-5.2) & $-0.064$ & $1.326$ & $-0.186$ & $0.770$ \\
\bottomrule
\end{tabular}%
}
\end{table}
 

\begin{table}[h]
  \centering
  \resizebox{\columnwidth}{!}{%
  \begin{tabular}{lccccc}
  \toprule
  \textbf{Team / Model} & \textbf{V $r$} & \textbf{V MAE} & \textbf{A $r$} & \textbf{A MAE} & \textbf{Avg $r$} \\
  \midrule
  HITSZ-CyberS & \textbf{0.698} & -- & 0.568 & -- & \textbf{0.633} \\
  YNU & 0.692 & -- & \textbf{0.647} & -- & 0.669 \\
  Momentum & 0.553 & -- & 0.589 & -- & 0.571 \\
  linear(prev) & 0.615 & 1.168 & 0.670 & 0.638 & 0.642 \\
  \midrule
  \textbf{One and Only*} & \textit{-0.194} & \textit{1.398} & \textit{-0.423} & \textit{0.818} & \textit{-0.308} \\
  \bottomrule
  \end{tabular}%
  }
  \caption{Official Subtask 2A leaderboard metrics. (*MAE independently evaluated on released labels)}
  \label{tab:subtask2a}
  \end{table}

\subsection{Subtask 2B: Long-Term Disposition}
Table~\ref{tab:subtask2b_ablation} shows an ablation comparing the trend-continuation heuristic against the LLM CoT approach evaluated on the released test labels. Table~\ref{tab:subtask2b} places our submitted result (heuristic) in the official leaderboard context.

\begin{table}[h]
\centering
\small
\caption{Subtask 2B post-submission ablation (released labels, N=46). All LLM rows use v1 prompt except where noted. Post-hoc scaling matches $\sigma_{\text{pred}}$ to $\sigma_{\text{train}}$.}
\label{tab:subtask2b_ablation}
\resizebox{\columnwidth}{!}{%
\begin{tabular}{l c c c c}
\toprule
\multirow{2}{*}{\textbf{Method}} & \multicolumn{2}{c}{\textbf{Valence}} & \multicolumn{2}{c}{\textbf{Arousal}} \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5}
& $\boldsymbol{r}$ & \textbf{MAE} & $\boldsymbol{r}$ & \textbf{MAE} \\
\midrule
Trend-Continuation Heuristic & $-0.185$ & $0.899$ & $+0.016$ & $0.483$ \\
LLM CoT v1 (Gemini 3 Pro) & $\mathbf{+0.129}$ & $\mathbf{0.422}$ & $\mathbf{+0.094}$ & $\mathbf{0.306}$ \\
LLM CoT v1 (Claude Opus 4.6) & $-0.190$ & $0.522$ & $+0.065$ & $0.333$ \\
LLM CoT v1 + post-hoc scaling & $+0.129$ & $0.606$ & $+0.094$ & $0.419$ \\
LLM CoT v2 (enriched prompt) & $+0.021$ & $0.687$ & $-0.040$ & $0.420$ \\
\bottomrule
\end{tabular}%
}
\end{table}

\begin{table}[h]
  \centering
  \resizebox{\columnwidth}{!}{%
  \begin{tabular}{lccccc}
  \toprule
  \textbf{Team / Model} & \textbf{V $r$} & \textbf{V MAE} & \textbf{A $r$} & \textbf{A MAE} & \textbf{Avg $r$} \\
  \midrule
  HITSZ-CyberS & \textbf{0.580} & -- & 0.200 & -- & \textbf{0.390} \\
  linear(prev) & 0.434 & 0.406 & 0.584 & 0.286 & 0.509 \\
  UAlberta & 0.405 & -- & \textbf{0.602} & -- & 0.503 \\
  \midrule
  \textbf{One and Only*} & \textit{-0.185} & \textit{0.899} & \textit{0.016} & \textit{0.483} & \textit{-0.084} \\ 
  \bottomrule
  \end{tabular}%
  }
  \caption{Official Subtask 2B leaderboard (submitted heuristic result). (*MAE independently evaluated on released labels)}
  \label{tab:subtask2b}
  \end{table}

\section{Error Analysis}
\label{sec:error_analysis}

\subsection{Subtask 2A: Failure Modes}

The hybrid pipeline yielded negative correlations ($r_V = -0.009$), worse than the pure OLS anchor ($r_V = 0.357$). A separate GPT-5.2 zero-shot run ($r_V = -0.064$) confirms this is not model-specific. Three failure modes explain this:

\textbf{(1) Variance Collapse:} Ground-truth Valence changes have $\sigma = 1.73$; LLM predictions compress to $\sigma = 0.41$ (Figure~\ref{fig:variance}). Safety-tuned LLMs anchor outputs to conservative near-zero values \cite{ouyang2022training}, destroying correlation.

\textbf{(2) CoT Neutralization Bias:} Step-by-step reasoning forces the model to weigh both positive and negative triggers, averaging them into neutralized near-zero outputs that fail to capture impulsive, non-linear affective shifts.

\textbf{(3) Uncalibrated Noise Injection:} Without a calibration layer, raw LLM scalars act as adversarial noise on the stable OLS anchor. Post-hoc linear variance scaling cannot help: Pearson $r$ is scale-invariant ($r(aX+b,Y) = r(X,Y)$), and scaling amplifies misdirected predictions (MAE: $1.377 \to 1.851$). The binding constraint is direction accuracy, not variance. Representative failure cases are shown in Appendix~\ref{sec:appendix_failures}.

\begin{figure}[t]
  \centering
  \includegraphics[width=\columnwidth]{fig_variance.pdf}
  \caption{Predicted $\sigma$ across systems (Valence). The 2A hybrid $\sigma$ is 4$\times$ smaller than gold, confirming variance collapse. The 2B v2 prompt achieves near-target $\sigma$ but degrades $r$ by amplifying directional errors.}
  \label{fig:variance}
\end{figure}

\subsection{Subtask 2B: Disposition Analysis}
\label{sec:2b_analysis}

The LLM CoT achieves $r_V = +0.129$, MAE$_V = 0.422$ vs.\ the heuristic's $r_V = -0.185$, MAE$_V = 0.899$---demonstrating genuine predictive signal. The heuristic fails because copying historical disposition change cannot capture trajectory reversals that the LLM identifies by reasoning over recent essays relative to the historical mean. Full scatter plots are in Appendix~\ref{sec:appendix_scatter}.

\paragraph{Post-hoc Calibration.} Variance scaling (matching $\sigma_{\text{pred}}$ to $\sigma_{\text{train}}$) preserves $r_V = +0.129$ by construction ($r$ is scale-invariant) but increases MAE from $0.422$ to $0.606$, confirming that the binding constraint is directional accuracy.

\paragraph{Prompt Sensitivity.} An enriched v2 prompt supplying pre-computed trend and volatility features improved distributional alignment ($\sigma_V$: $0.162 \to 0.598$, approaching the target $0.620$) but degraded $r_V$ from $+0.129$ to $+0.021$. When direction accuracy is near chance ($\approx 55\%$), amplifying magnitude worsens Pearson $r$.

\paragraph{Cross-Model Comparison.} Running the identical v1 prompt on Claude Opus 4.6 yields $r_V = -0.190$ (vs.\ Gemini's $+0.129$) despite comparable MAE ($0.522$ vs.\ $0.422$), demonstrating that zero-shot disposition forecasting is model-sensitive: the same prompt produces qualitatively different correlation structures, suggesting observed correlations reflect model-specific inductive biases rather than robust prompt-driven reasoning.

\paragraph{Takeaway.} Zero-shot LLM reasoning provides genuine signal when the task is semantically tractable. Disposition change (slow-moving) benefits from CoT ($\Delta r_V = +0.314$); momentary state change (volatile) requires parametric calibration. LLMs should serve as upstream semantic encoders feeding calibrated regression layers.

\section{Conclusion}
\label{sec:conclusion}

We evaluated zero-shot LLM agents for longitudinal affect forecasting. LLM CoT reasoning captures disposition-level trends ($r_V = +0.129$ vs.\ $-0.185$ for the heuristic) but disrupts state-change prediction via variance collapse. The utility of zero-shot reasoning is gated by signal timescale: slow-moving targets respond to narrative reasoning; volatile targets require parametric calibration. Future work should route LLM semantic signals through calibrated regression layers and evaluate on held-out development partitions \cite{gruver2023large}.

\section*{Limitations}
The system was evaluated with three LLMs (Gemini 3 Pro, Claude Opus 4.6, GPT-5.2) on a 46-instance test set; Pearson $r$ estimates are noisy without confidence intervals. Cross-model variability is substantial: 2B $r_V$ ranges from $-0.190$ (Claude) to $+0.129$ (Gemini) under the same prompt, and both Gemini and GPT-5.2 yield negative 2A correlations ($-0.009$, $-0.064$), confirming model sensitivity rather than stable prompt-driven reasoning. Post-submission ablation used released labels and was not part of official scoring. Exact replication is constrained by closed-source models, though all prompts and coefficients are disclosed in the appendices.

\section*{Acknowledgments}
We thank the SemEval-2026 Task 2 organizers for designing this challenging longitudinal affect benchmark.

\bibliography{custom}

\onecolumn
\appendix

\section{Supplementary Figures and Tables}
\label{sec:appendix_scatter}
\label{sec:appendix_failures}

\begin{figure}[H]
  \centering
  \includegraphics[width=\textwidth]{fig_scatter.pdf}
  \caption{Predicted vs.\ gold scatter plots (Valence dimension). \textbf{Top row:} Subtask 2A hybrid predictions cluster near zero ($\sigma_{\text{pred}} = 0.41$ vs.\ $\sigma_{\text{gold}} = 1.71$), illustrating severe variance collapse. \textbf{Bottom row:} Subtask 2B heuristic (left) shows high spread but negative correlation ($r = -0.185$); LLM v1 (right) achieves positive correlation ($r = +0.129$) despite conservative magnitude.}
  \label{fig:scatter}
\end{figure}

\begin{table}[H]
\centering
\small
\caption{Representative Subtask 2A failure cases (Valence). Context descriptions are derived from LLM reasoning traces.}
\label{tab:failcases}
\resizebox{\columnwidth}{!}{%
\begin{tabular}{c p{5cm} r r r}
\toprule
\textbf{User} & \textbf{Context summary} & \textbf{Pred} & \textbf{Gold} & \textbf{$|$Err$|$} \\
\midrule
8  & Bereavement; LLM flags large negative shift & $-1.17$ & $-4.0$ & $2.83$ \\
6  & Anxious state; LLM adjustment inverts anchor & $+0.69$ & $-1.0$ & $1.69$ \\
50 & Surface-positive text; large drop in gold & $-1.04$ & $-3.0$ & $1.96$ \\
29 & Negative keywords; strong positive rebound & $+1.36$ & $+3.0$ & $1.64$ \\
\bottomrule
\end{tabular}%
}
\end{table}

\section{System Prompt Template (Subtask 2A)}
\label{sec:appendix}

The following prompt was provided to the Gemini 3 Pro agent for all
Subtask 2A instances. Placeholders in \texttt{\{braces\}} are
substituted at inference time with per-instance data.

\begin{Verbatim}[fontsize=\footnotesize,frame=single,framesep=2mm]
ROLE: You are an expert in quantitative psychology
specialising in longitudinal affect forecasting.
Assess short-term emotional change using the
Russell Circumplex Model:
  Valence: negative <-> positive
  Arousal: calm <-> excited
INPUTS:
  Recent essay history: {ESSAY_HISTORY}
  Current essay:        {CURRENT_ESSAY}
  Prior state: Valence = {PREV_VALENCE}, Arousal = {PREV_AROUSAL}
INPUT TYPE DETECTION:
  Narrative : analyse causal event logic.
  Keywords  : analyse adjective density.
  Noisy/Spam: predict mild arousal decrease (low-motivation signal).
3-STEP CHAIN-OF-THOUGHT REASONING:
Step 1 (Anchor): Identify current V_t and A_t as baseline.
  All predicted changes are relative to this.
Step 2 (Intensity Mapping):
  Large  (+-2.0 to +-3.0): major life events
  Medium (+-0.5 to +-1.5): work, sleep, daily interactions
  Small  (+-0.1 to +-0.4): natural mood fluctuations
Step 3 (Delta Computation): Compute predicted change.
  Regression to the Mean: if V_t near +3, expect negative delta;
  if near -3, expect positive delta.
CONSTRAINTS: Output ONLY valid JSON. Values in [-3, 3].
{"reasoning": "<trace>",
 "pred_state_change_valence": <float>,
 "pred_state_change_arousal": <float>}
\end{Verbatim}

\section{System Prompt Templates (Subtask 2B)}
\label{sec:appendix2b}

We ran two Subtask 2B inference experiments. \textbf{Prompt v1} (primary post-submission experiment, Table~\ref{tab:subtask2b_ablation} row~2) supplies raw per-user statistics and 5 recent essays. \textbf{Prompt v2} (prompt-sensitivity ablation, row~3) adds pre-computed \texttt{recent\_trend} and \texttt{recent\_volatility} signals per dimension and instructs bolder magnitude outputs.

\paragraph{Prompt v1 --- Primary Experiment.}
Input: \texttt{subtask2b\_llm\_inputs.json} (raw statistics + last 5 essays per user).

\begin{Verbatim}[fontsize=\footnotesize,frame=single,framesep=2mm]
ROLE: Expert in longitudinal affect forecasting (Russell Circumplex).
  Valence: negative <-> positive  (scale: -3 to +3)
  Arousal: calm <-> excited       (scale: -3 to +3)
TASK: Predict the SHIFT in the user's dispositional BASELINE
  (not momentary state). historical_disposition_change is anchor.
INPUT: n_historical_essays, mean_valence, mean_arousal,
  historical_disposition_change_valence/arousal, recent_essays (last 5).
3-STEP CHAIN-OF-THOUGHT:
Step 1 (Baseline Assessment): Positive/negative/neutral? Trending?
Step 2 (Trajectory Projection): Compare recent vs. historical mean.
  Large  (+/-0.3 to +/-0.8): sustained multi-week trend reversals
  Medium (+/-0.1 to +/-0.3): gradual drift
  Small  (+/-0.0 to +/-0.1): stable baseline
Step 3 (Delta): Output FUTURE disposition change.
  Apply regression-to-mean for extremes.
OUTPUT: Valid JSON array only. All values in [-3, 3].
\end{Verbatim}

\paragraph{Prompt v2 --- Sensitivity Ablation.}
Input: \texttt{subtask2b\_llm\_inputs\_v2.json} (adds \texttt{recent\_trend} and \texttt{recent\_volatility} per dimension).

\begin{Verbatim}[fontsize=\footnotesize,frame=single,framesep=2mm]
ROLE: Expert in longitudinal affect forecasting (Russell Circumplex).
  Valence: negative <-> positive  (scale: -3 to +3)
  Arousal: calm <-> excited       (scale: -3 to +3)
TASK: Same as Prompt v1.
ADDITIONAL INPUT (pre-computed):
  recent_trend_valence/arousal: mean(last 5) - historical mean.
    POSITIVE = recent above baseline (primary directional signal).
  recent_volatility_valence/arousal: std of last 5 essays.
    High volatility (>0.8) = user in active flux.
CALIBRATION: std ~0.6 (valence), ~0.4 (arousal).
  Range: -1.5 to +1.5. Near-0.0 predictions should be RARE.
3-STEP CHAIN-OF-THOUGHT:
Step 1 (Signal Assessment):
  |recent_trend| > 0.5  -> bold   (+/-0.6 to +/-1.5)
  0.2--0.5              -> medium (+/-0.3 to +/-0.6)
  < 0.2                 -> small  (+/-0.1 to +/-0.3)
  Near-zero ONLY if flat recent AND flat hist. change.
Step 2 (Direction & Magnitude):
  Direction = sign(recent_trend).
  Apply 30% regression-to-mean if |hist_dispo_change| > 1.0.
Step 3 (Final Check): Verify direction; clamp to [-3, 3].
OUTPUT: Valid JSON array only. All values in [-3, 3].
\end{Verbatim}

\end{document}
