@inproceedings{r-r-etal-2026-surface,
title = "Under the Surface: Probing {T}amil Paraphrase Intelligence",
author = "R R, Viswadarshan and
Lilian, Dr. J. Felicia and
S, Mahalakshmi",
editor = "Mohammad, Saif M. and
Ousidhoum, Nedjma",
booktitle = "Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*{SEM} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.starsem-conference.15/",
pages = "244--254",
ISBN = "979-8-89176-413-2",
abstract = "We present a systematic study on paraphrase detection in Tamil by constructing a unified dataset through translation and semantic verification of three English benchmarks QQP, PAWS, and MRPC. Unlike prior efforts that focus on individual sources or limited scales, our dataset combines multiple paraphrase detection paradigms and is evaluated using semantic similarity metrics, round-trip translation checks, and classifier agreement analysis. We fine-tune five multilingual transformer models (mBERT, XLM-R, IndicBERT, MuRIL, and DistilmBERT) and a Tamil-specific compact model, TLMR (Tamil Language Model - DeBERTa), pretrained on 525M Tamil tokens. Furthermore, we assess the representational quality of the sentence embeddings that are taken from these models using lightweight classifiers (SVM, XGBoost, and Logistic Regression). We formulate an efficiency-oriented metric that incorporates top-5 accuracy, vocabulary usage, and script fidelity in relation to perplexity in order to facilitate resource-aware evaluation. The experimental findings lay the groundwork for future Tamil semantic understanding tasks by highlighting differences in generalization and efficiency across models."
}Markdown (Informal)
[Under the Surface: Probing Tamil Paraphrase Intelligence](https://preview.aclanthology.org/ingest-acl-workshops/2026.starsem-conference.15/) (R R et al., *SEM 2026)
ACL
- Viswadarshan R R, Dr. J. Felicia Lilian, and Mahalakshmi S. 2026. Under the Surface: Probing Tamil Paraphrase Intelligence. In Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*SEM 2026), pages 244–254, San Diego, California, United States. Association for Computational Linguistics.