@inproceedings{takahashi-ishihara-2025-quantifying,
title = "Quantifying Memorization in Continual Pre-training with {J}apanese General or Industry-Specific Corpora",
author = "Takahashi, Hiromu and
Ishihara, Shotaro",
editor = "Jia, Robin and
Wallace, Eric and
Huang, Yangsibo and
Pimentel, Tiago and
Maini, Pratyush and
Dankers, Verna and
Wei, Johnny and
Lesci, Pietro",
booktitle = "Proceedings of the First Workshop on Large Language Model Memorization (L2M2)",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.l2m2-1.8/",
pages = "95--105",
ISBN = "979-8-89176-278-7",
abstract = "Despite the growing concern about memorization of training data using large language models (LLMs), there has been insufficient analysis under conditions using non-English or industry-specific corpora.This study focuses on continual pre-training, a common approach in building non-English LLMs, and quantifies memorization of training data.Specifically, we trained two models based on Llama 3 using Japanese Wikipedia (general) and Japanese financial news articles (industry-specific).Experiments showed a tendency for the amount of memorization to increase as training progressed, similar to the empirical findings for English.This trend was clear in the industry-specific corpus, suggesting potential risks when using valuable, non-general industry corpora.We also identified issues specific to Japanese, and emphasized the importance of analysis other than in English."
}
Markdown (Informal)
[Quantifying Memorization in Continual Pre-training with Japanese General or Industry-Specific Corpora](https://preview.aclanthology.org/landing_page/2025.l2m2-1.8/) (Takahashi & Ishihara, L2M2 2025)
ACL