@inproceedings{ferreira-2025-building,
title = "Building a Compact Math Corpus",
author = "Ferreira, Andrea",
editor = "Abzianidze, Lasha and
de Paiva, Valeria",
booktitle = "Proceedings of the 5th Workshop on Natural Logic Meets Machine Learning (NALOMA)",
month = aug,
year = "2025",
address = "Bochum, Germany",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/gwc-25-ingestion/2025.naloma-1.5/",
pages = "48--55",
ISBN = "979-8-89176-287-9",
abstract = "This paper introduces the Compact Math Corpus (CMC), a preliminary resource for natural language processing in the mathematics domain. We process three open-access undergraduate textbooks from distinct mathematical areas and annotate them in the CoNLL-U format using a lightweight pipeline based on the spaCy Small model. The structured output enables the extraction of syntactic bigrams and TF-IDF scores, supporting a syntactic-semantic analysis of mathematical sentences.From the annotated data, we construct a classification dataset comprising bigrams potentially representing mathematical concepts, along with representative example sentences. We combine CMC with the conversational corpus UD English EWT and train a logistic regression model with K-fold cross-validation, achieving a minimum macro-F1 score of 0.989. These results indicate the feasibility of automatic concept identification in mathematical texts.The study is designed for easy replication in low-resource settings and to promote sustainable research practices. Our approach offers a viable path to tasks such as parser adaptation, terminology extraction, multiword expression modeling, and improved analysis of mathematical language structures."
}
Markdown (Informal)
[Building a Compact Math Corpus](https://preview.aclanthology.org/gwc-25-ingestion/2025.naloma-1.5/) (Ferreira, NALOMA 2025)
ACL
- Andrea Ferreira. 2025. Building a Compact Math Corpus. In Proceedings of the 5th Workshop on Natural Logic Meets Machine Learning (NALOMA), pages 48–55, Bochum, Germany. Association for Computational Linguistics.