@inproceedings{toksoz-etal-2026-pseudoseer,
title = "{P}seudo{S}eer: a Search Engine for Pseudocode",
author = "Toksoz, Levent and
Srinath, Mukund and
Tan, Gang and
Giles, C. Lee",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1978/",
pages = "39706--39716",
ISBN = "979-8-89176-395-1",
abstract = "PseudoSeer is a novel search engine for academic pseudocode, enabling retrieval over 320,000 algorithm implementations extracted from the arXiv. Using the system{'}s caption-reference pairs, we study asymmetric retrieval, matching short queries with a median length of five words against long documents of roughly 300 words composed primarily of natural language with limited LaTeX notation. Our evaluation reveals scaling limitations in embedding models: a 149M parameter encoder outperforms 1.5B parameter alternatives, while BM25 remains competitive with pretrained models. Analyzing attention patterns over 33,000 caption document pairs, we identify two factors driving these results: attention efficiency and attention concentration. Models that significantly attend to sinks or non-discriminative tokens leave less attention for discriminative content, while models with overly diffuse attention fail to form discriminative representations. Guided by these findings, PseudoSeer{'}s embedding model, trained via contrastive learning with efficient attention patterns, outperforms the best pretrained model by 8.7 points. A hybrid approach combining learned embeddings with BM25 reaches 66.5{\%} R@10. PseudoSeer is deployed at pseudoseer.ist.psu.edu as both a practical search system and a benchmark for retrieval evaluation."
}Markdown (Informal)
[PseudoSeer: a Search Engine for Pseudocode](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1978/) (Toksoz et al., Findings 2026)
ACL
- Levent Toksoz, Mukund Srinath, Gang Tan, and C. Lee Giles. 2026. PseudoSeer: a Search Engine for Pseudocode. In Findings of the Association for Computational Linguistics: ACL 2026, pages 39706–39716, San Diego, California, United States. Association for Computational Linguistics.