@inproceedings{utpala-etal-2024-language,
title = "Language Agnostic Code Embeddings",
author = "Utpala, Saiteja and
Gu, Alex and
Chen, Pin-Yu",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.naacl-long.38/",
doi = "10.18653/v1/2024.naacl-long.38",
pages = "678--691",
abstract = "Recently, code language models have achieved notable advancements in addressing a diverse array of essential code comprehension and generation tasks. Yet, the field lacks a comprehensive deep dive and understanding of the code embeddings of multilingual code models. In this paper, we present a comprehensive study on multilingual code embeddings, focusing on the cross-lingual capabilities of these embeddings across different programming languages. Through probing experiments, we demonstrate that code embeddings comprise two distinct components: one deeply tied to the nuances and syntax of a specific language, and the other remaining agnostic to these details, primarily focusing on semantics. Further, we show that when we isolate and eliminate this language-specific component, we witness significant improvements in downstream code retrieval tasks, leading to an absolute increase of up to +17 in the Mean Reciprocal Rank (MRR)."
}
Markdown (Informal)
[Language Agnostic Code Embeddings](https://preview.aclanthology.org/fix-sig-urls/2024.naacl-long.38/) (Utpala et al., NAACL 2024)
ACL
- Saiteja Utpala, Alex Gu, and Pin-Yu Chen. 2024. Language Agnostic Code Embeddings. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 678–691, Mexico City, Mexico. Association for Computational Linguistics.