@proceedings{alvr-2024-advances,
    title = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.0/"
}
@inproceedings{schneider-biemann-2024-wismir3,
    title = "{WISMIR}3: A Multi-Modal Dataset to Challenge Text-Image Retrieval Approaches",
    author = "Schneider, Florian  and
      Biemann, Chris",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.1/",
    doi = "10.18653/v1/2024.alvr-1.1",
    pages = "1--6"
}
@inproceedings{geigle-etal-2024-mblip,
    title = "m{BLIP}: Efficient Bootstrapping of Multilingual Vision-{LLM}s",
    author = "Geigle, Gregor  and
      Jain, Abhay  and
      Timofte, Radu  and
      Glava{\v{s}}, Goran",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.2/",
    doi = "10.18653/v1/2024.alvr-1.2",
    pages = "7--25"
}
@inproceedings{xia-etal-2024-lmpt,
    title = "{LMPT}: Prompt Tuning with Class-Specific Embedding Loss for Long-Tailed Multi-Label Visual Recognition",
    author = "Xia, Peng  and
      Xu, Di  and
      Hu, Ming  and
      Ju, Lie  and
      Ge, Zongyuan",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.3/",
    doi = "10.18653/v1/2024.alvr-1.3",
    pages = "26--36"
}
@inproceedings{lovenia-etal-2024-negative,
    title = "Negative Object Presence Evaluation ({NOPE}) to Measure Object Hallucination in Vision-Language Models",
    author = "Lovenia, Holy  and
      Dai, Wenliang  and
      Cahyawijaya, Samuel  and
      Ji, Ziwei  and
      Fung, Pascale",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.4/",
    doi = "10.18653/v1/2024.alvr-1.4",
    pages = "37--58"
}
@inproceedings{quantmeyer-etal-2024-clip,
    title = "How and where does {CLIP} process negation?",
    author = "Quantmeyer, Vincent  and
      Mosteiro, Pablo  and
      Gatt, Albert",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.5/",
    doi = "10.18653/v1/2024.alvr-1.5",
    pages = "59--72"
}
@inproceedings{nikandrou-etal-2024-enhancing,
    title = "Enhancing Continual Learning in Visual Question Answering with Modality-Aware Feature Distillation",
    author = "Nikandrou, Malvina  and
      Pantazopoulos, Georgios  and
      Konstas, Ioannis  and
      Suglia, Alessandro",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.6/",
    doi = "10.18653/v1/2024.alvr-1.6",
    pages = "73--85"
}
@inproceedings{teramen-etal-2024-english,
    title = "{E}nglish-to-{J}apanese Multimodal Machine Translation Based on Image-Text Matching of Lecture Videos",
    author = "Teramen, Ayu  and
      Ohtsuka, Takumi  and
      Kondo, Risa  and
      Kajiwara, Tomoyuki  and
      Ninomiya, Takashi",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.7/",
    doi = "10.18653/v1/2024.alvr-1.7",
    pages = "86--91"
}
@inproceedings{wang-etal-2024-videocot,
    title = "{V}ideo{C}o{T}: A Video Chain-of-Thought Dataset with Active Annotation Tool",
    author = "Wang, Yan  and
      Zeng, Yawen  and
      Zheng, Jingsheng  and
      Xing, Xiaofen  and
      Xu, Jin  and
      Xu, Xiangmin",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.8/",
    doi = "10.18653/v1/2024.alvr-1.8",
    pages = "92--101"
}
@inproceedings{rosch-etal-2024-enhancing,
    title = "Enhancing Conceptual Understanding in Multimodal Contrastive Learning through Hard Negative Samples",
    author = {R{\"o}sch, Philipp J.  and
      Oswald, Norbert  and
      Geierhos, Michaela  and
      Libovick{\'y}, Jind{\v{r}}ich},
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.9/",
    doi = "10.18653/v1/2024.alvr-1.9",
    pages = "102--115"
}
@inproceedings{xia-etal-2024-vision,
    title = "Vision Language Models for Spreadsheet Understanding: Challenges and Opportunities",
    author = "Xia, Shiyu  and
      Xiong, Junyu  and
      Dong, Haoyu  and
      Zhao, Jianbo  and
      Tian, Yuzhang  and
      Zhou, Mengyu  and
      He, Yeye  and
      Han, Shi  and
      Zhang, Dongmei",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.10/",
    doi = "10.18653/v1/2024.alvr-1.10",
    pages = "116--128"
}
@inproceedings{wang-etal-2024-slideavsr,
    title = "{S}lide{AVSR}: A Dataset of Paper Explanation Videos for Audio-Visual Speech Recognition",
    author = "Wang, Hao  and
      Kurita, Shuhei  and
      Shimizu, Shuichiro  and
      Kawahara, Daisuke",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.11/",
    doi = "10.18653/v1/2024.alvr-1.11",
    pages = "129--137"
}
@inproceedings{hu-keller-2024-causal,
    title = "Causal and Temporal Inference in Visual Question Generation by Utilizing Pre-trained Models",
    author = "Hu, Zhanghao  and
      Keller, Frank",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.12/",
    doi = "10.18653/v1/2024.alvr-1.12",
    pages = "138--154"
}
@inproceedings{reinhardt-etal-2024-improving,
    title = "Improving Vision-Language Cross-Lingual Transfer with Scheduled Unfreezing",
    author = "Reinhardt, Max  and
      Geigle, Gregor  and
      Timofte, Radu  and
      Glava{\v{s}}, Goran",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.13/",
    doi = "10.18653/v1/2024.alvr-1.13",
    pages = "155--166"
}
@inproceedings{zhu-etal-2024-automatic,
    title = "Automatic Layout Planning for Visually-Rich Documents with Instruction-Following Models",
    author = "Zhu, Wanrong  and
      Zhang, Ruiyi  and
      Healey, Jennifer  and
      Wang, William Yang  and
      Sun, Tong",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.14/",
    doi = "10.18653/v1/2024.alvr-1.14",
    pages = "167--172"
}
@inproceedings{urailertprasert-etal-2024-sea,
    title = "{SEA}-{VQA}: {S}outheast {A}sian Cultural Context Dataset For Visual Question Answering",
    author = "Urailertprasert, Norawit  and
      Limkonchotiwat, Peerat  and
      Suwajanakorn, Supasorn  and
      Nutanong, Sarana",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.15/",
    doi = "10.18653/v1/2024.alvr-1.15",
    pages = "173--185"
}
@inproceedings{bielefeld-etal-2024-wiki,
    title = "{W}iki-{VEL}: Visual Entity Linking for Structured Data on Wikimedia Commons",
    author = {Bielefeld, Philipp  and
      Geppert, Jasmin  and
      G{\"u}ven, Necdet  and
      John, Melna  and
      Ziupka, Adrian  and
      Kaffee, Lucie-Aim{\'e}e  and
      Biswas, Russa  and
      De Melo, Gerard},
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.16/",
    doi = "10.18653/v1/2024.alvr-1.16",
    pages = "186--194"
}
@inproceedings{wazni-etal-2024-verbclip,
    title = "{V}erb{CLIP}: Improving Verb Understanding in Vision-Language Models with Compositional Structures",
    author = "Wazni, Hadi  and
      Lo, Kin Ian  and
      Sadrzadeh, Mehrnoosh",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.17/",
    doi = "10.18653/v1/2024.alvr-1.17",
    pages = "195--201"
}
@inproceedings{narin-2024-evolutionary,
    title = "Evolutionary Reward Design and Optimization with Multimodal Large Language Models",
    author = "Narin, Ali",
    editor = "Gu, Jing  and
      Fu, Tsu-Jui (Ray)  and
      Hudson, Drew  and
      Celikyilmaz, Asli  and
      Wang, William",
    booktitle = "Proceedings of the 3rd Workshop on Advances in Language and Vision Research (ALVR)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-old-resolve/2024.alvr-1.18/",
    doi = "10.18653/v1/2024.alvr-1.18",
    pages = "202--208"
}