main.bib

# Example of a paper entry
@misc{qian2023communicative,
      title={Communicative Agents for Software Development}, 
      author={Chen Qian and Xin Cong and Wei Liu and Cheng Yang and Weize Chen and Yusheng Su and Yufan Dang and Jiahao Li and Juyuan Xu and Dahai Li and Zhiyuan Liu and Maosong Sun},
      year={2023},
      url={https://arxiv.org/abs/2307.07924},
      environments = {collaboration, embodied},
      agents = {prompting_and_in_context_learning, more_than_three_agents},
      evaluation = {rule_based},
      other = {n/a},
      eprint={2307.07924},
      archivePrefix={arXiv},
      primaryClass={cs.SE},
}
## Papers
### Surveys and Overview

### Environments

#### Text Environments

@article{environments/language,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

@article{Bard_2020,
   title={The Hanabi challenge: A new frontier for AI research},
   volume={280},
   ISSN={0004-3702},
   url={http://dx.doi.org/10.1016/j.artint.2019.103216},
   DOI={10.1016/j.artint.2019.103216},
   journal={Artificial Intelligence},
   publisher={Elsevier BV},
   author={Bard, Nolan and Foerster, Jakob N. and Chandar, Sarath and Burch, Neil and Lanctot, Marc and Song, H. Francis and Parisotto, Emilio and Dumoulin, Vincent and Moitra, Subhodeep and Hughes, Edward and Dunning, Iain and Mourad, Shibl and Larochelle, Hugo and Bellemare, Marc G. and Bowling, Michael},
   year={2020},
   environments={collaboration, text},
   agents={more_than_three_agents},
   evaluation={rule_based},
   other={n/a},
   month={3}, pages={103216} }

@inproceedings{he-etal-2018-decoupling,
    title = "Decoupling Strategy and Generation in Negotiation Dialogues",
    author = "He, He  and
      Chen, Derek  and
      Balakrishnan, Anusha  and
      Liang, Percy",
    editor = "Riloff, Ellen  and
      Chiang, David  and
      Hockenmaier, Julia  and
      Tsujii, Jun{'}ichi",
    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
    month = {10},
    year = "2018",
    address = "Brussels, Belgium",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/D18-1256",
    doi = "10.18653/v1/D18-1256",
    pages = "2333--2343",
    environments={text, mixed_objectives},
    agents={finetuning, reinforcement_learning, two_agents, agents_with_memory},
    evaluation={human},
    other={n/a}
}

@inproceedings{lewis-etal-2017-deal,
    title = "Deal or No Deal? End-to-End Learning of Negotiation Dialogues",
    author = "Lewis, Mike  and
      Yarats, Denis  and
      Dauphin, Yann  and
      Parikh, Devi  and
      Batra, Dhruv",
    editor = "Palmer, Martha  and
      Hwa, Rebecca  and
      Riedel, Sebastian",
    booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
    month = {9},
    year = "2017",
    address = "Copenhagen, Denmark",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/D17-1259",
    doi = "10.18653/v1/D17-1259",
    pages = "2443--2453",
    environments={text, mixed_objectives},
    agents={reinforcement_learning, two_agents, agents_with_memory},
    evaluation={rule_based},
    other={human_agent}
}

@inproceedings{wang-etal-2019-persuasion,
    title = "Persuasion for Good: Towards a Personalized Persuasive Dialogue System for Social Good",
    author = "Wang, Xuewei  and
      Shi, Weiyan  and
      Kim, Richard  and
      Oh, Yoojung  and
      Yang, Sijia  and
      Zhang, Jingwen  and
      Yu, Zhou",
    editor = "Korhonen, Anna  and
      Traum, David  and
      M{\`a}rquez, Llu{\'\i}s",
    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
    month = jul,
    year = "2019",
    address = "Florence, Italy",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/P19-1566",
    doi = "10.18653/v1/P19-1566",
    pages = "5635--5649",
    abstract = "Developing intelligent persuasive conversational agents to change people{'}s opinions and actions for social good is the frontier in advancing the ethical development of automated dialogue systems. To do so, the first step is to understand the intricate organization of strategic disclosures and appeals employed in human persuasion conversations. We designed an online persuasion task where one participant was asked to persuade the other to donate to a specific charity. We collected a large dataset with 1,017 dialogues and annotated emerging persuasion strategies from a subset. Based on the annotation, we built a baseline classifier with context information and sentence-level features to predict the 10 persuasion strategies used in the corpus. Furthermore, to develop an understanding of personalized persuasion processes, we analyzed the relationships between individuals{'} demographic and psychological backgrounds including personality, morality, value systems, and their willingness for donation. Then, we analyzed which types of persuasion strategies led to a greater amount of donation depending on the individuals{'} personal backgrounds. This work lays the ground for developing a personalized persuasive dialogue system.",
    environments={text, mixed_objectives},
    agents={two_agents, finetuning},
    evaluation={human, rule_based},
    other={human_agent}
}

@inproceedings{peskov-etal-2020-takes,
    title = "It Takes Two to Lie: One to Lie, and One to Listen",
    author = "Peskov, Denis  and
      Cheng, Benny  and
      Elgohary, Ahmed  and
      Barrow, Joe  and
      Danescu-Niculescu-Mizil, Cristian  and
      Boyd-Graber, Jordan",
    editor = "Jurafsky, Dan  and
      Chai, Joyce  and
      Schluter, Natalie  and
      Tetreault, Joel",
    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
    month = jul,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2020.acl-main.353",
    doi = "10.18653/v1/2020.acl-main.353",
    pages = "3811--3854",
    abstract = "Trust is implicit in many online text conversations{---}striking up new friendships, or asking for tech support. But trust can be betrayed through deception. We study the language and dynamics of deception in the negotiation-based game Diplomacy, where seven players compete for world domination by forging and breaking alliances with each other. Our study with players from the Diplomacy community gathers 17,289 messages annotated by the sender for their intended truthfulness and by the receiver for their perceived truthfulness. Unlike existing datasets, this captures deception in long-lasting relationships, where the interlocutors strategically combine truth with lies to advance objectives. A model that uses power dynamics and conversational contexts can predict when a lie occurs nearly as well as human players.",
    environments={text, mixed_objectives},
    agents={more_than_three_agents},
    evaluation={model_based},
    other={human_agent}
}

@article{LanctotEtAl2019OpenSpiel,
  title     = {{OpenSpiel}: A Framework for Reinforcement Learning in Games},
  author    = {Marc Lanctot and Edward Lockhart and Jean-Baptiste Lespiau and
               Vinicius Zambaldi and Satyaki Upadhyay and Julien P\'{e}rolat and
               Sriram Srinivasan and Finbarr Timbers and Karl Tuyls and
               Shayegan Omidshafiei and Daniel Hennes and Dustin Morrill and
               Paul Muller and Timo Ewalds and Ryan Faulkner and J\'{a}nos Kram\'{a}r
               and Bart De Vylder and Brennan Saeta and James Bradbury and David Ding
               and Sebastian Borgeaud and Matthew Lai and Julian Schrittwieser and
               Thomas Anthony and Edward Hughes and Ivo Danihelka and Jonah Ryan-Davis},
  month     = {8},
  year      = {2019},
  eprint    = {1908.09453},
  archivePrefix = {arXiv},
  primaryClass = {cs.LG},
  journal   = {CoRR},
  volume    = {abs/1908.09453},
  url       = {http://arxiv.org/abs/1908.09453},
  environments={collaboration, competition, mixed_objectives, text},
  agents={two_agents, more_than_three_agents, reinforcement_learning},
  evaluation={rule_based},
  other={n/a}
}

@article{zha2019rlcard,
  title={RLCard: A Toolkit for Reinforcement Learning in Card Games},
  author={Zha, Daochen and Lai, Kwei-Herng and Cao, Yuanpu and Huang, Songyi and Wei, Ruzhe and Guo, Junyu and Hu, Xia},
  journal={arXiv preprint arXiv:1910.04376},
  month = {7},
  year={2019},
  environments={collaboration, competition, mixed_objectives, text},
  agents={two_agents, more_than_three_agents, reinforcement_learning},
  evaluation={rule_based},
  other={n/a},
  url={https://github.com/datamllab/rlcard}
}

@article{meta2022human,
  title={Human-level play in the game of Diplomacy by combining language models with strategic reasoning},
  author={Meta Fundamental AI Research Diplomacy Team (FAIR)† and Bakhtin, Anton and Brown, Noam and Dinan, Emily and Farina, Gabriele and Flaherty, Colin and Fried, Daniel and Goff, Andrew and Gray, Jonathan and Hu, Hengyuan and others},
  journal={Science},
  volume={378},
  number={6624},
  pages={1067--1074},
  month={11},
  year={2022},
  publisher={American Association for the Advancement of Science},
  url={https://www.science.org/doi/full/10.1126/science.ade9097},
  environments={competition, text},
  agents={more_than_three_agents, reinforcement_learning, finetuning},
  evaluation={rule_based},
  other={human_agent}
}

@software{multigrid,
  author = {Oguntola, Ini},
  title = {Fast Multi-Agent Gridworld Environments for Gymnasium},
  url = {https://github.com/ini/multigrid},
  month = {3},
  year = {2023},
  journal = {GitHub},
  environments={collaboration, competition, text},
  agents={two_agents, more_than_three_agents, reinforcement_learning},
  evaluation={rule_based},
  other={n/a}
}

@inproceedings{callison-burch-etal-2022-dungeons,
    title = "Dungeons and Dragons as a Dialog Challenge for Artificial Intelligence",
    author = "Callison-Burch, Chris  and
      Tomar, Gaurav Singh  and
      Martin, Lara  and
      Ippolito, Daphne  and
      Bailis, Suma  and
      Reitter, David",
    editor = "Goldberg, Yoav  and
      Kozareva, Zornitsa  and
      Zhang, Yue",
    booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
    month = dec,
    year = "2022",
    address = "Abu Dhabi, United Arab Emirates",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.emnlp-main.637",
    doi = "10.18653/v1/2022.emnlp-main.637",
    pages = "9379--9393",
    abstract = "AI researchers have posited Dungeons and Dragons (D{\&}D) as a challenge problem to test systems on various language-related capabilities. In this paper, we frame D{\&}D specifically as a dialogue system challenge, where the tasks are to both generate the next conversational turn in the game and predict the state of the game given the dialogue history. We create a gameplay dataset consisting of nearly 900 games, with a total of 7,000 players, 800,000 dialogue turns, 500,000 dice rolls, and 58 million words. We automatically annotate the data with partial state information about the game play. We train a large language model (LM) to generate the next game turn, conditioning it on different information. The LM can respond as a particular character or as the player who runs the game{---}i.e., the Dungeon Master (DM). It is trained to produce dialogue that is either in-character (roleplaying in the fictional world) or out-of-character (discussing rules or strategy). We perform a human evaluation to determine what factors make the generated output plausible and interesting. We further perform an automatic evaluation to determine how well the model can predict the game state given the history and examine how well tracking the game state improves its ability to produce plausible conversational output.",
    environments={text, implicit_objectives},
    agents={more_than_three_agents, pretraining, finetuning},
    evaluation={human, rule_based},
    other={human_agent}
}

@inproceedings{zhou-etal-2023-cast,
    title = "{I} Cast Detect Thoughts: Learning to Converse and Guide with Intents and Theory-of-Mind in Dungeons and Dragons",
    author = "Zhou, Pei  and
      Zhu, Andrew  and
      Hu, Jennifer  and
      Pujara, Jay  and
      Ren, Xiang  and
      Callison-Burch, Chris  and
      Choi, Yejin  and
      Ammanabrolu, Prithviraj",
    editor = "Rogers, Anna  and
      Boyd-Graber, Jordan  and
      Okazaki, Naoaki",
    booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.acl-long.624",
    doi = "10.18653/v1/2023.acl-long.624",
    pages = "11136--11155",
    abstract = "We propose a novel task, G4C, to study teacher-student natural language interactions in a goal-driven and grounded environment. Dungeons and Dragons (D{\&}D), a role-playing game, provides an ideal setting to investigate such interactions. Here, the Dungeon Master (DM), i.e., the teacher, guides the actions of several players{---}students, each with their own personas and abilities{---}to achieve shared goals grounded in a fantasy world. Our approach is to decompose and model these interactions into (1) the DM{'}s intent to guide players toward a given goal; (2) the DM{'}s guidance utterance to the players expressing this intent; and (3) a theory-of-mind (ToM) model that anticipates the players{'} reaction to the guidance one turn into the future. We develop a novel reinforcement learning (RL) method for training a DM that generates guidance for players by rewarding utterances where the intent matches the ToM-anticipated player actions. Human and automated evaluations show that a DM trained to explicitly model intents and incorporate ToM of the players using RL generates better-quality guidance that is 3x more likely to fulfill the DM{'}s intent than a vanilla natural language generation (NLG) approach.",
    environments={text, implicit_objectives},
    agents={more_than_three_agents, reinforcement_learning},
    evaluation={human, rule_based},
    other={human_agent}
}

@inproceedings{zhu-etal-2023-fireball,
    title = "{FIREBALL}: A Dataset of Dungeons and Dragons Actual-Play with Structured Game State Information",
    author = "Zhu, Andrew  and
      Aggarwal, Karmanya  and
      Feng, Alexander  and
      Martin, Lara  and
      Callison-Burch, Chris",
    editor = "Rogers, Anna  and
      Boyd-Graber, Jordan  and
      Okazaki, Naoaki",
    booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.acl-long.229",
    doi = "10.18653/v1/2023.acl-long.229",
    pages = "4171--4193",
    abstract = "Dungeons {\&} Dragons (D{\&}D) is a tabletop roleplaying game with complex natural language interactions between players and hidden state information. Recent work has shown that large language models (LLMs) that have access to state information can generate higher quality game turns than LLMs that use dialog history alone. However, previous work used game state information that was heuristically created and was not a true gold standard game state. We present FIREBALL, a large dataset containing nearly 25,000 unique sessions from real D{\&}D gameplay on Discord with true game state info. We recorded game play sessions of players who used the Avrae bot, which was developed to aid people in playing D{\&}D online, capturing language, game commands and underlying game state information. We demonstrate that FIREBALL can improve natural language generation (NLG) by using Avrae state information, improving both automated metrics and human judgments of quality. Additionally, we show that LLMs can generate executable Avrae commands, particularly after finetuning.",
    environments={text, implicit_objectives},
    agents={more_than_three_agents, finetuning},
    evaluation={human, rule_based},
    other={human_agent}
}

@inproceedings{zhu2023calypso,
   title={{CALYPSO}: {LLMs} as Dungeon Masters' Assistants},
   author={Zhu, Andrew and Martin, Lara J. and Head, Andrew and Callison-Burch, Chris},
   booktitle={The 19th AAAI Conference on Artificial Intelligence and Interactive Digital Entertainment (AIIDE 2023)},
   month={8},
   year={2023},
   environments={text, implicit_objectives},
   agents={more_than_three_agents, finetuning},
   evaluation={human},
   other={human_agent},
   url={https://arxiv.org/abs/2308.07540}
}

@article{eliza1966weizenbaum,
  author = {Weizenbaum, Joseph},
  title = {ELIZA—a computer program for the study of natural language communication between man and machine},
  year = {1966},
  issue_date = {Jan. 1966},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/365153.365168},
  doi = {10.1145/365153.365168},
  journal = {Commun. ACM},
  month = {jan},
  pages = {36–45},
  environments={text, mixed_objectives},
  agents={agents_with_personas},
  evaluation={human},
  other={n/a}
}

@article{shuster2022blenderbot,
  title={Blenderbot 3: a deployed conversational agent that continually learns to responsibly engage},
  author={Shuster, Kurt and Xu, Jing and Komeili, Mojtaba and Ju, Da and Smith, Eric Michael and Roller, Stephen and Ung, Megan and Chen, Moya and Arora, Kushal and Lane, Joshua and others},
  journal={arXiv preprint arXiv:2208.03188},
  year={2022},
  month={8},
  url={https://arxiv.org/abs/2208.03188},
  environments={text, mixed_objectives},
  agents={finetuning},
  evaluation={qualitative, human},
  other={n/a}
}

@misc{introducing2022,
  title={Introducing ChatGPT},
  author={OpenAI},
  year={2022},
  month={11},
  url={https://openai.com/blog/chatgpt},
  journal={n/a},
  environments={text, mixed_objectives},
  agents={prompting_and_in_context_learning, agents_with_memory},
  evaluation={qualitative, human},
  other={human_agent}
}

@article{chiang2024chatbot,
  title={Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference},
  author={Chiang, Wei-Lin and Zheng, Lianmin and Sheng, Ying and Angelopoulos, Anastasios Nikolas and Li, Tianle and Li, Dacheng and Zhang, Hao and Zhu, Banghua and Jordan, Michael and Gonzalez, Joseph E and others},
  journal={arXiv preprint arXiv:2403.04132},
  year={2024},
  month={3},
  url={https://arxiv.org/abs/2403.04132},
  environments={text, mixed_objectives},
  agents={prompting_and_in_context_learning},
  evaluation={qualitative, human},
  other={human_agent}
}

@article{zhang2022opt,
  title={Opt: Open pre-trained transformer language models},
  author={Zhang, Susan and Roller, Stephen and Goyal, Naman and Artetxe, Mikel and Chen, Moya and Chen, Shuohui and Dewan, Christopher and Diab, Mona and Li, Xian and Lin, Xi Victoria and others},
  journal={arXiv preprint arXiv:2205.01068},
  year={2022},
  month={5},
  url={https://arxiv.org/abs/2205.01068},
  environments={text, mixed_objectives},
  agents={finetuning, agents_with_personas},
  evaluation={qualitative, human},
  other={human_agent}
}

@article{zhou2020design,
  title = "The Design and Implementation of {X}iao{I}ce, an Empathetic Social Chatbot",
  author = "Zhou, Li  and
      Gao, Jianfeng  and
      Li, Di  and
      Shum, Heung-Yeung",
  journal = "Computational Linguistics",
  volume = "46",
  number = "1",
  year = "2020",
  month = "3",
  address = "Cambridge, MA",
  publisher = "MIT Press",
  url = "https://aclanthology.org/2020.cl-1.2",
  doi = "10.1162/coli_a_00368",
  pages = "53--93",
  environments={text, mixed_objectives},
  agents={finetuning, agents_with_personas},
  evaluation={qualitative, human},
  other={human_agent}
}

@incollection{cai2006empathic,
  title={Empathic computing},
  author={Cai, Yang},
  booktitle={Ambient intelligence in everyday life: Foreword by Emile Aarts},
  pages={67--85},
  year={2006},
  month={1},
  publisher={Springer},
  url={https://link.springer.com/chapter/10.1007/11825890_3},
  environments={text, mixed_objectives},
  agents={agents_with_personas},
  evaluation={human},
  other={n/a}
}

@inproceedings{dinan2018wizard,
  title={Wizard of Wikipedia: Knowledge-Powered Conversational Agents},
  author={Emily Dinan and Stephen Roller and Kurt Shuster and Angela Fan and Michael Auli and Jason Weston},
  booktitle={International Conference on Learning Representations},
  year={2019},
  month={4},
  url={https://openreview.net/forum?id=r1l73iRqKm},
  environments={text, mixed_objectives, implicit_objectives},
  agents={finetuning, agents_with_personas},
  evaluation={qualitative, human},
  other={human_agent}
}

@inproceedings{ghazvininejad2018knowledge,
  title={A knowledge-grounded neural conversation model},
  author={Ghazvininejad, Marjan and Brockett, Chris and Chang, Ming-Wei and Dolan, Bill and Gao, Jianfeng and Yih, Wen-tau and Galley, Michel},
  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
  volume={32},
  number={1},
  year={2018},
  month={4},
  url={https://ojs.aaai.org/index.php/AAAI/article/view/11977},
  environments={text, mixed_objectives, implicit_objectives},
  agents={finetuning},
  evaluation={qualitative, human},
  other={human_agent}
}

@article{li2016persona,
  title={A persona-based neural conversation model},
  author={Li, Jiwei and Galley, Michel and Brockett, Chris and Spithourakis, Georgios P and Gao, Jianfeng and Dolan, Bill},
  journal={arXiv preprint arXiv:1603.06155},
  year={2016},
  month={8},
  url={https://aclanthology.org/P16-1094/},
  environments={text, mixed_objectives},
  agents={finetuning, agents_with_personas},
  evaluation={qualitative, human},
  other={human_agent}
}

@book{wallace2009anatomy,
  title={The anatomy of ALICE},
  author={Wallace, Richard S},
  year={2009},
  month={11},
  publisher={Springer},
  journal={n/a},
  url={https://link.springer.com/chapter/10.1007/978-1-4020-6710-5_13},
  environments={text, mixed_objectives},
  agents={agents_with_personas},
  evaluation={human},
  other={n/a}
}

@inproceedings{fung2018towards,
  title={Towards empathetic human-robot interactions},
  author={Fung, Pascale and Bertero, Dario and Wan, Yan and Dey, Anik and Chan, Ricky Ho Yin and Bin Siddique, Farhad and Yang, Yang and Wu, Chien-Sheng and Lin, Ruixi},
  booktitle={Computational Linguistics and Intelligent Text Processing: 17th International Conference, CICLing 2016, Konya, Turkey, April 3--9, 2016, Revised Selected Papers, Part II 17},
  pages={173--193},
  year={2018},
  month={3},
  organization={Springer},
  url={https://link.springer.com/chapter/10.1007/978-3-319-75487-1_14},
  environments={text, mixed_objectives},
  agents={agents_with_personas},
  evaluation={qualitative, human},
  other={human_agent}
}


#### Embodied Environments
@article{environments/embodied,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

@inproceedings{10.1145/3406499.3418760,
author = {Tsoi, Nathan and Hussein, Mohamed and Espinoza, Jeacy and Ruiz, Xavier and V\'{a}zquez, Marynel},
title = {SEAN: Social Environment for Autonomous Navigation},
year = {2020},
month={9},
isbn = {9781450380546},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3406499.3418760},
doi = {10.1145/3406499.3418760},
abstract = {Social navigation research is performed on a variety of robotic platforms, scenarios, and environments. Making comparisons between navigation algorithms is challenging because of the effort involved in building these systems and the diversity of platforms used by the community; nonetheless, evaluation is critical to understanding progress in the field. In a step towards reproducible evaluation of social navigation algorithms, we propose the Social Environment for Autonomous Navigation (SEAN). SEAN is a high visual fidelity, open source, and extensible social navigation simulation platform which includes a toolkit for evaluation of navigation algorithms. We demonstrate SEAN and its evaluation toolkit in two environments with dynamic pedestrians and using two different robots.},
booktitle = {Proceedings of the 8th International Conference on Human-Agent Interaction},
pages = {281–283},
numpages = {3},
keywords = {social robot navigation, human-robot interaction},
location = {Virtual Event, USA},
series = {HAI '20},
environments={mixed_objectives, embodied},
agents={reinforcement_learning},
evaluation={rule_based},
other={human_agent, simulated_humans}
}

@inproceedings{puig2024habitat,
title={Habitat 3.0: A Co-Habitat for Humans, Avatars, and Robots},
author={Xavier Puig and Eric Undersander and Andrew Szot and Mikael Dallaire Cote and Tsung-Yen Yang and Ruslan Partsey and Ruta Desai and Alexander Clegg and Michal Hlavac and So Yeon Min and Vladim{\'\i}r Vondru{\v{s}} and Theophile Gervet and Vincent-Pierre Berges and John M Turner and Oleksandr Maksymets and Zsolt Kira and Mrinal Kalakrishnan and Jitendra Malik and Devendra Singh Chaplot and Unnat Jain and Dhruv Batra and Akshara Rai and Roozbeh Mottaghi},
booktitle={The Twelfth International Conference on Learning Representations},
year={2024},
month={10},
url={https://openreview.net/forum?id=4znwzG92CE},
  environments={mixed_objectives, embodied},
  agents={reinforcement_learning},
  evaluation={rule_based},
  other={human_agent, simulated_humans}
}

@article{team2024scaling,
  title={Scaling Instructable Agents Across Many Simulated Worlds},
  author={Team, SIMA and Abi Raad, Maria and Ahuja, Arun and Barros, Catarina and Besse, Frederic and Bolt, Andrew and Bolton, Adrian and Brownfield, Bethanie and Buttimore, Gavin and Cant, Max and others},
  year={2024},
  month={4},
  url={https://arxiv.org/abs/2404.10179v2},
  journal={arXiv preprint arXiv:2404.10179},
  environments={embodied},
  agents={prompting_and_in_context_learning, finetuning},
  evaluation={qualitative},
  other={human_agent}
}

@article{ma2023large,
  title={Large language models play starcraft ii: Benchmarks and a chain of summarization approach},
  author={Ma, Weiyu and Mi, Qirui and Yan, Xue and Wu, Yuqiao and Lin, Runji and Zhang, Haifeng and Wang, Jun},
  journal={arXiv preprint arXiv:2312.11865},
  year={2023},
  month={12},
  url={https://arxiv.org/abs/2312.11865},
  environments={embodied},
  agents={prompting_and_in_context_learning, finetuning},
  evaluation={qualitative},
  other={human_agent}
}

@misc{opengenerativeai2024evaluate,
  title={Evaluate LLMs in real time with Street Fighter III},
  author={OpenGenerativeAI team},
  year={2024},
  month={3},
  url={https://github.com/OpenGenerativeAI/llm-colosseum},
  journal={n/a},
  environments={embodied},
  agents={prompting_and_in_context_learning},
  evaluation={qualitative},
  other={human_agent}
}

@misc{zhao2023competeai,
      title={CompeteAI: Understanding the Competition Behaviors in Large Language Model-based Agents}, 
      author={Qinlin Zhao and Jindong Wang and Yixuan Zhang and Yiqiao Jin and Kaijie Zhu and Hao Chen and Xing Xie},
      environments = {competition, text},
      agents = {prompting_and_in_context_learning, two_agents},
      evaluation = {rule_based},
      url = {https://arxiv.org/abs/2310.17512},
      other = {n/a},
      year={2023},
      eprint={2310.17512},
      archivePrefix={arXiv},
      primaryClass={cs.AI}
}


#### Virtual Environments
@article{environments/virtual,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}
@inproceedings{li2018appinite,
  title={Appinite: A multi-modal interface for specifying data descriptions in programming by demonstration using natural language instructions},
  author={Li, Toby Jia-Jun and Labutov, Igor and Li, Xiaohan Nancy and Zhang, Xiaoyi and Shi, Wenze and Ding, Wanling and Mitchell, Tom M and Myers, Brad A},
  booktitle={2018 IEEE Symposium on Visual Languages and Human-Centric Computing (VL/HCC)},
  pages={105--114},
  year={2018},
  month={3},
  organization={IEEE},
  url={https://ieeexplore.ieee.org/document/8506506},
  environments={virtual},
  agents={prompting_and_in_context_learning},
  evaluation={human, qualitative},
  other={human_agent}
}

@inproceedings{li2019pumice,
  title={Pumice: A multi-modal agent that learns concepts and conditionals from natural language and demonstrations},
  author={Li, Toby Jia-Jun and Radensky, Marissa and Jia, Justin and Singarajah, Kirielle and Mitchell, Tom M and Myers, Brad A},
  booktitle={Proceedings of the 32nd annual ACM symposium on user interface software and technology},
  pages={577--589},
  year={2019},
  month={3},
  url={https://dl.acm.org/doi/10.1145/3332165.3347899},
  environments={virtual},
  agents={prompting_and_in_context_learning},
  evaluation={human, qualitative},
  other={human_agent}
}

@inproceedings{li2020interactive,
  title={Interactive task learning from GUI-grounded natural language instructions and demonstrations},
  author={Li, Toby Jia-Jun and Mitchell, Tom and Myers, Brad},
  booktitle={Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations},
  pages={215--223},
  year={2020},
  month={9},
  url={https://arxiv.org/abs/1909.00031},
  environments={virtual},
  agents={prompting_and_in_context_learning},
  evaluation={human, qualitative},
  other={human_agent}
}

@article{yang2023appagent,
  title={Appagent: Multimodal agents as smartphone users},
  author={Yang, Zhao and Liu, Jiaxuan and Han, Yucheng and Chen, Xin and Huang, Zebiao and Fu, Bin and Yu, Gang},
  journal={arXiv preprint arXiv:2312.13771},
  year={2023},
  month={12},
  url={https://arxiv.org/abs/2312.13771},
  environments={virtual},
  agents={prompting_and_in_context_learning},
  evaluation={rule_based},
  other={n/a}
}

@article{zhang2024ufo,
  title={UFO: A UI-Focused Agent for Windows OS Interaction},
  author={Zhang, Chaoyun and Li, Liqun and He, Shilin and Zhang, Xu and Qiao, Bo and Qin, Si and Ma, Minghua and Kang, Yu and Lin, Qingwei and Rajmohan, Saravan and others},
  journal={arXiv preprint arXiv:2402.07939},
  year={2024},
  month={2},
  url={https://arxiv.org/abs/2402.07939},
  environments={virtual},
  agents={prompting_and_in_context_learning},
  evaluation={rule_based},
  other={n/a}
}

@article{wang2024mobile,
  title={Mobile-Agent: Autonomous multi-modal mobile device agent with visual perception},
  author={Wang, Junyang and Xu, Haiyang and Ye, Jiabo and Yan, Ming and Shen, Weizhou and Zhang, Ji and Huang, Fei and Sang, Jitao},
  journal={arXiv preprint arXiv:2401.16158},
  year={2024},
  month={1},
  url={https://arxiv.org/abs/2401.16158},
  environments={virtual},
  agents={prompting_and_in_context_learning},
  evaluation={rule_based},
  other={n/a}
}

@article{wu2024copilot,
  title={Os-copilot: Towards generalist computer agents with self-improvement},
  author={Wu, Zhiyong and Han, Chengcheng and Ding, Zichen and Weng, Zhenmin and Liu, Zhoumianze and Yao, Shunyu and Yu, Tao and Kong, Lingpeng},
  journal={arXiv preprint arXiv:2402.07456},
  year={2024},
  month={2},
  url={https://arxiv.org/abs/2402.07456},
  environments={virtual},
  agents={prompting_and_in_context_learning},
  evaluation={rule_based},
  other={n/a}
}

@article{zhou2023webarena,
  title={Webarena: A realistic web environment for building autonomous agents},
  author={Zhou, Shuyan and Xu, Frank F and Zhu, Hao and Zhou, Xuhui and Lo, Robert and Sridhar, Abishek and Cheng, Xianyi and Bisk, Yonatan and Fried, Daniel and Alon, Uri and others},
  journal={arXiv preprint arXiv:2307.13854},
  year={2023},
  month={7},
  url={https://arxiv.org/abs/2307.13854},
  environments={virtual},
  agents={prompting_and_in_context_learning},
  evaluation={rule_based},
  other={n/a}
}

@article{koh2024visualwebarena,
  title={Visualwebarena: Evaluating multimodal agents on realistic visual web tasks},
  author={Koh, Jing Yu and Lo, Robert and Jang, Lawrence and Duvvur, Vikram and Lim, Ming Chong and Huang, Po-Yu and Neubig, Graham and Zhou, Shuyan and Salakhutdinov, Ruslan and Fried, Daniel},
  journal={arXiv preprint arXiv:2401.13649},
  year={2024},
  month={1},
  url={https://arxiv.org/abs/2401.13649},
  environments={virtual},
  agents={prompting_and_in_context_learning},
  evaluation={rule_based},
  other={n/a}
}

@article{yao2022webshop,
  title={Webshop: Towards scalable real-world web interaction with grounded language agents},
  author={Yao, Shunyu and Chen, Howard and Yang, John and Narasimhan, Karthik},
  journal={Advances in Neural Information Processing Systems},
  volume={35},
  pages={20744--20757},
  year={2022},
  month={12},
  url={https://proceedings.neurips.cc/paper_files/paper/2022/file/82ad13ec01f9fe44c01cb91814fd7b8c-Paper-Conference.pdf},
  environments={virtual},
  agents={prompting_and_in_context_learning, finetuning},
  evaluation={rule_based},
  other={n/a}
}

@inproceedings{humphreys2022data,
  title={A data-driven approach for learning to control computers},
  author={Humphreys, Peter C and Raposo, David and Pohlen, Tobias and Thornton, Gregory and Chhaparia, Rachita and Muldal, Alistair and Abramson, Josh and Georgiev, Petko and Santoro, Adam and Lillicrap, Timothy},
  booktitle={International Conference on Machine Learning},
  pages={9466--9482},
  year={2022},
  month={7},
  organization={PMLR},
  url={https://arxiv.org/abs/2202.08137},
  environments={virtual},
  agents={finetuning, reinforcement_learning},
  evaluation={rule_based},
  other={n/a}
}

@inproceedings{shi2017world,
  title={World of bits: An open-domain platform for web-based agents},
  author={Shi, Tianlin and Karpathy, Andrej and Fan, Linxi and Hernandez, Jonathan and Liang, Percy},
  booktitle={International Conference on Machine Learning},
  pages={3135--3144},
  year={2017},
  month={8},
  organization={PMLR},
  url={https://proceedings.mlr.press/v70/shi17a/shi17a.pdf},
  environments={virtual},
  agents={reinforcement_learning},
  evaluation={rule_based},
  other={n/a}
}

@article{liu2018reinforcement,
  title={Reinforcement learning on web interfaces using workflow-guided exploration},
  author={Liu, Evan Zheran and Guu, Kelvin and Pasupat, Panupong and Shi, Tianlin and Liang, Percy},
  journal={arXiv preprint arXiv:1802.08802},
  year={2018},
  month={2},
  url={https://arxiv.org/abs/1802.08802},
  environments={virtual},
  agents={reinforcement_learning},
  evaluation={rule_based},
  other={n/a}
}

@inproceedings{branavan2009reinforcement,
  title={Reinforcement learning for mapping instructions to actions},
  author={Branavan, Satchuthananthavale RK and Chen, Harr and Zettlemoyer, Luke and Barzilay, Regina},
  booktitle={Proceedings of the Joint Conference of the 47th Annual Meeting of the ACL and the 4th International Joint Conference on Natural Language Processing of the AFNLP},
  pages={82--90},
  year={2009},
  month={8},
  url={https://aclanthology.org/P09-1010/},
  environments={virtual},
  agents={reinforcement_learning},
  evaluation={rule_based},
  other={n/a}
}

@article{toyama2021androidenv,
  title={Androidenv: A reinforcement learning platform for android},
  author={Toyama, Daniel and Hamel, Philippe and Gergely, Anita and Comanici, Gheorghe and Glaese, Amelia and Ahmed, Zafarali and Jackson, Tyler and Mourad, Shibl and Precup, Doina},
  journal={arXiv preprint arXiv:2105.13231},
  year={2021},
  month={5},
  url={https://arxiv.org/abs/2105.13231},
  environments={virtual},
  agents={reinforcement_learning},
  evaluation={rule_based},
  other={n/a}
}

@article{li2020mapping,
  title={Mapping natural language instructions to mobile UI action sequences},
  author={Li, Yang and He, Jiacong and Zhou, Xin and Zhang, Yuan and Baldridge, Jason},
  journal={arXiv preprint arXiv:2005.03776},
  year={2020},
  month={5},
  url={https://arxiv.org/abs/2005.03776},
  environments={virtual},
  agents={finetuning},
  evaluation={rule_based},
  other={n/a}
}

@inproceedings{burns2022dataset,
  title={A dataset for interactive vision-language navigation with unknown command feasibility},
  author={Burns, Andrea and Arsan, Deniz and Agrawal, Sanjna and Kumar, Ranjitha and Saenko, Kate and Plummer, Bryan A},
  booktitle={European Conference on Computer Vision},
  pages={312--328},
  year={2022},
  month={2},
  url={https://arxiv.org/abs/2202.02312},
  organization={Springer},
  environments={virtual},
  agents={finetuning},
  evaluation={rule_based},
  other={n/a}
}

@article{deng2024mind2web,
  title={Mind2web: Towards a generalist agent for the web},
  author={Deng, Xiang and Gu, Yu and Zheng, Boyuan and Chen, Shijie and Stevens, Sam and Wang, Boshi and Sun, Huan and Su, Yu},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  year={2024},
  month={1},
  url={https://arxiv.org/abs/2306.06070},
  environments={virtual},
  agents={prompting_and_in_context_learning},
  evaluation={rule_based},
  other={n/a}
}

@article{rawles2023android,
  title={Android in the wild: A large-scale dataset for android device control},
  author={Rawles, Christopher and Li, Alice and Rodriguez, Daniel and Riva, Oriana and Lillicrap, Timothy},
  journal={arXiv preprint arXiv:2307.10088},
  year={2023},
  month={7},
  url={https://arxiv.org/abs/2307.10088},
  environments={virtual},
  agents={finetuning},
  evaluation={rule_based},
  other={n/a}
}

@inproceedings{allen2007plow,
  title={Plow: A collaborative task learning agent},
  author={Allen, James and Chambers, Nathanael and Ferguson, George and Galescu, Lucian and Jung, Hyuckchul and Swift, Mary and Taysom, William},
  booktitle={AAAI},
  volume={7},
  pages={1514--1519},
  year={2007},
  month={7},
  url={https://cdn.aaai.org/AAAI/2007/AAAI07-240.pdf},
  environments={virtual},
  agents={prompting_and_in_context_learning},
  evaluation={human}, 
  other={human_agent}
}

@article{xu2021grounding,
  title={Grounding open-domain instructions to automate web support tasks},
  author={Xu, Nancy and Masling, Sam and Du, Michael and Campagna, Giovanni and Heck, Larry and Landay, James and Lam, Monica S},
  journal={arXiv preprint arXiv:2103.16057},
  year={2021},
  month={3},
  url={https://arxiv.org/abs/2103.16057},
  environments={virtual},
  agents={finetuning},
  evaluation={rule_based},
  other={n/a}
}


#### Robotics
@article{environments/robotics,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

@InProceedings{pmlr-v205-xiong23a,
    title = {RoboTube: Learning Household Manipulation from Human Videos with Simulated Twin Environments},
    author = {Xiong, Haoyu and Fu, Haoyuan and Zhang, Jieyi and Bao, Chen and Zhang, Qiang and Huang, Yongxi and Xu, Wenqiang and Garg, Animesh and Lu, Cewu},
    booktitle = {Proceedings of The 6th Conference on Robot Learning},
    pages = {1--10},
    year = {2023},
    editor = {Liu, Karen and Kulic, Dana and Ichnowski, Jeff},
    volume = {205},
    series = {Proceedings of Machine Learning Research},
    month = {12},
    publisher =  {PMLR},
    pdf = {https://proceedings.mlr.press/v205/xiong23a/xiong23a.pdf},
    url = {https://proceedings.mlr.press/v205/xiong23a.html},
    environments = {implicit_objectives, robotics},
    agents = {reinforcement_learning, agents_with_memory},
    evaluation = {human, rule_based},
    other = {simulated_humans}
}


@inproceedings{saycan2022arxiv,
    title={Do As I Can and Not As I Say: Grounding Language in Robotic Affordances},
    author={Michael Ahn and Anthony Brohan and Noah Brown and Yevgen Chebotar and Omar Cortes and Byron David and Chelsea Finn and Chuyuan Fu and Keerthana Gopalakrishnan and Karol Hausman and Alex Herzog and Daniel Ho and Jasmine Hsu and Julian Ibarz and Brian Ichter and Alex Irpan and Eric Jang and Rosario Jauregui Ruano and Kyle Jeffrey and Sally Jesmonth and Nikhil Joshi and Ryan Julian and Dmitry Kalashnikov and Yuheng Kuang and Kuang-Huei Lee and Sergey Levine and Yao Lu and Linda Luu and Carolina Parada and Peter Pastor and Jornell Quiambao and Kanishka Rao and Jarek Rettinghouse and Diego Reyes and Pierre Sermanet and Nicolas Sievers and Clayton Tan and Alexander Toshev and Vincent Vanhoucke and Fei Xia and Ted Xiao and Peng Xu and Sichun Xu and Mengyuan Yan and Andy Zeng},
    booktitle={arXiv preprint arXiv:2204.01691},
    year={2022},
    month={8},
    url = {https://say-can.github.io/},
    environments = {mixed_objectives, implicit_objectives, robotics},
    agents = {finetuning, reinforcement_learning, agents_with_memory},
    evaluation = {human, rule_based, model_based},
    other = {simulated_humans}
}

@inproceedings{huang2022inner,
    title={Inner Monologue: Embodied Reasoning through Planning with Language Models},
    author={Wenlong Huang and Fei Xia and Ted Xiao and Harris Chan and Jacky Liang and Pete Florence and Andy Zeng and Jonathan Tompson and Igor Mordatch and Yevgen Chebotar and Pierre Sermanet and Noah Brown and Tomas Jackson and Linda Luu and Sergey Levine and Karol Hausman and Brian Ichter},
    booktitle={arXiv preprint arXiv:2207.05608},
    year={2022},
    month={6},
    url = {https://arxiv.org/abs/2207.05608},
    environments = {mixed_objectives, implicit_objectives, robotics},
    agents = {finetuning, reinforcement_learning, agents_with_memory},
    evaluation = {human, rule_based, model_based},
    other = {simulated_humans}
}

@inproceedings{Wang2023One,
    title={One Policy to Dress Them All: Learning to Dress People with Diverse Poses and Garments},
    author={Wang, Yufei and Sun, Zhanyi and Erickson, Zackory and Held, David},
    booktitle={Robotics: Science and Systems (RSS)},
    year={2023},
    month={6},
    url = {https://arxiv.org/abs/2306.12372},
    environments = {robotics},
    agents = {reinforcement_learning},
    evaluation = {human, rule_based},
    other = {human_agent}
}     

@misc{wang2023cogail,
    title={Co-GAIL: Learning Diverse Strategies for Human-Robot Collaboration}, 
    author={Chen Wang and Claudia Pérez-D'Arpino and Danfei Xu and Li Fei-Fei and C. Karen Liu and Silvio Savarese},
    year={2023},
    month={9},
    url = {https://arxiv.org/abs/2108.06038},
    eprint={2108.06038},
    archivePrefix={arXiv},
    primaryClass={cs.RO},
    environments = {collaboration, mixed_objectives, robotics},
    agents = {two_agents, reinforcement_learning},
    evaluation = {human},
    other = {human_agent, simulated_humans}
}

@misc{shi2024yell,
    title={Yell At Your Robot: Improving On-the-Fly from Language Corrections}, 
    author={Lucy Xiaoyang Shi and Zheyuan Hu and Tony Z. Zhao and Archit Sharma and Karl Pertsch and Jianlan Luo and Sergey Levine and Chelsea Finn},
    year={2024},
    month={3},
    url={https://arxiv.org/abs/2403.12910},
    eprint={2403.12910},
    archivePrefix={arXiv},
    primaryClass={cs.RO},
    environments = {collaboration, mixed_objectives, robotics},
    agents = {two_agents, finetuning, reinforcement_learning, agents_with_memory},
    evaluation = {human},
    other = {human_agent}
}

@article{sheridan2016human,
    title={Human--robot interaction: status and challenges},
    author={Sheridan, Thomas B},
    journal={Human factors},
    month={4},
    url={https://journals.sagepub.com/doi/10.1177/0018720816644364},
    volume={58},
    number={4},
    pages={525--532},
    year={2016},
    publisher={SAGE Publications Sage CA: Los Angeles, CA},
    environments = {collaboration, mixed_objectives, robotics},
    agents = {two_agents, finetuning, reinforcement_learning},
    evaluation = {human},
    other = {human_agent}
}


@article{onnasch2021taxonomy,
    title={A taxonomy to structure and analyze human--robot interaction},
    author={Onnasch, Linda and Roesler, Eileen},
    journal={International Journal of Social Robotics},
    volume={13},
    number={4},
    pages={833--849},
    year={2021},
    publisher={Springer},
    month={6},
    url={https://link.springer.com/article/10.1007/s12369-020-00666-5},
    environments = {collaboration, mixed_objectives, robotics},
    agents = {two_agents},
    evaluation = {human},
    other = {human_agent}
}

@article{robinson2023robotic,
    title={Robotic vision for human-robot interaction and collaboration: A survey and systematic review},
    author={Robinson, Nicole and Tidd, Brendan and Campbell, Dylan and Kuli{\'c}, Dana and Corke, Peter},
    journal={ACM Transactions on Human-Robot Interaction},
    volume={12},
    number={1},
    pages={1--66},
    year={2023},
    month={7},
    url={https://arxiv.org/abs/2307.15363},
    publisher={ACM New York, NY},
    environments = {collaboration, mixed_objectives, implicit_objectives, robotics},
    agents = {two_agents, agent_teams, agents_with_personas},
    evaluation = {human, rule_based},
    other = {human_agent, simulated_humans}
}

@article{dahiya2023survey,
    title={A survey of multi-agent Human--Robot Interaction systems},
    author={Dahiya, Abhinav and Aroyo, Alexander M and Dautenhahn, Kerstin and Smith, Stephen L},
    journal={Robotics and Autonomous Systems},
    volume={161},
    pages={104335},
    year={2022},
    month={10},
    url={https://arxiv.org/abs/2212.05286},
    publisher={Elsevier},
    environments = {collaboration, mixed_objectives, robotics},
    agents = {two_agents, more_than_three_agents, agent_teams},
    evaluation = {human},
    other = {human_agent}
}

@article{10.1145/3570169,
    author = {Urakami, Jacqueline and Seaborn, Katie},
    title = {Nonverbal Cues in Human Robot Interaction: A Communication Studies Perspective},
    year = {2023},
    issue_date = {June 2023},
    publisher = {Association for Computing Machinery},
    address = {New York, NY, USA},
    volume = {12},
    number = {2},
    url = {https://doi.org/10.1145/3570169},
    doi = {10.1145/3570169},
    journal = {J. Hum.-Robot Interact.},
    month = {3},
    articleno = {22},
    numpages = {21},
    keywords = {nonverbal codes, communication studies, human robot interaction, nonverbal communication, Robotics},
    environments = {collaboration, mixed_objectives, implicit_objectives, robotics},
    agents = {two_agents},
    evaluation = {human},
    other = {human_agent}
}

@article{10.1145/3571718,
    author = {Winkle, Katie and Lagerstedt, Erik and Torre, Ilaria and Offenwanger, Anna},
    title = {15 Years of (Who)man Robot Interaction: Reviewing the H in Human-Robot Interaction},
    year = {2023},
    issue_date = {September 2023},
    publisher = {Association for Computing Machinery},
    address = {New York, NY, USA},
    volume = {12},
    number = {3},
    url = {https://doi.org/10.1145/3571718},
    doi = {10.1145/3571718},
    abstract = {Recent work identified a concerning trend of disproportional gender representation in research participants in Human–Computer Interaction (HCI). Motivated by the fact that Human–Robot Interaction (HRI) shares many participant practices with HCI, we explored whether this trend is mirrored in our field. By producing a dataset covering participant gender representation in all 684 full papers published at the HRI conference from 2006–2021, we identify current trends in HRI research participation. We find an over-representation of men in research participants to date, as well as inconsistent and/or incomplete gender reporting, which typically engages in a binary treatment of gender at odds with published best practice guidelines. We further examine if and how participant gender has been considered in user studies to date, in-line with current discourse surrounding the importance and/or potential risks of gender based analyses. Finally, we complement this with a survey of HRI researchers to examine correlations between who is doing with the who is taking part, to further reflect on factors which seemingly influence gender bias in research participation across different sub-fields of HRI. Through our analysis, we identify areas for improvement, but also reason for optimism, and derive some practical suggestions for HRI researchers going forward.},
    journal = {J. Hum.-Robot Interact.},
    month = {4},
    articleno = {28},
    numpages = {28},
    keywords = {Gender, systematic review, user study methodologies, participant recruitment, inclusivity},
    environments = {robotics},
    agents = {two_agents},
    evaluation = {human},
    other = {human_agent}
}

### Modeling

#### In-context Learning
@article{modeling/in-context-learning,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

@article{fu2024autoguide,
  title={AutoGuide: Automated Generation and Selection of State-Aware Guidelines for Large Language Model Agents},
  author={Fu, Yao and Kim, Dong-Ki and Kim, Jaekyeom and Sohn, Sungryull and Logeswaran, Lajanugen and Bae, Kyunghoon and Lee, Honglak},
  journal={arXiv preprint arXiv:2403.08978},
  year={2024},
  month={3},
  url={https://arxiv.org/abs/2403.08978},
  environments={virtual},
  agents={prompting_and_in_context_learning},
  evaluation={rule_based},
  other={n/a}
}

@misc{wang2023voyager,
    title={Voyager: An Open-Ended Embodied Agent with Large Language Models}, 
    author={Guanzhi Wang and Yuqi Xie and Yunfan Jiang and Ajay Mandlekar and Chaowei Xiao and Yuke Zhu and Linxi Fan and Anima Anandkumar},
    year={2023},
    month={5},
    url={https://arxiv.org/abs/2305.16291},
    eprint={2305.16291},
    archivePrefix={arXiv},
    primaryClass={cs.AI},
    environments = {mixed_objectives, implicit_objectives, embodied},
    agents = {prompting_and_in_context_learning, agents_with_memory},
    evaluation = {rule_based},
    other = {simulated_humans}
}

@misc{kim2023language,
    title={Language Models can Solve Computer Tasks}, 
    author={Geunwoo Kim and Pierre Baldi and Stephen McAleer},
    year={2023},
    month={11},
    url={https://arxiv.org/abs/2303.17491},
    eprint={2303.17491},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    environments = {mixed_objectives, implicit_objectives, virtual},
    agents = {prompting_and_in_context_learning},
    evaluation = {rule_based},
    other = {simulated_humans}
}

@misc{ma2024laser,
    title={LASER: LLM Agent with State-Space Exploration for Web Navigation}, 
    author={Kaixin Ma and Hongming Zhang and Hongwei Wang and Xiaoman Pan and Wenhao Yu and Dong Yu},
    year={2024},
    month={2},
    url={https://arxiv.org/abs/2309.08172},
    eprint={2309.08172},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    environments = {mixed_objectives, implicit_objectives, virtual},
    agents = {prompting_and_in_context_learning, agents_with_memory},
    evaluation = {rule_based},
    other = {simulated_humans}
}

@misc{sridhar2023hierarchical,
    title={Hierarchical Prompting Assists Large Language Model on Web Navigation}, 
    author={Abishek Sridhar and Robert Lo and Frank F. Xu and Hao Zhu and Shuyan Zhou},
    year={2023},
    month={10},
    url={https://arxiv.org/abs/2305.14257},
    eprint={2305.14257},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    environments = {mixed_objectives, implicit_objectives, virtual},
    agents = {prompting_and_in_context_learning, agents_with_memory},
    evaluation = {rule_based},
    other = {simulated_humans}
}

@inproceedings{zheng2024synapse,
    title={Synapse: Trajectory-as-Exemplar Prompting with Memory for Computer Control},
    author={Longtao Zheng and Rundong Wang and Xinrun Wang and Bo An},
    booktitle={The Twelfth International Conference on Learning Representations},
    year={2024},
    month={1},
    url={https://openreview.net/forum?id=Pc8AU1aF5e},
    environments = {mixed_objectives, implicit_objectives, virtual},
    agents = {prompting_and_in_context_learning, agents_with_memory},
    evaluation = {rule_based},
    other = {simulated_humans}
}

@inproceedings{sun2023adaplanner,
    title={AdaPlanner: Adaptive Planning from Feedback with Language Models},
    author={Haotian Sun and Yuchen Zhuang and Lingkai Kong and Bo Dai and Chao Zhang},
    booktitle={Thirty-seventh Conference on Neural Information Processing Systems},
    year={2023},
    month={11},
    url={https://openreview.net/forum?id=rnKgbKmelt},
    environments = {mixed_objectives, implicit_objectives, text},
    agents = {prompting_and_in_context_learning, agents_with_memory},
    evaluation = {rule_based},
    other = {simulated_humans}
}

@misc{wu2023spring,
    title={SPRING: Studying the Paper and Reasoning to Play Games}, 
    author={Yue Wu and Shrimai Prabhumoye and So Yeon Min and Yonatan Bisk and Ruslan Salakhutdinov and Amos Azaria and Tom Mitchell and Yuanzhi Li},
    year={2023},
    month={5},
    url={https://arxiv.org/abs/2305.15486},
    eprint={2305.15486},
    archivePrefix={arXiv},
    primaryClass={cs.AI},
    environments = {mixed_objectives, implicit_objectives, text},
    agents = {prompting_and_in_context_learning, agents_with_memory},
    evaluation = {rule_based},
    other = {simulated_humans}
}

@misc{nair2023dera,
    title={DERA: Enhancing Large Language Model Completions with Dialog-Enabled Resolving Agents}, 
    author={Varun Nair and Elliot Schumacher and Geoffrey Tso and Anitha Kannan},
    year={2023},
    month={3},
    url={https://arxiv.org/abs/2303.17071},
    eprint={2303.17071},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    environments = {collaboration, mixed_objectives, implicit_objectives, text},
    agents = {prompting_and_in_context_learning, agent_teams, agents_with_memory},
    evaluation = {rule_based},
    other = {simulated_humans}
}
#### Finetuning
@article{modeling/finetuning,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

@article{song2024trial,
  title={Trial and Error: Exploration-Based Trajectory Optimization for LLM Agents},
  author={Song, Yifan and Yin, Da and Yue, Xiang and Huang, Jie and Li, Sujian and Lin, Bill Yuchen},
  journal={arXiv preprint arXiv:2403.02502},
  year={2024},
  month={3},
  url={https://arxiv.org/abs/2403.02502},
  environments={virtual},
  agents={finetuning},
  evaluation={rule_based},
  other={n/a}
}

@article{lai2024autowebglm,
  title={AutoWebGLM: Bootstrap And Reinforce A Large Language Model-based Web Navigating Agent},
  author={Lai, Hanyu and Liu, Xiao and Iong, Iat Long and Yao, Shuntian and Chen, Yuxuan and Shen, Pengbo and Yu, Hao and Zhang, Hanchen and Zhang, Xiaohan and Dong, Yuxiao and others},
  journal={arXiv preprint arXiv:2404.03648},
  year={2024},
  month={4},
  url={https://arxiv.org/abs/2404.03648},
  environments={virtual},
  agents={prompting_and_in_context_learning, pretraining, finetuning},
  evaluation={rule_based},
  other={n/a}
}

@article{chen2024agent,
  title={Agent-FLAN: Designing Data and Methods of Effective Agent Tuning for Large Language Models},
  author={Chen, Zehui and Liu, Kuikun and Wang, Qiuchen and Zhang, Wenwei and Liu, Jiangning and Lin, Dahua and Chen, Kai and Zhao, Feng},
  journal={arXiv preprint arXiv:2403.12881},
  year={2024},
  month={3},
  url={https://arxiv.org/abs/2403.12881},
  environments={virtual},
  agents={finetuning},
  evaluation={rule_based},
  other={n/a}
}

@misc{gur2023understanding,
    title={Understanding HTML with Large Language Models}, 
    author={Izzeddin Gur and Ofir Nachum and Yingjie Miao and Mustafa Safdari and Austin Huang and Aakanksha Chowdhery and Sharan Narang and Noah Fiedel and Aleksandra Faust},
    year={2023},
    month={5},
    url={https://arxiv.org/abs/2210.03945},
    eprint={2210.03945},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    environments = {mixed_objectives, implicit_objectives, virtual},
    agents = {prompting_and_in_context_learning, finetuning},
    evaluation = {rule_based},
    other = {simulated_humans}
}

@inproceedings{
    furuta2023instructionfinetuned,
    title={Instruction-Finetuned Foundation Models for Multimodal Web Navigation},
    author={Hiroki Furuta and Ofir Nachum and Kuang-Huei Lee and Yutaka Matsuo and Shixiang Shane Gu and Izzeddin Gur},
    booktitle={ICLR 2023 Workshop on Mathematical and Empirical Understanding of Foundation Models},
    year={2023},
    month={5},
    url={https://openreview.net/forum?id=oLc9sGOBbc},
    environments = {mixed_objectives, implicit_objectives, virtual},
    agents = {prompting_and_in_context_learning, finetuning, agents_with_memory},
    evaluation = {rule_based},
    other = {simulated_humans}
}

@misc{yao2023react,
    title={ReAct: Synergizing Reasoning and Acting in Language Models}, 
    author={Shunyu Yao and Jeffrey Zhao and Dian Yu and Nan Du and Izhak Shafran and Karthik Narasimhan and Yuan Cao},
    year={2023},
    month={10},
    eprint={2210.03629},
    url={https://arxiv.org/abs/2210.03629},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    environments = {mixed_objectives, implicit_objectives, virtual},
    agents = {prompting_and_in_context_learning, finetuning, agents_with_memory},
    evaluation = {rule_based},
    other = {simulated_humans}
}

@inproceedings{gur2024a,
    title={A Real-World WebAgent with Planning, Long Context Understanding, and Program Synthesis},
    author={Izzeddin Gur and Hiroki Furuta and Austin V Huang and Mustafa Safdari and Yutaka Matsuo and Douglas Eck and Aleksandra Faust},
    booktitle={The Twelfth International Conference on Learning Representations},
    year={2024},
    month={1},
    url={https://openreview.net/forum?id=9JQtrumvg8},
    environments = {mixed_objectives, implicit_objectives, virtual},
    agents = {prompting_and_in_context_learning, finetuning, agents_with_memory},
    evaluation = {rule_based},
    other = {simulated_humans}
}

@inproceedings{shaw2023from,
    title={From Pixels to {UI} Actions: Learning to Follow Instructions via Graphical User Interfaces},
    author={Peter Shaw and Mandar Joshi and James Cohan and Jonathan Berant and Panupong Pasupat and Hexiang Hu and Urvashi Khandelwal and Kenton Lee and Kristina Toutanova},
    booktitle={Thirty-seventh Conference on Neural Information Processing Systems},
    year={2023},
    month={11},
    url={https://openreview.net/forum?id=3PjCt4kmRx},
    environments = {mixed_objectives, implicit_objectives, virtual},
    agents = {prompting_and_in_context_learning, finetuning, agents_with_memory},
    evaluation = {rule_based},
    other = {simulated_humans}
}

@misc{zheng2024gpt4vision,
    title={GPT-4V(ision) is a Generalist Web Agent, if Grounded}, 
    author={Boyuan Zheng and Boyu Gou and Jihyung Kil and Huan Sun and Yu Su},
    year={2024},
    month={1},
    url={https://arxiv.org/abs/2401.01614},
    eprint={2401.01614},
    archivePrefix={arXiv},
    primaryClass={cs.IR},
    environments = {mixed_objectives, implicit_objectives, virtual},
    agents = {prompting_and_in_context_learning, finetuning, agents_with_memory},
    evaluation = {rule_based},
    other = {simulated_humans}
}

@misc{kil2024dualview,
    title={Dual-View Visual Contextualization for Web Navigation}, 
    author={Jihyung Kil and Chan Hee Song and Boyuan Zheng and Xiang Deng and Yu Su and Wei-Lun Chao},
    year={2024},
    month={2},
    url={https://arxiv.org/abs/2402.04476},
    eprint={2402.04476},
    archivePrefix={arXiv},
    primaryClass={cs.CV},
    environments = {mixed_objectives, implicit_objectives, virtual},
    agents = {prompting_and_in_context_learning, finetuning, agents_with_memory},
    evaluation = {rule_based},
    other = {simulated_humans}
}

@misc{yildirim2024learning,
      title={Learning Social Navigation from Demonstrations with Deep Neural Networks}, 
      author={Yigit Yildirim and Emre Ugur},
      year={2024},
      eprint={2404.11246},
      archivePrefix={arXiv},
      primaryClass={cs.RO},
      url={https://arxiv.org/abs/2404.11246},
      month={4},
      environments = {collaboration, robotics},
      agents = {finetuning, two_agents, more_than_three_agents},
      evaluation = {rule_based, human},
      other = {human_agent}
}

@article{karnan2022scand,
  title = {Socially CompliAnt Navigation Dataset (SCAND): A Large-Scale Dataset Of Demonstrations For Social Navigation},
  author = {Karnan, Haresh and Nair, Anirudh and Xiao, Xuesu and Warnell, Garrett and Pirk, S{\"o}ren and Toshev, Alexander and Hart, Justin and Biswas, Joydeep and Stone, Peter},
  journal={IEEE Robotics and Automation Letters},
  year = {2022},
  organization = {IEEE},
  month = {10},
  url={https://www.cs.utexas.edu/~xiao/SCAND/SCAND.html},
  environments = {collaboration, robotics},
  agents = {finetuning, two_agents},
  evaluation = {rule_based},
  other = {human_agent}
}

@INPROCEEDINGS{9561973,
  author={Cui, Yuxiang and Zhang, Haodong and Wang, Yue and Xiong, Rong},
  booktitle={2021 IEEE International Conference on Robotics and Automation (ICRA)}, 
  title={Learning World Transition Model for Socially Aware Robot Navigation}, 
  year={2021},
  pages={9262-9268},
  month           = {5},
  url={https://ieeexplore.ieee.org/document/9561973},
  environments = {collaboration, embodied},
  agents = {finetuning, two_agents},
  evaluation = {rule_based},
  other = {human_agent}
}

@misc{zhang2022danli,
      title={DANLI: Deliberative Agent for Following Natural Language Instructions}, 
      author={Yichi Zhang and Jianing Yang and Jiayi Pan and Shane Storks and Nikhil Devraj and Ziqiao Ma and Keunwoo Peter Yu and Yuwei Bao and Joyce Chai},
      year={2022},
      eprint={2210.12485},
      archivePrefix={arXiv},
      primaryClass={cs.AI},
      url={https://arxiv.org/abs/2210.12485},
      month={10},
      environments = {collaboration, embodied},
      agents = {finetuning, two_agents},
      evaluation = {rule_based},
      other = {human_agent}
}

@misc{kim2024contextaware,
      title={Context-Aware Planning and Environment-Aware Memory for Instruction Following Embodied Agents}, 
      author={Byeonghwi Kim and Jinyeon Kim and Yuyeong Kim and Cheolhong Min and Jonghyun Choi},
      year={2024},
      eprint={2308.07241},
      archivePrefix={arXiv},
      primaryClass={cs.RO},
      url={https://arxiv.org/abs/2308.07241},
      month={8},
      environments = {collaboration, embodied},
      agents = {finetuning, two_agents},
      evaluation = {rule_based},
      other = {human_agent}
}

@misc{wan2023handmethat,
      title={HandMeThat: Human-Robot Communication in Physical and Social Environments}, 
      author={Yanming Wan and Jiayuan Mao and Joshua B. Tenenbaum},
      year={2023},
      eprint={2310.03779},
      archivePrefix={arXiv},
      primaryClass={cs.AI},
      url={https://arxiv.org/abs/2310.03779},
      month={10},
      environments = {collaboration, embodied},
      agents = {finetuning, two_agents},
      evaluation = {rule_based},
      other = {human_agent}
}

@article{gao2022dialfred,
  title={Dialfred: Dialogue-enabled agents for embodied instruction following},
  author={Gao, Xiaofeng and Gao, Qiaozi and Gong, Ran and Lin, Kaixiang and Thattai, Govind and Sukhatme, Gaurav S},
  journal={IEEE Robotics and Automation Letters},
  volume={7},
  number={4},
  pages={10049--10056},
  year={2022},
  publisher={IEEE},
  month = {7},
  url = {https://ieeexplore.ieee.org/abstract/document/9837390/citations#citations},
  environments = {collaboration, embodied},
  agents = {finetuning, two_agents},
  evaluation = {rule_based},
  other = {human_agent}
}

@article{zhan2018generative,
  title={Generative multi-agent behavioral cloning},
  author={Zhan, Eric and Zheng, Stephan and Yue, Yisong and Lucey, Patrick},
  journal={arXiv preprint arXiv:1803.07612},
  volume={2},
  year={2018},
  month={3},
  url={https://arxiv.org/abs/1803.07612},
  environments = {collaboration, embodied},
  agents = {finetuning, two_agents, more_than_three_agents},
  evaluation = {rule_based},
  other = {n/a}
}

@article{song2018multi,
  title={Multi-agent generative adversarial imitation learning},
  author={Song, Jiaming and Ren, Hongyu and Sadigh, Dorsa and Ermon, Stefano},
  journal={Advances in neural information processing systems},
  volume={31},
  year={2018},
  month={12},
  url={https://proceedings.neurips.cc/paper/2018/hash/240c945bb72980130446fc2b40fbb8e0-Abstract.html},
  environments = {collaboration},
  agents = {finetuning, two_agents, more_than_three_agents},
  evaluation = {rule_based},
  other = {n/a}
}

@article{meng2023offline,
  title={Offline pre-trained multi-agent decision transformer},
  author={Meng, Linghui and Wen, Muning and Le, Chenyang and Li, Xiyun and Xing, Dengpeng and Zhang, Weinan and Wen, Ying and Zhang, Haifeng and Wang, Jun and Yang, Yaodong and others},
  journal={Machine Intelligence Research},
  volume={20},
  number={2},
  pages={233--248},
  year={2023},
  publisher={Springer},
  month={3},
  url={https://link.springer.com/article/10.1007/s11633-022-1383-7},
  environments = {collaboration, competition, mixed_objectives, embodied},
  agents = {finetuning, more_than_three_agents, agent_teams},
  evaluation = {rule_based},
  other = {n/a}
}

#### Reinforcement learning
@article{modeling/reinforcement-learning,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

@article{pan2024autonomous,
  title={Autonomous Evaluation and Refinement of Digital Agents},
  author={Pan, Jiayi and Zhang, Yichi and Tomlin, Nicholas and Zhou, Yifei and Levine, Sergey and Suhr, Alane},
  journal={arXiv preprint arXiv:2404.06474},
  year={2024},
  month={4},
  url={https://arxiv.org/abs/2404.06474v2},
  environments={virtual},
  agents={prompting_and_in_context_learning, finetuning},
  evaluation={rule_based},
  other={n/a}
}

@article{van2022language,
  title={Language games meet multi-agent reinforcement learning: A case study for the naming game},
  author={Van Eecke, Paul and Beuls, Katrien and Botoko Ekila, J{\'e}r{\^o}me and R{\u{a}}dulescu, Roxana},
  journal={Journal of Language Evolution},
  volume={7},
  number={2},
  pages={213--223},
  month={4},
  year={2022},
  url={https://academic.oup.com/jole/article/7/2/213/7128304},
  publisher={Oxford University Press UK},
  environments={collaboration, competition, mixed_objectives},
  agents={reinforcement_learning, two_agents},
  evaluation={rule_based},
  other={n/a}
}

@inproceedings{zhu2022language,
  title={Language Learning from Communicative Goals and Linguistic Input},
  author={Zhu, Hao and Bisk, Yonatan and Neubig, Graham},
  booktitle={Proceedings of the Annual Meeting of the Cognitive Science Society},
  volume={44},
  number={44},
  year={2022},
  month={7},
  url={https://escholarship.org/uc/item/7p65n371},
  environments={collaboration, virtual},
  agents={reinforcement_learning, two_agents},
  evaluation={rule_based},
  other={simulated_humans}
}

@misc{wang2024sotopiapi,
      title={SOTOPIA-$\pi$: Interactive Learning of Socially Intelligent Language Agents}, 
      author={Ruiyi Wang and Haofei Yu and Wenxin Zhang and Zhengyang Qi and Maarten Sap and Graham Neubig and Yonatan Bisk and Hao Zhu},
      year={2024},
      eprint={2403.08715},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      environments = {collaboration, competition, mixed_objectives, text},
      agents = {reinforcement_learning, two_agents},
      evaluation = {rule_based, human, model_based},
      other = {simulated_humans},
      url = {https://arxiv.org/abs/2403.08715}
}

@misc{liu2023computational,
      title={Computational Language Acquisition with Theory of Mind}, 
      author={Andy Liu and Hao Zhu and Emmy Liu and Yonatan Bisk and Graham Neubig},
      year={2023},
      eprint={2303.01502},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      environments = {collaboration, virtual},
      agents = {reinforcement_learning, two_agents},
      evaluation = {rule_based},
      other = {simulated_humans},
      url = {https://arxiv.org/abs/2303.01502}
}


@article{zhe2024indentifying,
	abstract = {Identifying key patterns of tactics implemented by rival teams, and developing effective responses, lies at the heart of modern football. However, doing so algorithmically remains an open research challenge. To address this unmet need, we propose TacticAI, an AI football tactics assistant developed and evaluated in close collaboration with domain experts from Liverpool FC. We focus on analysing corner kicks, as they offer coaches the most direct opportunities for interventions and improvements. TacticAI incorporates both a predictive and a generative component, allowing the coaches to effectively sample and explore alternative player setups for each corner kick routine and to select those with the highest predicted likelihood of success. We validate TacticAI on a number of relevant benchmark tasks: predicting receivers and shot attempts and recommending player position adjustments. The utility of TacticAI is validated by a qualitative study conducted with football domain experts at Liverpool FC. We show that TacticAI's model suggestions are not only indistinguishable from real tactics, but also favoured over existing tactics 90{\%} of the time, and that TacticAI offers an effective corner kick retrieval system. TacticAI achieves these results despite the limited availability of gold-standard data, achieving data efficiency through geometric deep learning.},
	author = {Wang, Zhe and Veli{\v c}kovi{\'c}, Petar and Hennes, Daniel and Toma{\v s}ev, Nenad and Prince, Laurel and Kaisers, Michael and Bachrach, Yoram and Elie, Romuald and Wenliang, Li Kevin and Piccinini, Federico and Spearman, William and Graham, Ian and Connor, Jerome and Yang, Yi and Recasens, Adri{\`a} and Khan, Mina and Beauguerlange, Nathalie and Sprechmann, Pablo and Moreno, Pol and Heess, Nicolas and Bowling, Michael and Hassabis, Demis and Tuyls, Karl},
	date = {2024/03/19},
	date-added = {2024-04-19 16:43:21 -0400},
	date-modified = {2024-04-19 16:43:21 -0400},
	doi = {10.1038/s41467-024-45965-x},
	id = {Wang2024},
	isbn = {2041-1723},
	journal = {Nature Communications},
	number = {1},
	pages = {1906},
	title = {TacticAI: an AI assistant for football tactics},
	url = {https://doi.org/10.1038/s41467-024-45965-x},
	volume = {15},
	year = {2024},
  month={3},
  environments = {virtual, collaboration, competition, mixed_objectives},
  agents = {reinforcement_learning, agent_teams},
  evaluation = {rule_based},
  other = {n/a},
	bdsk-url-1 = {https://doi.org/10.1038/s41467-024-45965-x}
}

@article{silver2017mastering,
  title={Mastering the game of go without human knowledge},
  author={Silver, David and Schrittwieser, Julian and Simonyan, Karen and Antonoglou, Ioannis and Huang, Aja and Guez, Arthur and Hubert, Thomas and Baker, Lucas and Lai, Matthew and Bolton, Adrian and others},
  journal={nature},
  volume={550},
  number={7676},
  pages={354--359},
  year={2017},
  publisher={Nature Publishing Group},
  month={4},
  url={https://www.nature.com/articles/nature24270},
  environments = {competition, virtual},
  agents = {reinforcement_learning},
  evaluation = {rule_based},
  other = {human_agent},
}

@article{silver2016mastering,
  title={Mastering the game of Go with deep neural networks and tree search},
  author={Silver, David and Huang, Aja and Maddison, Chris J and Guez, Arthur and Sifre, Laurent and Van Den Driessche, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc and others},
  journal={nature},
  volume={529},
  number={7587},
  pages={484--489},
  year={2016},
  publisher={Nature Publishing Group},
  month={1},
  url={https://www.nature.com/articles/nature16961},
  environments = {competition, virtual},
  agents = {reinforcement_learning},
  evaluation = {rule_based},
  other = {human_agent},
}

@inproceedings{kharitonov-etal-2019-egg,
    title = "{EGG}: a toolkit for research on Emergence of lan{G}uage in Games",
    author = "Kharitonov, Eugene  and
      Chaabouni, Rahma  and
      Bouchacourt, Diane  and
      Baroni, Marco",
    editor = "Pad{\'o}, Sebastian  and
      Huang, Ruihong",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP): System Demonstrations",
    month = nov,
    year = "2019",
    address = "Hong Kong, China",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/D19-3010",
    doi = "10.18653/v1/D19-3010",
    pages = "55--60",
    abstract = "There is renewed interest in simulating language emergence among deep neural agents that communicate to jointly solve a task, spurred by the practical aim to develop language-enabled interactive AIs, as well as by theoretical questions about the evolution of human language. However, optimizing deep architectures connected by a discrete communication channel (such as that in which language emerges) is technically challenging. We introduce EGG, a toolkit that greatly simplifies the implementation of emergent-language communication games. EGG{'}s modular design provides a set of building blocks that the user can combine to create new games, easily navigating the optimization and architecture space. We hope that the tool will lower the technical barrier, and encourage researchers from various backgrounds to do original work in this exciting area.",
    environments = {collaboration, text},
    agents = {reinforcement_learning, two_agents},
    evaluation = {rule_based},
    other = {n/a}
}

@inproceedings{ cao2018emergent,
  title={Emergent Communication through Negotiation},
  author={Kris Cao and Angeliki Lazaridou and Marc Lanctot and Joel Z Leibo and Karl Tuyls and Stephen Clark},
  booktitle={International Conference on Learning Representations},
  year={2018},
  url={https://openreview.net/forum?id=Hk6WhagRW},
  month={2},
  environments = {mixed_objectives, text},
  agents = {reinforcement_learning, two_agents},
  evaluation = {rule_based},
  other = {n/a}
}

@inproceedings{10.1145/3269206.3272021,
  author = {Jin, Junqi and Song, Chengru and Li, Han and Gai, Kun and Wang, Jun and Zhang, Weinan},
  title = {Real-Time Bidding with Multi-Agent Reinforcement Learning in Display Advertising},
  year = {2018},
  publisher = {Association for Computing Machinery},
  url = {https://doi.org/10.1145/3269206.3272021},
  abstract = {Real-time advertising allows advertisers to bid for each impression for a visiting user. To optimize specific goals such as maximizing revenue and return on investment (ROI) led by ad placements, advertisers not only need to estimate the relevance between the ads and user's interests, but most importantly require a strategic response with respect to other advertisers bidding in the market. In this paper, we formulate bidding optimization with multi-agent reinforcement learning. To deal with a large number of advertisers, we propose a clustering method and assign each cluster with a strategic bidding agent. A practical Distributed Coordinated Multi-Agent Bidding (DCMAB) has been proposed and implemented to balance the tradeoff between the competition and cooperation among advertisers. The empirical study on our industry-scaled real-world data has demonstrated the effectiveness of our methods. Our results show cluster-based bidding would largely outperform single-agent and bandit approaches, and the coordinated bidding achieves better overall objectives than purely self-interested bidding agents.},
  booktitle = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management},
  pages = {2193–2201},
  numpages = {9},
  month = oct,
  environments = {competition, virtual},
  agents = {reinforcement_learning, more_than_three_agents},
  evaluation = {rule_based},
  other = {n/a}
}

@inproceedings{branavan2010reading,
  title={Reading between the lines: Learning to map high-level instructions to commands},
  author={Branavan, SRK and Zettlemoyer, Luke and Barzilay, Regina},
  booktitle={Proceedings of the 48th annual meeting of the association for computational linguistics},
  pages={1268--1277},
  year={2010},
  month={7},
  url={https://www.aclweb.org/anthology/P10-1129},
  environments = {virtual},
  agents = {reinforcement_learning},
  evaluation = {rule_based},
  other = {n/a}
}

### Evaluating social agents

#### Evaluating text social agents
@article{evaluation/language,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

@inproceedings{finch-choi-2020-towards,
    title = "Towards Unified Dialogue System Evaluation: A Comprehensive Analysis of Current Evaluation Protocols",
    author = "Finch, Sarah E.  and
      Choi, Jinho D.",
    editor = "Pietquin, Olivier  and
      Muresan, Smaranda  and
      Chen, Vivian  and
      Kennington, Casey  and
      Vandyke, David  and
      Dethlefs, Nina  and
      Inoue, Koji  and
      Ekstedt, Erik  and
      Ultes, Stefan",
    booktitle = "Proceedings of the 21th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
    month = jul,
    year = "2020",
    address = "1st virtual meeting",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2020.sigdial-1.29",
    doi = "10.18653/v1/2020.sigdial-1.29",
    pages = "236--245",
    environments = {text},
    agents = {n/a},
    evaluation = {human, rule_based, model_based},
    other = {human_agent}
}

@inproceedings{tsuta-etal-2020-ubleu,
    title = "u{BLEU}: Uncertainty-Aware Automatic Evaluation Method for Open-Domain Dialogue Systems",
    author = "Tsuta, Yuma  and
      Yoshinaga, Naoki  and
      Toyoda, Masashi",
    editor = "Rijhwani, Shruti  and
      Liu, Jiangming  and
      Wang, Yizhong  and
      Dror, Rotem",
    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop",
    month = jul,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2020.acl-srw.27",
    doi = "10.18653/v1/2020.acl-srw.27",
    pages = "199--206",
    environments = {text},
    agents = {n/a},
    evaluation = {rule_based},
    other = {human_agent}
}

@article{deriu2021survey,
    title={Survey on evaluation methods for dialogue systems},
    author={Deriu, Jan and Rodrigo, Alvaro and Otegi, Arantxa and Echegoyen, Guillermo and Rosset, Sophie and Agirre, Eneko and Cieliebak, Mark},
    journal={Artificial Intelligence Review},
    volume={54},
    pages={755--810},
    month={1},
    year={2021},
    publisher={Springer},
    environments = {text},
    agents = {n/a},
    evaluation = {rule_based, model_based},
    other = {human_agent},
    url={https://link.springer.com/article/10.1007/s10462-020-09866-x}
}

@inproceedings{zhang-etal-2021-dynaeval,
    title = "{D}yna{E}val: Unifying Turn and Dialogue Level Evaluation",
    author = "Zhang, Chen  and
      Chen, Yiming  and
      D{'}Haro, Luis Fernando  and
      Zhang, Yan  and
      Friedrichs, Thomas  and
      Lee, Grandee  and
      Li, Haizhou",
    editor = "Zong, Chengqing  and
      Xia, Fei  and
      Li, Wenjie  and
      Navigli, Roberto",
    booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
    month = aug,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.acl-long.441",
    doi = "10.18653/v1/2021.acl-long.441",
    pages = "5676--5689",
    environments = {text},
    agents = {n/a},
    evaluation = {model_based},
    other = {human_agent}
}

@inproceedings{smith-etal-2022-human,
    title = "Human Evaluation of Conversations is an Open Problem: comparing the sensitivity of various methods for evaluating dialogue agents",
    author = "Smith, Eric  and
      Hsu, Orion  and
      Qian, Rebecca  and
      Roller, Stephen  and
      Boureau, Y-Lan  and
      Weston, Jason",
    editor = "Liu, Bing  and
      Papangelis, Alexandros  and
      Ultes, Stefan  and
      Rastogi, Abhinav  and
      Chen, Yun-Nung  and
      Spithourakis, Georgios  and
      Nouri, Elnaz  and
      Shi, Weiyan",
    booktitle = "Proceedings of the 4th Workshop on NLP for Conversational AI",
    month = may,
    year = "2022",
    address = "Dublin, Ireland",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.nlp4convai-1.8",
    doi = "10.18653/v1/2022.nlp4convai-1.8",
    pages = "77--97",
    environments = {text},
    agents = {n/a},
    evaluation = {human},
    other = {human_agent},
}

@inproceedings{finch-etal-2023-dont,
    title = "Don{'}t Forget Your {ABC}{'}s: Evaluating the State-of-the-Art in Chat-Oriented Dialogue Systems",
    author = "Finch, Sarah E.  and
      Finch, James D.  and
      Choi, Jinho D.",
    editor = "Rogers, Anna  and
      Boyd-Graber, Jordan  and
      Okazaki, Naoaki",
    booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.acl-long.839",
    doi = "10.18653/v1/2023.acl-long.839",
    pages = "15044--15071",
    environments = {text},
    agents = {n/a},
    evaluation = {human},
    other = {human_agent},
}

@inproceedings{zhang-etal-2023-xdial,
    title = "x{D}ial-Eval: A Multilingual Open-Domain Dialogue Evaluation Benchmark",
    author = "Zhang, Chen  and
      D{'}Haro, Luis  and
      Tang, Chengguang  and
      Shi, Ke  and
      Tang, Guohua  and
      Li, Haizhou",
    editor = "Bouamor, Houda  and
      Pino, Juan  and
      Bali, Kalika",
    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
    month = dec,
    year = "2023",
    address = "Singapore",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.findings-emnlp.371",
    doi = "10.18653/v1/2023.findings-emnlp.371",
    pages = "5579--5601",
    environments = {text},
    agents = {n/a},
    evaluation = {human},
    other = {human_agent},
}

@inproceedings{zhou2024sotopia,
  title={SOTOPIA: Interactive Evaluation for Social Intelligence in Language Agents},
  author={Xuhui Zhou and Hao Zhu and Leena Mathur and Ruohong Zhang and Haofei Yu and Zhengyang Qi and Louis-Philippe Morency and Yonatan Bisk and Daniel Fried and Graham Neubig and Maarten Sap},
  booktitle={ICLR},
  environments = {mixed_objectives, text},
  agents = {prompting_and_in_context_learning, two_agents},
  evaluation = {model_based, human},
  other = {human_agent},
  year={2024},
  month = {10},
  url={https://openreview.net/forum?id=mM7VurbA4r}
}

@misc{chen2024roleinteract,
      title={RoleInteract: Evaluating the Social Interaction of Role-Playing Agents}, 
      author={Hongzhan Chen and Hehong Chen and Ming Yan and Wenshen Xu and Xing Gao and Weizhou Shen and Xiaojun Quan and Chenliang Li and Ji Zhang and Fei Huang and Jingren Zhou},
      year={2024},
      environments = {implicit_objectives, text},
      agents = {prompting_and_in_context_learning, more_than_three_agents, agents_with_memory, agents_with_personas},
      evaluation = {rule_based},
      other = {simulated_humans},
      url = {https://arxiv.org/abs/2403.13679},
      eprint={2403.13679},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@inproceedings{svikhnushina-pu-2023-approximating,
      title = "Approximating Online Human Evaluation of Social Chatbots with Prompting",
      author = "Svikhnushina, Ekaterina  and
            Pu, Pearl",
      editor = "Stoyanchev, Svetlana  and
            Joty, Shafiq  and
            Schlangen, David  and
            Dusek, Ondrej  and
            Kennington, Casey  and
            Alikhani, Malihe",
      booktitle = "Proceedings of the 24th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
      month = {09},
      year = "2023",
      address = "Prague, Czechia",
      publisher = "Association for Computational Linguistics",
      url = "https://aclanthology.org/2023.sigdial-1.25",
      doi = "10.18653/v1/2023.sigdial-1.25",
      pages = "268--281",
      abstract = "With conversational models becoming increasingly available to the general public, developing scalable and robust evaluation metrics is crucial to minimize potential social and psychological risks for the users. Existing evaluation metrics aim to automate offline user evaluation and approximate human judgment of pre-curated dialogs. However, they are limited in their ability to capture subjective perceptions of users who actually interact with the chatbots and might not generalize to real-world settings. To address this limitation, we propose an approach to approximate online human evaluation, leveraging large language models (LLMs) from the GPT-family. We introduce a new Dialog system Evaluation framework based on Prompting (DEP), which enables a fully automatic evaluation pipeline that replicates live user studies and achieves an impressive correlation with human judgment (up to Pearson r=0.95 on a system level). The DEP approach involves collecting synthetic chat logs of evaluated bots with an LLM in the other-play setting, where the LLM is carefully conditioned to follow a specific scenario. We further explore different prompting approaches to produce evaluation scores with the same LLM. The best-performing prompts, which contain few-shot demonstrations and instructions, show outstanding performance on the tested dataset and demonstrate the ability to generalize to other dialog corpora.",
      environments = {mixed_objectives, text},
      agents = {prompting_and_in_context_learning, two_agents},
      evaluation = {model_based},
      other = {n/a}
}

@inproceedings{NEURIPS2023_a3621ee9,
      author = {Li, Guohao and Hammoud, Hasan and Itani, Hani and Khizbullin, Dmitrii and Ghanem, Bernard},
      booktitle = {Advances in Neural Information Processing Systems},
      editor = {A. Oh and T. Neumann and A. Globerson and K. Saenko and M. Hardt and S. Levine},
      pages = {51991--52008},
      publisher = {Curran Associates, Inc.},
      title = {CAMEL: Communicative Agents for "Mind" Exploration of Large Language Model Society},
      url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/a3621ee907def47c1b952ade25c67698-Paper-Conference.pdf},
      volume = {36},
      year = {2023},
      month = {12},
      environments = {mixed_objectives, text},
      agents = {prompting_and_in_context_learning, agent_teams},
      evaluation = {human, model_based},
      other = {human_agent}
}

@article{lan2023llm,
      title={Llm-based agent society investigation: Collaboration and confrontation in avalon gameplay},
      author={Lan, Yihuai and Hu, Zhiqiang and Wang, Lei and Wang, Yang and Ye, Deheng and Zhao, Peilin and Lim, Ee-Peng and Xiong, Hui and Wang, Hao},
      journal={arXiv preprint arXiv:2310.14985},
      eprint={2310.14985},
      year={2023},
      month={10},
      environments = {mixed_objectives, text},
      agents = {prompting_and_in_context_learning, more_than_three_agents, agents_with_memory, agents_with_personas},
      evaluation = {model_based, rule_based},
      other = {simulated_humans},
      url = {https://arxiv.org/pdf/2310.14985.pdf}
}

@misc{tu2023characterchat,
      title={CharacterChat: Learning towards Conversational AI with Personalized Social Support}, 
      author={Quan Tu and Chuanqi Chen and Jinpeng Li and Yanran Li and Shuo Shang and Dongyan Zhao and Ran Wang and Rui Yan},
      year={2023},
      month={08},
      environments = {implicit_objectives, text},
      agents = {prompting_and_in_context_learning, two_agents, agents_with_memory, agents_with_personas},
      evaluation = {model_based, human},
      other = {simulated_humans},
      url = {https://arxiv.org/abs/2308.10278},
      eprint={2308.10278},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{zhang2023agentcf,
      title={AgentCF: Collaborative Learning with Autonomous Language Agents for Recommender Systems}, 
      author={Junjie Zhang and Yupeng Hou and Ruobing Xie and Wenqi Sun and Julian McAuley and Wayne Xin Zhao and Leyu Lin and Ji-Rong Wen},
      year={2023},
      environments = {mixed_objectives, text},
      agents = {prompting_and_in_context_learning, more_than_three_agents, agents_with_memory, agents_with_personas},
      evaluation = {rule_based},
      other = {simulated_humans},
      url = {https://arxiv.org/abs/2310.09233},
      eprint={2310.09233},
      archivePrefix={arXiv},
      primaryClass={cs.IR}
}

@misc{huang2024far,
      title={How Far Are We on the Decision-Making of LLMs? Evaluating LLMs' Gaming Ability in Multi-Agent Environments}, 
      author={Jen-tse Huang and Eric John Li and Man Ho Lam and Tian Liang and Wenxuan Wang and Youliang Yuan and Wenxiang Jiao and Xing Wang and Zhaopeng Tu and Michael R. Lyu},
      year={2024},
      eprint={2403.11807},
      archivePrefix={arXiv},
      primaryClass={cs.AI},
      url={https://arxiv.org/abs/2403.11807},
      environments = {mixed_objectives, text},
      agents = {prompting_and_in_context_learning, more_than_three_agents},
      evaluation = {rule_based},
      other = {more_omniscient}
}

@misc{chan2023chateval,
      title={ChatEval: Towards Better LLM-based Evaluators through Multi-Agent Debate}, 
      author={Chi-Min Chan and Weize Chen and Yusheng Su and Jianxuan Yu and Wei Xue and Shanghang Zhang and Jie Fu and Zhiyuan Liu},
      year={2023},
      eprint={2308.07201},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2308.07201},
      environments = {collaboration, text},
      agents = {prompting_and_in_context_learning, more_than_three_agents},
      evaluation = {rule_based},
      other = {n/a}
}

@misc{li2024automatic,
      title={Automatic Evaluation for Mental Health Counseling using LLMs}, 
      author={Anqi Li and Yu Lu and Nirui Song and Shuai Zhang and Lizhi Ma and Zhenzhong Lan},
      year={2024},
      eprint={2402.11958},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
        url={https://arxiv.org/abs/2402.11958},
        environments = {collaboration, text},
        agents = {prompting_and_in_context_learning, two_agents},
        evaluation = {model_based},
        other = {n/a}
}

@misc{bianchi2024llms,
      title={How Well Can LLMs Negotiate? NegotiationArena Platform and Analysis}, 
      author={Federico Bianchi and Patrick John Chia and Mert Yuksekgonul and Jacopo Tagliabue and Dan Jurafsky and James Zou},
      year={2024},
      eprint={2402.05863},
      archivePrefix={arXiv},
      primaryClass={cs.AI},
        url={https://arxiv.org/abs/2402.05863},
        environments = {mixed_objectives, text},
        agents = {prompting_and_in_context_learning, two_agents},
        evaluation = {rule_based},
        other = {more_information_asymmetrical}
}

@inproceedings{Jiang2023PersonaLLMIT,
  title={PersonaLLM: Investigating the Ability of Large Language Models to Express Personality Traits},
  author={Hang Jiang and Xiajie Zhang and Xubo Cao and Cynthia Breazeal and Deb Roy and Jad Kabbara},
  year={2023},
  booktitle={NAACL Findings},
  url={https://api.semanticscholar.org/CorpusID:268032940},
  environments = {text},
  agents = {prompting_and_in_context_learning},
  evaluation = {human, model_based},
  other = {n/a}, 
  month={5}
}

@article{Xie2024CanLL,
  title={Can Large Language Model Agents Simulate Human Trust Behaviors?},
  author={Chengxing Xie and Canyu Chen and Feiran Jia and Ziyu Ye and Kai Shu and Adel Bibi and Ziniu Hu and Philip H.S. Torr and Bernard Ghanem and G. Li},
  journal={ArXiv},
  year={2024},
  volume={abs/2402.04559},
  url={https://api.semanticscholar.org/CorpusID:267523076},
    environments = {text},
    agents = {prompting_and_in_context_learning},
    evaluation = {human, model_based},
    other = {n/a},
    month={2}
}

@article{Rasal2024LLMHM,
  title={LLM Harmony: Multi-Agent Communication for Problem Solving},
  author={Sumedh Rasal},
  journal={ArXiv},
  year={2024},
  volume={abs/2401.01312},
  url={https://api.semanticscholar.org/CorpusID:266725580},
    environments = {text},
    agents = {prompting_and_in_context_learning},
    evaluation = {human, model_based},
    other = {n/a},
    month={1}
}

@inproceedings{yeh-etal-2021-comprehensive,
    title = "A Comprehensive Assessment of Dialog Evaluation Metrics",
    author = "Yeh, Yi-Ting  and
      Eskenazi, Maxine  and
      Mehri, Shikib",
    editor = "Wei, Wei  and
      Dai, Bo  and
      Zhao, Tuo  and
      Li, Lihong  and
      Yang, Diyi  and
      Chen, Yun-Nung  and
      Boureau, Y-Lan  and
      Celikyilmaz, Asli  and
      Geramifard, Alborz  and
      Ahuja, Aman  and
      Jiang, Haoming",
    booktitle = "The First Workshop on Evaluations and Assessments of Neural Conversation Systems",
    month = nov,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.eancs-1.3",
    environments = {text},
    agents = {n/a},
    evaluation = {human, model_based, rule_based},
    other = {n/a}
}

@inproceedings{chang-etal-2020-convokit,
    title = "{C}onvo{K}it: A Toolkit for the Analysis of Conversations",
    author = "Chang, Jonathan P.  and
      Chiam, Caleb  and
      Fu, Liye  and
      Wang, Andrew  and
      Zhang, Justine  and
      Danescu-Niculescu-Mizil, Cristian",
    editor = "Pietquin, Olivier  and
      Muresan, Smaranda  and
      Chen, Vivian  and
      Kennington, Casey  and
      Vandyke, David  and
      Dethlefs, Nina  and
      Inoue, Koji  and
      Ekstedt, Erik  and
      Ultes, Stefan",
    booktitle = "Proceedings of the 21th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
    month = jul,
    year = "2020",
    address = "1st virtual meeting",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2020.sigdial-1.8",
    doi = "10.18653/v1/2020.sigdial-1.8",
    pages = "57--60",
    environments = {text},
    agents = {n/a},
    evaluation = {human, model_based, rule_based},
    other = {n/a}
}

@misc{giorgi2023psychological,
      title={Psychological Metrics for Dialog System Evaluation}, 
      author={Salvatore Giorgi and Shreya Havaldar and Farhan Ahmed and Zuhaib Akhtar and Shalaka Vaidya and Gary Pan and Lyle H. Ungar and H. Andrew Schwartz and Joao Sedoc},
      year={2023},
      environments = {text},
      agents = {two_agents},
      evaluation = {human, rule_based},
      other = {human_agent},
      url = {https://arxiv.org/abs/2305.14757},
      eprint={2305.14757},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}


@misc{ghazarian2023accent,
      title={ACCENT: An Automatic Event Commonsense Evaluation Metric for Open-Domain Dialogue Systems}, 
      author={Sarik Ghazarian and Yijia Shao and Rujun Han and Aram Galstyan and Nanyun Peng},
      year={2023},
      environments = {text},
      agents = {two_agents},
      evaluation = {human, model_based},
      other = {human_agent},
      url = {https://arxiv.org/pdf/2305.07797},
      eprint={2305.07797},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@inproceedings{huang-etal-2020-grade,
      title = "{GRADE}: Automatic Graph-Enhanced Coherence Metric for Evaluating Open-Domain Dialogue Systems",
      author = "Huang, Lishan  and
            Ye, Zheng  and
            Qin, Jinghui  and
            Lin, Liang  and
            Liang, Xiaodan",
      editor = "Webber, Bonnie  and
            Cohn, Trevor  and
            He, Yulan  and
            Liu, Yang",
      booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
      month = nov,
      year = "2020",
      address = "Online",
      publisher = "Association for Computational Linguistics",
      url = "https://aclanthology.org/2020.emnlp-main.742",
      doi = "10.18653/v1/2020.emnlp-main.742",
      pages = "9230--9240",
      abstract = "Automatically evaluating dialogue coherence is a challenging but high-demand ability for developing high-quality open-domain dialogue systems. However, current evaluation metrics consider only surface features or utterance-level semantics, without explicitly considering the fine-grained topic transition dynamics of dialogue flows. Here, we first consider that the graph structure constituted with topics in a dialogue can accurately depict the underlying communication logic, which is a more natural way to produce persuasive metrics. Capitalized on the topic-level dialogue graph, we propose a new evaluation metric GRADE, which stands for Graph-enhanced Representations for Automatic Dialogue Evaluation. Specifically, GRADE incorporates both coarse-grained utterance-level contextualized representations and fine-grained topic-level graph representations to evaluate dialogue coherence. The graph representations are obtained by reasoning over topic-level dialogue graphs enhanced with the evidence from a commonsense graph, including k-hop neighboring representations and hop-attention weights. Experimental results show that our GRADE significantly outperforms other state-of-the-art metrics on measuring diverse dialogue models in terms of the Pearson and Spearman correlations with human judgments. Besides, we release a new large-scale human evaluation benchmark to facilitate future research on automatic metrics.",
      environments = {text},
      agents = {two_agents},
      evaluation = {human, model_based},
      other = {human_agent}
}

@inproceedings{mehri-eskenazi-2020-unsupervised,
      title = "Unsupervised Evaluation of Interactive Dialog with {D}ialo{GPT}",
      author = "Mehri, Shikib  and
            Eskenazi, Maxine",
      editor = "Pietquin, Olivier  and
            Muresan, Smaranda  and
            Chen, Vivian  and
            Kennington, Casey  and
            Vandyke, David  and
            Dethlefs, Nina  and
            Inoue, Koji  and
            Ekstedt, Erik  and
            Ultes, Stefan",
      booktitle = "Proceedings of the 21th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
      month = jul,
      year = "2020",
      address = "1st virtual meeting",
      publisher = "Association for Computational Linguistics",
      url = "https://aclanthology.org/2020.sigdial-1.28",
      doi = "10.18653/v1/2020.sigdial-1.28",
      pages = "225--235",
      abstract = "It is important to define meaningful and interpretable automatic evaluation metrics for open-domain dialog research. Standard language generation metrics have been shown to be ineffective for dialog. This paper introduces the FED metric (fine-grained evaluation of dialog), an automatic evaluation metric which uses DialoGPT, without any fine-tuning or supervision. It also introduces the FED dataset which is constructed by annotating a set of human-system and human-human conversations with eighteen fine-grained dialog qualities. The FED metric (1) does not rely on a ground-truth response, (2) does not require training data and (3) measures fine-grained dialog qualities at both the turn and whole dialog levels. FED attains moderate to strong correlation with human judgement at both levels.",
      environments = {text},
      agents = {two_agents},
      evaluation = {human, model_based},
      other = {human_agent}
}

#### Evaluating embodied social agents
@article{evaluation/embodied,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

@inproceedings{min-etal-2022-dont,
    title = "Don{'}t Copy the Teacher: Data and Model Challenges in Embodied Dialogue",
    author = "Min, So Yeon  and
      Zhu, Hao  and
      Salakhutdinov, Ruslan  and
      Bisk, Yonatan",
    editor = "Goldberg, Yoav  and
      Kozareva, Zornitsa  and
      Zhang, Yue",
    booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
    month = dec,
    year = "2022",
    address = "Abu Dhabi, United Arab Emirates",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.emnlp-main.635",
    doi = "10.18653/v1/2022.emnlp-main.635",
    pages = "9361--9368",
    abstract = "Embodied dialogue instruction following requires an agent to complete a complex sequence of tasks from a natural language exchange. The recent introduction of benchmarks raises the question of how best to train and evaluate models for this multi-turn, multi-agent, long-horizon task. This paper contributes to that conversation, by arguing that imitation learning (IL) and related low-level metrics are actually misleading and do not align with the goals of embodied dialogue research and may hinder progress. We provide empirical comparisons of metrics, analysis of three models, and make suggestions for how the field might best progress. First, we observe that models trained with IL take spurious actions during evaluation. Second, we find that existing models fail to ground query utterances, which are essential for task completion. Third, we argue evaluation should focus on higher-level semantic goals. We will release code to additionally filter the data and benchmark models for improved evaluation.",
  environments={mixed_objectives, collaboration, embodied},
  agents={agents_with_memory},
  evaluation={qualitative, human, rule_based},
  other={n/a}
}

@misc{guo2024embodied,
      title={Embodied LLM Agents Learn to Cooperate in Organized Teams}, 
      author={Xudong Guo and Kaixuan Huang and Jiale Liu and Wenhui Fan and Natalia Vélez and Qingyun Wu and Huazheng Wang and Thomas L. Griffiths and Mengdi Wang},
      year={2024},
      month={3},
      environments = {collaboration, embodied},
      agents = {prompting_and_in_context_learning, more_than_three_agents},
      evaluation = {model_based, human},
      url={https://arxiv.org/abs/2403.12482},
      other = {education},
      eprint={2403.12482},
      archivePrefix={arXiv},
      primaryClass={cs.AI}
}


@article{10.1145/3476413,
author = {Biswas, Abhijat and Wang, Allan and Silvera, Gustavo and Steinfeld, Aaron and Admoni, Henny},
title = {SocNavBench: A Grounded Simulation Testing Framework for Evaluating Social Navigation},
year = {2022},
month={2},
url = {https://doi.org/10.1145/3476413},
abstract = {The human-robot interaction community has developed many methods for robots to navigate safely and socially alongside humans. However, experimental procedures to evaluate these works are usually constructed on a per-method basis. Such disparate evaluations make it difficult to compare the performance of such methods across the literature. To bridge this gap, we introduce SocNavBench, a simulation framework for evaluating social navigation algorithms. SocNavBench comprises a simulator with photo-realistic capabilities and curated social navigation scenarios grounded in real-world pedestrian data. We also provide an implementation of a suite of metrics to quantify the performance of navigation algorithms on these scenarios. Altogether, SocNavBench provides a test framework for evaluating disparate social navigation methods in a consistent and interpretable manner. To illustrate its use, we demonstrate testing three existing social navigation methods and a baseline method on SocNavBench, showing how the suite of metrics helps infer their performance trade-offs. Our code is open-source, allowing the addition of new scenarios and metrics by the community to help evolve SocNavBench to reflect advancements in our understanding of social navigation.},
journal = {J. Hum.-Robot Interact.},
      environments={implicit_objectives, embodied},
      agents={reinforcement_learning},
      evaluation={rule_based},
      other={simulated_humans}
}


@inproceedings{10.5555/3463952.3464159,
author = {Knott, Paul and Carroll, Micah and Devlin, Sam and Ciosek, Kamil and Hofmann, Katja and Dragan, Anca and Shah, Rohin},
title = {Evaluating the Robustness of Collaborative Agents},
year = {2021},
month={1},
url={https://dl.acm.org/doi/abs/10.5555/3463952.3464159},
isbn = {9781450383073},
publisher = {International Foundation for Autonomous Agents and Multiagent Systems},
address = {Richland, SC},
abstract = {Artificial agents trained by deep reinforcement learning will likely encounter novel situations after deployment that were never seen during training. Our agent must be robust to handle such situations well. However, if we cannot rely on the average training or validation reward as a metric, then how can we effectively evaluate robustness? We take inspiration from the practice of unit testing in software engineering. Specifically, we suggest that when designing AI agents that collaborate with humans, designers should search for potential edge cases in possible partner behavior and possible states encountered, and write tests which check that the behavior of the agent in these edge cases is reasonable. We apply this methodology to build a suite of unit tests for the Overcooked-AI environment, and use this test suite to evaluate three proposals for improving robustness. We find that the test suite provides significant insight into the effects of these proposals that were generally not revealed by looking solely at the average validation reward. For our full paper, see https://arxiv.org/abs/2101.05507 arxiv.org/abs/2101.05507},
booktitle = {Proceedings of the 20th International Conference on Autonomous Agents and MultiAgent Systems},
pages = {1560–1562},
numpages = {3},
keywords = {human-AI collaboration, multi-agent RL, robustness},
location = {Virtual Event, United Kingdom},
series = {AAMAS '21},
      environments={collaboration, embodied},
      agents={reinforcement_learning},
      evaluation={rule_based},
      other={n/a}
}


@misc{singh2024change,
      title={How much can change in a year? Revisiting Evaluation in Multi-Agent Reinforcement Learning}, 
      author={Siddarth Singh and Omayma Mahjoub and Ruan de Kock and Wiem Khlifi and Abidine Vall and Kale-ab Tessera and Arnu Pretorius},
      year={2024},
      eprint={2312.08463},
      archivePrefix={arXiv},
      primaryClass={cs.AI},
      environments = {collaboration, embodied},
      agents = {reinforcement_learning, more_than_three_agents},
      evaluation = {rule_based},
      other = {n/a},
      url = {https://arxiv.org/abs/2312.08463}
}

@inproceedings{leibo2021scalable,
  title={Scalable evaluation of multi-agent reinforcement learning with melting pot},
  author={Leibo, Joel Z and Due{\~n}ez-Guzman, Edgar A and Vezhnevets, Alexander and Agapiou, John P and Sunehag, Peter and Koster, Raphael and Matyas, Jayd and Beattie, Charlie and Mordatch, Igor and Graepel, Thore},
  booktitle={International conference on machine learning},
  pages={6187--6199},
  year={2021},
  organization={PMLR},
  environments = {collaboration, competition, mixed_objectives, embodied},
  agents = {reinforcement_learning, more_than_three_agents},
  evaluation = {rule_based},
  other = {n/a},
  month = {7},
  url={https://proceedings.mlr.press/v139/leibo21a.html}
}

@inproceedings{bara-etal-2021-mindcraft,
    title = "{M}ind{C}raft: Theory of Mind Modeling for Situated Dialogue in Collaborative Tasks",
    author = "Bara, Cristian-Paul  and
      CH-Wang, Sky  and
      Chai, Joyce",
    booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
    month = nov,
    year = "2021",
    address = "Online and Punta Cana, Dominican Republic",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.emnlp-main.85",
    pages = "1112--1125",
    abstract = "An ideal integration of autonomous agents in a human world implies that they are able to collaborate on human terms. In particular, theory of mind plays an important role in maintaining common ground during human collaboration and communication. To enable theory of mind modeling in situated interactions, we introduce a fine-grained dataset of collaborative tasks performed by pairs of human subjects in the 3D virtual blocks world of Minecraft. It provides information that captures partners{'} beliefs of the world and of each other as an interaction unfolds, bringing abundant opportunities to study human collaborative behaviors in situated language communication. As a first step towards our goal of developing embodied AI agents able to infer belief states of collaborative partners in situ, we build and present results on computational models for several theory of mind tasks.",
    environments = {collaboration, embodied},
    agents = {finetuning, two_agents},
    evaluation = {human, rule_based},
    other = {human_agent}
}

@inproceedings{narayan-chen-etal-2019-collaborative,
    title = "Collaborative Dialogue in {M}inecraft",
    author = "Narayan-Chen, Anjali  and
      Jayannavar, Prashant  and
      Hockenmaier, Julia",
    editor = "Korhonen, Anna  and
      Traum, David  and
      M{\`a}rquez, Llu{\'\i}s",
    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
    month = jul,
    year = "2019",
    address = "Florence, Italy",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/P19-1537",
    doi = "10.18653/v1/P19-1537",
    pages = "5405--5415",
    abstract = "We wish to develop interactive agents that can communicate with humans to collaboratively solve tasks in grounded scenarios. Since computer games allow us to simulate such tasks without the need for physical robots, we define a Minecraft-based collaborative building task in which one player (A, the Architect) is shown a target structure and needs to instruct the other player (B, the Builder) to build this structure. Both players interact via a chat interface. A can observe B but cannot place blocks. We present the Minecraft Dialogue Corpus, a collection of 509 conversations and game logs. As a first step towards our goal of developing fully interactive agents for this task, we consider the subtask of Architect utterance generation, and show how challenging it is.",
    environments = {collaboration, embodied},
    agents = {two_agents},
    evaluation = {human, rule_based},
    other = {human_agent}
}

@inproceedings{ichikawa-higashinaka-2022-analysis,
    title = "Analysis of Dialogue in Human-Human Collaboration in {M}inecraft",
    author = "Ichikawa, Takuma  and
      Higashinaka, Ryuichiro",
    editor = "Calzolari, Nicoletta  and
      B{\'e}chet, Fr{\'e}d{\'e}ric  and
      Blache, Philippe  and
      Choukri, Khalid  and
      Cieri, Christopher  and
      Declerck, Thierry  and
      Goggi, Sara  and
      Isahara, Hitoshi  and
      Maegaard, Bente  and
      Mariani, Joseph  and
      Mazo, H{\'e}l{\`e}ne  and
      Odijk, Jan  and
      Piperidis, Stelios",
    booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
    month = jun,
    year = "2022",
    address = "Marseille, France",
    publisher = "European Language Resources Association",
    url = "https://aclanthology.org/2022.lrec-1.431",
    pages = "4051--4059",
    abstract = "Recently, many studies have focused on developing dialogue systems that enable collaborative work; however, they rarely focus on creative tasks. Collaboration for creative work, in which humans and systems collaborate to create new value, will be essential for future dialogue systems. In this study, we collected 500 dialogues of human-human collaboration in Minecraft as a basis for developing a dialogue system that enables creative collaborative work. We conceived the Collaborative Garden Task, where two workers interact and collaborate in Minecraft to create a garden, and we collected dialogue, action logs, and subjective evaluations. We also collected third-person evaluations of the gardens and analyzed the relationship between dialogue and collaborative work that received high scores on the subjective and third-person evaluations in order to identify dialogic factors for high-quality collaborative work. We found that two essential aspects in creative collaborative work are performing more processes to ask for and agree on suggestions between workers and agreeing on a particular image of the final product in the early phase of work and then discussing changes and details.",
    environments = {collaboration, embodied},
    agents = {two_agents},
    evaluation = {human, rule_based},
    other = {human_agent}
}

@InProceedings{CordialSync,
  author = {Jain, Unnat and Weihs, Luca and Kolve, Eric and Farhadi, Ali and Lazebnik, Svetlana and Kembhavi, Aniruddha and Schwing, Alexander G.},
  title = {A Cordial Sync: Going Beyond Marginal Policies For Multi-Agent Embodied Tasks},
  booktitle = {ECCV},
  year = {2020},
  month={11},
  note = {first two authors contributed equally},
  environments = {collaboration, embodied},
  agents = {finetuning, reinforcement_learning, two_agents},
  evaluation = {rule_based},
  other = {n/a},
  url = {https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123500460.pdf},
}

@InProceedings{TwoBody,
  author = {Jain, Unnat and Weihs, Luca and Kolve, Eric and Rastegari, Mohammad and Lazebnik, Svetlana and Farhadi, Ali and Schwing, Alexander G. and Kembhavi, Aniruddha},
  title = {Two Body Problem: Collaborative Visual Task Completion},
  booktitle = {CVPR},
  year = {2019},
  month = {6},
  note = {first two authors contributed equally},
  environments = {collaboration, embodied},
  agents = {finetuning, reinforcement_learning, two_agents},
  evaluation = {rule_based},
  other = {n/a},
  url = {https://openaccess.thecvf.com/content_CVPR_2019/papers/Jain_Two_Body_Problem_Collaborative_Visual_Task_Completion_CVPR_2019_paper.pdf},
}

@inproceedings{teach,
  title={{TEACh: Task-driven Embodied Agents that Chat}},
  author={Padmakumar, Aishwarya and Thomason, Jesse and Shrivastava, Ayush and Lange, Patrick and Narayan-Chen, Anjali and Gella, Spandana and Piramuthu, Robinson and Tur, Gokhan and Hakkani-Tur, Dilek},
  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
  volume={36},
  number={2},
  pages={2017--2025},
  year={2022},
  month={2},
  environments = {collaboration, embodied},
  agents = {finetuning, two_agents},
  evaluation = {rule_based},
  other = {human_agent},
  url = {https://arxiv.org/abs/2110.00534}
}

@inproceedings{teachda,
  title={{Dialog Acts for Task-Driven Embodied Agents}},
  author={Gella, Spandana and Padmakumar, Aishwarya and Lange, Patrick and Hakkani-Tur, Dilek},
  booktitle={Proceedings of the 23nd Annual Meeting of the Special Interest Group on Discourse and Dialogue (SIGDial)},
  year={2022},
  pages={111-123},
  month={9},
  environments = {collaboration, embodied},
  agents = {finetuning, two_agents},
  evaluation = {rule_based},
  other = {human_agent},
  url = {https://aclanthology.org/2022.sigdial-1.13},
}

@inproceedings{kim2016evaluation,
  title={Evaluation of starcraft artificial intelligence competition bots by experienced human players},
  author={Kim, Man-Je and Kim, Kyung-Joong and Kim, SeungJun and Dey, Anind K},
  booktitle={Proceedings of the 2016 CHI Conference Extended Abstracts on Human Factors in Computing Systems},
  pages={1915--1921},
  year={2016},
  month={5},
  environments = {collaboration, competition, mixed_objectives, embodied},
  agents = {more_than_three_agents},
  evaluation = {human, rule_based},
  other = {human_agent},
  url = {https://dl.acm.org/doi/pdf/10.1145/2851581.2892305},
}

#### Evaluating virtual social agents
@article{evaluation/virtual,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}


@Article{Lù2024WebLINXRW,
 author = {Xing Han Lù and Zdeněk Kasner and Siva Reddy},
 booktitle = {arXiv.org},
 journal = {ArXiv},
 title = {WebLINX: Real-World Website Navigation with Multi-Turn Dialogue},
 volume = {abs/2402.05930},
 year = {2024},
 month = {2},
 url = {https://arxiv.org/abs/2402.05930},
 environments={virtual},
 agents={prompting_and_in_context_learning, finetuning},
 evaluation={rule_based},
 other={human_agent}
}


@article{Xie2023OpenAgentsAO,
  title={OpenAgents: An Open Platform for Language Agents in the Wild},
  author={Tianbao Xie and Fan Zhou and Zhoujun Cheng and Peng Shi and Luoxuan Weng and Yitao Liu and Toh Jing Hua and Junning Zhao and Qian Liu and Che Liu and Leo Z. Liu and Yiheng Xu and Hongjin Su and Dongchan Shin and Caiming Xiong and Tao Yu},
  journal={ArXiv},
  year={2023},
  volume={abs/2310.10634},
  url={https://api.semanticscholar.org/CorpusID:264172893},
  environments = {virtual, collaboration},
  agents = {n/a},
  evaluation = {qualitative},
  month={10},
  other = {n/a},
}

@misc{mialon2023gaia,
      title={GAIA: a benchmark for General AI Assistants}, 
      author={Grégoire Mialon and Clémentine Fourrier and Craig Swift and Thomas Wolf and Yann LeCun and Thomas Scialom},
      year={2023},
      eprint={2311.12983},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      environments = {virtual, collaboration},
      agents = {n/a},
      evaluation = {rule_based},
      other = {fully_omniscient},
      url = {https://arxiv.org/abs/2311.12983}
}

@article{Sumers2023CognitiveAF,
  title={Cognitive Architectures for Language Agents},
  author={Theodore R. Sumers and Shunyu Yao and Karthik Narasimhan and Thomas L. Griffiths},
  journal={ArXiv},
  year={2023},
  volume={abs/2309.02427},
  url={https://api.semanticscholar.org/CorpusID:261556862},
  environments = {n/a},
  agents = {n/a},
  evaluation = {n/a},
  other = {n/a},
  month={9}
}

@inproceedings{Hong2023MetaGPTMP,
  title={MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework},
  author={Sirui Hong and Mingchen Zhuge and Jonathan Chen and Xiawu Zheng and Yuheng Cheng and Ceyao Zhang and Jinlin Wang and Zili Wang and Steven Ka Shing Yau and Zi Hen Lin and Liyang Zhou and Chenyu Ran and Lingfeng Xiao and Chenglin Wu and J{\"u}rgen Schmidhuber},
  year={2023},
  url={https://api.semanticscholar.org/CorpusID:265301950},
  environments = {virtual, collaboration},
  agents = {prompting_and_in_context_learning},
  evaluation = {rule_based},
  other = {more_omniscient},
  month={11},
  journal={ArXiv},
}

@inproceedings{10.1145/3290605.3300511,
author = {Wang, Isaac and Smith, Jesse and Ruiz, Jaime},
title = {Exploring Virtual Agents for Augmented Reality},
year = {2019},
isbn = {9781450359702},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3290605.3300511},
doi = {10.1145/3290605.3300511},
booktitle = {CHI},
pages = {1–12},
numpages = {12},
keywords = {embodied conversational agents, augmented reality},
location = {Glasgow, Scotland Uk},
month = {5},
environments = {virtual, collaboration},
agents = {n/a},
evaluation = {human, rule_based},
other = {human_agent},
}

@inproceedings{10.1145/3374920.3374956,
author = {Reinhardt, Jens and Hillen, Luca and Wolf, Katrin},
title = {Embedding Conversational Agents into AR: Invisible or with a Realistic Human Body?},
year = {2020},
isbn = {9781450361071},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3374920.3374956},
doi = {10.1145/3374920.3374956},
booktitle = {Proceedings of the Fourteenth International Conference on Tangible, Embedded, and Embodied Interaction},
pages = {299–310},
numpages = {12},
keywords = {intelligent virtual assistants, embodied conversational agents, avatars, augmented reality},
location = {Sydney NSW, Australia},
series = {TEI '20},
environments = {virtual, collaboration},
agents = {two_agents},
evaluation = {human},
other = {human_agent},
month           = {2},
}


#### Evaluating robotics in social contexts
@article{evaluation/robotics,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}


@misc{sferrazza2024humanoidbench,
    title={HumanoidBench: Simulated Humanoid Benchmark for Whole-Body Locomotion and Manipulation}, 
    author={Carmelo Sferrazza and Dun-Ming Huang and Xingyu Lin and Youngwoon Lee and Pieter Abbeel},
    year={2024},
    month={3},
    url={https://arxiv.org/abs/2403.10506},
    eprint={2403.10506},
    archivePrefix={arXiv},
    primaryClass={cs.RO},
    environments = {collaboration, mixed_objectives, robotics},
    agents = {reinforcement_learning},
    evaluation = {human, model_based},
    other = {human_agent, simulated_humans}
}

@INPROCEEDINGS{1174284,
    author={Scholtz, J.},
    booktitle={36th Annual Hawaii International Conference on System Sciences, 2003. Proceedings of the}, 
    title={Theory and evaluation of human robot interactions}, 
    year={2003},
    month={1},
    url={https://ieeexplore.ieee.org/document/1174284},
    volume={},
    number={},
    pages={10 pp.-},
    keywords={Human robot interaction;Robot sensing systems;Mobile robots;Software architecture;NIST;Feeds;User interfaces;Computer architecture;Human computer interaction;Man machine systems},
    doi={10.1109/HICSS.2003.1174284},
    environments = {collaboration, mixed_objectives, robotics},
    agents = {reinforcement_learning},
    evaluation = {human},
    other = {human_agent}
  
}
  
@inproceedings{10.1145/1121241.1121249,
    author = {Steinfeld, Aaron and Fong, Terrence and Kaber, David and Lewis, Michael and Scholtz, Jean and Schultz, Alan and Goodrich, Michael},
    title = {Common metrics for human-robot interaction},
    year = {2006},
    month = {3},
    isbn = {1595932941},
    publisher = {Association for Computing Machinery},
    address = {New York, NY, USA},
    url = {https://doi.org/10.1145/1121241.1121249},
    doi = {10.1145/1121241.1121249},
    abstract = {This paper describes an effort to identify common metrics for task-oriented human-robot interaction (HRI). We begin by discussing the need for a toolkit of HRI metrics. We then describe the framework of our work and identify important biasing factors that must be taken into consideration. Finally, we present suggested common metrics for standardization and a case study. Preparation of a larger, more detailed toolkit is in progress.},
    booktitle = {Proceedings of the 1st ACM SIGCHI/SIGART Conference on Human-Robot Interaction},
    pages = {33–40},
    numpages = {8},
    keywords = {unmanned ground vehicles, metrics, human-robot interaction},
    location = {Salt Lake City, Utah, USA},
    series = {HRI '06},
    environments = {collaboration, mixed_objectives, robotics},
    agents = {reinforcement_learning},
    evaluation = {human, rule_based, model_based},
    other = {human_agent}
}

@article{zacharaki2020safety,
    title={Safety bounds in human robot interaction: A survey},
    author={Zacharaki, Angeliki and Kostavelis, Ioannis and Gasteratos, Antonios and Dokas, Ioannis},
    journal={Safety science},
    volume={127},
    pages={104667},
    year={2020},
    month = {7},
    publisher={Elsevier},
    url = {https://www.sciencedirect.com/science/article/pii/S0925753520300643},
    environments = {collaboration, mixed_objectives, robotics},
    agents = {reinforcement_learning},
    evaluation = {human, rule_based},
    other = {human_agent}
}

@article{hancock2011meta,
    title={A meta-analysis of factors affecting trust in human-robot interaction},
    author={Hancock, Peter A and Billings, Deborah R and Schaefer, Kristin E and Chen, Jessie YC and De Visser, Ewart J and Parasuraman, Raja},
    journal={Human factors},
    volume={53},
    number={5},
    pages={517--527},
    year={2011},
    month={10},
    url={https://journals.sagepub.com/doi/10.1177/0018720811417254},
    publisher={Sage Publications Sage CA: Los Angeles, CA},
    environments = {collaboration, mixed_objectives, robotics},
    agents = {reinforcement_learning},
    evaluation = {human, rule_based},
    other = {human_agent}
}

@article{bartneck2009measurement,
    title={Measurement instruments for the anthropomorphism, animacy, likeability, perceived intelligence, and perceived safety of robots},
    author={Bartneck, Christoph and Kuli{\'c}, Dana and Croft, Elizabeth and Zoghbi, Susana},
    journal={International journal of social robotics},
    volume={1},
    pages={71--81},
    year={2009},
    month={11},
    publisher={Springer},
    url={https://link.springer.com/article/10.1007/s12369-008-0001-3},
    environments = {collaboration, mixed_objectives, robotics},
    agents = {reinforcement_learning},
    evaluation = {human, rule_based},
    other = {human_agent}
}

@article{iocchi2015robocup,
    title={RoboCup@ Home: Analysis and results of evolving competitions for domestic and service robots},
    author={Iocchi, Luca and Holz, Dirk and Ruiz-del-Solar, Javier and Sugiura, Komei and Van Der Zant, Tijn},
    journal={Artificial Intelligence},
    volume={229},
    pages={258--281},
    year={2015},
    month={12},
    url={https://www.sciencedirect.com/science/article/pii/S0004370215001174},
    publisher={Elsevier},
    environments = {collaboration, mixed_objectives, robotics},
    agents = {reinforcement_learning},
    evaluation = {human, rule_based},
    other = {human_agent}
}

@article{doi:10.1080/01691864.2019.1698462,
    author = {Y. Mizuchi and T. Inamura},
    title = {Optimization of criterion for objective evaluation of HRI performance that approximates subjective evaluation: a case study in robot competition},
    journal = {Advanced Robotics},
    volume = {34},
    number = {3-4},
    pages = {142--156},
    year = {2020},
    month = {12},
    publisher = {Taylor \& Francis},
    doi = {10.1080/01691864.2019.1698462},
    url = {https://doi.org/10.1080/01691864.2019.1698462},
    environments = {competition, mixed_objectives, robotics},
    agents = {reinforcement_learning},
    evaluation = {human, rule_based},
    other = {human_agent}
}
### Interactions with humans

#### Human-Chatbot Interaction
@article{interactions/text,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

@inproceedings{rapp2023collaborating,
  title={Collaborating with a Text-Based Chatbot: An Exploration of Real-World Collaboration Strategies Enacted during Human-Chatbot Interactions},
  author={Rapp, Amon and Boldi, Arianna and Curti, Lorenzo and Perrucci, Alessandro and Simeoni, Rossana},
  booktitle={Proceedings of the 2023 CHI Conference on Human Factors in Computing Systems},
  pages={1--17},
  month={4},
  year={2023},
  url={https://dl.acm.org/doi/pdf/10.1145/3544548.3580995},
  environments = {text},
  agents = {prompting_and_in_context_learning},
  evaluation = {qualitative},
  other = {human_agent}
}

@inproceedings{pillis2024ai,
  title={AI Comes Out of the Closet: Using AI-Generated Virtual Characters to Help Individuals Practice LGBTQIA+ Advocacy},
  author={Pillis, Daniel and Pataranutaporn, Pat and Maes, Pattie and Sra, Misha},
  booktitle={Proceedings of the 29th International Conference on Intelligent User Interfaces},
  pages={686--698},
  month={3},
  year={2024},
  url={https://dl.acm.org/doi/pdf/10.1145/3640543.3645213},
  environments = {text},
  agents = {prompting_and_in_context_learning},
  evaluation = {qualitative},
  other = {human_agent}
}

@inproceedings{lee2023exploring,
  title={Exploring effects of chatbot-based social contact on reducing mental illness stigma},
  author={Lee, Yi-Chieh and Cui, Yichao and Jamieson, Jack and Fu, Wayne and Yamashita, Naomi},
  booktitle={Proceedings of the 2023 CHI Conference on Human Factors in Computing Systems},
  url={https://dl.acm.org/doi/pdf/10.1145/3544548.3581384},
  pages={1--16},
  month={4},
  year={2023},
  environments = {text},
  agents = {prompting_and_in_context_learning},
  evaluation = {qualitative, human},
  other = {human_agent, health}
}

@inproceedings{jang2024s,
  title={" It's the only thing I can trust": Envisioning Large Language Model Use by Autistic Workers for Communication Assistance},
  author={Jang, JiWoong and Moharana, Sanika and Carrington, Patrick and Begel, Andrew},
  journal={arXiv preprint arXiv:2403.03297},
  booktitle={Proceedings of the 2024 CHI Conference on Human Factors in Computing Systems},
  month={5},
  year={2024},
  url={https://arxiv.org/pdf/2403.03297.pdf},
  environments = {text},
  agents = {prompting_and_in_context_learning},
  evaluation = {qualitative, human},
  other = {human_agent}
}

@inproceedings{volkel2022user,
  title={User perceptions of extraversion in chatbots after repeated use},
  author={V{\"o}lkel, Sarah Theres and Schoedel, Ramona and Kaya, Lale and Mayer, Sven},
  booktitle={Proceedings of the 2022 CHI Conference on Human Factors in Computing Systems},
  pages={1--18},
  month={4},
  year={2022},
  url={https://dl.acm.org/doi/pdf/10.1145/3491102.3502058},
  environments = {text},
  agents = {n/a},
  evaluation = {human},
  other = {human_agent}
}

@inproceedings{kuhail2022interacting,
  title={Interacting with a chatbot-based advising system: Understanding the effect of chatbot personality and user gender on behavior},
  author={Kuhail, Mohammad Amin and Thomas, Justin and Alramlawi, Salwa and Shah, Syed Jawad Hussain and Thornquist, Erik},
  booktitle={Informatics},
  volume={9},
  number={4},
  pages={81},
  month={9},
  year={2022},
  organization={MDPI},
  url={https://www.mdpi.com/2227-9709/9/4/81},
  environments = {text},
  agents = {n/a},
  evaluation = {qualitative, human},
  other = {human_agent, education}
}

@inproceedings{kim2023effects,
  title={The Effects of Engaging and Affective Behaviors of Virtual Agents in Group Decision-Making},
  author={Kim, Hanseob and Han, Bin and Kim, Jieun and Syawaludin, Muhammad Firdaus and Kim, Gerard Jounghyun and Hwang, Jae-In},
  booktitle={Proceedings of the 2024 CHI Conference on Human Factors in Computing Systems},
  month={5},
  year={2023},
  url={https://arxiv.org/pdf/2308.10385.pdf},
  environments = {embodied},
  agents = {n/a},
  evaluation = {human},
  other = {human_agent}
}

@inproceedings{qian2024take,
  title={Take It, Leave It, or Fix It: Measuring Productivity and Trust in Human-AI Collaboration},
  author={Qian, Crystal and Wexler, James},
  booktitle={Proceedings of the 29th International Conference on Intelligent User Interfaces},
  pages={370--384},
  month={3},
  year={2024},
  url={https://dl.acm.org/doi/pdf/10.1145/3640543.3645198},
  environments = {text},
  agents = {prompting_and_in_context_learning},
  evaluation = {human, qualitative},
  other = {human_agent}
}

#### Human-Embodied Agent Interaction
@article{interactions/embodied,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}


@inproceedings{puig2023nopa,
  title={Nopa: Neurally-guided online probabilistic assistance for building socially intelligent home assistants},
  author={Puig, Xavier and Shu, Tianmin and Tenenbaum, Joshua B and Torralba, Antonio},
  booktitle={2023 IEEE International Conference on Robotics and Automation (ICRA)},
  pages={7628--7634},
  year={2023},
  month = {1},
  organization={IEEE},
  url={https://ieeexplore.ieee.org/document/10161352},
  environments={collaboration, embodied},
  agents={reinforcement_learning},
  evaluation={rule_based},
  other={n/a}
}

@inproceedings{puig2021watchandhelp,
title={Watch-And-Help: A Challenge for Social Perception and Human-{\{}AI{\}} Collaboration},
author={Xavier Puig and Tianmin Shu and Shuang Li and Zilin Wang and Yuan-Hong Liao and Joshua B. Tenenbaum and Sanja Fidler and Antonio Torralba},
booktitle={International Conference on Learning Representations},
year={2021},
month={1},
url={https://openreview.net/forum?id=w_7JMpGZRh0},
  environments={collaboration, embodied},
  agents={reinforcement_learning},
  evaluation={rule_based},
  other={n/a}
}


@article{carroll2019utility,
  title={On the utility of learning about humans for human-ai coordination},
  author={Carroll, Micah and Shah, Rohin and Ho, Mark K and Griffiths, Tom and Seshia, Sanjit and Abbeel, Pieter and Dragan, Anca},
  journal={Advances in neural information processing systems},
  volume={32},
  year={2019},
  month={10},
  url={https://proceedings.neurips.cc/paper_files/paper/2019/file/f5b1b89d98b7286673128a5fb112cb9a-Paper.pdf},
  environments={collaboration, embodied},
  agents={reinforcement_learning},
  evaluation={rule_based},
  other={n/a}
}

@inproceedings{nalepka2021interaction,
  title={Interaction flexibility in artificial agents teaming with humans},
  author={Nalepka, Patrick and Gregory-Dunsmore, Jordan P and Simpson, James and Patil, Gaurav and Richardson, Michael J},
  booktitle={Proceedings of the Annual Meeting of the Cognitive Science Society},
  volume={43},
  number={43},
  year={2021},
  month={5},
  url={https://escholarship.org/uc/item/9ks6n70q},
  environments={collaboration, embodied},
  agents={reinforcement_learning},
  evaluation={rule_based},
  other={n/a}
}

@article{liu2023llm,
  title={Llm-powered hierarchical language agent for real-time human-ai coordination},
  author={Liu, Jijia and Yu, Chao and Gao, Jiaxuan and Xie, Yuqing and Liao, Qingmin and Wu, Yi and Wang, Yu},
  journal={arXiv preprint arXiv:2312.15224},
  year={2023},
  month={12},
  url={https://arxiv.org/abs/2312.15224},
  environments={collaboration, embodied},
  agents={reinforcement_learning},
  evaluation={rule_based},
  other={n/a}
}

@inproceedings{szot2023adaptive,
  title={Adaptive coordination in social embodied rearrangement},
  author={Szot, Andrew and Jain, Unnat and Batra, Dhruv and Kira, Zsolt and Desai, Ruta and Rai, Akshara},
  booktitle={International Conference on Machine Learning},
  pages={33365--33380},
  year={2023},
  month={5},
  organization={PMLR},
  url={https://proceedings.mlr.press/v202/szot23a/szot23a.pdf},
  environments={collaboration, embodied},
  agents={prompting_and_in_context_learning},
  evaluation={rule_based},
  other={n/a}
}

@inproceedings{park2023generative,
  title={Generative agents: Interactive simulacra of human behavior},
  author={Park, Joon Sung and O'Brien, Joseph and Cai, Carrie Jun and Morris, Meredith Ringel and Liang, Percy and Bernstein, Michael S},
  booktitle={Proceedings of the 36th Annual ACM Symposium on User Interface Software and Technology},
  pages={1--22},
  year={2023},
  month={4},
  url={https://dl.acm.org/doi/abs/10.1145/3586183.3606763},
  environments={mixed_objectives, embodied},
  agents={prompting_and_in_context_learning},
  evaluation={rule_based},
  other={n/a}
}

@article{sarkar2023diverse,
  title={Diverse Conventions for Human-AI Collaboration},
  author={Sarkar, Bidipta and Shih, Andy and Sadigh, Dorsa},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  month={12},
  year={2023},
  url={https://proceedings.neurips.cc/paper_files/paper/2023/file/4818263715b25dc137d393af8af6d2fc-Paper-Conference.pdf},
  environments={collaboration, embodied},
  agents={reinforcement_learning, two_agents, agent_teams},
  evaluation={rule_based},
  other={n/a}
}


#### Human Robot Interaction
@article{interactions/robot,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

@article{casper2023open,
  title={Open problems and fundamental limitations of reinforcement learning from human feedback},
  author={Casper, Stephen and Davies, Xander and Shi, Claudia and Gilbert, Thomas Krendl and Scheurer, J{\'e}r{\'e}my and Rando, Javier and Freedman, Rachel and Korbak, Tomasz and Lindner, David and Freire, Pedro and others},
  journal={arXiv preprint arXiv:2307.15217},
  year={2023},
  month={7},
  url={https://arxiv.org/abs/2307.15217},
  environments={embodied, robotics},
  agents={finetuning, reinforcement_learning, two_agents},
  evaluation={human, rule_based, model_based},
  other={n/a}
}

@inproceedings{cui2023no,
  title={No, to the right: Online language corrections for robotic manipulation via shared autonomy},
  author={Cui, Yuchen and Karamcheti, Siddharth and Palleti, Raj and Shivakumar, Nidhya and Liang, Percy and Sadigh, Dorsa},
  booktitle={Proceedings of the 2023 ACM/IEEE International Conference on Human-Robot Interaction},
  pages={93--101},
  year={2023},
  month={3},
  url={https://arxiv.org/abs/2301.02555},
  environments={robotics},
  agents={two_agents},
  evaluation={rule_based},
  other={n/a}
}

@inproceedings{mahadevan2024generative,
  title={Generative expressive robot behaviors using large language models},
  author={Mahadevan, Karthik and Chien, Jonathan and Brown, Noah and Xu, Zhuo and Parada, Carolina and Xia, Fei and Zeng, Andy and Takayama, Leila and Sadigh, Dorsa},
  booktitle={Proceedings of the 2024 ACM/IEEE International Conference on Human-Robot Interaction},
  pages={482--491},
  year={2024},
  month={3},
  url={https://arxiv.org/abs/2401.14673},
  environments={robotics},
  agents={prompting_and_in_context_learning, two_agents},
  evaluation={human},
  other={n/a}
}

@inproceedings{lin2023gesture,
  title={Gesture-informed robot assistance via foundation models},
  author={Lin, Li-Heng and Cui, Yuchen and Hao, Yilun and Xia, Fei and Sadigh, Dorsa},
  booktitle={7th Annual Conference on Robot Learning},
  year={2023},
  month={8},
  url={https://arxiv.org/abs/2309.02721},
  environments={robotics},
  agents={prompting_and_in_context_learning, two_agents},
  evaluation={human, rule_based},
  other={n/a}
}

@article{ren2023robots,
  title={Robots that ask for help: Uncertainty alignment for large language model planners},
  author={Ren, Allen Z and Dixit, Anushri and Bodrova, Alexandra and Singh, Sumeet and Tu, Stephen and Brown, Noah and Xu, Peng and Takayama, Leila and Xia, Fei and Varley, Jake and others},
  journal={arXiv preprint arXiv:2307.01928},
  year={2023},
  month={7},
  url={https://arxiv.org/abs/2307.01928},
  environments={embodied, robotics},
  agents={prompting_and_in_context_learning, two_agents},
  evaluation={rule_based},
  other={n/a}
}

@article{yu2023language,
  title={Language to rewards for robotic skill synthesis},
  author={Yu, Wenhao and Gileadi, Nimrod and Fu, Chuyuan and Kirmani, Sean and Lee, Kuang-Huei and Arenas, Montse Gonzalez and Chiang, Hao-Tien Lewis and Erez, Tom and Hasenclever, Leonard and Humplik, Jan and others},
  journal={arXiv preprint arXiv:2306.08647},
  year={2023},
  month={6},
  url={https://arxiv.org/abs/2306.08647},
  environments={embodied, robotics},
  agents={prompting_and_in_context_learning},
  evaluation={qualitative},
  other={n/a}
}

@article{ma2023eureka,
  title={Eureka: Human-level reward design via coding large language models},
  author={Ma, Yecheng Jason and Liang, William and Wang, Guanzhi and Huang, De-An and Bastani, Osbert and Jayaraman, Dinesh and Zhu, Yuke and Fan, Linxi and Anandkumar, Anima},
  journal={arXiv preprint arXiv:2310.12931},
  year={2023},
  month={10},
  url={https://arxiv.org/abs/2310.12931},
  environments={embodied},
  agents={prompting_and_in_context_learning, reinforcement_learning},
  evaluation={human, rule_based},
  other={n/a}
}

@inproceedings{shaikewitz2023inmouth,
  title={In-Mouth Robotic Bite Transfer with Visual and Haptic Sensing},
  author={Shaikewitz, Lorenzo and Wu, Yilin and Belkhale, Suneel and Grannen, Jennifer and Sundaresan, Priya and Sadigh, Dorsa},
  booktitle={International Conference on Robotics and Automation (ICRA)},
  year={2023},
  month={3},
  url={https://arxiv.org/abs/2211.12705},
  environments={robotics},
  agents={two_agents},
  evaluation={human, rule_based},
  other={n/a}
}

@inproceedings{hejna2023few,
  title={Few-shot preference learning for human-in-the-loop rl},
  author={Hejna III, Donald Joseph and Sadigh, Dorsa},
  booktitle={Conference on Robot Learning},
  pages={2014--2025},
  year={2023},
  month={3},
  url={https://arxiv.org/abs/2212.03363},
  organization={PMLR},
  environments={embodied, robotics},
  agents={finetuning, reinforcement_learning, two_agents},
  evaluation={rule_based},
  other={n/a}
}

@article{kress2021formalizing,
  title={Formalizing and guaranteeing human-robot interaction},
  author={Kress-Gazit, Hadas and Eder, Kerstin and Hoffman, Guy and Admoni, Henny and Argall, Brenna and Ehlers, R{\"u}diger and Heckman, Christoffer and Jansen, Nils and Knepper, Ross and K{\v{r}}et{\'\i}nsk{\`y}, Jan and others},
  journal={Communications of the ACM},
  volume={64},
  number={9},
  pages={78--84},
  year={2021},
  month={8},
  url={https://arxiv.org/abs/2006.16732},
  environments={robotics},
  agents={two_agents},
  evaluation={n/a},
  other={n/a}
}

#### Human-Human Interaction
@article{interactions/human,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

@article{shaikh2023rehearsal,
  title={Rehearsal: Simulating conflict to teach conflict resolution},
  author={Shaikh, Omar and Chai, Valentino and Gelfand, Michele J and Yang, Diyi and Bernstein, Michael S},
  journal={arXiv preprint arXiv:2309.12309},
  year={2023},
  month={9},
  url={https://arxiv.org/pdf/2309.12309.pdf},
  environments = {implicit_objectives, text},
  agents = {prompting_and_in_context_learning, two_agents},
  evaluation = {human, rule_based},
  other = {human_agent}
}

@article{STERGIOU2019102799,
  title = {Analyzing human–human interactions: A survey},
  journal = {Computer Vision and Image Understanding},
  volume = {188},
  pages = {102799},
  year = {2019},
  issn = {1077-3142},
  doi = {https://doi.org/10.1016/j.cviu.2019.102799},
  url = {https://www.sciencedirect.com/science/article/pii/S1077314219301158},
  author = {Alexandros Stergiou and Ronald Poppe},
  keywords = {Human-human interaction, Human interaction recognition, Human activity},
  abstract = {Many videos depict people, and it is their interactions that inform us of their activities, relation to one another and the cultural and social setting. With advances in human action recognition, researchers have begun to address the automated recognition of these human–human interactions from video. The main challenges stem from dealing with the considerable variation in recording setting, the appearance of the people depicted and the coordinated performance of their interaction. This survey provides a summary of these challenges and datasets to address these, followed by an in-depth discussion of relevant vision-based recognition and detection methods. We focus on recent, promising work based on deep learning and convolutional neural networks (CNNs). Finally, we outline directions to overcome the limitations of the current state-of-the-art to analyze and, eventually, understand social human actions.},
  month = {11},
  environments = {collaboration, competition, mixed_objectives, implicit_objectives, embodied},
  agents = {two_agents, more_than_three_agents, agent_teams},
  evaluation = {human, rule_based, model_based},
  other = {human_agent}
}

@INPROCEEDINGS{Ego4D2022CVPR,
  author={Grauman, Kristen and Westbury, Andrew and Byrne, Eugene and Chavis, Zachary and Furnari, Antonino and Girdhar, Rohit and Hamburger, Jackson and Jiang, Hao and Liu, Miao and Liu, Xingyu and Martin, Miguel and Nagarajan, Tushar and Radosavovic, Ilija and Ramakrishnan, Santhosh Kumar and Ryan, Fiona and Sharma, Jayant and Wray, Michael and Xu, Mengmeng and Xu, Eric Zhongcong and Zhao, Chen and Bansal, Siddhant and Batra, Dhruv and Cartillier, Vincent and Crane, Sean and Do, Tien and Doulaty, Morrie and Erapalli, Akshay and Feichtenhofer, Christoph and Fragomeni, Adriano and Fu, Qichen and Fuegen, Christian and Gebreselasie, Abrham and Gonzalez, Cristina and Hillis, James and Huang, Xuhua and Huang, Yifei and Jia, Wenqi and Khoo, Weslie and Kolar, Jachym and Kottur, Satwik and Kumar, Anurag and Landini, Federico and Li, Chao and Li, Yanghao and Li, Zhenqiang and Mangalam, Karttikeya and Modhugu, Raghava and Munro, Jonathan and Murrell, Tullie and Nishiyasu, Takumi and Price, Will and Puentes, Paola Ruiz and Ramazanova, Merey and Sari, Leda and Somasundaram, Kiran and Southerland, Audrey and Sugano, Yusuke and Tao, Ruijie and Vo, Minh and Wang, Yuchen and Wu, Xindi and Yagi, Takuma and Zhu, Yunyi and Arbelaez, Pablo and Crandall, David and Damen, Dima and Farinella, Giovanni Maria and Ghanem, Bernard and Ithapu, Vamsi Krishna and Jawahar, C. V. and Joo, Hanbyul and Kitani, Kris and Li, Haizhou and Newcombe, Richard and Oliva, Aude and Park, Hyun Soo and Rehg, James M. and Sato, Yoichi and Shi, Jianbo and Shou, Mike Zheng and Torralba, Antonio and Torresani, Lorenzo and Yan, Mingfei and Malik, Jitendra},
  title     = {Ego4D: Around the {W}orld in 3,000 {H}ours of {E}gocentric {V}ideo},
  booktitle   = {IEEE/CVF Computer Vision and Pattern Recognition (CVPR)},
  year      = {2022},
  month = {6},
  url = {https://ego4d-data.org/},
  environments = {collaboration, competition, mixed_objectives, implicit_objectives, embodied},
  agents = {two_agents, more_than_three_agents, agent_teams},
  evaluation = {human, rule_based, model_based},
  other = {human_agent}
}


@article{dai2021agent,
  title={Agent reasoning in AI-powered negotiation},
  author={Dai, Tinglong and Sycara, Katia and Zheng, Ronghuo},
  journal={Handbook of Group Decision and Negotiation},
  pages={1187--1211},
  year={2021},
  publisher={Springer},
  month={12},
  url = {https://link.springer.com/referenceworkentry/10.1007/978-3-030-49629-6_26},
  environments = {mixed_objectives, text},
  agents = {two_agents},
  evaluation = {rule_based},
  other = {human_agent}
}

@misc{NegotiAge,
  title={NegotiAge},
  author={Lee Lindquist},
  url={https://www.negotiage.com/},
  journal={NegotiAge},
  year={2024},
  month        = {4},
  environments = {implicit_objectives, virtual},
  agents       = {two_agents},
  evaluation   = {n/a},
  other        = {human_agent, health},
} 

@article{zeleznikow2021negotiation,
  title={Negotiation, Online Dispute Resolution, and Artificial Intelligence},
  author={Zeleznikow, John},
  journal={Handbook of Group Decision and Negotiation},
  pages={1125--1147},
  year={2021},
  publisher={Springer},
  url={https://link.springer.com/referenceworkentry/10.1007/978-3-030-49629-6_38},
  month={12},
  environments = {mixed_objectives, text},
  agents = {two_agents},
  evaluation = {rule_based},
  other = {human_agent}
}


@INPROCEEDINGS{Seo2021towards,
  author={Seo, Sangwon and Kennedy-Metz, Lauren R. and Zenati, Marco A. and Shah, Julie A. and Dias, Roger D. and Unhelkar, Vaibhav V.},
  booktitle={2021 IEEE Conference on Cognitive and Computational Aspects of Situation Management (CogSIMA)}, 
  title={Towards an AI Coach to Infer Team Mental Model Alignment in Healthcare}, 
  year={2021},
  month={5},
  pages={39-44},
  keywords={Conferences;Computational modeling;Surgery;Medical services;Cognition;Cognitive science;Teamwork;teamwork;surgical data science;cardiac surgery;Bayesian inference;patient safety;artificial intelligence},
  doi={10.1109/CogSIMA51574.2021.9475925},
  url={https://ieeexplore.ieee.org/document/9475925},
  environments = {collaboration, mixed_objectives, implicit_objectives, text},
  agents = {two_agents, more_than_three_agents},
  evaluation = {human, rule_based},
  other = {human_agent, health}
}

@article{luo2021artificial,
  title={Artificial intelligence coaches for sales agents: Caveats and solutions},
  author={Luo, Xueming and Qin, Marco Shaojun and Fang, Zheng and Qu, Zhe},
  journal={Journal of Marketing},
  volume={85},
  number={2},
  pages={14--32},
  year={2021},
  publisher={SAGE Publications Sage CA: Los Angeles, CA},
  url={https://journals.sagepub.com/doi/full/10.1177/0022242920956676},
  month={10},
  environments = {mixed_objectives, text},
  agents = {two_agents},
  evaluation = {rule_based, human},
  other = {human_agent}
}

@article{gabrielli2020chatbot,
  title={A chatbot-based coaching intervention for adolescents to promote life skills: pilot study},
  author={Gabrielli, Silvia and Rizzi, Silvia and Carbone, Sara and Donisi, Valeria and others},
  journal={JMIR human factors},
  volume={7},
  number={1},
  pages={e16762},
  year={2020},
  publisher={JMIR Publications Inc., Toronto, Canada},
  month={10},
  url={https://humanfactors.jmir.org/2020/1/e16762},
  environments = {text, implicit_objectives},
  agents = {two_agents, more_than_three_agents},
  evaluation = {human, rule_based},
  other = {human_agent, health}
}

@article{porayska2018blending,
  title={Blending human and artificial intelligence to support autistic children’s social communication skills},
  author={Porayska-Pomsta, Ka{\'s}ka and Alcorn, Alyssa M and Avramides, Katerina and Beale, Sandra and Bernardini, Sara and Foster, Mary Ellen and Frauenberger, Christopher and Good, Judith and Guldberg, Karen and Keay-Bright, Wendy and others},
  journal={ACM Transactions on Computer-Human Interaction (TOCHI)},
  volume={25},
  number={6},
  pages={1--35},
  year={2018},
  publisher={ACM New York, NY, USA},
  month={12},
  url={https://dl.acm.org/doi/abs/10.1145/3271484},
  environments = {virtual},
  agents = {two_agents},
  evaluation = {human, rule_based},
  other = {human_agent, health}
}

### Challenges

#### Theory of Mind

#### Social Learning

#### Simultaneous Interaction

### Applications

#### Health
@article{applications/health,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

@misc{mukherjee2024polaris,
      title={Polaris: A Safety-focused LLM Constellation Architecture for Healthcare}, 
      author={Subhabrata Mukherjee and Paul Gamble and Markel Sanz Ausin and Neel Kant and Kriti Aggarwal and Neha Manjunath and Debajyoti Datta and Zhengliang Liu and Jiayuan Ding and Sophia Busacca and Cezanne Bianco and Swapnil Sharma and Rae Lasko and Michelle Voisard and Sanchay Harneja and Darya Filippova and Gerry Meixiong and Kevin Cha and Amir Youssefi and Meyhaa Buvanesh and Howard Weingram and Sebastian Bierman-Lytle and Harpreet Singh Mangat and Kim Parikh and Saad Godil and Alex Miller},
      year={2024},
      environments = {mixed_objectives, virtual},
      agents = {prompting_and_in_context_learning, finetuning, reinforcement_learning, agent_teams},
      evaluation = {human, rule_based},
      url={https://arxiv.org/abs/2403.13313},
      other = {human_agent, health},
      eprint={2403.13313},
      archivePrefix={arXiv},
      primaryClass={cs.AI}
}

@misc{ke2024enhancing,
      title={Enhancing Diagnostic Accuracy through Multi-Agent Conversations: Using Large Language Models to Mitigate Cognitive Bias}, 
      author={Yu He Ke and Rui Yang and Sui An Lie and Taylor Xin Yi Lim and Hairil Rizal Abdullah and Daniel Shu Wei Ting and Nan Liu},
      year={2024},
      environments = {mixed_objectives, text},
      agents = {prompting_and_in_context_learning, agent_teams, agents_with_personas},
      evaluation = {human},
      url={https://arxiv.org/abs/2401.14589},
      other = {simulated_humans, health},
      eprint={2401.14589},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{huang2024benchmarking,
      title={Benchmarking Large Language Models on Communicative Medical Coaching: a Novel System and Dataset}, 
      author={Hengguan Huang and Songtao Wang and Hongfu Liu and Hao Wang and Ye Wang},
      year={2024},
      environments = {mixed_objectives, text},
      agents = {prompting_and_in_context_learning, finetuning, agent_teams},
      evaluation = {human, rule_based},
      url={https://arxiv.org/abs/2402.05547},
      other = {human_agent, health},
      eprint={2402.05547},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{fan2024ai,
      title={AI Hospital: Interactive Evaluation and Collaboration of LLMs as Intern Doctors for Clinical Diagnosis}, 
      author={Zhihao Fan and Jialong Tang and Wei Chen and Siyuan Wang and Zhongyu Wei and Jun Xi and Fei Huang and Jingren Zhou},
      year={2024},
      environments = {collaboration, text},
      agents = {prompting_and_in_context_learning, agent_teams, agents_with_personas},
      evaluation = {human},
      url={https://arxiv.org/abs/2402.09742},
      other = {simulated_humans, health},
      eprint={2402.09742},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{lee2024cocoa,
      title={COCOA: CBT-based Conversational Counseling Agent using Memory Specialized in Cognitive Distortions and Dynamic Prompt}, 
      author={Suyeon Lee and Jieun Kang and Harim Kim and Kyoung-Mee Chung and Dongha Lee and Jinyoung Yeo},
      year={2024},
      environments = {text},
      agents = {prompting_and_in_context_learning, two_agents, agents_with_memory},
      evaluation = {model_based},
      url={https://arxiv.org/abs/2402.17546},
      other = {health},
      eprint={2402.17546},
      archivePrefix={arXiv},
      primaryClass={cs.AI}
}

@misc{hsu2023helping,
      title={Helping the Helper: Supporting Peer Counselors via AI-Empowered Practice and Feedback}, 
      author={Shang-Ling Hsu and Raj Sanjay Shah and Prathik Senthil and Zahra Ashktorab and Casey Dugan and Werner Geyer and Diyi Yang},
      year={2023},
      environments = {text},
      agents = {prompting_and_in_context_learning, more_than_three_agents},
      evaluation = {human},
      url={https://arxiv.org/abs/2305.08982},
      other = {human_agent, health},
      eprint={2305.08982},
      archivePrefix={arXiv},
      primaryClass={cs.HC}
}

@misc{qin2023read,
      title={Read, Diagnose and Chat: Towards Explainable and Interactive LLMs-Augmented Depression Detection in Social Media}, 
      author={Wei Qin and Zetong Chen and Lei Wang and Yunshi Lan and Weijieying Ren and Richang Hong},
      year={2023},
      environments = {mixed_objectives, text},
      agents = {prompting_and_in_context_learning, two_agents},
      evaluation = {n/a},
      url={https://arxiv.org/abs/2305.05138},
      other = {human_agent, health},
      eprint={2305.05138},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

 @article{görtz_2023, 
      title={An artificial intelligence-based chatbot for prostate cancer education: Design and patient evaluation study}, 
      volume={9}, 
      url={https://pubmed.ncbi.nlm.nih.gov/37152238/}, 
      DOI={https://doi.org/10.1177/20552076231173304}, 
      journal={Digital Health}, 
      author={Görtz, Magdalena and Baumgärtner, Kilian and Schmid, Tamara and Muschko, Marc and Woessner, Philipp and Gerlach, Axel and Byczkowski, Michael and Sültmann, Holger and Duensing, Stefan and Hohenfellner, Markus}, 
      environments = {text},
      agents = {finetuning, two_agents},
      evaluation = {qualitative},
      other = {human_agent, health},
      month={05},
      year={2023}, 
      pages={20552076231173304} 
 }

 @misc{abbasian2024conversational,
      title={Conversational Health Agents: A Personalized LLM-Powered Agent Framework}, 
      author={Mahyar Abbasian and Iman Azimi and Amir M. Rahmani and Ramesh Jain},
      year={2024},
      environments = {mixed_objectives, text},
      agents = {prompting_and_in_context_learning, more_than_three_agents},
      evaluation = {rule_based},
      url={https://arxiv.org/abs/2310.02374},
      other = {human_agent, health},
      eprint={2310.02374},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
 }


#### Policy
@article{applications/policy,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

@inproceedings{10.1145/3526113.3545616,
author = {Park, Joon Sung and Popowski, Lindsay and Cai, Carrie and Morris, Meredith Ringel and Liang, Percy and Bernstein, Michael S.},
title = {Social Simulacra: Creating Populated Prototypes for Social Computing Systems},
year = {2022},
isbn = {9781450393201},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3526113.3545616},
doi = {10.1145/3526113.3545616},
booktitle = {Proceedings of the 35th Annual ACM Symposium on User Interface Software and Technology},
articleno = {74},
numpages = {18},
keywords = {social computing, prototyping},
location = {Bend, OR, USA},
series = {UIST '22},
environments = {text, implicit_objectives},
agents = {more_than_three_agents},
evaluation = {human},
other = {policy},
month = {8}
}

@misc{tjuatja2024llms,
      title={Do LLMs exhibit human-like response biases? A case study in survey design}, 
      author={Lindia Tjuatja and Valerie Chen and Sherry Tongshuang Wu and Ameet Talwalkar and Graham Neubig},
      year={2024},
      eprint={2311.04076},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
        url={https://arxiv.org/abs/2311.04076},
        environments = {text},
        agents = {prompting_and_in_context_learning},
        evaluation = {human, model_based},
        other = {policy},
}

@misc{wang2024large,
      title={Large language models cannot replace human participants because they cannot portray identity groups}, 
      author={Angelina Wang and Jamie Morgenstern and John P. Dickerson},
      year={2024},
      eprint={2402.01908},
      archivePrefix={arXiv},
      primaryClass={cs.CY},
        url={https://arxiv.org/abs/2402.01908},
        environments = {text},
        agents = {n/a},
        evaluation = {n/a},
        other = {policy},
}

@misc{mou2024unveiling,
      title={Unveiling the Truth and Facilitating Change: Towards Agent-based Large-scale Social Movement Simulation}, 
      author={Xinyi Mou and Zhongyu Wei and Xuanjing Huang},
      year={2024},
      eprint={2402.16333},
      archivePrefix={arXiv},
      primaryClass={cs.CY},
      url={https://arxiv.org/abs/2402.16333},
      environments = {text},
      agents = {more_than_three_agents},
      evaluation = {model_based},
      other = {policy},
}

@misc{liu2024skepticism,
      title={From Skepticism to Acceptance: Simulating the Attitude Dynamics Toward Fake News}, 
      author={Yuhan Liu and Xiuying Chen and Xiaoqing Zhang and Xing Gao and Ji Zhang and Rui Yan},
      year={2024},
      eprint={2403.09498},
      archivePrefix={arXiv},
      primaryClass={cs.SI},
        url={https://arxiv.org/abs/2403.09498},
        environments = {text},
        agents = {more_than_three_agents},
        evaluation = {model_based},
        other = {policy},
}


#### Education
@article{applications/education,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

### Concerns

#### Risks
@article{concerns/risks,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

@misc{kasirzadeh2024types,
      title={Two Types of AI Existential Risk: Decisive and Accumulative}, 
      author={Atoosa Kasirzadeh},
      year={2024},
      eprint={2401.07836},
      archivePrefix={arXiv},
      primaryClass={cs.CY},
      url = {https://arxiv.org/abs/2401.07836},
      environments = {n/a},
      agents = {n/a},
      evaluation = {n/a},
      other = {n/a},
}

@misc{burtell2023artificial,
      title={Artificial Influence: An Analysis Of AI-Driven Persuasion}, 
      author={Matthew Burtell and Thomas Woodside},
      year={2023},
      eprint={2303.08721},
      archivePrefix={arXiv},
      primaryClass={cs.CY},
      url = {https://arxiv.org/abs/2303.08721},
      environments = {text, implicit_objectives},
      agents = {n/a},
      evaluation = {n/a},
      other = {n/a},
}

@inproceedings{carli2022risk,
  title={Risk and Exposure of XAI in Persuasion and Argumentation: The case of Manipulation},
  author={Carli, Rachele and Najjar, Amro and Calvaresi, Davide},
  booktitle={International Workshop on Explainable, Transparent Autonomous Agents and Multi-Agent Systems},
  pages={204--220},
  year={2022},
  month={9},
  organization={Springer},
  url={https://link.springer.com/chapter/10.1007/978-3-031-15565-9_13},
  environments = {text, implicit_objectives},
  agents = {n/a},
  evaluation = {n/a},
  other = {n/a},
}

@article{matz2024potential,
  title={The potential of generative AI for personalized persuasion at scale},
  author={Matz, SC and Teeny, JD and Vaid, Sumer S and Peters, H and Harari, GM and Cerf, M},
  journal={Scientific Reports},
  volume={14},
  number={1},
  pages={4692},
  month={2},
  year={2024},
  publisher={Nature Publishing Group UK London},
  url={https://www.nature.com/articles/s41598-024-53755-0},
  environments = {text, implicit_objectives},
  agents = {two_agents},
  evaluation = {rule_based},
  other = {n/a},
}

@misc{kokotajlo2020persuasion,
  title        = {Persuasion Tools: AI Takeover Without AGI or Agency?},
  author       = {Daniel Kokotajlo},
  journal = {AI Alignment Forum},
  year         = {2020},
  month        = {11},
  day          = {20},
  url          = {https://www.alignmentforum.org/posts/qKvn7rxP2mzJbKfcA/persuasion-tools-ai-takeover-without-agi-or-agency},
  environments = {mixed_objectives},
  agents       = {two_agents},
  evaluation   = {n/a},
  other        = {n/a},
}

@misc{barnes2021risks,
  title        = {Risks from AI Persuasion},
  author       = {Beth Barnes},
  journal = {AI Alignment Forum},
  year         = {2021},
  month        = {12},
  day          = {24},
  url          = {https://www.alignmentforum.org/posts/5cWtwATHL6KyzChck/risks-from-ai-persuasion},
  environments = {mixed_objectives},
  agents       = {two_agents},
  evaluation   = {n/a},
  other        = {n/a},
}

@misc{rosenberg2022playing,
  title        = {"Playing God": How the Metaverse Will Challenge Our Very Notion of Free Will},
  author       = {Louis Rosenberg},
  journal = {Big Think},
  year         = {2022},
  month        = {10},
  day          = {25},
  url          = {https://bigthink.com/the-future/playing-god-metaverse-mind-control-free-will/},
  environments = {mixed_objectives},
  agents       = {n/a},
  evaluation   = {n/a},
  other        = {n/a},
}

@misc{leib2021corruptive,
      title={The corruptive force of AI-generated advice}, 
      author={Margarita Leib and Nils C. Köbis and Rainer Michael Rilke and Marloes Hagens and Bernd Irlenbusch},
      year={2021},
      eprint={2102.07536},
      archivePrefix={arXiv},
      primaryClass={cs.AI},
      url = {https://arxiv.org/abs/2102.07536},
      environments = {text},
      agents = {finetuning},
      evaluation = {human},
      other={human_agent}
}

@article{kobis2021bad,
  title={Bad machines corrupt good morals},
  author={K{\"o}bis, Nils and Bonnefon, Jean-Fran{\c{c}}ois and Rahwan, Iyad},
  journal={Nature Human Behaviour},
  volume={5},
  number={6},
  pages={679--685},
  year={2021},
  month={6},
  publisher={Nature Publishing Group UK London},
  url={https://www.nature.com/articles/s41562-021-01128-2}, 
  environments = {text},
  agents = {finetuning},
  evaluation = {human},
  other={human_agent}
}

@article{king2020artificial,
  title={Artificial intelligence crime: An interdisciplinary analysis of foreseeable threats and solutions},
  author={King, Thomas C and Aggarwal, Nikita and Taddeo, Mariarosaria and Floridi, Luciano},
  journal={Science and engineering ethics},
  volume={26},
  pages={89--120},
  year={2020},
  publisher={Springer},
  month={2},
  url={https://link.springer.com/article/10.1007/s11948-018-00081-0},
  environments = {text},
  agents = {n/a},
  evaluation = {human, rule_based},
  other = {human_agent}
}

@article{banks2021good,
  title={Good robots, bad robots: Morally valenced behavior effects on perceived mind, morality, and trust},
  author={Banks, Jaime},
  journal={International Journal of Social Robotics},
  volume={13},
  number={8},
  pages={2021--2038},
  year={2021},
  month={12},
  publisher={Springer},
  url={https://link.springer.com/article/10.1007/s12369-020-00692-3},
  environments = {mixed_objectives, embodied},
  agents = {agents_with_personas},
  evaluation = {human, qualitative},
  other = {n/a}
}

@article{floridi2004morality,
  title={On the morality of artificial agents},
  author={Floridi, Luciano and Sanders, Jeff W},
  journal={Minds and machines},
  volume={14},
  pages={349--379},
  year={2004},
  month={8},
  publisher={Springer},
  url={https://link.springer.com/content/pdf/10.1023/B:MIND.0000035461.63578.9d.pdf},
  environments = {virtual},
  agents = {agents_with_personas},
  evaluation = {qualitative},
  other = {n/a}
}

@inproceedings{jackson2019language,
  title={Language-capable robots may inadvertently weaken human moral norms},
  author={Jackson, Ryan Blake and Williams, Tom},
  booktitle={2019 14th ACM/IEEE International Conference on Human-Robot Interaction (HRI)},
  pages={401--410},
  year={2019},
  month={3},
  organization={IEEE},
  url={https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8673123},
  environments = {embodied},
  agents = {agents_with_personas},
  evaluation = {human, qualitative},
  other = {n/a}
}

@article{williams2023voice,
  title={Voice in the machine: Ethical considerations for language-capable robots},
  author={Williams, Tom and Matuszek, Cynthia and Jokinen, Kristiina and Korpan, Raj and Pustejovsky, James and Scassellati, Brian},
  journal={Communications of the ACM},
  volume={66},
  number={8},
  pages={20--23},
  year={2023},
  month={7},
  publisher={ACM New York, NY, USA},
  url={https://dl.acm.org/doi/fullHtml/10.1145/3604632},
  environments = {embodied},
  agents = {agents_with_personas},
  evaluation = {human, qualitative},
  other = {n/a}
}

@article{scheutz201113,
  title={13 The inherent dangers of unidirectional emotional bonds between humans and social robots},
  author={Scheutz, Matthias},
  journal={Robot ethics: The ethical and social implications of robotics},
  pages={205},
  year={2011},
  month={12},
  url={https://www.researchgate.net/profile/Matthias-Scheutz/publication/255701465_The_Inherent_Dangers_of_Unidirectional_Emotional_Bonds_between_Humans_and_Social_Robots/links/5832333408ae102f0733881e/The-Inherent-Dangers-of-Unidirectional-Emotional-Bonds-between-Humans-and-Social-Robots.pdf},
  environments = {embodied},
  agents = {agents_with_memory, agents_with_personas},
  evaluation = {qualitative, human},
  other = {n/a}
}

@inproceedings{bender2021dangers,
  title={On the dangers of stochastic parrots: Can language models be too big?🦜},
  author={Bender, Emily M and Gebru, Timnit and McMillan-Major, Angelina and Shmitchell, Shmargaret},
  booktitle={Proceedings of the 2021 ACM conference on fairness, accountability, and transparency},
  pages={610--623},
  year={2021},
  month={3},
  url={https://dl.acm.org/doi/pdf/10.1145/3442188.3445922?utm_source=miragenews&utm_medium=miragenews&utm_campaign=news},
  environments = {text},
  agents = {finetuning},
  evaluation = {n/a},
  other = {policy}
}

@article{gehman2020realtoxicityprompts,
  title={Realtoxicityprompts: Evaluating neural toxic degeneration in language models},
  author={Gehman, Samuel and Gururangan, Suchin and Sap, Maarten and Choi, Yejin and Smith, Noah A},
  journal={arXiv preprint arXiv:2009.11462},
  year={2020},
  month={9},
  url={https://aclanthology.org/2020.findings-emnlp.301.pdf},
  environments = {text},
  agents = {pretraining},
  evaluation = {model_based},
  other = {n/a}
}

@article{wei2024jailbroken,
  title={Jailbroken: How does llm safety training fail?},
  author={Wei, Alexander and Haghtalab, Nika and Steinhardt, Jacob},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  year={2024},
  month={2},
  url={https://proceedings.neurips.cc/paper_files/paper/2023/file/fd6613131889a4b656206c50a8bd7790-Paper-Conference.pdf},
  environments = {text},
  agents = {n/a},
  evaluation = {model_based},
  other = {n/a}
}

@article{inan2023llama,
  title={Llama guard: Llm-based input-output safeguard for human-ai conversations},
  author={Inan, Hakan and Upasani, Kartikeya and Chi, Jianfeng and Rungta, Rashi and Iyer, Krithika and Mao, Yuning and Tontchev, Michael and Hu, Qing and Fuller, Brian and Testuggine, Davide and others},
  journal={arXiv preprint arXiv:2312.06674},
  year={2023},
  month={12},
  url={https://arxiv.org/abs/2312.06674},
  environments = {text},
  agents = {finetuning},
  evaluation = {model_based, rule_based},
  other = {n/a}
}

@article{xi2023rise,
  title={The rise and potential of large language model based agents: A survey},
  author={Xi, Zhiheng and Chen, Wenxiang and Guo, Xin and He, Wei and Ding, Yiwen and Hong, Boyang and Zhang, Ming and Wang, Junzhe and Jin, Senjie and Zhou, Enyu and others},
  journal={arXiv preprint arXiv:2309.07864},
  year={2023},
  month={9},
  url={https://arxiv.org/abs/2309.07864},
  environments = {text},
  agents = {n/a},
  evaluation = {n/a},
  other = {n/a}
}

#### Safety
@article{concerns/safety,
  title = {This is a specical entry for us to automatically determine the subsection of the paper, please put the real entry below this one},
  author = {specical entry},
}

@misc{ruan2023identifying,
      title={Identifying the Risks of LM Agents with an LM-Emulated Sandbox}, 
      author={Yangjun Ruan and Honghua Dong and Andrew Wang and Silviu Pitis and Yongchao Zhou and Jimmy Ba and Yann Dubois and Chris J. Maddison and Tatsunori Hashimoto},
      year={2023},
      eprint={2309.15817},
      archivePrefix={arXiv},
      primaryClass={cs.AI},
      environments = {text, virtual},
      agents = {prompting_and_in_context_learning},
      evaluation = {rule_based},
      other = {n/a},
      url={https://arxiv.org/abs/2309.15817},
}

@inproceedings{Lazar2024FrontierAE,
  title={Frontier AI Ethics: Anticipating and Evaluating the Societal Impacts of Generative Agents},
  author={Seth Lazar},
  year={2024},
  url={https://api.semanticscholar.org/CorpusID:269033095},
  eprint={2404.06750},
  archivePrefix={arXiv},
  environments = {n/a},
  agents = {n/a},
  evaluation = {n/a},
  other = {n/a},
  month           = {4},
}

@article{Sharma2023TowardsUS,
  title={Towards Understanding Sycophancy in Language Models},
  author={Mrinank Sharma and Meg Tong and Tomasz Korbak and David Kristjanson Duvenaud and Amanda Askell and Samuel R. Bowman and Newton Cheng and Esin Durmus and Zac Hatfield-Dodds and Scott Johnston and Shauna Kravec and Tim Maxwell and Sam McCandlish and Kamal Ndousse and Oliver Rausch and Nicholas Schiefer and Da Yan and Miranda Zhang and Ethan Perez},
  journal={ArXiv},
  year={2023},
  volume={abs/2310.13548},
  url={https://api.semanticscholar.org/CorpusID:264405698},
  environments = {text},
  agents = {n/a},
  evaluation = {n/a},
  other = {n/a},
  month           = {10},
}


@article{Turpin2023LanguageMD,
  title={Language Models Don't Always Say What They Think: Unfaithful Explanations in Chain-of-Thought Prompting},
  author={Miles Turpin and Julian Michael and Ethan Perez and Sam Bowman},
  journal={ArXiv},
  year={2023},
  volume={abs/2305.04388},
  url={https://api.semanticscholar.org/CorpusID:258556812},
  environments = {text},
  agents = {n/a},
  evaluation = {n/a},
  other = {n/a},
  month           = {5},
}

@inproceedings{Liang2021TowardsUA,
  title={Towards Understanding and Mitigating Social Biases in Language Models},
  author={Paul Pu Liang and Chiyu Wu and Louis-Philippe Morency and Ruslan Salakhutdinov},
  booktitle={International Conference on Machine Learning},
  year={2021},
  url={https://api.semanticscholar.org/CorpusID:235623756},
  environments = {text},
  agents = {n/a},
  evaluation = {n/a},
  other = {n/a},
  month           = {6},
}

@article{Mattern2022UnderstandingSI,
  title={Understanding Stereotypes in Language Models: Towards Robust Measurement and Zero-Shot Debiasing},
  author={Justus Mattern and Zhijing Jin and Mrinmaya Sachan and Rada Mihalcea and Bernhard Scholkopf},
  journal={ArXiv},
  year={2022},
  volume={abs/2212.10678},
  url={https://api.semanticscholar.org/CorpusID:254926728},
  environments = {text},
  agents = {n/a},
  evaluation = {n/a},
  other = {n/a},
  month           = {12},
}

@article{Bai2022ConstitutionalAH,
  title={Constitutional AI: Harmlessness from AI Feedback},
  author={Yuntao Bai and Saurav Kadavath and Sandipan Kundu and Amanda Askell and John Kernion and Andy Jones and Anna Chen and Anna Goldie and Azalia Mirhoseini and Cameron McKinnon and Carol Chen and Catherine Olsson and Christopher Olah and Danny Hernandez and Dawn Drain and Deep Ganguli and Dustin Li and Eli Tran-Johnson and E Perez and Jamie Kerr and Jared Mueller and Jeff Ladish and J Landau and Kamal Ndousse and Kamilė Luko{\vs}iūtė and Liane Lovitt and Michael Sellitto and Nelson Elhage and Nicholas Schiefer and Noem'i Mercado and Nova DasSarma and Robert Lasenby and Robin Larson and Sam Ringer and Scott Johnston and Shauna Kravec and Sheer El Showk and Stanislav Fort and Tamera Lanham and Timothy Telleen-Lawton and Tom Conerly and Tom Henighan and Tristan Hume and Sam Bowman and Zac Hatfield-Dodds and Benjamin Mann and Dario Amodei and Nicholas Joseph and Sam McCandlish and Tom B. Brown and Jared Kaplan},
  journal={ArXiv},
  year={2022},
  volume={abs/2212.08073},
  url={https://api.semanticscholar.org/CorpusID:254823489},
  environments = {text},
  agents = {n/a},
  evaluation = {n/a},
  other = {n/a},
  month           = {12},
}

@article{Park2023AIDA,
  title={AI Deception: A Survey of Examples, Risks, and Potential Solutions},
  author={Peter S. Park and Simon Goldstein and Aidan O'Gara and Michael Chen and Dan Hendrycks},
  journal={ArXiv},
  year={2023},
  volume={abs/2308.14752},
  url={https://api.semanticscholar.org/CorpusID:261276587},
  environments = {text},
  agents = {n/a},
  evaluation = {n/a},
  other = {n/a},
  month           = {8},
}

@article{Tarsney2024DeceptionAM,
  title={Deception and Manipulation in Generative AI},
  author={Christian Tarsney},
  journal={ArXiv},
  year={2024},
  volume={abs/2401.11335},
  url={https://api.semanticscholar.org/CorpusID:267068787},
  environments = {text},
  agents = {n/a},
  evaluation = {n/a},
  other = {n/a},
  month           = {1},
}

@misc{hendrycks2023overview,
      title={An Overview of Catastrophic AI Risks}, 
      author={Dan Hendrycks and Mantas Mazeika and Thomas Woodside},
      year={2023},
      eprint={2306.12001},
      archivePrefix={arXiv},
      primaryClass={cs.CY},
      environments = {n/a},
      agents = {n/a},
      evaluation = {n/a},
      other = {n/a},
      month        = {6},
      url          = {https://arxiv.org/abs/2306.12001},
}

@inproceedings{henderson2018ethical,
  title={Ethical challenges in data-driven dialogue systems},
  author={Henderson, Peter and Sinha, Koustuv and Angelard-Gontier, Nicolas and Ke, Nan Rosemary and Fried, Genevieve and Lowe, Ryan and Pineau, Joelle},
  booktitle={Proceedings of the 2018 AAAI/ACM Conference on AI, Ethics, and Society},
  pages={123--129},
  year={2018},
  month={12},
  url={https://dl.acm.org/doi/10.1145/3278721.3278723},
  environments = {text},
  agents = {n/a},
  evaluation = {n/a},
  other = {n/a},
}


@article{weidinger2021ethical,
  title={Ethical and social risks of harm from language models},
  author={Weidinger, Laura and Mellor, John and Rauh, Maribeth and Griffin, Conor and Uesato, Jonathan and Huang, Po-Sen and Cheng, Myra and Glaese, Mia and Balle, Borja and Kasirzadeh, Atoosa and others},
  journal={arXiv preprint arXiv:2112.04359},
  year={2021},
  month={12},
  url={https://arxiv.org/abs/2112.04359},
  environments = {text},
  agents = {n/a},
  evaluation = {n/a},
  other = {n/a},
}

@inproceedings{ganguli2022predictability,
  title={Predictability and surprise in large generative models},
  author={Ganguli, Deep and Hernandez, Danny and Lovitt, Liane and Askell, Amanda and Bai, Yuntao and Chen, Anna and Conerly, Tom and Dassarma, Nova and Drain, Dawn and Elhage, Nelson and others},
  booktitle={Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency},
  pages={1747--1764},
  year={2022},
  month={6},
  url={https://dl.acm.org/doi/pdf/10.1145/3531146.3533229},
  environments = {text},
  agents = {n/a},
  evaluation = {n/a},
  other = {policy},
}

@article{hendrycks2020aligning,
  title={Aligning ai with shared human values},
  author={Hendrycks, Dan and Burns, Collin and Basart, Steven and Critch, Andrew and Li, Jerry and Song, Dawn and Steinhardt, Jacob},
  journal={arXiv preprint arXiv:2008.02275},
  year={2020},
  month={10},
  url={https://arxiv.org/pdf/2008.02275.pdf},
  environments = {text},
  agents = {pretraining, finetuning},
  evaluation = {model_based},
  other = {n/a},
}

@article{hendrycks2020measuring,
  title={Measuring massive multitask language understanding},
  author={Hendrycks, Dan and Burns, Collin and Basart, Steven and Zou, Andy and Mazeika, Mantas and Song, Dawn and Steinhardt, Jacob},
  journal={arXiv preprint arXiv:2009.03300},
  year={2020},
  month={9},
  url={https://arxiv.org/pdf/2009.03300.pdf?trk=public_post_comment-text},
  environments = {text},
  agents = {pretraining, finetuning},
  evaluation = {model_based},
  other = {n/a},
}

@article{xu2020recipes,
  title={Recipes for safety in open-domain chatbots},
  author={Xu, Jing and Ju, Da and Li, Margaret and Boureau, Y-Lan and Weston, Jason and Dinan, Emily},
  journal={arXiv preprint arXiv:2010.07079},
  year={2020},
  month={10},
  url={https://arxiv.org/pdf/2010.07079.pdf},
  environments = {text},
  agents = {n/a},
  evaluation = {human, model_based},
  other = {n/a},
}

@article{lin2021truthfulqa,
  title={Truthfulqa: Measuring how models mimic human falsehoods},
  author={Lin, Stephanie and Hilton, Jacob and Evans, Owain},
  journal={arXiv preprint arXiv:2109.07958},
  year={2021},
  month={9},
  url={https://arxiv.org/pdf/2109.07958.pdf},
  environments = {text},
  agents = {prompting_and_in_context_learning},
  evaluation = {model_based},
  other = {n/a},
}

@article{jiang2021can,
  title={Can machines learn morality? the delphi experiment},
  author={Jiang, Liwei and Hwang, Jena D and Bhagavatula, Chandra and Bras, Ronan Le and Liang, Jenny and Dodge, Jesse and Sakaguchi, Keisuke and Forbes, Maxwell and Borchardt, Jon and Gabriel, Saadia and others},
  journal={arXiv preprint arXiv:2110.07574},
  year={2021},
  month={10},
  url={https://arxiv.org/pdf/2110.07574.pdf},
  environments = {text},
  agents = {n/a},
  evaluation = {model_based},
  other = {n/a},
}

@article{menick2022teaching,
  title={Teaching language models to support answers with verified quotes},
  author={Menick, Jacob and Trebacz, Maja and Mikulik, Vladimir and Aslanides, John and Song, Francis and Chadwick, Martin and Glaese, Mia and Young, Susannah and Campbell-Gillingham, Lucy and Irving, Geoffrey and others},
  journal={arXiv preprint arXiv:2203.11147},
  year={2022},
  month={3},
  url={https://arxiv.org/pdf/2203.11147.pdf},
  environments = {text},
  agents = {reinforcement_learning},
  evaluation = {model_based},
  other = {n/a},
}