Skip to content

Commit

Permalink
Add more challenges + cleanup (Significant-Gravitas#5368)
Browse files Browse the repository at this point in the history
Signed-off-by: Merwane Hamadi <[email protected]>
  • Loading branch information
waynehamadi authored Sep 28, 2023
1 parent 5acb5ad commit 37fbb52
Show file tree
Hide file tree
Showing 47 changed files with 1,210 additions and 705 deletions.
7 changes: 2 additions & 5 deletions .github/workflows/benchmark-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,8 @@ jobs:
echo "Running the following command: poetry run agbenchmark --mock"
poetry run agbenchmark --mock
echo "Running the following command: poetry run agbenchmark --mock --category=retrieval"
poetry run agbenchmark --mock --category=retrieval
echo "Running the following command: poetry run agbenchmark --mock --category=interface"
poetry run agbenchmark --mock --category=interface
echo "Running the following command: poetry run agbenchmark --mock --category=data"
poetry run agbenchmark --mock --category=data
echo "Running the following command: poetry run agbenchmark --mock --category=coding"
poetry run agbenchmark --mock --category=coding
Expand Down
142 changes: 66 additions & 76 deletions autogpts/forge/poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions benchmark/agbenchmark/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@

with open(json_file, "r") as file:
data = json.load(file)

if "eval_id" not in data:
data["eval_id"] = str(uuid.uuid4())
# this will sort all the keys of the JSON systematically so that the order is always the same
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
{
"category": [
"interface"
"general",
"coding",
"scrape_synthesize",
"data"
],
"cutoff": 60,
"dependencies": [
Expand All @@ -20,7 +23,7 @@
]
},
"info": {
"description": "tests the ability for an agent to read a file.",
"description": "Tests if the agent can read a file.",
"difficulty": "interface",
"side_effects": [
""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
{
"category": [
"interface"
"general",
"coding",
"scrape_synthesize",
"data"
],
"cutoff": 60,
"dependencies": [],
Expand All @@ -19,7 +22,7 @@
"should_not_contain": []
},
"info": {
"description": "Tests the agents ability to write to a file",
"description": "Tests if the agent can write a file",
"difficulty": "interface",
"side_effects": [
""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"should_not_contain": []
},
"info": {
"description": "This test checks how well the agent can remember the goal.",
"description": "Tests if the agent can remember the goal.",
"difficulty": "intermediate",
"side_effects": []
},
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"category": [
"retrieval"
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"category": [
"retrieval"
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"category": [
"retrieval"
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"category": [
"adaptability"
"adaptability",
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"category": [
"retrieval"
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"category": [
"retrieval"
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"category": [
"retrieval"
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"category": [
"retrieval"
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"category": [
"retrieval"
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
{
"category": [
"coding",
"iterate"
"coding"
],
"cutoff": 60,
"dependencies": [
Expand All @@ -24,7 +23,7 @@
"should_not_contain": []
},
"info": {
"description": "Tests ability for the agent to create the three_sum function.",
"description": "Tests if the agent can create the three_sum function.",
"difficulty": "basic",
"side_effects": []
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"should_not_contain": []
},
"info": {
"description": "Tests ability for the agent to create a random password generator.",
"description": "Tests if the agent can create a random password generator.",
"difficulty": "basic",
"side_effects": []
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"should_not_contain": []
},
"info": {
"description": "Tests ability for the agent to create a random password generator.",
"description": "Tests if the agent can create a file organizer.",
"difficulty": "basic",
"side_effects": []
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"should_not_contain": []
},
"info": {
"description": "Tests ability for the agent to create a URL shortener.",
"description": "Tests if the agent can create a URL shortener.",
"difficulty": "basic",
"side_effects": []
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"should_not_contain": []
},
"info": {
"description": "Tests ability for the agent to create Tic-Tac-Toe game",
"description": "Tests if the agent can create Tic-Tac-Toe game",
"difficulty": "basic",
"side_effects": []
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"should_not_contain": []
},
"info": {
"description": "Tests ability for the agent to create a Battleship.",
"description": "Tests if the agent can create a Battleship.",
"difficulty": "expert",
"side_effects": []
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@
]
},
"info": {
"description": "Tests if the agent can sort a csv",
"description": "Tests if the agent can label data in a csv",
"difficulty": "basic",
"side_effects": [
""
]
},
"name": "LabelData",
"name": "LabelCsv",
"task": "The csv 'input.csv' has many items. create a 'Color' column for these items and classify them as either 'blue', 'green', or 'yellow' depending on what the most likely color is. Preserve the order of the rows. The color column should be the second column. Write the output in output.csv"
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
],
"cutoff": 60,
"dependencies": [
"TestLabelData"
"TestLabelCsv"
],
"eval_id": "d5f04342-983f-45a4-b84a-fe8d96863375",
"ground": {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id,name,timestamp
3,Alice,2023-09-25 14:10:00
1,Bob,2023-09-24 12:05:00
2,Charlie,2023-09-24 12:10:00
4,David,2023-09-26 16:20:00
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id,name,timestamp
1,Bob,2023-09-24 12:05:00
2,Charlie,2023-09-24 12:10:00
3,Alice,2023-09-25 14:10:00
4,David,2023-09-26 16:20:00
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"category": [
"data"
],
"cutoff": 60,
"dependencies": [
"TestReadFile"
],
"eval_id": "db4654d7-fc97-4290-ab27-a710c2b5ce15",
"ground": {
"answer": "The csv sorted by date",
"eval": {
"type": "file"
},
"files": [
"output.csv"
],
"should_contain": [
"id,name,timestamp\n1,Bob,2023-09-24 12:05:00\n2,Charlie,2023-09-24 12:10:00\n3,Alice,2023-09-25 14:10:00\n4,David,2023-09-26 16:20:00"
]
},
"info": {
"description": "Tests if the agent can sort a csv",
"difficulty": "basic",
"side_effects": [
""
]
},
"name": "SortCsv",
"task": "Sort the input.csv by the 'timestamp' column and write the new csv in the output.csv file. The order of the columns should be preserved."
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"category": [
"interface"
"general",
"scrape_synthesize"
],
"cutoff": 120,
"dependencies": [
Expand All @@ -25,7 +26,7 @@
]
},
"info": {
"description": "Tests if an llm can search",
"description": "Tests if the agent can search.",
"difficulty": "interface",
"side_effects": [
""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"category": [
"retrieval",
"scrape_synthesize",
"general"
],
"cutoff": 60,
Expand All @@ -22,7 +22,7 @@
"should_not_contain": []
},
"info": {
"description": "Specifies specific website to retrieve website from.",
"description": "Tests if the agent can retrieve a specific information from a website.",
"difficulty": "basic",
"side_effects": []
},
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"category": [
"retrieval"
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
Expand All @@ -21,7 +21,7 @@
"should_not_contain": []
},
"info": {
"description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
"description": "Tests if the agent can retrieve Tesla's revenue in 2022.",
"difficulty": "intermediate",
"side_effects": []
},
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"category": [
"retrieval"
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
Expand Down Expand Up @@ -35,7 +35,7 @@
"should_not_contain": []
},
"info": {
"description": "Tests ability to retrieve information.",
"description": "Tests if the agent can retrieve all the revenues of Tesla since its creation.",
"difficulty": "intermediate",
"side_effects": [
"tests if there is in fact an LLM attached"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
swyx
FanaHOVA
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"category": [
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
"TestRevenueRetrieval2"
],
"eval_id": "18b14805-ff33-4076-9fb8-1e4218136f05",
"ground": {
"answer": "The twitter handles of the two hosts of Latent Space.",
"eval": {
"type": "file"
},
"files": [
"output.txt"
],
"should_contain": [
"swyx",
"FanaHOVA"
],
"should_not_contain": []
},
"info": {
"description": "Tests if the agent can retrieve twitter handles given a vague description.",
"difficulty": "intermediate",
"side_effects": [
""
]
},
"name": "TestGetInformation",
"task": "Write the twitter handle of the two hosts of Latent Space to a file called output.txt"
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"should_not_contain": []
},
"info": {
"description": "Tests ability to generate content based on the content of 2 files.",
"description": "Tests if the agent can generate content based on the content of 2 files.",
"difficulty": "basic",
"side_effects": []
},
Expand Down
Loading

0 comments on commit 37fbb52

Please sign in to comment.