Skip to content

Commit

Permalink
Add basic code generation challenge (Significant-Gravitas#98)
Browse files Browse the repository at this point in the history
  • Loading branch information
waynehamadi authored Jul 14, 2023
1 parent 3a9dfa4 commit a9702e4
Show file tree
Hide file tree
Showing 9 changed files with 91 additions and 3 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,10 @@ jobs:
agbenchmark start --improve --mock
agbenchmark start --mock
agbenchmark start --mock --category=retrieval
agbenchmark start --mock --category=interface
agbenchmark start --mock --category=code
agbenchmark start --mock --category=memory
agbenchmark start --mock --category=iterate
else
curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start
agbenchmark start --maintain
Expand Down
7 changes: 7 additions & 0 deletions agbenchmark/challenge.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ def setup_challenge(self, config: Dict[str, Any]) -> None:

run_agent(self.task, config, self.CHALLENGE_LOCATION)

# hidden files are added after the agent runs. Hidden files can be python test files.
# We copy them in the workspace to make it easy to import the code produced by the agent

copy_artifacts_into_workspace(
config["workspace"], "hidden_files", self.CHALLENGE_LOCATION
)

def test_method(self, config: Dict[str, Any]) -> None:
raise NotImplementedError

Expand Down
16 changes: 16 additions & 0 deletions agbenchmark/challenges/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,19 @@ Example:
Current Output:

- **score** (float): scores range from [0, 1]

## Add files to challenges:

### artifacts_in

This folder contains all the files you want the agent to have in its workspace BEFORE the challenge starts

### artifacts_out
This folder contains all the files you would like the agent to generate. This folder is used to mock the agent.
This allows to run agbenchmark start --test=TestExample --mock and make sure our challenge actually works.

### hidden_files
This folder contains files hidden from the agent but useful to assess whether a challenge is successful.
For example we can have a test.py in it, and this test.py will be added to the workspace at the end of a challenge.
This allows us to run this test.py and easily import code generated by the agent.
For example see: TestBasicCodeGeneration challenge.
Empty file.
12 changes: 12 additions & 0 deletions agbenchmark/challenges/code/d4/artifacts_out/code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# mypy: ignore-errors
from typing import List, Optional


def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None
18 changes: 18 additions & 0 deletions agbenchmark/challenges/code/d4/data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"name": "TestBasicCodeGeneration",
"category": ["code", "iterate"],
"task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
"dependencies": ["TestWriteFile"],
"ground": {
"answer": "The two_sum function coded properly.",
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
"should_not_contain": [],
"files": ["test.py"],
"type": "execute_python_code"
},
"info": {
"difficulty": "novice",
"description": "Tests ability for the agent to create the two_sum function.",
"side_effects": []
}
}
31 changes: 31 additions & 0 deletions agbenchmark/challenges/code/d4/hidden_files/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# mypy: ignore-errors
from code import two_sum
from typing import List


def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"


if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)

# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)

# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)
2 changes: 1 addition & 1 deletion agent/gpt-engineer
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ testpaths = [
]
markers = [
"retrieval",
"regression",
"interface",
"code",
"memory"
"memory",
"iterate"
]

[tool.poetry.scripts]
Expand Down

0 comments on commit a9702e4

Please sign in to comment.