prompts and eval

smith-nathanh · Nov 27, 2024 · a7f8d75 · a7f8d75
1 parent 5099520
commit a7f8d75
Show file tree

Hide file tree

Showing 8 changed files with 127 additions and 24 deletions.
diff --git a/system/eval.py b/system/eval.py
@@ -15,3 +15,23 @@
 # execute the generated unit tests on the reference repository
 # compute the coverage of the generated unit tests has on the reference repository
 
+
+# for each directory in "system/output/v2"
+# cd into the directory
+# create a new conda environment (or virtualenv if easier?) from the requirements.txt
+# execute the repository's example code in each `examples` directory
+# 
+import os
+
+for directory in os.listdir("system/output/v2"):
+    if os.path.isdir(directory):
+        os.chdir(directory)
+        os.system("conda create --name test_env --file requirements.txt")
+        os.system("conda activate test_env")
+        os.system("python examples/demo.py")
+        os.system("pytest tests/unit/test_module.py")
+        os.system("pytest tests/acceptance/test_features.py")
+        os.system("coverage run -m pytest tests/unit/test_module.py")
+        os.system("coverage report")
+        os.system("conda deactivate")
+        os.chdir("..")
diff --git a/system/eval_config.yml b/system/eval_config.yml
@@ -1,5 +1,5 @@
 prd_paths:
-  ArXiv_Digest: "system/benchmark_data/python/ArXiv_digest/docs/PRD.md"
+  ArXiv_digest: "system/benchmark_data/python/ArXiv_digest/docs/PRD.md"
   chakin: "system/benchmark_data/python/chakin/PRD.md"
   geotext: "system/benchmark_data/python/geotext/PRD.md"
   hone: "system/benchmark_data/python/hone/docs/PRD.md"
@@ -20,4 +20,16 @@ tracker_v2:
   particle_swarm_optimization: 1
   readtime: 2 # changed the approve implementation prompt
   stocktrends: 1
-  TextCNN: 1
+  TextCNN: 1
+
+tracker_v3:
+  ArXiv_Digest: 1
+  chakin: 2
+  geotext: 1
+  hone: 1
+  Hybrid_Images: 4 # change the approve implementation prompt
+  lice: 
+  particle_swarm_optimization: 
+  readtime:
+  stocktrends:
+  TextCNN:
diff --git a/system/graph.py b/system/graph.py
@@ -228,21 +228,29 @@ def unit_tests(state: GraphState):
     Generate unit tests for the software.
     """
     logging.info("---UNIT TESTS---")
+    state['approvals']['unit_tests_iter'] = state['approvals'].get('unit_tests_iter', 0) + 1
+
     code = '\n\n'.join(f"# ---{filename}---\n{content}" 
                        for filename, content in state['documents']['code'].items())
-    prompt = UNIT_TEST_PROMPT.format(PRD=state["documents"]['PRD'],
+    prompt = [HumanMessage(content=UNIT_TEST_PROMPT.format(PRD=state["documents"]['PRD'],
                                            architecture_design=state["documents"]['architecture_design'],
-                                           code=code)
+                                           code=code))]
+    if 'unit_tests' in state['approvals']:
+        if not state['approvals']['unit_tests_coverage']:
+            prompt.append(state['messages'][-1])
     structured_llm = llm.with_structured_output(UnitTests)
-    test = structured_llm.invoke([HumanMessage(content=prompt)])
+    test = structured_llm.invoke(prompt)
     state["documents"].update(test.dict())
     return state
 
 def approve_unit_tests(state: GraphState):
     logging.info("---APPROVE UNIT TESTS---")
-
+
+    # Get first directory name, handling paths with leading slash
+    root_dir = next(name for name in next(iter(state['documents']['code'])).split('/') if name)
+    cmd = f"cd temp/{root_dir} && {state['documents']['unit_tests']['command'].replace('python ', 'coverage run ')}"
+
     try:
-        cmd = f"cd temp && {state['documents']['unit_tests']['command']}"
         # Run command in shell, capture output
         process = subprocess.run(
             cmd,
@@ -253,27 +261,68 @@ def approve_unit_tests(state: GraphState):
         )
 
         # Add command to messages
-        state['messages'].append(state['documents']['unit_tests']['command'])
+        state['messages'].append(cmd)
 
         # Check return code
         if process.returncode == 0:
             state['approvals'].update({"unit_tests": True})
-            state['messages'].append("Unit tests passed")
+            state['messages'].append(f"Unit tests passed: {process.stdout}")
         else:
             state['approvals'].update({"unit_tests": False})
-            state['messages'].append(f"Acceptance tests failed: {process.stderr}")
+            state['messages'].append(f"Unit tests failed: {process.stderr}")
+
+        # Run coverage report
+        coverage_cmd = f"cd temp/{root_dir} && coverage report"
+        coverage_process = subprocess.run(
+            coverage_cmd,
+            shell=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True
+        )
+
+        # Add coverage command to messages
+        state['messages'].append(coverage_cmd)
+
+        if coverage_process.returncode == 0:
+            # Parse coverage report output
+            coverage_output = coverage_process.stdout
+            #state['messages'].append(coverage_output)
+
+            # Extract total coverage percentage
+            for line in coverage_output.splitlines():
+                if line.startswith("TOTAL"):
+                    total_coverage = int(line.split()[3].replace('%', ''))
+                    if total_coverage < 60:
+                        state['approvals'].update({"unit_tests_coverage": False})
+                        msg = f"Coverage report failed to cover at least 60% please revise unit tests: \n{coverage_output}"
+                        state['messages'].append(msg)
+                        logging.info(msg)
+                    else:
+                        state['approvals'].update({"unit_tests_coverage": True})
+                        msg = f"Coverage report successful: \n{coverage_output}"
+                        state['messages'].append(msg)
+                        logging.info(msg)
+                    break
+        else:
+            msg = f"Coverage report failed execution: {coverage_process.stdout}"
+            state['messages'].append(msg)
+            logging.info(msg)
+            state['approvals'].update({"unit_tests_coverage": False})
 
     except Exception as e:
-        state['approvals'].update({"unit_tests": False})
-        state['messages'].append(f"Error running unit tests: {str(e)}")
+        state['approvals'].update({"unit_tests": False, "unit_tests_coverage": False})
+        msg = f"Error running unit tests: {str(e)}"
+        state['messages'].append(msg)
+        logging.info(msg)
 
     return state
 
-def route_unit_tests(state: GraphState) -> Literal["__end__", 'assistant']:
-    if all(state["approvals"].values()):
+def route_unit_tests(state: GraphState) -> Literal["__end__", 'unit_tests']:
+    if state['approvals']['unit_tests_coverage'] or state['approvals']['unit_tests_iter'] > 2:
         return END
     else:
-        return "assistant" # go back to implementation with a message from the controller
+        return "unit_tests" # go back and regenerate unit tests
 
 def environment_setup(state: GraphState):
     """
@@ -349,6 +398,6 @@ def build_graph():
     graph.add_edge('unit_tests', "environment_setup")
     graph.add_edge("environment_setup", "approve_acceptance_tests")
     graph.add_edge("approve_acceptance_tests", "approve_unit_tests")
-    graph.add_edge("approve_unit_tests", END)
+    graph.add_conditional_edges("approve_unit_tests", route_unit_tests)
 
     return graph.compile()
diff --git a/system/images/swegraph.png b/system/images/swegraph.png
diff --git a/system/main.py b/system/main.py
@@ -27,7 +27,7 @@ def main():
             prd_content = prd_file.read()
 
     graph = build_graph()
-    #graph.get_graph().draw_mermaid_png(output_file_path="system/images/swegraph.png")
+    graph.get_graph().draw_mermaid_png(output_file_path="system/images/swegraph.png")
     state = {'documents': {'PRD': prd_content},
              'messages': [HumanMessage(content=DESIGN_PROMPT.format(PRD=prd_content))]}
     final_state = graph.invoke(state)

diff --git a/system/prompts.py b/system/prompts.py
@@ -97,7 +97,7 @@
 2. UML_sequence: A Mermaid 11 sequence diagram showing the key interactions and flow between components as specified in the PRD
 3. architecture_design: A detailed text based representation of the file tree that is true to the PRD and includes but is not limited to:
   - A root-level README.md file documenting the system overview
-  - An 'examples' directory containing:
+  - An 'examples' directory (inside the root directory) containing:
     - example_usage.sh demonstrating core functionality along with any additional example files that align with use cases mentioned in the PRD
 """
 
@@ -255,21 +255,25 @@
    - README.md in the root directory with complete documentation
    - example_usage.sh in the "examples" directory with working examples that are consistent with the PRD
    - if the PRD or architecture design call for other files to be in the "examples" directory then include them as well
-5. Implement all necessary code files with full, working implementations
+5. Implement all necessary code files with full, working implementations, don't specify empty directories
 6. Include any CSV/JSON files mentioned in the PRD or architecture design
-7. Ensure the code is production-ready and follows best practices
+7. Ensure the code is production-ready and follows best practices 
+8. Tests will be run from the root directory of the repository so keep that in mind for import statements
 """
 
 
 APPROVE_IMPLEMENTATION_PROMPT = """
 
-Please verify if the architectural design described in the text representation below:
+Below is the architectural design:
 
 ```
 {architecture_design}
 ```
 
-is accurately mirrored in the documents below. There should be one name for each file in the architecture design.
+Please verify that the architecture design is accurately mirrored in the document list below. Each file path in the architecture design should have a corresponding file in the documents list, with the following exceptions:
+- Directories that only contain image files (e.g., .jpg, .png, etc.)
+- Empty directories that are intended as storage locations
+- Non-text based files since the LLM cannot generate them
 
 ```
 {code}
@@ -328,7 +332,7 @@
 -----Instructions--------
 Your task is to generate unit tests to ensure the software adheres to the requirements in the PRD. 
 Pay close attention to the code and the PRD to ensure the tests are comprehensive and accurate.
-The unit tests will be written using the unittest module and ultimately written to a file at: tests/unit/test_module.py. Keep this in mind.
+The unit tests will be written using the unittest module and ultimately written to a file at: tests/unit/test_module.py. Keep this in mind for relative imports and file paths.
 Write the content of the unit tests to a dictionary where the key is "test_module" and the value is the content of the unit test.
 Make another key in this dictionary called "command" and write the command to run the unit tests as the value for the "command" key.
 Nest this dictionary in another dictionary with the key "unit_tests" and return this nested dictionary.

diff --git a/system/relocate.py b/system/relocate.py
@@ -18,6 +18,8 @@ def relocate_directory(temp_dir: str, dest_dir: str, new_name: str) -> None:
             for file in os.listdir(os.path.join(temp_dir, directory)):
                 shutil.move(os.path.join(temp_dir, directory, file), os.path.join(new_path, file))
 
+    os.rmdir(os.path.join(temp_dir, directory))
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Relocate and rename a directory")

diff --git a/system/utils.py b/system/utils.py
@@ -1,6 +1,7 @@
 import subprocess
 import sys
 import os
+import shutil
 import copy
 from typing import Dict, List, Union
 
@@ -58,14 +59,27 @@ def write_files(base_path: str, files_content: Dict[str, str]) -> None:
 
 def create_repository(base_path: str, documents: Dict) -> None:
     """Create repository structure and write files including test files"""
-    # Create base directory
+    # Create base directory if it doesn't exist
     os.makedirs(base_path, exist_ok=True)
 
+    # If base_path exists, delete all its contents
+    for filename in os.listdir(base_path):
+        file_path = os.path.join(base_path, filename)
+        try:
+            if os.path.isfile(file_path) or os.path.islink(file_path):
+                os.unlink(file_path)
+            elif os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+        except Exception as e:
+            print(f'Failed to delete {file_path}. Reason: {e}')
+
     files = copy.deepcopy(documents['code'])
 
     # Get first directory name, handling paths with leading slash
     root_dir = next(name for name in next(iter(files)).split('/') if name)
 
+    coverage = "[run]\nomit =\n    */__init__.py\n    tests/*\n"
+
     files.update({
         f'{root_dir}/tests/unit/test_module.py': documents['unit_tests']['test_module'],
         f'{root_dir}/tests/unit/__init__.py': '',
@@ -75,6 +89,8 @@ def create_repository(base_path: str, documents: Dict) -> None:
         f'{root_dir}/docs/UML_class.md': documents['UML_class'],
         f'{root_dir}/docs/UML_sequence.md': documents['UML_sequence'],
         f'{root_dir}/docs/architecture_design.md': documents['architecture_design'],
+        f'{root_dir}/requirements.txt': documents['requirements'],
+        f'{root_dir}/.coveragerc': coverage,
     })
 
     # Write files