update all (microsoft#530)

navendugarg · Jan 22, 2025 · f3ed911 · f3ed911
1 parent cf2ff92
commit f3ed911
Show file tree

Hide file tree

Showing 10 changed files with 162 additions and 28 deletions.
diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py
@@ -64,12 +64,22 @@ def evaluate(
         implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
+        if "main.py" in implementation.file_dict:
+            workflow_stdout = implementation.execute(env=de, entry="python main.py")
+        else:
+            workflow_stdout = None
+
         system_prompt = T(".prompts:ensemble_eval.system").r(
             task_desc=target_task_information,
             test_code=test_code,
             code=implementation.file_dict["ensemble.py"],
+            workflow_stdout=workflow_stdout,
+            workflow_code=implementation.all_codes,
+        )
+        user_prompt = T(".prompts:ensemble_eval.user").r(
+            stdout=stdout,
+            workflow_stdout=workflow_stdout,
         )
-        user_prompt = T(".prompts:ensemble_eval.user").r(stdout=stdout)
 
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
         return EnsembleEvalFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/ensemble/prompts.yaml b/rdagent/components/coder/data_science/ensemble/prompts.yaml
@@ -67,6 +67,13 @@ ensemble_eval:
     {{test_code}}
     ```
     
+    {% if workflow_stdout is not none %}
+    Your ensemble code is also part of the whole workflow, the user also tested the whole workflow and provided you the stdout.
+    The whole workflow code is:
+    {{workflow_code}}
+    Please consider both stdout and approve the code when both the ensemble test and the whole workflow test pass.
+    {% endif %}
+    
     You'll be given the stdout of your testing scripts.
     Please respond with your feedback in the following JSON format: 
     {
@@ -76,4 +83,9 @@ ensemble_eval:
         "final_decision": <true/false>
     }    
   user: |-    
+    Ensemble test stdout:
     {{stdout}}   
+    {% if workflow_stdout is not none %}
+    Whole workflow test stdout:
+    {{workflow_stdout}}
+    {% endif %}
diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py
@@ -57,12 +57,22 @@ def evaluate(
 
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
+        if "main.py" in implementation.file_dict:
+            workflow_stdout = implementation.execute(env=de, entry="python main.py")
+        else:
+            workflow_stdout = None
+
         system_prompt = T(".prompts:feature_eval.system").r(
             task_desc=target_task.get_task_information(),
             test_code=test_code,
             code=implementation.file_dict["feature.py"],
+            workflow_stdout=workflow_stdout,
+            workflow_code=implementation.all_codes,
+        )
+        user_prompt = T(".prompts:feature_eval.user").r(
+            stdout=shrink_text(stdout),
+            workflow_stdout=workflow_stdout,
         )
-        user_prompt = T(".prompts:feature_eval.user").r(stdout=shrink_text(stdout))
 
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
         return FeatureEvalFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt
@@ -8,6 +8,7 @@ Please make sure the stdout is rich enough to support informative feedback
 """
 
 import pickle
+from copy import deepcopy
 
 import numpy as np
 import pandas as pd
@@ -19,17 +20,44 @@ print(f"X.shape: {X.shape}")
 print(f"y.shape: {y.shape}" if not isinstance(y, list) else f"y(list)'s length: {len(y)}")
 print(f"X_test.shape: {X_test.shape}")
 print(f"test_ids length: {len(test_ids)}")
+X_loaded = deepcopy(X)
+y_loaded = deepcopy(y)
+X_test_loaded = deepcopy(X_test)
 X, y, X_test = feat_eng(X, y, X_test)
 
 
 def get_length(data):
     return len(data) if isinstance(data, list) else data.shape[0]
 
-assert get_length(X_test) == get_length(test_ids), (
-    f"Mismatch in length of test images and test IDs: X_test ({get_length(X_test)}) and test_ids ({get_length(test_ids)})"
-)
-assert get_length(X) == get_length(y), (
-    f"Mismatch in length of training images and labels: X ({get_length(X)}) and y ({get_length(y)})"
-)
 
-print("Feature Engineering test passed successfully. Length of test images matches length of test IDs.")
+def get_width(data):
+    return 1 if isinstance(data, list) else data.shape[1:]
+
+
+def get_column_list(data):
+    return data.columns.tolist() if isinstance(data, pd.DataFrame) else None
+
+
+assert get_length(X_test) == get_length(
+    test_ids
+), f"Mismatch in length of test images and test IDs: X_test ({get_length(X_test)}) and test_ids ({get_length(test_ids)})"
+assert get_length(X) == get_length(
+    y
+), f"Mismatch in length of training images and labels: X ({get_length(X)}) and y ({get_length(y)})"
+
+assert get_length(X) != 0, f"Training data is empty."
+assert get_length(y) != 0, f"Training labels are empty."
+assert get_length(X_test) != 0, f"Test data is empty."
+
+assert get_width(X) == get_width(
+    X_test
+), "Mismatch in width of training and test data. Width means the number of features."
+
+if isinstance(X, pd.DataFrame) and isinstance(X_test, pd.DataFrame):
+    assert get_column_list(X) == get_column_list(X_test), "Mismatch in column names of training and test data."
+
+assert sorted(X.dtypes.unique().tolist()) == sorted(
+    X_loaded.dtypes.unique().tolist()
+), f"feature engineering has produced new data types which is not allowed, data loader data types are {X_loaded.dtypes.unique().tolist()} and feature engineering data types are {X.dtypes.unique().tolist()}"
+
+print("Feature Engineering test passed successfully. All checks including length, width, and data types have been validated.")
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -74,7 +74,14 @@ feature_eval:
     ```python
     {{test_code}}
     ```
-
+    
+    {% if workflow_stdout is not none %}
+    Your feature engineering code is also part of the whole workflow, the user also tested the whole workflow and provided you the stdout.
+    The whole workflow code is:
+    {{workflow_code}}
+    Please consider both stdout and approve the code when both the feature engineering test and the whole workflow test pass.
+    {% endif %}
+    
     You'll be given the stdout of your testing scripts.
     Please respond with your feedback in the following JSON format and order
     ```json
@@ -86,6 +93,9 @@ feature_eval:
     }
     ```
   user: |-
-    ```
-    {{stdout}}
-    ```
+    Feature engineering test stdout:
+    {{stdout}}   
+    {% if workflow_stdout is not none %}
+    Whole workflow test stdout:
+    {{workflow_stdout}}
+    {% endif %}
diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py
@@ -73,15 +73,23 @@ def evaluate(
                 "The execution output contains too many progress bars and results in the LLM's token size exceeding the limit."
             )
 
+        if "main.py" in implementation.file_dict:
+            workflow_stdout = implementation.execute(env=de, entry="python main.py")
+        else:
+            workflow_stdout = None
+
         system_prompt = T(".prompts:model_eval.system").r(
             task_desc=target_task.get_task_information(),
             test_code=test_code,
             scenario=self.scen.get_scenario_all_desc(),
             spec=implementation.file_dict["spec/model.md"],
+            workflow_stdout=workflow_stdout,
+            workflow_code=implementation.all_codes,
         )
         user_prompt = T(".prompts:model_eval.user").r(
             stdout=stdout,
             code=implementation.file_dict[f"{target_task.name}.py"],
+            workflow_stdout=workflow_stdout,
         )
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
         return ModelSingleFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -116,6 +116,13 @@ model_eval:
 
         Only if there is "Model code test passed successfully." in the stdout, then the model is considered successful, or else there must be some issues with the model.
         If no stdout is provided, the model is considered to have failed due to a timeout. Please check if there are any ways to improve the model's execution speed.
+        
+        {% if workflow_stdout is not none %}
+        Your model code is also part of the whole workflow, the user also tested the whole workflow and provided you the stdout.
+        The whole workflow code is:
+        {{workflow_code}}
+        Please consider both stdout and approve the code when both the model test and the whole workflow test pass.
+        {% endif %}
     
         Please respond with your feedback in the following JSON format and order:
         ```json
@@ -128,9 +135,11 @@ model_eval:
         ```
 
     user: |-
-        --------------Code generated by user:---------------
+        ---------------Code generated by user:---------------
         {{ code }}
-        --------------stdoutput:---------------
-        '''
-        {{ stdout }}
-        '''
+        ---------------Model test stdout:---------------
+        {{stdout}}   
+        {% if workflow_stdout is not none %}
+        ---------------Whole workflow test stdout:---------------
+        {{workflow_stdout}}
+        {% endif %}
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py
@@ -58,12 +58,22 @@ def evaluate(
         implementation.inject_files(**{fname: test_code})
         stdout = implementation.execute(env=de, entry=f"python {fname}")
 
+        if "main.py" in implementation.file_dict:
+            workflow_stdout = implementation.execute(env=de, entry="python main.py")
+        else:
+            workflow_stdout = None
+
         system_prompt = T(".prompts:data_loader_eval.system").r(
             task_desc=target_task.get_task_information(),
             test_code=test_code,
             code=implementation.file_dict["load_data.py"],
+            workflow_stdout=workflow_stdout,
+            workflow_code=implementation.all_codes,
+        )
+        user_prompt = T(".prompts:data_loader_eval.user").r(
+            stdout=stdout,
+            workflow_stdout=workflow_stdout,
         )
-        user_prompt = T(".prompts:data_loader_eval.user").r(stdout=stdout)
 
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
         return DataLoaderEvalFeedback(**json.loads(resp))
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt
@@ -8,18 +8,45 @@ Please make sure the stdout is rich enough to support informative feedback
 """
 
 import pickle
+
+import pandas as pd
 from load_data import load_data
 
 X, y, X_test, test_ids = load_data()
 
+
 def get_length(data):
     return len(data) if isinstance(data, list) else data.shape[0]
 
-assert get_length(X_test) == get_length(test_ids), (
-    f"Mismatch in length of test images and test IDs: X_test ({get_length(X_test)}) and test_ids ({get_length(test_ids)})"
-)
-assert get_length(X) == get_length(y), (
-    f"Mismatch in length of training images and labels: X ({get_length(X)}) and y ({get_length(y)})"
-)
+
+def get_width(data):
+    return 1 if isinstance(data, list) else data.shape[1:]
+
+
+def get_column_list(data):
+    return data.columns.tolist() if isinstance(data, pd.DataFrame) else None
+
+
+assert get_length(X_test) == get_length(
+    test_ids
+), f"Mismatch in length of test images and test IDs: X_test ({get_length(X_test)}) and test_ids ({get_length(test_ids)})"
+assert get_length(X) == get_length(
+    y
+), f"Mismatch in length of training images and labels: X ({get_length(X)}) and y ({get_length(y)})"
+
+assert get_length(X) != 0, f"Training data is empty."
+assert get_length(y) != 0, f"Training labels are empty."
+assert get_length(X_test) != 0, f"Test data is empty."
+
+assert get_width(X) == get_width(
+    X_test
+), "Mismatch in width of training and test data. Width means the number of features."
+
+if isinstance(X, pd.DataFrame) and isinstance(X_test, pd.DataFrame):
+    assert get_column_list(X) == get_column_list(X_test), "Mismatch in column names of training and test data."
+
+assert get_width(X) == get_width(
+    X_test
+), "Mismatch in width of training and test data. Width means the number of features."
 
 print("Data loader test passed successfully. Length of test images matches length of test IDs.")
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -373,6 +373,13 @@ data_loader_eval:
     {{test_code}}
     ```
 
+    {% if workflow_stdout is not none %}
+    Your feature engineering code is also part of the whole workflow, the user also tested the whole workflow and provided you the stdout.
+    The whole workflow code is:
+    {{workflow_code}}
+    Please consider both stdout and approve the code when both the feature engineering test and the whole workflow test pass.
+    {% endif %}
+    
     You'll be given the stdout of your testing scripts.
     Please respond with your feedback in the following JSON format and order
     ```json
@@ -384,6 +391,9 @@ data_loader_eval:
     }
     ```
   user: |-
-    ```
-    {{stdout}}
-    ```
+    Data loader test stdout:
+    {{stdout}}   
+    {% if workflow_stdout is not none %}
+    Whole workflow test stdout:
+    {{workflow_stdout}}
+    {% endif %}