Skip to content

Commit

Permalink
update all (microsoft#530)
Browse files Browse the repository at this point in the history
  • Loading branch information
peteryang1 authored Jan 22, 2025
1 parent cf2ff92 commit f3ed911
Show file tree
Hide file tree
Showing 10 changed files with 162 additions and 28 deletions.
12 changes: 11 additions & 1 deletion rdagent/components/coder/data_science/ensemble/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,22 @@ def evaluate(
implementation.inject_files(**{fname: test_code})
stdout = implementation.execute(env=de, entry=f"python {fname}")

if "main.py" in implementation.file_dict:
workflow_stdout = implementation.execute(env=de, entry="python main.py")
else:
workflow_stdout = None

system_prompt = T(".prompts:ensemble_eval.system").r(
task_desc=target_task_information,
test_code=test_code,
code=implementation.file_dict["ensemble.py"],
workflow_stdout=workflow_stdout,
workflow_code=implementation.all_codes,
)
user_prompt = T(".prompts:ensemble_eval.user").r(
stdout=stdout,
workflow_stdout=workflow_stdout,
)
user_prompt = T(".prompts:ensemble_eval.user").r(stdout=stdout)

resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
return EnsembleEvalFeedback(**json.loads(resp))
12 changes: 12 additions & 0 deletions rdagent/components/coder/data_science/ensemble/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,13 @@ ensemble_eval:
{{test_code}}
```
{% if workflow_stdout is not none %}
Your ensemble code is also part of the whole workflow, the user also tested the whole workflow and provided you the stdout.
The whole workflow code is:
{{workflow_code}}
Please consider both stdout and approve the code when both the ensemble test and the whole workflow test pass.
{% endif %}
You'll be given the stdout of your testing scripts.
Please respond with your feedback in the following JSON format:
{
Expand All @@ -76,4 +83,9 @@ ensemble_eval:
"final_decision": <true/false>
}
user: |-
Ensemble test stdout:
{{stdout}}
{% if workflow_stdout is not none %}
Whole workflow test stdout:
{{workflow_stdout}}
{% endif %}
12 changes: 11 additions & 1 deletion rdagent/components/coder/data_science/feature/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,22 @@ def evaluate(

stdout = implementation.execute(env=de, entry=f"python {fname}")

if "main.py" in implementation.file_dict:
workflow_stdout = implementation.execute(env=de, entry="python main.py")
else:
workflow_stdout = None

system_prompt = T(".prompts:feature_eval.system").r(
task_desc=target_task.get_task_information(),
test_code=test_code,
code=implementation.file_dict["feature.py"],
workflow_stdout=workflow_stdout,
workflow_code=implementation.all_codes,
)
user_prompt = T(".prompts:feature_eval.user").r(
stdout=shrink_text(stdout),
workflow_stdout=workflow_stdout,
)
user_prompt = T(".prompts:feature_eval.user").r(stdout=shrink_text(stdout))

resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
return FeatureEvalFeedback(**json.loads(resp))
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Please make sure the stdout is rich enough to support informative feedback
"""

import pickle
from copy import deepcopy

import numpy as np
import pandas as pd
Expand All @@ -19,17 +20,44 @@ print(f"X.shape: {X.shape}")
print(f"y.shape: {y.shape}" if not isinstance(y, list) else f"y(list)'s length: {len(y)}")
print(f"X_test.shape: {X_test.shape}")
print(f"test_ids length: {len(test_ids)}")
X_loaded = deepcopy(X)
y_loaded = deepcopy(y)
X_test_loaded = deepcopy(X_test)
X, y, X_test = feat_eng(X, y, X_test)


def get_length(data):
return len(data) if isinstance(data, list) else data.shape[0]

assert get_length(X_test) == get_length(test_ids), (
f"Mismatch in length of test images and test IDs: X_test ({get_length(X_test)}) and test_ids ({get_length(test_ids)})"
)
assert get_length(X) == get_length(y), (
f"Mismatch in length of training images and labels: X ({get_length(X)}) and y ({get_length(y)})"
)

print("Feature Engineering test passed successfully. Length of test images matches length of test IDs.")
def get_width(data):
return 1 if isinstance(data, list) else data.shape[1:]


def get_column_list(data):
return data.columns.tolist() if isinstance(data, pd.DataFrame) else None


assert get_length(X_test) == get_length(
test_ids
), f"Mismatch in length of test images and test IDs: X_test ({get_length(X_test)}) and test_ids ({get_length(test_ids)})"
assert get_length(X) == get_length(
y
), f"Mismatch in length of training images and labels: X ({get_length(X)}) and y ({get_length(y)})"

assert get_length(X) != 0, f"Training data is empty."
assert get_length(y) != 0, f"Training labels are empty."
assert get_length(X_test) != 0, f"Test data is empty."

assert get_width(X) == get_width(
X_test
), "Mismatch in width of training and test data. Width means the number of features."

if isinstance(X, pd.DataFrame) and isinstance(X_test, pd.DataFrame):
assert get_column_list(X) == get_column_list(X_test), "Mismatch in column names of training and test data."

assert sorted(X.dtypes.unique().tolist()) == sorted(
X_loaded.dtypes.unique().tolist()
), f"feature engineering has produced new data types which is not allowed, data loader data types are {X_loaded.dtypes.unique().tolist()} and feature engineering data types are {X.dtypes.unique().tolist()}"

print("Feature Engineering test passed successfully. All checks including length, width, and data types have been validated.")
18 changes: 14 additions & 4 deletions rdagent/components/coder/data_science/feature/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,14 @@ feature_eval:
```python
{{test_code}}
```
{% if workflow_stdout is not none %}
Your feature engineering code is also part of the whole workflow, the user also tested the whole workflow and provided you the stdout.
The whole workflow code is:
{{workflow_code}}
Please consider both stdout and approve the code when both the feature engineering test and the whole workflow test pass.
{% endif %}
You'll be given the stdout of your testing scripts.
Please respond with your feedback in the following JSON format and order
```json
Expand All @@ -86,6 +93,9 @@ feature_eval:
}
```
user: |-
```
{{stdout}}
```
Feature engineering test stdout:
{{stdout}}
{% if workflow_stdout is not none %}
Whole workflow test stdout:
{{workflow_stdout}}
{% endif %}
8 changes: 8 additions & 0 deletions rdagent/components/coder/data_science/model/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,23 @@ def evaluate(
"The execution output contains too many progress bars and results in the LLM's token size exceeding the limit."
)

if "main.py" in implementation.file_dict:
workflow_stdout = implementation.execute(env=de, entry="python main.py")
else:
workflow_stdout = None

system_prompt = T(".prompts:model_eval.system").r(
task_desc=target_task.get_task_information(),
test_code=test_code,
scenario=self.scen.get_scenario_all_desc(),
spec=implementation.file_dict["spec/model.md"],
workflow_stdout=workflow_stdout,
workflow_code=implementation.all_codes,
)
user_prompt = T(".prompts:model_eval.user").r(
stdout=stdout,
code=implementation.file_dict[f"{target_task.name}.py"],
workflow_stdout=workflow_stdout,
)
resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
return ModelSingleFeedback(**json.loads(resp))
19 changes: 14 additions & 5 deletions rdagent/components/coder/data_science/model/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,13 @@ model_eval:
Only if there is "Model code test passed successfully." in the stdout, then the model is considered successful, or else there must be some issues with the model.
If no stdout is provided, the model is considered to have failed due to a timeout. Please check if there are any ways to improve the model's execution speed.
{% if workflow_stdout is not none %}
Your model code is also part of the whole workflow, the user also tested the whole workflow and provided you the stdout.
The whole workflow code is:
{{workflow_code}}
Please consider both stdout and approve the code when both the model test and the whole workflow test pass.
{% endif %}
Please respond with your feedback in the following JSON format and order:
```json
Expand All @@ -128,9 +135,11 @@ model_eval:
```
user: |-
--------------Code generated by user:---------------
---------------Code generated by user:---------------
{{ code }}
--------------stdoutput:---------------
'''
{{ stdout }}
'''
---------------Model test stdout:---------------
{{stdout}}
{% if workflow_stdout is not none %}
---------------Whole workflow test stdout:---------------
{{workflow_stdout}}
{% endif %}
12 changes: 11 additions & 1 deletion rdagent/components/coder/data_science/raw_data_loader/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,22 @@ def evaluate(
implementation.inject_files(**{fname: test_code})
stdout = implementation.execute(env=de, entry=f"python {fname}")

if "main.py" in implementation.file_dict:
workflow_stdout = implementation.execute(env=de, entry="python main.py")
else:
workflow_stdout = None

system_prompt = T(".prompts:data_loader_eval.system").r(
task_desc=target_task.get_task_information(),
test_code=test_code,
code=implementation.file_dict["load_data.py"],
workflow_stdout=workflow_stdout,
workflow_code=implementation.all_codes,
)
user_prompt = T(".prompts:data_loader_eval.user").r(
stdout=stdout,
workflow_stdout=workflow_stdout,
)
user_prompt = T(".prompts:data_loader_eval.user").r(stdout=stdout)

resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=True)
return DataLoaderEvalFeedback(**json.loads(resp))
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,45 @@ Please make sure the stdout is rich enough to support informative feedback
"""

import pickle

import pandas as pd
from load_data import load_data

X, y, X_test, test_ids = load_data()


def get_length(data):
return len(data) if isinstance(data, list) else data.shape[0]

assert get_length(X_test) == get_length(test_ids), (
f"Mismatch in length of test images and test IDs: X_test ({get_length(X_test)}) and test_ids ({get_length(test_ids)})"
)
assert get_length(X) == get_length(y), (
f"Mismatch in length of training images and labels: X ({get_length(X)}) and y ({get_length(y)})"
)

def get_width(data):
return 1 if isinstance(data, list) else data.shape[1:]


def get_column_list(data):
return data.columns.tolist() if isinstance(data, pd.DataFrame) else None


assert get_length(X_test) == get_length(
test_ids
), f"Mismatch in length of test images and test IDs: X_test ({get_length(X_test)}) and test_ids ({get_length(test_ids)})"
assert get_length(X) == get_length(
y
), f"Mismatch in length of training images and labels: X ({get_length(X)}) and y ({get_length(y)})"

assert get_length(X) != 0, f"Training data is empty."
assert get_length(y) != 0, f"Training labels are empty."
assert get_length(X_test) != 0, f"Test data is empty."

assert get_width(X) == get_width(
X_test
), "Mismatch in width of training and test data. Width means the number of features."

if isinstance(X, pd.DataFrame) and isinstance(X_test, pd.DataFrame):
assert get_column_list(X) == get_column_list(X_test), "Mismatch in column names of training and test data."

assert get_width(X) == get_width(
X_test
), "Mismatch in width of training and test data. Width means the number of features."

print("Data loader test passed successfully. Length of test images matches length of test IDs.")
16 changes: 13 additions & 3 deletions rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,13 @@ data_loader_eval:
{{test_code}}
```
{% if workflow_stdout is not none %}
Your feature engineering code is also part of the whole workflow, the user also tested the whole workflow and provided you the stdout.
The whole workflow code is:
{{workflow_code}}
Please consider both stdout and approve the code when both the feature engineering test and the whole workflow test pass.
{% endif %}
You'll be given the stdout of your testing scripts.
Please respond with your feedback in the following JSON format and order
```json
Expand All @@ -384,6 +391,9 @@ data_loader_eval:
}
```
user: |-
```
{{stdout}}
```
Data loader test stdout:
{{stdout}}
{% if workflow_stdout is not none %}
Whole workflow test stdout:
{{workflow_stdout}}
{% endif %}

0 comments on commit f3ed911

Please sign in to comment.