-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinterpreter.py
107 lines (78 loc) · 3.26 KB
/
interpreter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import base64
import io
from dotenv import load_dotenv
load_dotenv()
from e2b_code_interpreter import CodeInterpreter
def code_interpret( code):
with CodeInterpreter() as sandbox:
execution = sandbox.notebook.exec_cell(code)
print(execution)
return execution
code = """
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
df = pd.read_csv('netflix.csv')
# Prepare the data
X = df.drop('type', axis=1)
y = df['type']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
models = {
'Random Forest': RandomForestClassifier(),
'Logistic Regression': LogisticRegression(),
'SVM': SVC(),
'KNN': KNeighborsClassifier()
}
accuracies = {}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracies[name] = accuracy_score(y_test, y_pred)
# Find top 5 accuracy models
top_5_models = sorted(accuracies.items(), key=lambda x: x[1], reverse=True)[:5]
# Choose the best model from the top 5
best_model_name, best_model_accuracy = top_5_models[0]
best_model = models[best_model_name]
# Model evaluation for the best model
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)
best_model_accuracy = accuracy_score(y_test, y_pred_best)
# Store the best model
import joblib
joblib.dump(best_model, 'best_model.pkl')
# Store the model building code
with open('ML_model.py', 'w') as file:
file.write("# Put the code to build the best model here")
print("Top 5 models and their accuracies:")
for model_name, accuracy in top_5_models:
print(f"{model_name}: {accuracy}")
print(f"\nBest model: {best_model_name}, Accuracy: {best_model_accuracy}")
"""
SYSTEM_PROMPT = """
## your job & context
you are a python data scientist. you are given tasks to complete and you run python code to solve them.
- the python code runs in jupyter notebook.
- every time you call `execute_python` tool, the python code is executed in a separate cell. it's okay to multiple calls to `execute_python`.
- display visualizations using matplotlib or any other visualization library directly in the notebook. don't worry about saving the visualizations to a file.
- you have access to the internet and can make api requests.
- you also have access to the filesystem and can read/write files.
- you can install any pip package (if it exists) if you need to but the usual packages for data analysis are already preinstalled.
- you can run any python code you want, everything is running in a secure sandbox environment.
## style guide
tool response values that have text inside "[]" mean that a visual element got rended in the notebook. for example:
- "[chart]" means that a chart was generated in the notebook.
"""
# image = execution.results[0].png
#
#
# i = base64.b64decode(image)
# i = io.BytesIO(i)
# i = mpimg.imread(i, format='PNG')
#
# plt.imshow(i, interpolation='nearest')
# plt.show()