Skip to content

Commit

Permalink
feat(demo-project): update code
Browse files Browse the repository at this point in the history
  • Loading branch information
Caceresenzo committed May 12, 2023
1 parent 919c7c6 commit 3238ff3
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 57 deletions.
2 changes: 1 addition & 1 deletion crunch/__version__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
__title__ = 'crunch-cli'
__description__ = 'crunch-cli - CLI of the CrunchDAO Platform'
__version__ = '1.2.1'
__version__ = '1.2.2'
__author__ = 'Enzo CACERES'
__author_email__ = '[email protected]'
__url__ = 'https://github.com/crunchdao/crunch-cli'
102 changes: 48 additions & 54 deletions crunch/demo-project/main.py
Original file line number Diff line number Diff line change
@@ -1,90 +1,84 @@
"""
This is a basic example of what you need to do to participate to the tournament.
The code will not have access to the internet (or any socket related operation) so don't try to get access to external resources.
The code will not have access to the internet (or any socket related operation).
"""

import os
# Imports
import xgboost as xgb
import pandas as pd
import typing

import joblib
import pandas as pd
import sklearn

import xgboost as xgb
from scipy import stats
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split


def scorer(y_test: pd.DataFrame, y_pred: pd.DataFrame) -> None:
score = (stats.spearmanr(y_test, y_pred)*100)[0]
print(f"In sample spearman correlation {score}")
from pathlib import Path


def train(x_train: pd.DataFrame, y_train: pd.DataFrame, model_directory_path: str) -> None:
def train(
X_train: pd.DataFrame,
y_train: pd.DataFrame,
model_directory_path: str = "resources"
) -> None:
"""
Do your model training here.
At each retrain this function will save an updated version of the model under the model_directiory_path.
Make sure to use the correct operator to read and/or write your model.
At each retrain this function will have to save an updated version of
the model under the model_directiory_path, as in the example below.
Note: You can use other serialization methods than joblib.dump(), as
long as it matches what reads the model in infer().
Args:
x_train, y_train: the data post user processing and user feature engeneering done in the data_process function.
model_directory_path: the path to the directory to the directory in wich we will saving your updated model
X_train, y_train: the data to train the model.
model_directory_path: the path to save your updated model
Returns:
None
"""
X_train = X_train[X_train.columns[:10]]

# spliting training and test set
print("spliting...")
X_train, X_test, y_train, y_test = train_test_split(
x_train,
y_train,
test_size=0.2,
shuffle=False
)

# very shallow xgboost regressor
# basic xgboost regressor
model = xgb.XGBRegressor(
objective='reg:squarederror',
max_depth=4,
learning_rate=0.01,
learning_rate=0.1,
n_estimators=2,
n_jobs=-1,
colsample_bytree=0.5
colsample_bytree=0.05
)

# training the model
print("fiting...")
model.fit(X_train.iloc[:, 2:], y_train.iloc[:, 2:])
print("training...")
model.fit(X_train.iloc[:,2:], y_train.iloc[:,2:])

# testing model's Spearman score
pred = model.predict(X_test.iloc[:, 2:])
scorer(y_test.iloc[:, 2:], pred)
# make sure that the train function correctly save the trained model
# in the model_directory_path
model_pathname = Path(model_directory_path) / "model.joblib"
print(f"Saving model in {model_pathname}")
joblib.dump(model, model_pathname)

# make sure that the train function correctly save the trained model in the model_directory_path
joblib.dump(model, os.path.join(model_directory_path, "model.joblib"))


def infer(model_directory_path: str, x_test: pd.DataFrame) -> pd.DataFrame:
def infer(
X_test: pd.DataFrame,
model_directory_path: str = "resources"
) -> pd.DataFrame:
"""
Do your inference here.
This function will load the model saved at the previous iteration and use it to produce your inference on the current date.
It is mandatory to send your inferences with the ids so the system can match it correctly.
This function will load the model saved at the previous iteration and use
it to produce your inference on the current date.
It is mandatory to send your inferences with the ids so the system
can match it correctly.
Args:
model_directory_path: the path to the directory to the directory in wich we will be saving your updated model.
x_test: the independant variables of the current date passed to your model.
X_test: the independant variables of the current date passed to your model.
Returns:
A dataframe with the inferences of your model for the current date, including the ids columns.
A dataframe (date, id, value) with the inferences of your model for the current date.
"""

X_test = X_test[X_test.columns[:10]]

# loading the model saved by the train function at previous iteration
model = joblib.load(os.path.join(model_directory_path, "model.joblib"))

# creating the predicted label dataframe without omiting to keep the ids and data
predicted = x_test[["date", "id"]].copy()
predicted["value"] = model.predict(x_test.iloc[:, 2:])
model = joblib.load(Path(model_directory_path) / "model.joblib")
# creating the predicted label dataframe with correct dates and ids
y_test_predicted = X_test[["date", "id"]].copy()
y_test_predicted["value"] = model.predict(X_test.iloc[:, 2:])

return predicted
return y_test_predicted
2 changes: 0 additions & 2 deletions crunch/demo-project/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
pandas
numpy
scikit-learn
xgboost
joblib

0 comments on commit 3238ff3

Please sign in to comment.