forked from crunchdao/crunch-cli
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
919c7c6
commit 3238ff3
Showing
3 changed files
with
49 additions
and
57 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
__title__ = 'crunch-cli' | ||
__description__ = 'crunch-cli - CLI of the CrunchDAO Platform' | ||
__version__ = '1.2.1' | ||
__version__ = '1.2.2' | ||
__author__ = 'Enzo CACERES' | ||
__author_email__ = '[email protected]' | ||
__url__ = 'https://github.com/crunchdao/crunch-cli' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,90 +1,84 @@ | ||
""" | ||
This is a basic example of what you need to do to participate to the tournament. | ||
The code will not have access to the internet (or any socket related operation) so don't try to get access to external resources. | ||
The code will not have access to the internet (or any socket related operation). | ||
""" | ||
|
||
import os | ||
# Imports | ||
import xgboost as xgb | ||
import pandas as pd | ||
import typing | ||
|
||
import joblib | ||
import pandas as pd | ||
import sklearn | ||
|
||
import xgboost as xgb | ||
from scipy import stats | ||
from sklearn.feature_selection import VarianceThreshold | ||
from sklearn.model_selection import train_test_split | ||
|
||
|
||
def scorer(y_test: pd.DataFrame, y_pred: pd.DataFrame) -> None: | ||
score = (stats.spearmanr(y_test, y_pred)*100)[0] | ||
print(f"In sample spearman correlation {score}") | ||
from pathlib import Path | ||
|
||
|
||
def train(x_train: pd.DataFrame, y_train: pd.DataFrame, model_directory_path: str) -> None: | ||
def train( | ||
X_train: pd.DataFrame, | ||
y_train: pd.DataFrame, | ||
model_directory_path: str = "resources" | ||
) -> None: | ||
""" | ||
Do your model training here. | ||
At each retrain this function will save an updated version of the model under the model_directiory_path. | ||
Make sure to use the correct operator to read and/or write your model. | ||
At each retrain this function will have to save an updated version of | ||
the model under the model_directiory_path, as in the example below. | ||
Note: You can use other serialization methods than joblib.dump(), as | ||
long as it matches what reads the model in infer(). | ||
Args: | ||
x_train, y_train: the data post user processing and user feature engeneering done in the data_process function. | ||
model_directory_path: the path to the directory to the directory in wich we will saving your updated model | ||
X_train, y_train: the data to train the model. | ||
model_directory_path: the path to save your updated model | ||
Returns: | ||
None | ||
""" | ||
X_train = X_train[X_train.columns[:10]] | ||
|
||
# spliting training and test set | ||
print("spliting...") | ||
X_train, X_test, y_train, y_test = train_test_split( | ||
x_train, | ||
y_train, | ||
test_size=0.2, | ||
shuffle=False | ||
) | ||
|
||
# very shallow xgboost regressor | ||
# basic xgboost regressor | ||
model = xgb.XGBRegressor( | ||
objective='reg:squarederror', | ||
max_depth=4, | ||
learning_rate=0.01, | ||
learning_rate=0.1, | ||
n_estimators=2, | ||
n_jobs=-1, | ||
colsample_bytree=0.5 | ||
colsample_bytree=0.05 | ||
) | ||
|
||
# training the model | ||
print("fiting...") | ||
model.fit(X_train.iloc[:, 2:], y_train.iloc[:, 2:]) | ||
print("training...") | ||
model.fit(X_train.iloc[:,2:], y_train.iloc[:,2:]) | ||
|
||
# testing model's Spearman score | ||
pred = model.predict(X_test.iloc[:, 2:]) | ||
scorer(y_test.iloc[:, 2:], pred) | ||
# make sure that the train function correctly save the trained model | ||
# in the model_directory_path | ||
model_pathname = Path(model_directory_path) / "model.joblib" | ||
print(f"Saving model in {model_pathname}") | ||
joblib.dump(model, model_pathname) | ||
|
||
# make sure that the train function correctly save the trained model in the model_directory_path | ||
joblib.dump(model, os.path.join(model_directory_path, "model.joblib")) | ||
|
||
|
||
def infer(model_directory_path: str, x_test: pd.DataFrame) -> pd.DataFrame: | ||
def infer( | ||
X_test: pd.DataFrame, | ||
model_directory_path: str = "resources" | ||
) -> pd.DataFrame: | ||
""" | ||
Do your inference here. | ||
This function will load the model saved at the previous iteration and use it to produce your inference on the current date. | ||
It is mandatory to send your inferences with the ids so the system can match it correctly. | ||
This function will load the model saved at the previous iteration and use | ||
it to produce your inference on the current date. | ||
It is mandatory to send your inferences with the ids so the system | ||
can match it correctly. | ||
Args: | ||
model_directory_path: the path to the directory to the directory in wich we will be saving your updated model. | ||
x_test: the independant variables of the current date passed to your model. | ||
X_test: the independant variables of the current date passed to your model. | ||
Returns: | ||
A dataframe with the inferences of your model for the current date, including the ids columns. | ||
A dataframe (date, id, value) with the inferences of your model for the current date. | ||
""" | ||
|
||
X_test = X_test[X_test.columns[:10]] | ||
|
||
# loading the model saved by the train function at previous iteration | ||
model = joblib.load(os.path.join(model_directory_path, "model.joblib")) | ||
|
||
# creating the predicted label dataframe without omiting to keep the ids and data | ||
predicted = x_test[["date", "id"]].copy() | ||
predicted["value"] = model.predict(x_test.iloc[:, 2:]) | ||
model = joblib.load(Path(model_directory_path) / "model.joblib") | ||
# creating the predicted label dataframe with correct dates and ids | ||
y_test_predicted = X_test[["date", "id"]].copy() | ||
y_test_predicted["value"] = model.predict(X_test.iloc[:, 2:]) | ||
|
||
return predicted | ||
return y_test_predicted |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,3 @@ | ||
pandas | ||
numpy | ||
scikit-learn | ||
xgboost | ||
joblib |