feat(demo-project): update code

zoe-mav · May 12, 2023 · 3238ff3 · 3238ff3
1 parent 919c7c6
commit 3238ff3
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 57 deletions.
diff --git a/crunch/__version__.py b/crunch/__version__.py
@@ -1,6 +1,6 @@
 __title__ = 'crunch-cli'
 __description__ = 'crunch-cli - CLI of the CrunchDAO Platform'
-__version__ = '1.2.1'
+__version__ = '1.2.2'
 __author__ = 'Enzo CACERES'
 __author_email__ = '[email protected]'
 __url__ = 'https://github.com/crunchdao/crunch-cli'
diff --git a/crunch/demo-project/main.py b/crunch/demo-project/main.py
@@ -1,90 +1,84 @@
 """
 This is a basic example of what you need to do to participate to the tournament.
-The code will not have access to the internet (or any socket related operation) so don't try to get access to external resources.
+The code will not have access to the internet (or any socket related operation).
 """
 
-import os
+# Imports
+import xgboost as xgb
+import pandas as pd
 import typing
-
 import joblib
-import pandas as pd
-import sklearn
-
-import xgboost as xgb
-from scipy import stats
-from sklearn.feature_selection import VarianceThreshold
-from sklearn.model_selection import train_test_split
-
-
-def scorer(y_test: pd.DataFrame, y_pred: pd.DataFrame) -> None:
-    score = (stats.spearmanr(y_test, y_pred)*100)[0]
-    print(f"In sample spearman correlation {score}")
+from pathlib import Path
 
 
-def train(x_train: pd.DataFrame, y_train: pd.DataFrame, model_directory_path: str) -> None:
+def train(
+    X_train: pd.DataFrame,
+    y_train: pd.DataFrame,
+    model_directory_path: str = "resources"
+) -> None:
     """
     Do your model training here.
-    At each retrain this function will save an updated version of the model under the model_directiory_path.
-    Make sure to use the correct operator to read and/or write your model.
-
+    At each retrain this function will have to save an updated version of
+    the model under the model_directiory_path, as in the example below.
+    Note: You can use other serialization methods than joblib.dump(), as
+    long as it matches what reads the model in infer().
+    
     Args:
-        x_train, y_train: the data post user processing and user feature engeneering done in the data_process function.
-        model_directory_path: the path to the directory to the directory in wich we will saving your updated model
-
+        X_train, y_train: the data to train the model.
+        model_directory_path: the path to save your updated model
+    
     Returns:
         None
     """
+    X_train = X_train[X_train.columns[:10]]
 
-    # spliting training and test set
-    print("spliting...")
-    X_train, X_test, y_train, y_test = train_test_split(
-        x_train,
-        y_train,
-        test_size=0.2,
-        shuffle=False
-    )
-
-    # very shallow xgboost regressor
+    # basic xgboost regressor
     model = xgb.XGBRegressor(
         objective='reg:squarederror',
         max_depth=4,
-        learning_rate=0.01,
+        learning_rate=0.1,
         n_estimators=2,
         n_jobs=-1,
-        colsample_bytree=0.5
+        colsample_bytree=0.05
     )
 
     # training the model
-    print("fiting...")
-    model.fit(X_train.iloc[:, 2:], y_train.iloc[:, 2:])
+    print("training...")
+    model.fit(X_train.iloc[:,2:], y_train.iloc[:,2:])
 
-    # testing model's Spearman score
-    pred = model.predict(X_test.iloc[:, 2:])
-    scorer(y_test.iloc[:, 2:], pred)
+    # make sure that the train function correctly save the trained model
+    # in the model_directory_path
+    model_pathname = Path(model_directory_path) / "model.joblib"
+    print(f"Saving model in {model_pathname}")
+    joblib.dump(model, model_pathname)
 
-    # make sure that the train function correctly save the trained model in the model_directory_path
-    joblib.dump(model, os.path.join(model_directory_path, "model.joblib"))
 
-
-def infer(model_directory_path: str, x_test: pd.DataFrame) -> pd.DataFrame:
+def infer(
+    X_test: pd.DataFrame,
+    model_directory_path: str = "resources"
+) -> pd.DataFrame:
     """
     Do your inference here.
-    This function will load the model saved at the previous iteration and use it to produce your inference on the current date.
-    It is mandatory to send your inferences with the ids so the system can match it correctly.
-
+    This function will load the model saved at the previous iteration and use
+    it to produce your inference on the current date.
+    It is mandatory to send your inferences with the ids so the system
+    can match it correctly.
+    
     Args:
         model_directory_path: the path to the directory to the directory in wich we will be saving your updated model.
-        x_test: the independant  variables of the current date passed to your model.
+        X_test: the independant  variables of the current date passed to your model.
 
     Returns:
-        A dataframe with the inferences of your model for the current date, including the ids columns.
+        A dataframe (date, id, value) with the inferences of your model for the current date.
     """
+
+    X_test = X_test[X_test.columns[:10]]
 
     # loading the model saved by the train function at previous iteration
-    model = joblib.load(os.path.join(model_directory_path, "model.joblib"))
-
-    # creating the predicted label dataframe without omiting to keep the ids and data
-    predicted = x_test[["date", "id"]].copy()
-    predicted["value"] = model.predict(x_test.iloc[:, 2:])
+    model = joblib.load(Path(model_directory_path) / "model.joblib")
+    
+    # creating the predicted label dataframe with correct dates and ids
+    y_test_predicted = X_test[["date", "id"]].copy()
+    y_test_predicted["value"] = model.predict(X_test.iloc[:, 2:])
 
-    return predicted
+    return y_test_predicted
diff --git a/crunch/demo-project/requirements.txt b/crunch/demo-project/requirements.txt
@@ -1,5 +1,3 @@
 pandas
-numpy
-scikit-learn
 xgboost
 joblib