Skip to content

Commit

Permalink
Fix 2020_06_04 catboost tutorial
Browse files Browse the repository at this point in the history
Note: mandatory check (NEED_CHECK) was skipped
ref:fdfe4258918442af8f0eec5e5b60c005eeaee7a9
  • Loading branch information
nikitxskv committed Jun 4, 2020
1 parent 2514389 commit 25c7f81
Showing 1 changed file with 31 additions and 54 deletions.
85 changes: 31 additions & 54 deletions events/2020_06_04_catboost_tutorial/catboost_features.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -191,27 +191,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dataset Quantization"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Features quantization. It is effective to quantize features single time before several trainings."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pool1.quantize(\n",
" border_count=254,\n",
" feature_border_type='GreedyLogSum',\n",
" per_float_feature_quantization=['0:border_count=1024']\n",
")"
"## Split your data into train and validation"
]
},
{
Expand All @@ -220,31 +200,27 @@
"metadata": {},
"outputs": [],
"source": [
"from catboost.utils import quantize\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"pool2 = quantize(\n",
" data_path=os.path.join(dataset_dir, 'train.csv'),\n",
" delimiter=',',\n",
" column_description=os.path.join(dataset_dir, 'train.cd'),\n",
" has_header=True,\n",
")"
"data = train_test_split(X, y, train_size=0.8, random_state=0)\n",
"X_train, X_validation, y_train, y_validation = data\n",
"\n",
"train_pool = Pool(data=X_train, label=y_train)\n",
"validation_pool = Pool(data=X_validation, label=y_validation)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can dump borders and use it for another dataset."
"## Dataset Quantization"
]
},
{
"cell_type": "code",
"execution_count": null,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"pool2.save_quantization_borders('borders.tsv')\n",
"!head -20 borders.tsv"
"Features quantization. It is effective to quantize features single time before several trainings."
]
},
{
Expand All @@ -253,15 +229,14 @@
"metadata": {},
"outputs": [],
"source": [
"pool1 = Pool(data=X, label=y)\n",
"pool1.quantize(input_borders='borders.tsv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Split your data into train and validation"
"train_pool.quantize(\n",
" border_count=254,\n",
" # per_float_feature_quantization=['0:border_count=1024']\n",
")\n",
"\n",
"train_pool.save_quantization_borders('borders.tsv')\n",
"\n",
"validation_pool.quantize(input_borders='borders.tsv')"
]
},
{
Expand All @@ -270,13 +245,14 @@
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"data = train_test_split(X, y, train_size=0.8, random_state=0)\n",
"X_train, X_validation, y_train, y_validation = data\n",
"from catboost.utils import quantize\n",
"\n",
"train_pool = Pool(data=X_train, label=y_train)\n",
"validation_pool = Pool(data=X_validation, label=y_validation)"
"pool2 = quantize(\n",
" data_path=os.path.join(dataset_dir, 'train.csv'),\n",
" delimiter=',',\n",
" column_description=os.path.join(dataset_dir, 'train.cd'),\n",
" has_header=True,\n",
")"
]
},
{
Expand Down Expand Up @@ -402,9 +378,10 @@
"metadata": {},
"outputs": [],
"source": [
"pool = Pool(data=X_train, label=y_train)\n",
"model = CatBoostRegressor(iterations=10, eval_metric='MAE')\n",
"grid = {'learning_rate': [0.001, 0.01, 0.1, 1.0, 10.0], 'depth': [4, 5, 6]}\n",
"result = model.grid_search(grid, train_pool)"
"grid = {'learning_rate': [0.001, 0.01, 0.1], 'depth': [4, 5, 6]}\n",
"result = model.grid_search(grid, pool)"
]
},
{
Expand Down Expand Up @@ -446,7 +423,7 @@
"outputs": [],
"source": [
"model = CatBoostRegressor(iterations=100, eval_metric='MAE')\n",
"model.grid_search(grid, train_pool, plot=True, verbose=False);"
"model.grid_search(grid, pool, plot=True, verbose=False);"
]
},
{
Expand Down Expand Up @@ -628,7 +605,7 @@
"source": [
"x = [0, 7, 2]\n",
"\n",
"raw_pred = model.predict([x], prediction_type='RawFormulaVal')\n",
"raw_pred = model.predict([x])\n",
"print(raw_pred)"
]
},
Expand Down Expand Up @@ -658,7 +635,7 @@
},
"outputs": [],
"source": [
"model = CatBoostRegressor(iterations=2, depth=4, grow_policy='Lossguide')\n",
"model = CatBoostRegressor(iterations=2, max_leaves=16, grow_policy='Lossguide')\n",
"model.fit(features, labels, verbose=False);\n",
"display(model.plot_tree(0))"
]
Expand Down

0 comments on commit 25c7f81

Please sign in to comment.