From a273e92c369a10b859eb19600f67053056bd1a8d Mon Sep 17 00:00:00 2001 From: Stas Kirillov Date: Fri, 30 Oct 2020 19:22:24 +0300 Subject: [PATCH] proper odsc notebook --- events/2020_odsc_west/text_features.ipynb | 2047 +++++++++++++++++++++ 1 file changed, 2047 insertions(+) create mode 100644 events/2020_odsc_west/text_features.ipynb diff --git a/events/2020_odsc_west/text_features.ipynb b/events/2020_odsc_west/text_features.ipynb new file mode 100644 index 0000000..5bf97e3 --- /dev/null +++ b/events/2020_odsc_west/text_features.ipynb @@ -0,0 +1,2047 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "DnLV1HUefFtW" + }, + "source": [ + "# ODSC WEST 2020: Text Features In CatBoost\n", + "## Part 0 - CatBoost installation and notebook run" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0UAHpnD8fFtZ" + }, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/catboost/tutorials/blob/master/events/2020_odsc_west/text_features.ipynb)\n", + "\n", + "**Set GPU as hardware accelerator**\n", + "\n", + "First of all, you need to select GPU as hardware accelerator. There are two simple steps to do so:\n", + "Step 1. Navigate to **Runtime** menu and select **Change runtime type**\n", + "Step 2. Choose **GPU** as hardware accelerator.\n", + "That's all!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "9FM0IRyi8NOw" + }, + "source": [ + "Let's install CatBoost." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 361 + }, + "colab_type": "code", + "id": "TpJdgt63fSOv", + "outputId": "d62a776e-f741-4192-b919-91903ea0441b" + }, + "outputs": [], + "source": [ + "# Install essentials:\n", + "!pip install -U catboost matplotlib sklearn shap\n", + "\n", + "# Install widgets\n", + "!pip install -U ipywidgets\n", + "!jupyter nbextension enable --py widgetsnbextension" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import catboost\n", + "print(catboost.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Part 1: CatBoost base tutorial" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "np.set_printoptions(precision=4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reading the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from catboost.datasets import msrank_10k\n", + "\n", + "# If you have \"URLError: SSL: CERTIFICATE_VERIFY_FAILED\" uncomment next two lines:\n", + "# import ssl\n", + "# ssl._create_default_https_context = ssl._create_unverified_context\n", + "\n", + "(train_df, test_df) = msrank_10k()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparing the data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Label values extraction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y = train_df[0]\n", + "X = train_df.drop([0, 1], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ways to create Pool class. If you have a big dataset it is effective (in terms of time) to load data from file, instead of pandas Dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset_dir = './msrank_10k'\n", + "if not os.path.exists(dataset_dir):\n", + " os.makedirs(dataset_dir)\n", + "\n", + "train_df.to_csv(\n", + " os.path.join(dataset_dir, 'train.csv'),\n", + " index=False, sep=',', header=True\n", + ")\n", + "test_df.to_csv(\n", + " os.path.join(dataset_dir, 'test.csv'),\n", + " index=False, sep=',', header=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!head -2 msrank_10k/train.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from catboost.utils import create_cd\n", + "feature_names = dict(map(lambda i: (i, 'Feature ' + str(i)), range(train_df.shape[1] - 2)))\n", + " \n", + "create_cd(\n", + " label=0,\n", + " feature_names=feature_names,\n", + " auxiliary_columns=[1],\n", + " output_path=os.path.join(dataset_dir, 'train.cd')\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from catboost import Pool\n", + "\n", + "pool1 = Pool(data=X, label=y)\n", + "\n", + "pool2 = Pool(\n", + " data=os.path.join(dataset_dir, 'train.csv'), \n", + " delimiter=',', \n", + " column_description=os.path.join(dataset_dir, 'train.cd'),\n", + " has_header=True,\n", + ")\n", + "\n", + "print('Dataset shape: {}\\n'.format(pool1.shape))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Split your data into train and validation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "data = train_test_split(X, y, train_size=0.8, random_state=0)\n", + "X_train, X_validation, y_train, y_validation = data\n", + "\n", + "train_pool = Pool(data=X_train, label=y_train)\n", + "validation_pool = Pool(data=X_validation, label=y_validation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from catboost import CatBoostRegressor\n", + "\n", + "model = CatBoostRegressor(\n", + " iterations=5,\n", + " learning_rate=0.1,\n", + ")\n", + "model.fit(train_pool, eval_set=validation_pool, verbose=False)\n", + "\n", + "print('Model is fitted: {}'.format(model.is_fitted()))\n", + "print('Model params:\\n{}'.format(model.get_params()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Stdout of the training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = CatBoostRegressor(\n", + " iterations=15,\n", + "# verbose=5,\n", + ")\n", + "model.fit(train_pool, eval_set=validation_pool);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Metrics calculation and graph plotting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = CatBoostRegressor(\n", + " iterations=200,\n", + " learning_rate=0.2,\n", + " custom_metric=['MAE', 'R2']\n", + ")\n", + "\n", + "model.fit(\n", + " train_pool,\n", + " eval_set=validation_pool,\n", + " verbose=False,\n", + " plot=True\n", + ");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Best iteration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = CatBoostRegressor(\n", + " iterations=100,\n", + " eval_metric='MAE',\n", + " learning_rate=0.5,\n", + "# use_best_model=False\n", + ")\n", + "model.fit(\n", + " train_pool,\n", + " eval_set=validation_pool,\n", + " verbose=False,\n", + " plot=True\n", + ");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('Tree count: ' + str(model.tree_count_))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Grid Search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pool = Pool(data=X_train, label=y_train)\n", + "model = CatBoostRegressor(iterations=10, eval_metric='MAE')\n", + "grid = {'learning_rate': [0.001, 0.01, 0.1], 'depth': [4, 5, 6]}\n", + "result = model.grid_search(grid, pool)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('Best parameters: {}\\n'.format(result['params']))\n", + "\n", + "msg = 'Mean MAE value on validation set per each iteration:\\n{}'\n", + "print(msg.format(np.round(result['cv_results']['test-MAE-mean'], 4)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.get_params()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.predict(validation_pool)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = CatBoostRegressor(iterations=100, eval_metric='MAE')\n", + "model.grid_search(grid, pool, plot=True, verbose=False);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "More about parameter tuning you can find in [tutorial](https://github.com/catboost/catboost/blob/master/catboost/tutorials/hyperparameters_tuning/hyperparameters_tuning.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Feature importances" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prediction values change" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.get_feature_importance()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.get_feature_importance(prettified=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loss function change" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.get_feature_importance(\n", + " data=validation_pool, \n", + " type='LossFunctionChange',\n", + " prettified=True\n", + ").head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Shap values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = CatBoostRegressor(iterations=1000, learning_rate=0.1)\n", + "model.fit(\n", + " train_pool,\n", + " eval_set=validation_pool,\n", + " verbose=False,\n", + " plot=True\n", + ");\n", + "\n", + "\n", + "shap_values = model.get_feature_importance(\n", + " data=train_pool, \n", + " type='ShapValues'\n", + ")\n", + "\n", + "expected_value = shap_values[0,-1]\n", + "shap_values = shap_values[:,:-1]\n", + "\n", + "print(shap_values.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shap\n", + "\n", + "shap.initjs()\n", + "shap.force_plot(\n", + " expected_value,\n", + " shap_values[1,:],\n", + " feature_names=train_pool.get_feature_names()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "shap.force_plot(\n", + " expected_value,\n", + " shap_values[7,:],\n", + " feature_names=train_pool.get_feature_names()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "shap.summary_plot(shap_values, X_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "More information about shap value usage you can find in [tutorial](https://github.com/catboost/catboost/blob/master/catboost/tutorials/model_analysis/shap_values_tutorial.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tree Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import make_regression\n", + "\n", + "model = CatBoostRegressor(\n", + " iterations=2,\n", + " depth=2,\n", + " learning_rate=0.5,\n", + " boost_from_average=False)\n", + "\n", + "features, labels = make_regression(n_samples=50, n_features=4, n_informative=2)\n", + "\n", + "model.fit(features, labels, verbose=False);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This cell doesn't work without graphviz package\n", + "# You can install it by link https://graphviz.gitlab.io/download/\n", + "# Installation can take a lot of time. You can do it at home.\n", + "\n", + "from IPython.display import display\n", + "\n", + "display(model.plot_tree(0))\n", + "display(model.plot_tree(1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x = [0, 0,0,0]\n", + "\n", + "raw_pred = model.predict([x])\n", + "print(raw_pred)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Grow Policies: SymmetricTree, Depthwise, Lossguide" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = CatBoostRegressor(iterations=2, depth=3, grow_policy='SymmetricTree')\n", + "model.fit(features, labels, verbose=False);\n", + "display(model.plot_tree(0))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "model = CatBoostRegressor(iterations=2, depth=3, grow_policy='Depthwise')\n", + "model.fit(features, labels, verbose=False);\n", + "display(model.plot_tree(0))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = CatBoostRegressor(iterations=2, max_leaves=8, grow_policy='Lossguide')\n", + "model.fit(features, labels, verbose=False);\n", + "display(model.plot_tree(0))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data uncertainty" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X = np.arange(-10,10, 0.0005, dtype=np.float32)\n", + "Y = np.arctan(X)\n", + "for i in range(len(X)):\n", + " Y[i] = np.random.normal(Y[i], np.sin(X[i]/1.5) ** 2 /2)\n", + "\n", + "model = CatBoostRegressor(iterations=1000, learning_rate=0.1, loss_function='RMSEWithUncertainty',\n", + " verbose=100, random_seed=0, border_count=1024, posterior_sampling=True)\n", + "Xs = X.reshape((len(X), 1))\n", + "\n", + "model.fit(Xs, Y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "vapred = model.virtual_ensembles_predict(Xs, prediction_type='TotalUncertainty', virtual_ensembles_count=5)\n", + "\n", + "plt.figure(figsize = (16,9))\n", + "\n", + "plt.plot(X, Y, '.', c='gray', markersize=2)\n", + "plt.plot(X, vapred[:, 0],c='magenta')\n", + "plt.plot(X, vapred[:, 0] + np.sqrt(vapred[:, 1] + vapred[:, 2]), c='blue')\n", + "plt.plot(X, vapred[:, 0] - np.sqrt(vapred[:, 1] + vapred[:, 2]), c='blue')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Snapshotting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm 'catboost_info/snapshot.bkp'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "model = CatBoostRegressor(\n", + " iterations=3000,\n", + " save_snapshot=True,\n", + " snapshot_file='snapshot.bkp',\n", + " snapshot_interval=1\n", + ")\n", + "\n", + "model.fit(train_pool, eval_set=validation_pool, verbose=100);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "model = CatBoostRegressor(iterations=10)\n", + "model.fit(train_pool, eval_set=validation_pool, verbose=False)\n", + "model.save_model('catboost_model.bin')\n", + "model.save_model('catboost_model.json', format='json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.load_model('catboost_model.bin')\n", + "print(model.get_params())\n", + "print(model.learning_rate_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **Questions and answers?**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Part 2: CatBoost Text features support" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "viF18QJqfFtd" + }, + "source": [ + "In this tutorial we will use dataset **Rotten Tomatoes Movie Reviews** from [Kaggle](https://www.kaggle.com) competition for our experiments. Data can be downloaded [here](https://www.kaggle.com/rpnuser8182/rotten-tomatoes/data)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "colab_type": "code", + "id": "MNC1tP0UfFtd", + "outputId": "2c0abe55-df9c-4a0f-daa4-dc8c8d858f63" + }, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "np.set_printoptions(precision=4)\n", + "\n", + "import catboost\n", + "print(catboost.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "OkexL1k7fFti" + }, + "source": [ + "## Reading the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 284 + }, + "colab_type": "code", + "id": "m11CtnPEfFtj", + "outputId": "715d43f8-ab44-44e0-ebd5-5b4327be07b7" + }, + "outputs": [], + "source": [ + "from catboost.datasets import rotten_tomatoes\n", + "\n", + "train_df, test_df = rotten_tomatoes()\n", + "\n", + "train_df.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8IeOEa1gfFtm" + }, + "source": [ + "### Features description \n", + "\n", + "|Id | Feature name | Description |\n", + "|---|-------------------|----------------------------------------------------------------------------------------------|\n", + "| 1 | ``id`` | unique movie id |\n", + "| 2 | ``synopsis`` | brief summary of the major points of a movie |\n", + "| 3 | ``rating_MPAA`` | film rating by MPAA rating system |\n", + "| 4 | ``genre`` | list of genres that are suitable for this film (e.g. Action, Adventure, Comedy,... |\n", + "| 5 | ``director`` | list of persons who direct the making of a film |\n", + "| 6 | ``writer`` | list of persons who write a screenplay |\n", + "| 7 | ``theater_date`` | the date when film was first shown to the public in cinema (string) |\n", + "| 8 | ``dvd_date`` | the date when film was released on DVD (string) |\n", + "| 9 | ``box_office`` | the amount of money raised by ticket sales (revenue) |\n", + "| 10 | ``runtime`` | film duration in minutes |\n", + "| 11 | ``studio`` | is a major entertainment company or motion picture company (20th Century Fox, Sony Pictures)|\n", + "| 12 | ``dvd_date_int`` | the date when film was released on DVD (converted to integer) |\n", + "| 13 | ``theater_date_int`` | the date when film was first shown to the public in cinema (converted to integer) |\n", + "| 14 | ``review`` | review of a movie, that was written by a critic |\n", + "| 15 | ``rating`` | float rating from 0 to 1 of the film according to the Rotten tomatoes web site |\n", + "| 16 | ``fresh`` | freshness of review - fresh or rotten |\n", + "| 17 | ``critic`` | name of reviewer |\n", + "| 18 | ``top_critic`` | binary feature, is reviewer a top critic or not |\n", + "| 19 | ``publisher`` | journal or website where the review was published |\n", + "| 20 | ``date`` | the date when critic publish review (string) |\n", + "| 21 | ``date_int`` | the date when critic publish review (converted to integer) |\n", + "| 22 | ``rating_10`` | integer rating from 0 to 10 of the film according to the critic |\n", + "\n", + "We mark as **auxiliary** columnns 'id' and 'rating', because they can be the reason of overfitting, 'theater_date','dvd_date','date' because we convert them into integers.\n", + "\n", + "We mark as **text** features 'synopsis' because it is short *text* description of a film, 'genre' because it is combination of categories (we know that strings have structure where words define categories), for example 'Action | Comedy | Adventure', 'director' and 'writer' features are included to the text features by the same reason, 'review' becuase it is a *text* summary of critic opinion.\n", + "\n", + "We mark as **categorical** features 'rating_MPAA', 'studio', 'fresh', 'critic', 'top_critic' and 'publisher' because they can not be splitted into the group of categorical features and feature values can not be compared.\n", + "\n", + "The other columns considered as **numeric**." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "wJRY9YyVfFtl" + }, + "source": [ + "## Preparing the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Qy_gcs7qfFtn" + }, + "outputs": [], + "source": [ + "auxiliary_columns = ['id', 'theater_date', 'dvd_date', 'rating', 'date']\n", + "cat_features = ['rating_MPAA', 'studio', 'fresh', 'critic', 'top_critic', 'publisher']\n", + "text_features = ['synopsis', 'genre', 'director', 'writer', 'review']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "WkV114UDfFtp" + }, + "outputs": [], + "source": [ + "def fill_na(df, features):\n", + " for feature in features:\n", + " df[feature].fillna('', inplace=True)\n", + "\n", + "def preprocess_data_part(data_part):\n", + " data_part = data_part.drop(auxiliary_columns, axis=1)\n", + "\n", + " fill_na(data_part, cat_features)\n", + " fill_na(data_part, text_features)\n", + "\n", + " X = data_part.drop(['rating_10'], axis=1)\n", + " y = data_part['rating_10']\n", + " return X, y\n", + "\n", + "X_train, y_train = preprocess_data_part(train_df)\n", + "X_test, y_test = preprocess_data_part(test_df)\n", + "\n", + "X_train_no_text = X_train.drop(text_features, axis=1)\n", + "X_test_no_text = X_test.drop(text_features, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 111 + }, + "colab_type": "code", + "id": "OfkxzEZXfFtr", + "outputId": "294c112f-e382-4f0c-8b53-28c3158aa721" + }, + "outputs": [], + "source": [ + "X_train_no_text.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 53 + }, + "colab_type": "code", + "id": "CTq7w0U9fFtt", + "outputId": "c0b10680-d537-49c2-ef0c-4d5579b672f0" + }, + "outputs": [], + "source": [ + "from catboost import Pool\n", + "\n", + "train_pool_no_text = Pool(\n", + " X_train_no_text, y_train, \n", + " cat_features=cat_features, \n", + ")\n", + "\n", + "validation_pool_no_text = Pool(\n", + " X_test_no_text, y_test, \n", + " cat_features=cat_features, \n", + ")\n", + "\n", + "print('Train dataset shape: {}\\n'.format(train_pool_no_text.shape))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 269 + }, + "colab_type": "code", + "id": "VTi3eN58fFt6", + "outputId": "e694fed2-1341-45a3-c799-334b32fbc01e" + }, + "outputs": [], + "source": [ + "from catboost import CatBoostClassifier\n", + "\n", + "def fit_model(train_pool, validation_pool, **kwargs):\n", + " model = CatBoostClassifier(\n", + " iterations=1000,\n", + " learning_rate=0.05,\n", + " eval_metric='Accuracy',\n", + " task_type='GPU',\n", + " **kwargs\n", + " )\n", + "\n", + " return model.fit(\n", + " train_pool,\n", + " eval_set=validation_pool,\n", + " verbose=100,\n", + " )\n", + "\n", + "model_no_text = fit_model(train_pool_no_text, validation_pool_no_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "QhF2RAAhfFuJ" + }, + "source": [ + "# Text Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 305 + }, + "colab_type": "code", + "id": "Aw0M5trY8Dmg", + "outputId": "bde6afe0-cf94-46a4-ae36-19bb5b6361e3" + }, + "outputs": [], + "source": [ + "train_pool = Pool(\n", + " X_train, y_train, \n", + " cat_features=cat_features,\n", + " text_features=text_features,\n", + ")\n", + "\n", + "validation_pool = Pool(\n", + " X_test, y_test, \n", + " cat_features=cat_features,\n", + " text_features=text_features,\n", + ")\n", + "\n", + "print('Train dataset shape: {}\\n'.format(train_pool.shape))\n", + "\n", + "model = fit_model(train_pool, validation_pool)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "HsuS5qKnfFuQ" + }, + "outputs": [], + "source": [ + "def print_score_diff(first_model, second_model):\n", + " first_accuracy = first_model.best_score_['validation']['Accuracy']\n", + " second_accuracy = second_model.best_score_['validation']['Accuracy']\n", + "\n", + " gap = (second_accuracy - first_accuracy) / first_accuracy * 100\n", + "\n", + " print('{} vs {} ({:+.2f}%)'.format(first_accuracy, second_accuracy, gap))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "colab_type": "code", + "id": "O-3uDpJafFuS", + "outputId": "9827b6fb-4408-4725-f267-f38a6ee642ba" + }, + "outputs": [], + "source": [ + "print_score_diff(model_no_text, model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Ym-fEV-mfFuU" + }, + "source": [ + "Note!\n", + "\n", + "1. Text features also cannot contain NaN values, so we converted them into strings manually.\n", + "2. The training may be performed only with classification losses and targets." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "IiHpTGfbfFuV" + }, + "source": [ + "## How it works?\n", + "\n", + "1. **Text Tokenization**\n", + "2. **Dictionary Creation**\n", + "3. **Feature Calculation**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "MszSnbqH8NR3" + }, + "source": [ + "## Text Tokenization" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "mOBGuexjb8tr" + }, + "source": [ + "Usually we get our text as a sequence of Unicode symbols. So, if the task isn't a DNA classification we don't need such granularity, moreover, we need to extract more complicated entities, e.g. words. The process of extraction tokens -- words, numbers, punctuation symbols or special symbols which defines emoji from a sequence is called **tokenization**.
\n", + "\n", + "Tokenization is the first part of text preprocessing in CatBoost and performed as a simple splitting a sequence on a string pattern (e.g. space)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "NAeELULufFuV" + }, + "outputs": [], + "source": [ + "text_small = [\n", + " \"Cats are so cute :)\",\n", + " \"Mouse scare...\",\n", + " \"The cat defeated the mouse\",\n", + " \"Cute: Mice gather an army!\",\n", + " \"Army of mice defeated the cat :(\",\n", + " \"Cat offers peace\",\n", + " \"Cat is scared :(\",\n", + " \"Cat and mouse live in peace :)\"\n", + "]\n", + "\n", + "target_small = [1, 0, 1, 1, 0, 1, 0, 1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 161 + }, + "colab_type": "code", + "id": "E21CQ8ocfFuX", + "outputId": "f78b995b-29fc-41c9-b28c-b3adee167ba7" + }, + "outputs": [], + "source": [ + "from catboost.text_processing import Tokenizer\n", + "\n", + "simple_tokenizer = Tokenizer()\n", + "\n", + "def tokenize_texts(texts):\n", + " return [simple_tokenizer.tokenize(text) for text in texts]\n", + "\n", + "simple_tokenized_text = tokenize_texts(text_small)\n", + "simple_tokenized_text" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ChZQ5cpJfFuZ" + }, + "source": [ + "### More preprocessing!\n", + "\n", + "Lets take a closer look on the tokenization result of small text example -- the tokens contains a lot of mistakes:\n", + "\n", + "1. They are glued with punctuation 'Cute:', 'army!', 'skare...'.\n", + "2. The words 'Cat' and 'cat', 'Mice' and 'mice' seems to have same meaning, perhaps they should be the same tokens.\n", + "3. The same problem with tokens 'are'/'is' -- they are inflected forms of same token 'be'.\n", + "\n", + "**Punctuation handling**, **lowercasing**, and **lemmatization** processes help to overcome these problems." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "qaoTjEmR8NSM" + }, + "source": [ + "### Punctuation handling and lowercasing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 161 + }, + "colab_type": "code", + "id": "6cPpYpmtfFuZ", + "outputId": "2bc7abef-5828-43af-d588-48edb490eed9" + }, + "outputs": [], + "source": [ + "tokenizer = Tokenizer(\n", + " lowercasing=True,\n", + " separator_type='BySense',\n", + " token_types=['Word', 'Number']\n", + ")\n", + "\n", + "tokenized_text = [tokenizer.tokenize(text) for text in text_small]\n", + "tokenized_text" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "JDhBkZzJfFua" + }, + "source": [ + "### Removing stop words\n", + "\n", + "**Stop words** - the words that are considered to be uninformative in this task, e.g. function words such as *the, is, at, which, on*.\n", + "Usually stop words are removed during text preprocessing to reduce the amount of information that is considered for further algorithms.\n", + "Stop words are collected manually (in dictionary form) or automatically, for example taking the most frequent words." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 161 + }, + "colab_type": "code", + "id": "d1MYzKgTfFub", + "outputId": "865f655e-0cb9-4626-9d40-e459b9487b0f" + }, + "outputs": [], + "source": [ + "stop_words = set(('be', 'is', 'are', 'the', 'an', 'of', 'and', 'in'))\n", + "\n", + "def filter_stop_words(tokens):\n", + " return list(filter(lambda x: x not in stop_words, tokens))\n", + " \n", + "tokenized_text_no_stop = [filter_stop_words(tokens) for tokens in tokenized_text]\n", + "tokenized_text_no_stop" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "vxofPVc1fFuc" + }, + "source": [ + "### Lemmatization\n", + "\n", + "Lemma (Wikipedia) -- is the canonical form, dictionary form, or citation form of a set of words.
\n", + "For example, the lemma \"go\" represents the inflected forms \"go\", \"goes\", \"going\", \"went\", and \"gone\".
\n", + "The process of convertation word to its lemma called **lemmatization**.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 89 + }, + "colab_type": "code", + "id": "HWrijpMGfFud", + "outputId": "1b6b8015-8cf9-47c5-89cf-5d5fc8b5f794" + }, + "outputs": [], + "source": [ + "import nltk\n", + "\n", + "nltk_data_path = os.path.join(os.path.dirname(nltk.__file__), 'nltk_data')\n", + "nltk.data.path.append(nltk_data_path)\n", + "nltk.download('wordnet', nltk_data_path)\n", + "\n", + "lemmatizer = nltk.stem.WordNetLemmatizer()\n", + "\n", + "def lemmatize_tokens_nltk(tokens):\n", + " return list(map(lambda t: lemmatizer.lemmatize(t), tokens))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 161 + }, + "colab_type": "code", + "id": "XfyhV9ONfFuf", + "outputId": "4b0568c9-3bb8-483a-8f86-dd358c6fd2c5" + }, + "outputs": [], + "source": [ + "text_small_lemmatized_nltk = [lemmatize_tokens_nltk(tokens) for tokens in tokenized_text_no_stop]\n", + "text_small_lemmatized_nltk" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "y63KVna4fFui" + }, + "source": [ + "Now words with same meaning represented by the same token, tokens are not glued with punctuation.\n", + "\n", + "Be carefull. You should verify for your own task:
\n", + "Is it realy necessary to remove punctuation, lowercasing sentences or performing a lemmatization and/or by word tokenization?
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "qFWoSX-kfFui" + }, + "source": [ + "### Let's check up accuracy with new text preprocessing\n", + "\n", + "Since CatBoost doesn't perform spacing punctuation, lowercasing letters and lemmatization, we need to preprocess text manually and then pass it to learning algorithm.\n", + "\n", + "Since the natural text features is only synopsis and review, we will preprocess only them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 53 + }, + "colab_type": "code", + "id": "ZHL3x7NwfFuj", + "outputId": "85135452-02ea-4644-882d-726fcc568605" + }, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "def preprocess_data(X):\n", + " X_preprocessed = X.copy()\n", + " for feature in ['synopsis', 'review']:\n", + " X_preprocessed[feature] = X[feature].apply(lambda x: ' '.join(lemmatize_tokens_nltk(tokenizer.tokenize(x))))\n", + " return X_preprocessed\n", + "\n", + "X_preprocessed_train = preprocess_data(X_train)\n", + "X_preprocessed_test = preprocess_data(X_test)\n", + "\n", + "train_processed_pool = Pool(\n", + " X_preprocessed_train, y_train, \n", + " cat_features=cat_features,\n", + " text_features=text_features,\n", + ")\n", + "\n", + "validation_processed_pool = Pool(\n", + " X_preprocessed_test, y_test, \n", + " cat_features=cat_features,\n", + " text_features=text_features,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 269 + }, + "colab_type": "code", + "id": "0jJJSrFJfFuk", + "outputId": "6baeef42-d430-4793-fc33-556095416a9b" + }, + "outputs": [], + "source": [ + "model_on_processed_data = fit_model(train_processed_pool, validation_processed_pool)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "colab_type": "code", + "id": "AXDdPAgyfFum", + "outputId": "61e26e81-b858-4675-ab58-aaf3384428ae" + }, + "outputs": [], + "source": [ + "print_score_diff(model, model_on_processed_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "CJr7fXN7fFun" + }, + "source": [ + "## Dictionary Creation\n", + "\n", + "After the first stage, preprocessing of text and tokenization, the second stage starts. The second stage uses the prepared text to select a set of units, which will be used for building new numerical features.\n", + "\n", + "A set of selected units is called dictionary. It might contain words, word bigramms, or character n-gramms." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "D6H1MXf9fFuo" + }, + "outputs": [], + "source": [ + "from catboost.text_processing import Dictionary" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Rn402k78fFuq" + }, + "outputs": [], + "source": [ + "dictionary = Dictionary(occurence_lower_bound=0, max_dictionary_size=10)\n", + "\n", + "dictionary.fit(text_small_lemmatized_nltk);\n", + "#dictionary.fit(text_small, tokenizer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 253 + }, + "colab_type": "code", + "id": "KJr0UBzOfFur", + "outputId": "4ab23b42-0fb7-4ac4-c878-63da839c8635" + }, + "outputs": [], + "source": [ + "dictionary.save('dictionary.tsv')\n", + "!cat dictionary.tsv" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "U1wLb5MX8NTY" + }, + "source": [ + "## Feature Calculation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "KYzNqXgcfFut" + }, + "source": [ + "### Convertation into fixed size vectors\n", + "\n", + "The majority of classic ML algorithms are computing and performing predictions on a fixed number of features $F$.
\n", + "That means that learning set $X = \\{x_i\\}$ contains vectors $x_i = (a_0, a_1, ..., a_F)$ where $F$ is constant.\n", + "\n", + "Since text object $x$ is not a fixed length vector, we need to perform preprocessing of the origin set $D$.
\n", + "One of the simplest text to vector encoding technique is **Bag of words (BoW)**.\n", + "\n", + "### Bag of words algorithm\n", + "\n", + "The algorithm takes in a dictionary and a text.
\n", + "During the algorithm text $x = (a_0, a_1, ..., a_k)$ converted into vector $\\tilde x = (b_0, b_1, ..., b_F)$,
where $b_i$ is 0/1 (depending on whether there is a word with id=$i$ from dictionary into text $x$)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 161 + }, + "colab_type": "code", + "id": "7Ea944JbfFuu", + "outputId": "5f788c52-345c-4703-957a-4f57dd29c418" + }, + "outputs": [], + "source": [ + "text_small_lemmatized_nltk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "colab_type": "code", + "id": "bRm5Cf5qkzlJ", + "outputId": "6226eea1-ab2b-4924-df6c-a006e71965f5" + }, + "outputs": [], + "source": [ + "dictionary.apply([text_small_lemmatized_nltk[0]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 305 + }, + "colab_type": "code", + "id": "ga0AfpT8fFuv", + "outputId": "6b6e9abb-3e2a-4a8e-eac9-dacbac3c33fd" + }, + "outputs": [], + "source": [ + "def bag_of_words(tokenized_text, dictionary):\n", + " features = np.zeros((len(tokenized_text), dictionary.size))\n", + " for i, tokenized_sentence in enumerate(tokenized_text):\n", + " indices = np.array(dictionary.apply([tokenized_sentence])[0])\n", + " features[i, indices] = 1\n", + " return features\n", + "\n", + "bow_features = bag_of_words(text_small_lemmatized_nltk, dictionary)\n", + "bow_features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "vhr-EyPyfFuy" + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from scipy.sparse import csr_matrix\n", + "from sklearn.metrics import log_loss\n", + "\n", + "def fit_linear_model(X, c):\n", + " model = LogisticRegression()\n", + " model.fit(X, c)\n", + " return model\n", + "\n", + "def fit_naive_bayes(X, c):\n", + " clf = MultinomialNB()\n", + " if isinstance(X, csr_matrix):\n", + " X.eliminate_zeros()\n", + " clf.fit(X, c)\n", + " return clf\n", + "\n", + "def evaluate_model_logloss(model, X, y):\n", + " y_pred = model.predict_proba(X)[:,1]\n", + " metric = log_loss(y, y_pred)\n", + " print('Logloss: ' + str(metric))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 125 + }, + "colab_type": "code", + "id": "GekNCx5ofFuz", + "outputId": "5b218b73-c7fd-4628-f218-29d0d30686eb" + }, + "outputs": [], + "source": [ + "def evaluate_models(X, y):\n", + " linear_model = fit_linear_model(bow_features, target_small)\n", + " naive_bayes = fit_naive_bayes(bow_features, target_small)\n", + " \n", + " print('Linear model')\n", + " evaluate_model_logloss(linear_model, X, y)\n", + " print('Naive bayes')\n", + " evaluate_model_logloss(naive_bayes, X, y)\n", + " print('Comparing to constant prediction')\n", + " logloss_constant_prediction = log_loss(y, np.ones(shape=(len(text_small), 2)) * 0.5)\n", + " print('Logloss: ' + str(logloss_constant_prediction))\n", + " \n", + "evaluate_models(bow_features, target_small)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 125 + }, + "colab_type": "code", + "id": "uFsAWNE9fFu2", + "outputId": "7197acdf-71ac-4c81-b507-4f06cafdbea8" + }, + "outputs": [], + "source": [ + "dictionary = Dictionary(occurence_lower_bound=0)\n", + "dictionary.fit(text_small_lemmatized_nltk)\n", + "\n", + "bow_features = bag_of_words(text_small_lemmatized_nltk, dictionary)\n", + "evaluate_models(bow_features, target_small)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "yvjUACB_fFu6" + }, + "source": [ + "### Looking at sequences of letters / words\n", + "\n", + "Let's look at the example: texts 'The cat defeated the mouse' and 'Army of mice defeated the cat :('
\n", + "Simplifying it we have three tokens in each sentence 'cat defeat mouse' and 'mouse defeat cat'.
\n", + "After applying BoW we get two equal vectors with the opposite meaning:\n", + "\n", + "| cat | mouse | defeat |\n", + "|-----|-------|--------|\n", + "| 1 | 1 | 1 |\n", + "| 1 | 1 | 1 |\n", + "\n", + "How to distinguish them?\n", + "Lets add sequences of words as a single tokens into our dictionary:\n", + "\n", + "| cat | mouse | defeat | cat_defeat | mouse_defeat | defeat_cat | defeat_mouse |\n", + "|-----|-------|--------|------------|--------------|------------|--------------|\n", + "| 1 | 1 | 1 | 1 | 0 | 0 | 1 |\n", + "| 1 | 1 | 1 | 0 | 1 | 1 | 0 |\n", + "\n", + "**N-gram** is a continguous sequence of $n$ items from a given sample of text or speech (Wikipedia).
\n", + "In example above Bi-gram (Bigram) = 2-gram of words.\n", + "\n", + "Ngrams help to add into vectors more information about text structure, moreover there are n-grams has no meanings in separation, for example, 'Mickey Mouse company'." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 379 + }, + "colab_type": "code", + "id": "WU6iWFPZClrf", + "outputId": "b666b9a2-0782-472a-a729-0fa1b15bd9f2" + }, + "outputs": [], + "source": [ + "dictionary = Dictionary(occurence_lower_bound=0, gram_order=2)\n", + "dictionary.fit(text_small_lemmatized_nltk)\n", + "\n", + "dictionary.save('dictionary.tsv')\n", + "!cat dictionary.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 125 + }, + "colab_type": "code", + "id": "ypPTi_XXfFu7", + "outputId": "59136696-c457-4f99-b884-cf1e2e68fb80" + }, + "outputs": [], + "source": [ + "bow_features = bag_of_words(text_small_lemmatized_nltk, dictionary)\n", + "evaluate_models(bow_features, target_small)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "1uLlIfJHodEL" + }, + "source": [ + "### Unigram + Bigram" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 125 + }, + "colab_type": "code", + "id": "XaRC74kNfFu8", + "outputId": "f67a5ea4-0795-4b16-db80-2bff733109e9" + }, + "outputs": [], + "source": [ + "dictionary1 = Dictionary(occurence_lower_bound=0)\n", + "dictionary1.fit(text_small_lemmatized_nltk)\n", + "\n", + "bow_features1 = bag_of_words(text_small_lemmatized_nltk, dictionary1)\n", + "\n", + "dictionary2 = Dictionary(occurence_lower_bound=0, gram_order=2)\n", + "dictionary2.fit(text_small_lemmatized_nltk)\n", + "\n", + "bow_features2 = bag_of_words(text_small_lemmatized_nltk, dictionary2)\n", + "\n", + "bow_features = np.concatenate((bow_features1, bow_features2), axis=1)\n", + "evaluate_models(bow_features, target_small)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "oFR_rMfH8NT_" + }, + "source": [ + "## CatBoost Configuration" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8xoFAOiz8NT_" + }, + "source": [ + "Parameter names:\n", + "\n", + "1. **Text Tokenization** - `tokenizers`\n", + "2. **Dictionary Creation** - `dictionaries`\n", + "3. **Feature Calculation** - `feature_calcers`\n", + "\n", + "\\* More complex configuration with `text_processing` parameter" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Wntt3XrYgkhf" + }, + "source": [ + "### `tokenizers`\n", + "\n", + "Tokenizers used to preprocess Text type feature columns before creating the dictionary.\n", + "\n", + "[Documentation](https://catboost.ai/docs/references/tokenizer_options.html).\n", + "\n", + "```\n", + "tokenizers = [{\n", + "\t'tokenizer_id': 'Space',\n", + "\t'delimiter': ' ',\n", + "\t'separator_type': 'ByDelimiter',\n", + "},{\n", + "\t'tokenizer_id': 'Sense',\n", + "\t'separator_type': 'BySense',\n", + "}]\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "aKqHyav7fFu-" + }, + "source": [ + "### `dictionaries`\n", + "\n", + "Dictionaries used to preprocess Text type feature columns.\n", + "\n", + "[Documentation](https://catboost.ai/docs/references/dictionaries_options.html).\n", + "\n", + "```\n", + "dictionaries = [{\n", + "\t'dictionary_id': 'Unigram',\n", + "\t'max_dictionary_size': '50000',\n", + "\t'gram_count': '1',\n", + "},{\n", + "\t'dictionary_id': 'Bigram',\n", + "\t'max_dictionary_size': '50000',\n", + "\t'gram_count': '2',\n", + "},{\n", + "\t'dictionary_id': 'Trigram',\n", + "\t'token_level_type': 'Letter',\n", + "\t'max_dictionary_size': '50000',\n", + "\t'gram_count': '3',\n", + "}]\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "JT6I_LN98NUC" + }, + "source": [ + "### `feature_calcers`\n", + "\n", + "Feature calcers used to calculate new features based on preprocessed Text type feature columns.\n", + "\n", + "1. **`BoW`**
\n", + "Bag of words: 0/1 features (text sample has or not token_id).
\n", + "Number of produced numeric features = dictionary size.
\n", + "Parameters: `top_tokens_count` - maximum number of tokens that will be used for vectorization in bag of words, the most frequent $n$ tokens are taken (**highly affect both on CPU ang GPU RAM usage**).\n", + "\n", + "2. **`NaiveBayes`**
\n", + "NaiveBayes: [Multinomial naive bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes) model. As many new features as classes are added. This feature is calculated by analogy with counters in CatBoost by permutation ([estimation of CTRs](https://catboost.ai/docs/concepts/algorithm-main-stages_cat-to-numberic.html)). In other words, a random permutation is made and then we go from top to bottom on the dataset and calculate the probability of its belonging to this class for each object.\n", + "\n", + "3. **`BM25`**
\n", + "[BM25](https://en.wikipedia.org/wiki/Okapi_BM25). As many new features as classes are added. The idea is the same as in Naive Bayes, but for each class we calculate not the conditional probability, but a certain relevance, which is similar to tf-idf, where the tokens instead of the words and the classes instead of the documents (or rather, the unification of all texts of this class). Only the tf multiplier in BM25 is replaced with another multiplier, which gives an advantage to classes that contain rare tokens.\n", + "\n", + "```\n", + "feature_calcers = [\n", + "\t'BoW:top_tokens_count=1000',\n", + "\t'NaiveBayes',\n", + "\t'BM25',\n", + "]\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "02lH5f1PgpYM" + }, + "source": [ + "### `text_processing`\n", + "\n", + "```\n", + "text_processing = {\n", + " \"tokenizers\" : [{\n", + " \"tokenizer_id\" : \"Space\",\n", + " \"separator_type\" : \"ByDelimiter\",\n", + " \"delimiter\" : \" \"\n", + " }],\n", + "\n", + " \"dictionaries\" : [{\n", + " \"dictionary_id\" : \"BiGram\",\n", + " \"max_dictionary_size\" : \"50000\",\n", + " \"occurrence_lower_bound\" : \"3\",\n", + " \"gram_order\" : \"2\"\n", + " }, {\n", + " \"dictionary_id\" : \"Word\",\n", + " \"max_dictionary_size\" : \"50000\",\n", + " \"occurrence_lower_bound\" : \"3\",\n", + " \"gram_order\" : \"1\"\n", + " }],\n", + "\n", + " \"feature_processing\" : {\n", + " \"default\" : [{\n", + " \"dictionaries_names\" : [\"BiGram\", \"Word\"],\n", + " \"feature_calcers\" : [\"BoW\"],\n", + " \"tokenizers_names\" : [\"Space\"]\n", + " }, {\n", + " \"dictionaries_names\" : [\"Word\"],\n", + " \"feature_calcers\" : [\"NaiveBayes\"],\n", + " \"tokenizers_names\" : [\"Space\"]\n", + " }],\n", + " }\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 269 + }, + "colab_type": "code", + "id": "-HOhMr-ffFu_", + "outputId": "d28394e4-fb6e-4a63-9090-8860a3a27333" + }, + "outputs": [], + "source": [ + "model_on_processed_data_2 = fit_model(\n", + " train_processed_pool,\n", + " validation_processed_pool,\n", + " text_processing = {\n", + " \"tokenizers\" : [{\n", + " \"tokenizer_id\" : \"Space\",\n", + " \"separator_type\" : \"ByDelimiter\",\n", + " \"delimiter\" : \" \"\n", + " }],\n", + " \n", + " \"dictionaries\" : [{\n", + " \"dictionary_id\" : \"BiGram\",\n", + " \"max_dictionary_size\" : \"50000\",\n", + " \"occurrence_lower_bound\" : \"3\",\n", + " \"gram_order\" : \"2\"\n", + " }, {\n", + " \"dictionary_id\" : \"Word\",\n", + " \"max_dictionary_size\" : \"50000\",\n", + " \"occurrence_lower_bound\" : \"3\",\n", + " \"gram_order\" : \"1\"\n", + " }],\n", + " \n", + " \"feature_processing\" : {\n", + " \"default\" : [{\n", + " \"dictionaries_names\" : [\"BiGram\", \"Word\"],\n", + " \"feature_calcers\" : [\"BoW\"],\n", + " \"tokenizers_names\" : [\"Space\"]\n", + " }, {\n", + " \"dictionaries_names\" : [\"Word\"],\n", + " \"feature_calcers\" : [\"NaiveBayes\"],\n", + " \"tokenizers_names\" : [\"Space\"]\n", + " }],\n", + " }\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "colab_type": "code", + "id": "HFJRD9RofFvC", + "outputId": "08f27541-75fe-4c0e-dd88-3b6e9a716035" + }, + "outputs": [], + "source": [ + "print_score_diff(model_no_text, model_on_processed_data_2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "xlo77dzufFvE" + }, + "source": [ + "# Summary: Text features in CatBoost\n", + "\n", + "### The algorithm:\n", + "1. Input text is loaded as a usual column. ``text_column: [string]``.\n", + "2. Each text sample is tokenized via splitting by space. ``tokenized_column: [[string]]``.\n", + "3. Dictionary estimation.\n", + "4. Each string in tokenized column is converted into token_id from dictionary. ``text: [[token_id]]``.\n", + "5. Depending on the parameters CatBoost produce features basing on the resulting text column: Bag of words, Multinomial naive bayes or Bm25.\n", + "6. Computed float features are passed into the usual CatBoost learning algorithm." + ] + } + ], + "metadata": { + "accelerator": "GPU", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}