Automatic Feature Selection with Featurewiz (cmu-db#54)

This PR modifies the training pipeline to be able to derive input features with `featurewiz` (https://github.com/AutoViML/featurewiz/). `featurewiz` is a method of finding features of high signal from given input/test datasets. Furthermore, `featurewiz` will automatically explore feature combinations (e.g., `column1 * column2`). Validating the selected features and/or selecting the best model from a set of model/hyperparameter candidates is deferred to a future PR. This PR just brings in the infrastructure to run `featurewiz` and train/infer models based on derived features.
mbutrovich · Mar 11, 2022 · bca97d7 · bca97d7
1 parent ff5ce7c
commit bca97d7
Show file tree

Hide file tree

Showing 18 changed files with 401 additions and 190 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -12,6 +12,8 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v2
+        with:
+          submodules: 'recursive'
       - uses: actions/setup-python@v2
         with:
           python-version: '3.8'

diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,5 @@
+[submodule "featurewiz"]
+	path = behavior/modeling/featurewiz
+	url = https://github.com/17zhangw/featurewiz.git
+	branch = stable
+    ignore = dirty
diff --git a/README.md b/README.md
@@ -4,11 +4,13 @@ This repository contains the pilot components for the [NoisePage DBMS](https://n
 
 ## Quickstart
 
-1. Install necessary packages.
+1. Init all submodules `git submodule update --init --recursive`.
+2. Install necessary packages.
+    - `cd behavior/modeling/featurewiz && pip3 install --upgrade -r requirements.txt`
     - `pip3 install --upgrade -r requirements.txt`
-2. List all the tasks.
+3. List all the tasks.
     - `doit list`
-3. Select and run a doit task from the task list, e.g. `doit action_recommendation`.  Task dependencies are executed automatically.
+4. Select and run a doit task from the task list, e.g. `doit action_recommendation`.  Task dependencies are executed automatically.
 
 ## Sample Doit Tasks
 

diff --git a/action/selection/open_spiel/database.cc b/action/selection/open_spiel/database.cc
@@ -136,8 +136,6 @@ struct ExplainMicroserviceCost {
 
  private:
   void Augment(rapidjson::Document &doc, rapidjson::Document::Object &node) {
-    // TODO(WAN): Robustness for checking that node is a Plan from PostgreSQL's EXPLAIN (FORMAT JSON).
-
     // Currently, the only augmentation done is differencing.
     // Differencing refers to isolating a node's contribution to a feature by subtracting away children contributions.
     // For example, for this plan:
@@ -152,17 +150,18 @@ struct ExplainMicroserviceCost {
     if (node.HasMember("Plans")) {
       for (auto &child_node : node["Plans"].GetArray()) {
         rapidjson::Document::Object &&child = child_node.GetObject();
-        Augment(doc, child);
+        // Get the cost before we call Augment() on the child.
         children_startup_cost += GetValue(child, "startup_cost").GetDouble();
         children_total_cost += GetValue(child, "total_cost").GetDouble();
+        Augment(doc, child);
       }
     }
     // Compute the diffed features.
     double diffed_startup_cost = GetValue(node, "startup_cost").GetDouble() - children_startup_cost;
     double diffed_total_cost = GetValue(node, "total_cost").GetDouble() - children_total_cost;
     // Add the diffed features back to the JSON object.
-    node.AddMember("Diffed Startup Cost", diffed_startup_cost, doc.GetAllocator());
-    node.AddMember("Diffed Total Cost", diffed_total_cost, doc.GetAllocator());
+    GetValue(node, "startup_cost") = rapidjson::Value(diffed_startup_cost);
+    GetValue(node, "total_cost") = rapidjson::Value(diffed_total_cost);
   }
 
   void PrepareCostRequest(rapidjson::Document &target, rapidjson::Document::Object &node) {
@@ -172,51 +171,28 @@ struct ExplainMicroserviceCost {
     rapidjson::Value features;
     features.SetObject();
 
-    // TODO(WAN): Robustness for checking that node is a Plan from PostgreSQL's EXPLAIN (FORMAT JSON).
-    features.AddMember("bias", 1, target.GetAllocator());
-    features.AddMember("startup_cost", GetValue(node, "startup_cost").GetDouble(), target.GetAllocator());
-    features.AddMember("total_cost", GetValue(node, "total_cost").GetDouble(), target.GetAllocator());
-    features.AddMember("plan_rows", uint64_t(GetValue(node, "plan_rows").GetDouble()), target.GetAllocator());
-    features.AddMember("plan_width", GetValue(node, "plan_width").GetInt64(), target.GetAllocator());
-    features.AddMember("startup_cost", node["Diffed Startup Cost"].GetDouble(), target.GetAllocator());
-    features.AddMember("total_cost", node["Diffed Total Cost"].GetDouble(), target.GetAllocator());
-
-    // Determine which behavior model this is.
-    // Currently, names mostly match if you strip out spaces.
-    // TODO (Karthik): Currently the "tag" returns the OU name.
-    // Make the output from Hutch play nicely with this.
-    std::string model_name = std::regex_replace(node["node_type"].GetString(), std::regex("\\s+"), "");
-    // Model-specific hacks.
-    if (model_name == "Aggregate") {
-      model_name = "Agg";
-    } else if (model_name == "ModifyTable") {
-      // See PostgreSQL nodes.h/CmdType. At time of writing: UNKNOWN, SELECT, UPDATE, INSERT, DELETE, UTILITY, NOTHING.
-      // C standard guarantees that if you don't specify enum value, it starts at 0 and goes sequentially.
-      std::string opstr = node["Operation"].GetString();
-      std::string opnum = "-1";
-      if (opstr == "Select") {
-        opnum = "1";
-      } else if (opstr == "Update") {
-        opnum = "2";
-      } else if (opstr == "Insert") {
-        opnum = "3";
-      } else if (opstr == "Delete") {
-        opnum = "4";
+    for (rapidjson::Value::MemberIterator it = node.MemberBegin(); it != node.MemberEnd(); it++) {
+      // Pass all features provided by Hutch directly to model inference only if
+      // the attribute value is a numeric scalar.
+      if (!it->value.IsObject() && !it->value.IsArray() && !it->value.IsString()) {
+        // Explicitly invoke rapidjson::Value() copy constructor so we aren't moving
+        // elements from one node to another.
+        features.AddMember(rapidjson::Value(it->name, target.GetAllocator()),
+                           rapidjson::Value(it->value, target.GetAllocator()), target.GetAllocator());
       }
-
-      rapidjson::Value opnum_val(opnum.c_str(), target.GetAllocator());
-      features.AddMember("ModifyTable_operation", opnum_val, target.GetAllocator());
-    } else if (model_name == "NestedLoop") {
-      model_name = "NestLoop";
     }
 
     // Determine which behavior model type to use.
     node_element.AddMember("model_type", "rf", target.GetAllocator());
     // Just kidding. No choice allowed, random forests only.
     // TODO(WAN): More seriously, we should expose this as a parameter.
 
-    rapidjson::Value model_name_val(model_name.c_str(), target.GetAllocator());
-    node_element.AddMember("ou_type", model_name_val, target.GetAllocator());
+    // Determine which behavior model this is.
+    // TODO(wz2): Optionally also attach `node` so we can match predictions back to the plan tree.
+    auto node_member = node.FindMember("node_type");
+    assert(node_member != node.MemberEnd() && "Expected to find node_type member.");
+    node_element.AddMember("ou_type", rapidjson::Value(node_member->value, target.GetAllocator()),
+                           target.GetAllocator());
     node_element.AddMember("features", features, target.GetAllocator());
     target.PushBack(node_element, target.GetAllocator());
 
@@ -284,8 +260,8 @@ struct ExplainMicroserviceCost {
     return false;
   }
 
-  const rapidjson::Value &GetValue(rapidjson::Value &node, std::string key) {
-    for (rapidjson::Value::ConstMemberIterator itr = node.MemberBegin(); itr != node.MemberEnd(); itr++) {
+  rapidjson::Value &GetValue(rapidjson::Value &node, std::string key) {
+    for (rapidjson::Value::MemberIterator itr = node.MemberBegin(); itr != node.MemberEnd(); itr++) {
       if (EndsWith(itr->name.GetString(), key)) {
         return itr->value;
       }

diff --git a/behavior/__init__.py b/behavior/__init__.py
@@ -145,3 +145,43 @@
     "memory_bytes",
     "elapsed_us",
 ]
+
+# The set of columns to standardize the input data's columns on. A column that ends with any
+# of the values defined below is renamed to the defined value.
+STANDARDIZE_COLUMNS: list[str] = [
+    "query_id",
+    "plan_node_id",
+    "left_child_plan_node_id",
+    "right_child_plan_node_id",
+    "startup_cost",
+    "total_cost",
+    "start_time",
+    "end_time",
+    "statement_timestamp",
+    "pid",
+]
+
+
+def standardize_input_data(df):
+    """
+    Standardizes input data for either model inference or for the data diff/model
+    training pipeline. Function remaps non-target input columns that are suffixed
+    by a column in STANDARDIZE_COLUMNS
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        Input data dataframe that needs to be remapped.
+        Remapping is done in-place.
+    """
+
+    # Determine which columns to remap. A non-target input column that is suffixed by a
+    # column in STANDARDIZE_COLUMNS is remapped to produce a common schema.
+    remapper: dict[str, str] = {}
+    for init_col in df.columns:
+        if init_col not in STANDARDIZE_COLUMNS and init_col not in BASE_TARGET_COLS:
+            for common_col in STANDARDIZE_COLUMNS:
+                if init_col.endswith(common_col):
+                    remapper[init_col] = common_col
+                    break
+    df.rename(columns=remapper, inplace=True)
diff --git a/behavior/microservice/app.py b/behavior/microservice/app.py
@@ -6,11 +6,12 @@
 
 import flask
 import numpy as np
+import pandas as pd
 import setproctitle
 from flask import Flask, g, jsonify, render_template, request, send_from_directory
 from plumbum import cli
 
-from behavior import BASE_TARGET_COLS
+from behavior import BASE_TARGET_COLS, standardize_input_data
 from behavior.modeling.model import BehaviorModel
 
 app = Flask(__name__, static_url_path="")
@@ -51,14 +52,16 @@ def _infer_model(model_type, ou_type, features):
     except KeyError as err:
         return f"Error cannot find {model_type} model: {err}"
 
-    # Check that all the features are present.
-    diff = set(behavior_model.features).difference(features)
-    if len(diff) > 0:
-        return f"{model_type}:{ou_type} Features missing: {diff}"
-
-    # Extract the features.
-    X = [features[feature] for feature in behavior_model.features]
-    X = np.array(X).astype(float).reshape(1, -1)
+    try:
+        # Convert the input features into a pandas dataframe.
+        df = pd.DataFrame.from_records([features])
+        # Standardize the input features (particularly, total_cost and startup_cost).
+        standardize_input_data(df)
+        # Convert the input features into input features expected by the model.
+        X = behavior_model.convert_raw_input(df)
+        X = X.to_numpy(dtype=np.float, copy=False)
+    except KeyError as e:
+        return f"Error cannot convert input feature: {e}"
 
     # Predict the Y values. Record how long it takes to predict Y values.
     start = time.time()

diff --git a/behavior/modeling/__init__.py b/behavior/modeling/__init__.py
@@ -1,3 +1,7 @@
+from __future__ import annotations
+
+from behavior.plans import DIFFERENCING_SCHEMA
+
 METHODS = [
     "lr",
     "huber",
@@ -12,3 +16,13 @@
     "mt_elastic",
     "elastic",
 ]
+
+# Any feature that ends with any keyword in BLOCKED_FEATURES is dropped
+# from the input schema. These features are intentionally dropped since
+# we don't want feature selection/model to try and learn from these.
+BLOCKED_FEATURES: list[str] = DIFFERENCING_SCHEMA + [
+    "plan_type",  # Plan Type
+    "cpu_id",  # CPU ID
+    "relid",  # Scan index into range able
+    "indexid",  # Index OID
+]
diff --git a/behavior/modeling/featurewiz b/behavior/modeling/featurewiz