IntelPython · RukhovichIV · Jul 16, 2021 · Jul 16, 2021 · Jul 16, 2021 · Jul 16, 2021
diff --git a/bench.py b/bench.py
@@ -389,14 +389,13 @@ def convert_data(data, dtype, data_order, data_format):
     # Secondly, change format of data
     if data_format == 'numpy':
         return data
-    elif data_format == 'pandas':
+    if data_format == 'pandas':
         import pandas as pd
 
         if data.ndim == 1:
             return pd.Series(data)
-        else:
-            return pd.DataFrame(data)
-    elif data_format == 'cudf':
+        return pd.DataFrame(data)
+    if data_format == 'cudf':
         import cudf
         import pandas as pd
 
@@ -512,36 +511,42 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None,
 def print_output(library, algorithm, stages, params, functions,
                  times, metric_type, metrics, data, alg_instance=None,
                  alg_params=None):
-    if params.output_format == 'json':
-        output = []
-        for i, stage in enumerate(stages):
-            result = gen_basic_dict(library, algorithm, stage, params,
-                                    data[i], alg_instance, alg_params)
-            result.update({'time[s]': times[i]})
-            if metric_type is not None:
-                if isinstance(metric_type, str):
-                    result.update({f'{metric_type}': metrics[i]})
-                elif isinstance(metric_type, list):
-                    for ind, val in enumerate(metric_type):
-                        if metrics[ind][i] is not None:
-                            result.update({f'{val}': metrics[ind][i]})
-            if hasattr(params, 'n_classes'):
-                result['input_data'].update({'classes': params.n_classes})
-            if hasattr(params, 'n_clusters'):
-                if algorithm == 'kmeans':
-                    result['input_data'].update(
-                        {'n_clusters': params.n_clusters})
-                elif algorithm == 'dbscan':
-                    result.update({'n_clusters': params.n_clusters})
-            # replace non-string init with string for kmeans benchmarks
-            if alg_instance is not None:
-                if 'init' in result['algorithm_parameters'].keys():
-                    if not isinstance(result['algorithm_parameters']['init'], str):
-                        result['algorithm_parameters']['init'] = 'random'
-                if 'handle' in result['algorithm_parameters'].keys():
-                    del result['algorithm_parameters']['handle']
-            output.append(result)
-        print(json.dumps(output, indent=4))
+    if params.output_format != 'json':
+        return
+
+    output = []
+    for i, stage in enumerate(stages):
+        result = gen_basic_dict(library, algorithm, stage, params,
+                                data[i], alg_instance, alg_params)
+        result.update({'time[s]': times[i]})
+
+        if metric_type is not None:
+            if isinstance(metric_type, str):
+                result.update({f'{metric_type}': metrics[i]})
+            elif isinstance(metric_type, list):
+                for ind, val in enumerate(metric_type):
+                    if metrics[ind][i] is not None:
+                        result.update({f'{val}': metrics[ind][i]})
+
+        if hasattr(params, 'n_classes'):
+            result['input_data'].update({'classes': params.n_classes})
+        if hasattr(params, 'n_clusters'):
+            if algorithm == 'kmeans':
+                result['input_data'].update(
+                    {'n_clusters': params.n_clusters})
+            elif algorithm == 'dbscan':
+                result.update({'n_clusters': params.n_clusters})
+
+        # replace non-string init with string for kmeans benchmarks
+        if alg_instance is not None:
+            if 'init' in result['algorithm_parameters'].keys():
+                if not isinstance(result['algorithm_parameters']['init'], str):
+                    result['algorithm_parameters']['init'] = 'random'
+            if 'handle' in result['algorithm_parameters'].keys():
+                del result['algorithm_parameters']['handle']
+        output.append(result)
+
+    print(json.dumps(output, indent=4))
 
 
 def run_with_context(params, function):

@@ -1,12 +1,10 @@
 {
     "common": {
         "lib": "xgboost",
-        "data-format": "cudf",
         "data-order": "F",
         "dtype": "float32",
         "algorithm": "gbt",
         "tree-method": "gpu_hist",
-        "count-dmatrix": "",
         "max-depth": 8,
         "learning-rate": 0.1,
         "reg-lambda": 1,
@@ -15,6 +13,7 @@
     "cases": [
         {
             "objective": "binary:logistic",
+            "data-format": "pandas",
             "scale-pos-weight": 2.1067817411664587,
             "dataset": [
                 {
@@ -33,6 +32,7 @@
         },
         {
             "objective": "binary:logistic",
+            "data-format": "cudf",
             "scale-pos-weight": 173.63348001466812,
             "dataset": [
                 {
@@ -51,6 +51,7 @@
         },
         {
             "objective": "multi:softmax",
+            "data-format": "cudf",
             "dataset": [
                 {
                     "source": "npy",
@@ -68,6 +69,7 @@
         },
         {
             "objective": "binary:logistic",
+            "data-format": "pandas",
             "scale-pos-weight": 2.0017715678375363,
             "dataset": [
                 {
@@ -86,6 +88,7 @@
         },
         {
             "objective": "binary:logistic",
+            "data-format": "cudf",
             "scale-pos-weight": 578.2868020304569,
             "dataset": [
                 {
@@ -104,6 +107,7 @@
         },
         {
             "objective": "binary:logistic",
+            "data-format": "cudf",
             "scale-pos-weight": 1.8872389605086624,
             "dataset": [
                 {
@@ -122,6 +126,7 @@
         },
         {
             "objective": "reg:squarederror",
+            "data-format": "cudf",
             "dataset": [
                 {
                     "source": "npy",

@@ -1,15 +1,15 @@
 {
     "common": {
         "lib": "xgboost",
-        "data-format": "cudf",
         "data-order": "F",
         "dtype": "float32",
         "algorithm": "gbt",
-        "tree-method": "gpu_hist",
-        "count-dmatrix": ""
+        "tree-method": "gpu_hist"
     },
     "cases": [
         {
+            "objective": "reg:squarederror",
+            "data-format": "cudf",
             "dataset": [
                 {
                     "source": "npy",
@@ -26,10 +26,11 @@
             ],
             "learning-rate": 0.03,
             "max-depth": 6,
-            "n-estimators": 1000,
-            "objective": "reg:squarederror"
+            "n-estimators": 1000
         },
         {
+            "objective": "binary:logistic",
+            "data-format": "pandas",
             "dataset": [
                 {
                     "source": "npy",
@@ -53,10 +54,11 @@
             "min-child-weight": 0,
             "max-depth": 8,
             "max-leaves": 256,
-            "n-estimators": 1000,
-            "objective": "binary:logistic"
+            "n-estimators": 1000
         },
         {
+            "objective": "binary:logistic",
+            "data-format": "pandas",
             "dataset": [
                 {
                     "source": "npy",
@@ -81,10 +83,11 @@
             "max-depth": 8,
             "max-leaves": 256,
             "n-estimators": 1000,
-            "objective": "binary:logistic",
             "inplace-predict": ""
         },
         {
+            "objective": "multi:softprob",
+            "data-format": "cudf",
             "dataset": [
                 {
                     "source": "npy",
@@ -101,10 +104,11 @@
             ],
             "learning-rate": 0.03,
             "max-depth": 6,
-            "n-estimators": 1000,
-            "objective": "multi:softprob"
+            "n-estimators": 1000
         },
         {
+            "objective": "multi:softprob",
+            "data-format": "cudf",
             "dataset": [
                 {
                     "source": "npy",
@@ -122,10 +126,11 @@
             "min-child-weight": 1,
             "min-split-loss": 0.1,
             "max-depth": 8,
-            "n-estimators": 200,
-            "objective": "multi:softprob"
+            "n-estimators": 200
         },
         {
+            "objective": "reg:squarederror",
+            "data-format": "cudf",
             "dataset": [
                 {
                     "source": "npy",
@@ -137,7 +142,6 @@
                 }
             ],
             "n-estimators": 100,
-            "objective": "reg:squarederror",
             "max-depth": 8,
             "scale-pos-weight": 2,
             "learning-rate": 0.1,
@@ -148,6 +152,8 @@
             "max-leaves": 256
         },
         {
+            "objective": "multi:softprob",
+            "data-format": "cudf",
             "dataset": [
                 {
                     "source": "npy",
@@ -163,12 +169,13 @@
                 }
             ],
             "n-estimators": 60,
-            "objective": "multi:softprob",
             "max-depth": 7,
             "subsample": 0.7,
             "colsample-bytree": 0.7
         },
         {
+            "objective": "binary:logistic",
+            "data-format": "cudf",
             "dataset": [
                 {
                     "source": "npy",
@@ -184,7 +191,6 @@
                 }
             ],
             "n-estimators": 10000,
-            "objective": "binary:logistic",
             "max-depth": 1,
             "subsample": 0.5,
             "eta": 0.1,