Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Model converters benchmark update #86

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
73 changes: 39 additions & 34 deletions bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,14 +389,13 @@ def convert_data(data, dtype, data_order, data_format):
# Secondly, change format of data
if data_format == 'numpy':
return data
elif data_format == 'pandas':
if data_format == 'pandas':
import pandas as pd

if data.ndim == 1:
return pd.Series(data)
else:
return pd.DataFrame(data)
elif data_format == 'cudf':
return pd.DataFrame(data)
if data_format == 'cudf':
import cudf
import pandas as pd

Expand Down Expand Up @@ -512,36 +511,42 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None,
def print_output(library, algorithm, stages, params, functions,
times, metric_type, metrics, data, alg_instance=None,
alg_params=None):
if params.output_format == 'json':
output = []
for i, stage in enumerate(stages):
result = gen_basic_dict(library, algorithm, stage, params,
data[i], alg_instance, alg_params)
result.update({'time[s]': times[i]})
if metric_type is not None:
if isinstance(metric_type, str):
result.update({f'{metric_type}': metrics[i]})
elif isinstance(metric_type, list):
for ind, val in enumerate(metric_type):
if metrics[ind][i] is not None:
result.update({f'{val}': metrics[ind][i]})
if hasattr(params, 'n_classes'):
result['input_data'].update({'classes': params.n_classes})
if hasattr(params, 'n_clusters'):
if algorithm == 'kmeans':
result['input_data'].update(
{'n_clusters': params.n_clusters})
elif algorithm == 'dbscan':
result.update({'n_clusters': params.n_clusters})
# replace non-string init with string for kmeans benchmarks
if alg_instance is not None:
if 'init' in result['algorithm_parameters'].keys():
if not isinstance(result['algorithm_parameters']['init'], str):
result['algorithm_parameters']['init'] = 'random'
if 'handle' in result['algorithm_parameters'].keys():
del result['algorithm_parameters']['handle']
output.append(result)
print(json.dumps(output, indent=4))
if params.output_format != 'json':
return

output = []
for i, stage in enumerate(stages):
result = gen_basic_dict(library, algorithm, stage, params,
data[i], alg_instance, alg_params)
result.update({'time[s]': times[i]})

if metric_type is not None:
if isinstance(metric_type, str):
result.update({f'{metric_type}': metrics[i]})
elif isinstance(metric_type, list):
for ind, val in enumerate(metric_type):
if metrics[ind][i] is not None:
result.update({f'{val}': metrics[ind][i]})

if hasattr(params, 'n_classes'):
result['input_data'].update({'classes': params.n_classes})
if hasattr(params, 'n_clusters'):
if algorithm == 'kmeans':
result['input_data'].update(
{'n_clusters': params.n_clusters})
elif algorithm == 'dbscan':
result.update({'n_clusters': params.n_clusters})

# replace non-string init with string for kmeans benchmarks
if alg_instance is not None:
if 'init' in result['algorithm_parameters'].keys():
if not isinstance(result['algorithm_parameters']['init'], str):
result['algorithm_parameters']['init'] = 'random'
if 'handle' in result['algorithm_parameters'].keys():
del result['algorithm_parameters']['handle']
output.append(result)

print(json.dumps(output, indent=4))


def run_with_context(params, function):
Expand Down
9 changes: 7 additions & 2 deletions configs/xgboost/xgb_gpu_additional_config.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
{
"common": {
"lib": "xgboost",
"data-format": "cudf",
"data-order": "F",
"dtype": "float32",
"algorithm": "gbt",
"tree-method": "gpu_hist",
"count-dmatrix": "",
"max-depth": 8,
"learning-rate": 0.1,
"reg-lambda": 1,
Expand All @@ -15,6 +13,7 @@
"cases": [
{
"objective": "binary:logistic",
"data-format": "pandas",
"scale-pos-weight": 2.1067817411664587,
"dataset": [
{
Expand All @@ -33,6 +32,7 @@
},
{
"objective": "binary:logistic",
"data-format": "cudf",
"scale-pos-weight": 173.63348001466812,
"dataset": [
{
Expand All @@ -51,6 +51,7 @@
},
{
"objective": "multi:softmax",
"data-format": "cudf",
"dataset": [
{
"source": "npy",
Expand All @@ -68,6 +69,7 @@
},
{
"objective": "binary:logistic",
"data-format": "pandas",
"scale-pos-weight": 2.0017715678375363,
"dataset": [
{
Expand All @@ -86,6 +88,7 @@
},
{
"objective": "binary:logistic",
"data-format": "cudf",
"scale-pos-weight": 578.2868020304569,
"dataset": [
{
Expand All @@ -104,6 +107,7 @@
},
{
"objective": "binary:logistic",
"data-format": "cudf",
"scale-pos-weight": 1.8872389605086624,
"dataset": [
{
Expand All @@ -122,6 +126,7 @@
},
{
"objective": "reg:squarederror",
"data-format": "cudf",
"dataset": [
{
"source": "npy",
Expand Down
36 changes: 21 additions & 15 deletions configs/xgboost/xgb_gpu_main_config.json
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
{
"common": {
"lib": "xgboost",
"data-format": "cudf",
"data-order": "F",
"dtype": "float32",
"algorithm": "gbt",
"tree-method": "gpu_hist",
"count-dmatrix": ""
"tree-method": "gpu_hist"
},
"cases": [
{
"objective": "reg:squarederror",
"data-format": "cudf",
"dataset": [
{
"source": "npy",
Expand All @@ -26,10 +26,11 @@
],
"learning-rate": 0.03,
"max-depth": 6,
"n-estimators": 1000,
"objective": "reg:squarederror"
"n-estimators": 1000
},
{
"objective": "binary:logistic",
"data-format": "pandas",
"dataset": [
{
"source": "npy",
Expand All @@ -53,10 +54,11 @@
"min-child-weight": 0,
"max-depth": 8,
"max-leaves": 256,
"n-estimators": 1000,
"objective": "binary:logistic"
"n-estimators": 1000
},
{
"objective": "binary:logistic",
"data-format": "pandas",
"dataset": [
{
"source": "npy",
Expand All @@ -81,10 +83,11 @@
"max-depth": 8,
"max-leaves": 256,
"n-estimators": 1000,
"objective": "binary:logistic",
"inplace-predict": ""
},
{
"objective": "multi:softprob",
"data-format": "cudf",
"dataset": [
{
"source": "npy",
Expand All @@ -101,10 +104,11 @@
],
"learning-rate": 0.03,
"max-depth": 6,
"n-estimators": 1000,
"objective": "multi:softprob"
"n-estimators": 1000
},
{
"objective": "multi:softprob",
"data-format": "cudf",
"dataset": [
{
"source": "npy",
Expand All @@ -122,10 +126,11 @@
"min-child-weight": 1,
"min-split-loss": 0.1,
"max-depth": 8,
"n-estimators": 200,
"objective": "multi:softprob"
"n-estimators": 200
},
{
"objective": "reg:squarederror",
"data-format": "cudf",
"dataset": [
{
"source": "npy",
Expand All @@ -137,7 +142,6 @@
}
],
"n-estimators": 100,
"objective": "reg:squarederror",
"max-depth": 8,
"scale-pos-weight": 2,
"learning-rate": 0.1,
Expand All @@ -148,6 +152,8 @@
"max-leaves": 256
},
{
"objective": "multi:softprob",
"data-format": "cudf",
"dataset": [
{
"source": "npy",
Expand All @@ -163,12 +169,13 @@
}
],
"n-estimators": 60,
"objective": "multi:softprob",
"max-depth": 7,
"subsample": 0.7,
"colsample-bytree": 0.7
},
{
"objective": "binary:logistic",
"data-format": "cudf",
"dataset": [
{
"source": "npy",
Expand All @@ -184,7 +191,6 @@
}
],
"n-estimators": 10000,
"objective": "binary:logistic",
"max-depth": 1,
"subsample": 0.5,
"eta": 0.1,
Expand Down
Loading