Skip to content

Commit

Permalink
More Pandas dtypes and more flexible variable naming
Browse files Browse the repository at this point in the history
- Pandas DataFrame supports more dtypes than 'int64', 'float64' and 'bool', therefor added a bunch of extra dtypes for the data variable.
- From now on the label variable can be a Pandas DataFrame with the same dtypes as the data variable.
- If label is a Pandas DataFrame will be converted to float.
- If no feature_types is set, the data dtypes will be converted to 'int' or 'float'.
- The feature_names may contain every character except [, ] or <
  • Loading branch information
JohanManders committed Oct 17, 2015
1 parent f116722 commit 9bbc390
Showing 1 changed file with 47 additions and 22 deletions.
69 changes: 47 additions & 22 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,27 +138,50 @@ def c_array(ctype, values):
return (ctype * len(values))(*values)


def _maybe_from_pandas(data, feature_names, feature_types):
""" Extract internal data from pd.DataFrame """
def _maybe_from_pandas(data, label, feature_names, feature_types):
""" Extract internal data from pd.DataFrame
If data is Pandas DataFrame, feature_names passed through will be ignored and
overwritten by the column names of the Pandas DataFrame.
"""
try:
import pandas as pd
except ImportError:
return data, feature_names, feature_types
return data, label, feature_names, feature_types

if not isinstance(data, pd.DataFrame):
return data, feature_names, feature_types
return data, label, feature_names, feature_types

data_dtypes = data.dtypes
if not all(dtype.name in ('int8', 'int16', 'int32', 'int64',
'uint8', 'uint16', 'uint32', 'uint64',
'float16', 'float32', 'float64',
'bool') for dtype in data_dtypes):
raise ValueError('DataFrame.dtypes for data must be int, float or bool')

if label is not None:
if isinstance(label, pd.DataFrame):
label_dtypes = label.dtypes
if not all(dtype.name in ('int8', 'int16', 'int32', 'int64',
'uint8', 'uint16', 'uint32', 'uint64',
'float16', 'float32', 'float64',
'bool') for dtype in label_dtypes):
raise ValueError('DataFrame.dtypes for label must be int, float or bool')
else:
label = label.values.astype('float')

dtypes = data.dtypes
if not all(dtype.name in ('int64', 'float64', 'bool') for dtype in dtypes):
raise ValueError('DataFrame.dtypes must be int, float or bool')
feature_names = data.columns.format()

if feature_names is None:
feature_names = data.columns.format()
if feature_types is None:
mapper = {'int64': 'int', 'float64': 'q', 'bool': 'i'}
feature_types = [mapper[dtype.name] for dtype in dtypes]
mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
'float16': 'float', 'float32': 'float', 'float64': 'float',
'bool': 'int'}
feature_types = [mapper[dtype.name] for dtype in data_dtypes]

data = data.values.astype('float')
return data, feature_names, feature_types

return data, label, feature_names, feature_types

class DMatrix(object):
"""Data Matrix used in XGBoost.
Expand Down Expand Up @@ -192,9 +215,10 @@ def __init__(self, data, label=None, missing=0.0,
silent : boolean, optional
Whether print messages during construction
feature_names : list, optional
Labels for features.
Set names for features.
When data is a Pandas DataFrame, feature_names will be ignored.
feature_types : list, optional
Labels for features.
Set types for features.
"""
# force into void_p, mac need to pass things in as void_p
if data is None:
Expand All @@ -204,8 +228,10 @@ def __init__(self, data, label=None, missing=0.0,
klass = getattr(getattr(data, '__class__', None), '__name__', None)
if klass == 'DataFrame':
# once check class name to avoid unnecessary pandas import
data, feature_names, feature_types = _maybe_from_pandas(data, feature_names,
feature_types)
data, label, feature_names, feature_types = _maybe_from_pandas(data,
label,
feature_names,
feature_types)

if isinstance(data, STRING_TYPES):
self.handle = ctypes.c_void_p()
Expand Down Expand Up @@ -520,10 +546,10 @@ def feature_names(self, feature_names):
if len(feature_names) != self.num_col():
msg = 'feature_names must have the same length as data'
raise ValueError(msg)
# prohibit to use symbols may affect to parse. e.g. ``[]=.``
if not all(isinstance(f, STRING_TYPES) and f.isalnum()
# prohibit to use symbols may affect to parse. e.g. []<
if not all(isinstance(f, STRING_TYPES) and not any(x in f for x in {'[', ']', '<'})
for f in feature_names):
raise ValueError('all feature_names must be alphanumerics')
raise ValueError('feature_names may not contain [, ] or <')
else:
# reset feature_types also
self.feature_types = None
Expand Down Expand Up @@ -556,12 +582,11 @@ def feature_types(self, feature_types):
if len(feature_types) != self.num_col():
msg = 'feature_types must have the same length as data'
raise ValueError(msg)
# prohibit to use symbols may affect to parse. e.g. ``[]=.``

valid = ('q', 'i', 'int', 'float')
valid = ('int', 'float')
if not all(isinstance(f, STRING_TYPES) and f in valid
for f in feature_types):
raise ValueError('all feature_names must be {i, q, int, float}')
raise ValueError('All feature_names must be {int, float}')
self._feature_types = feature_types


Expand Down

0 comments on commit 9bbc390

Please sign in to comment.