Skip to content

Commit

Permalink
Add CV and Feature Importance
Browse files Browse the repository at this point in the history
  • Loading branch information
jjakimoto committed Dec 21, 2018
1 parent 2a988a0 commit f8b89ab
Show file tree
Hide file tree
Showing 6 changed files with 232 additions and 49 deletions.
2 changes: 1 addition & 1 deletion finance_ml/features/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .importance import feat_importance
from .importance import feat_importance, feat_imp_MDA, feat_imp_MDI, feat_imp_SFI
from .orth import get_e_vec, orth_feats
102 changes: 87 additions & 15 deletions finance_ml/features/importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,19 @@


def feat_imp_MDI(forest, feat_names):
"""Compute Mean Decrease Impurity
Params
------
forest: Forest Classifier instance
feat_names: list(str)
List of names of features
Returns
-------
imp: pd.DataFrame
Importance means and standard deviations
"""
imp_dict = {i: tree.feature_importances_ for i, tree in
enumerate(forest.estimators_)}
imp_df = pd.DataFrame.from_dict(imp_dict, orient='index')
Expand All @@ -22,8 +35,29 @@ def feat_imp_MDI(forest, feat_names):
return imp


def feat_imp_MDA(clf, X, y, n_splits, sample_weight, t1, pct_embargo,
scoring='neg_log_loss'):
def feat_imp_MDA(clf, X, y, n_splits, t1, sample_weight=None,
pct_embargo=0, scoring='neg_log_loss'):
"""Calculate Mean Decrease Accuracy
Params
------
clf: Classifier instance
X: pd.DataFrame, Input feature
y: pd.Series, Label
n_splits: int
The number of splits for cross validation
sample_weight: array-like
Sampling weight for fit function
t1: pd.Series
Index and values correspond to begenning and end of timestamps for each point
pct_embargo: float
The ratio to get rid of from data
Returns
-------
imp: pd.DataFrame, feature importance of means and standard deviations
scores: float, scores of cross validation
"""
if scoring not in ['neg_log_loss', 'accuracy']:
raise Exception('wrong scoring method')
cv_gen = PurgedKFold(n_splits=n_splits, t1=t1, pct_embargo=pct_embargo)
Expand All @@ -33,20 +67,26 @@ def feat_imp_MDA(clf, X, y, n_splits, sample_weight, t1, pct_embargo,
for idx, (train, test) in zip(index, cv_gen.split(X=X)):
X_train = X.iloc[train]
y_train = y.iloc[train]
w_train = sample_weight.iloc[train]
if sample_weight is not None:
w_train = sample_weight.iloc[train].values
else:
w_train = None
X_test = X.iloc[test]
y_test = y.iloc[test]
w_test = sample_weight.iloc[test]
clf_fit = clf.fit(X_train, y_train, sample_weight=w_train.values)
if sample_weight is not None:
w_test = sample_weight.iloc[test].values
else:
w_test = None
clf_fit = clf.fit(X_train, y_train, sample_weight=w_train)
if scoring == 'neg_log_loss':
prob = clf_fit.predict_proba(X_test)
scores.loc[idx] = -log_loss(y_test, prob,
sample_weight=w_test.values,
sample_weight=w_test,
labels=clf_fit.classes_)
else:
pred = clf_fit.predict(X_test)
scores.loc[idx] = accuracy_score(y_test, pred,
sample_weight=w_test.values)
sample_weight=w_test)

for col in X.columns:
X_test_ = X_test.copy(deep=True)
Expand All @@ -55,12 +95,12 @@ def feat_imp_MDA(clf, X, y, n_splits, sample_weight, t1, pct_embargo,
if scoring == 'neg_log_loss':
prob = clf_fit.predict_proba(X_test_)
scores_perm.loc[idx, col] = -log_loss(y_test, prob,
sample_weight=w_test.value,
sample_weight=w_test,
labels=clf_fit.classes_)
else:
pred = clf_fit.predict(X_test_)
scores_perm.loc[idx, col] = accuracy_score(y_test, pred,
sample_weight=w_test.values)
sample_weight=w_test)
# (Original score) - (premutated score)
imprv = (-scores_perm).add(scores, axis=0)
# Relative to maximum improvement
Expand All @@ -74,13 +114,45 @@ def feat_imp_MDA(clf, X, y, n_splits, sample_weight, t1, pct_embargo,
return imp, scores.mean()


def aux_feat_imp_SFI(feat_names, clf, X, cont, scoring, cv_gen):
def feat_imp_SFI(clf, X, y, sample_weight=None, scoring='neg_log_loss',
n_splits=3, t1=None, cv_gen=None, pct_embargo=0, purging=True):
"""Calculate Single Feature Importance
Params
------
clf: Classifier instance
X: pd.DataFrame
y: pd.Series, optional
sample_weight: pd.Series, optional
If specified, apply this to bot testing and training
scoring: str, default 'neg_log_loss'
The name of scoring methods. 'accuracy' or 'neg_log_loss'
n_splits: int, default 3
The number of splits for cross validation
t1: pd.Series
Index and value correspond to the begining and end of information
cv_gen: KFold instance
If not specified, use PurgedKfold
pct_embargo: float, default 0
The percentage of applying embargo
purging: bool, default True
If true, apply purging method
Returns
-------
imp: pd.DataFrame, feature importance of means and standard deviations
"""
imp = pd.DataFrame(columns=['mean', 'std'])
for feat_name in feat_names:
scores = cv_score(clf, X=X[[feat_name]], y=cont['size'],
sample_weight=cont['w'],
for feat_name in X.columns:
scores = cv_score(clf, X=X[[feat_name]], y=y,
sample_weight=sample_weight,
scoring=scoring,
cv_gen=cv_gen)
cv_gen=cv_gen,
n_splits=n_splits,
t1=t1,
pct_embargo=pct_embargo,
purging=purging)
imp.loc[feat_name, 'mean'] = scores.mean()
imp.loc[feat_name, 'std'] = scores.std() * np.sqrt(scores.shape[0])
return imp
Expand Down Expand Up @@ -117,7 +189,7 @@ def feat_importance(X, cont, clf=None, n_estimators=1000, n_splits=10, max_sampl
oos = cv_score(clf, X=X, y=cont['size'], sample_weight=cont['w'],
scoring=scoring, cv_gen=cv_gen)
clf.n_jobs = 1
imp = mp_pandas_obj(aux_feat_imp_SFI, ('feat_names', X.columns),
imp = mp_pandas_obj(feat_imp_SFI, ('feat_names', X.columns),
num_threads, clf=clf, X=X, cont=cont,
scoring=scoring, cv_gen=cv_gen)
return imp, oob, oos
60 changes: 43 additions & 17 deletions finance_ml/features/orth.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,54 @@
import pandas as pd


def get_e_vec(dot, var_thres):
def get_evec(dot, var_th):
"""Calculate eigen values and vectors
Params
------
dot: pd.DataFrame
Z score product dataframe
var_th: float
Threshold for the explanation of variance
Returns
-------
e_val: pd.Series, eigen values
e_vec: pd.DataFrame, eigen vectors
"""
# Compute and sort eigen vectors and values for dot product matrix
e_val, e_vec = np.linalg.eigh(dot)
# Descending order
idx = e_val.argsort()[::-1]
e_val = e_val[idx]
e_vec = e_vec[:, idx]
# Use only positive ones
e_val, e_vec = e_val[idx], e_vec[:, idx]
# Labeling features
e_val = pd.Series(e_val, index=['PC_' + str(i + 1) for i in range(e_val.shape[0])])
e_vec = pd.DataFrame(e_vec, index=dot.index, columns=e_val.index)
e_vec = e_vec.loc[:, e_val > 0]
e_val = e_val.loc[e_val > 0]
# Reduce dimension with threashold
e_vec = e_vec.loc[:, e_val.index]
# Reduce dimension from threshold
cum_var = e_val.cumsum() / e_val.sum()
dim = cum_var.values.searchsorted(var_thres)
e_val = e_val.iloc[:dim+1]
e_vec = e_vec.iloc[:, :dim+1]
dim = cum_var.searchsorted(var_th)[0]
e_val = e_val.iloc[:dim + 1]
e_vec = e_vec.iloc[:, :dim + 1]
return e_val, e_vec


def orth_feats(dfX, var_thres=.95):
dfZ = dfX.sub(dfX.mean(), axis=1).div(dfX.std(), axis=1)
dot = pd.DataFrame(np.dot(dfZ.T, dfZ), index=dfX.columns, columns=dfX.columns)
e_val, e_vec = get_e_vec(dot, var_thres)
dfP = pd.DataFrame(np.dot(dfZ, e_vec), index=dfZ.index, columns=e_vec.columns)
return dfP
def ortho_feats(dfX, var_th=.95):
"""Compute orthgonal features with threshold
Params
------
dfX: pd.DataFrame
Feataures dataframe
var_th: float
Threshold for the explanation of variance
Returns
-------
pd.DataFrame: orthogonal feature
"""
Z = (dfX.values - dfX.mean().values) / dfX.std().values
dot = pd.DataFrame(np.dot(Z.T, Z), index=dfX.columns, columns=dfX.columns)
e_val, e_vec = get_evec(dot, var_th)
dfP = pd.DataFrame(np.dot(Z, e_vec), index=dfX.index,
columns=['PC_' + str(i + 1) for i in range(e_vec.shape[1])])
return dfP
39 changes: 32 additions & 7 deletions finance_ml/model_selection/kfold.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,59 @@


class PurgedKFold(_BaseKFold):
def __init__(self, n_splits=3, t1=None, pct_embargo=0., purging=False):
"""Cross Validation with purging and embargo
Params
------
n_splits: int
The number of splits for cross validation
t1: pd.Series
Index and value correspond to the begining and end of information
pct_embargo: float, default 0
The percentage of applying embargo
purging: bool, default True
If true, apply purging method
"""

def __init__(self, n_splits=3, t1=None, pct_embargo=0., purging=True):
super(PurgedKFold, self).__init__(n_splits=n_splits, shuffle=False, random_state=None)
if not isinstance(t1, pd.Series):
raise ValueError('Label through dates must be a pd.Series')
super(PurgedKFold, self).__init__(n_splits=n_splits, shuffle=False,
random_state=None)
raise ValueError('t1 must be pd.Series')
self.t1 = t1
self.pct_embargo = pct_embargo
self.purging = purging

def split(self, X, y=None, groups=None):
"""Get train and test times stamps
Params
------
X: pd.DataFrame
y: pd.Series, optional
Returns
-------
train_indices, test_indices: np.array
"""
if (X.index == self.t1.index).sum() != len(self.t1):
raise ValueError('X and t1 must have the same index')
indices = np.arange(X.shape[0])
# Embargo width
embg_size = int(X.shape[0] * self.pct_embargo)
# Pandas is close set when using [t0:t1]
test_ranges = [(i[0], i[-1] + 1) for i in
np.array_split(indices, self.n_splits)]
for st, end in test_ranges:
# Test data
test_indices = indices[st:end]
# Training data prior to test data
t0 = self.t1.index[st]
# Avoid look ahead leakage here
train_indices = self.t1.index.searchsorted(
self.t1[self.t1 <= t0].index)
# Add training data after test data
# Edge point of test set in the most recent side
max_t1_idx = self.t1.index.searchsorted(
self.t1[test_indices].max())
if max_t1_idx < X.shape[0]:
# Adding indices after test set
train_indices = np.concatenate(
(train_indices, indices[max_t1_idx + embg_size:]))
# Purging
Expand Down
26 changes: 24 additions & 2 deletions finance_ml/model_selection/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,29 @@


def cv_score(clf, X, y, sample_weight=None, scoring='neg_log_loss',
t1=None, n_splits=3, cv_gen=None, pct_embargo=0., purging=False):
n_splits=3, t1=None, cv_gen=None, pct_embargo=0., purging=True):
"""Cross Validation with default purging and embargo
Params
------
X: pd.DataFrame
y: pd.Series, optional
sample_weight: pd.Series, optional
If specified, apply this to bot testing and training
scoring: str, default 'neg_log_loss'
The name of scoring methods. 'accuracy' or 'neg_log_loss'
n_splits: int
The number of splits for cross validation
t1: pd.Series
Index and value correspond to the begining and end of information
cv_gen: KFold instance
If not specified, use PurgedKfold
pct_embargo: float, default 0
The percentage of applying embargo
purging: bool, default True
If true, apply purging method
"""
if scoring not in ['neg_log_loss', 'accuracy']:
raise Exception('Wrong scoring method')
if cv_gen is None:
Expand All @@ -29,4 +51,4 @@ def cv_score(clf, X, y, sample_weight=None, scoring='neg_log_loss',
pred = clf_.predict(X.iloc[test, :])
score_ = accuracy_score(y.iloc[test], pred, **test_params)
scores.append(score_)
return np.array(scores)
return np.array(scores)
52 changes: 45 additions & 7 deletions finance_ml/model_selection/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,46 @@
def get_train_times(t1, test_times):
trn = t1.copy(deep=True)
for i, j in test_times.iteritems():
df0 = trn[(i <= trn.index) & (trn.index <= j)].index
df1 = trn[(i <= trn) & (trn <= j)].index
df2 = trn[(trn.index <= i) & (j <= trn)].index
trn = trn.drop(df0.union(df1.union(df2)))
import pandas as pd


def get_train_times(train_times, test_times):
"""Sample train points without overlapping with test period
Params
------
train_times: pd.Series
Trainig points with index for initial and values for end time
test_times: pd.Series
Testing points with index for initial and values for end time
Returns
-------
pd.Series
"""
trn = train_times.copy(deep=True)
for init, end in test_times.iteritems():
df0 = trn[(init <= trn.index) & (trn.index <= end)].index
df1 = trn[(init <= trn) & (trn <= end)].index
df2 = trn[(trn.index <= init) & (end <= trn)].index
trn = trn.drop(df0 | df1 | df2)
return trn


def get_embargo_times(times, pct_embargo):
"""Get embargo time index for each timestamp
times:
times: Timestamps
Entire timestamps which you want to apply embargo
pct_embargo: float ranged at [0, 1]
The ratio to embargo with respect to the size of timestamps
Returns:
pd.Series: For each valud corresponds to a point which you should take
out before from the other forward dataset
"""
step = int(times.shape[0] * pct_embargo)
if step == 0:
embg = pd.Series(times, index=times)
else:
embg = pd.Series(times[step:], index=times[:-step])
embg = embg.append(pd.Series(times[-1], index=times[-step:]))
return embg

0 comments on commit f8b89ab

Please sign in to comment.