Add CV and Feature Importance

bassemfg · Dec 21, 2018 · f8b89ab · f8b89ab
1 parent 2a988a0
commit f8b89ab
Show file tree

Hide file tree

Showing 6 changed files with 232 additions and 49 deletions.
diff --git a/finance_ml/features/__init__.py b/finance_ml/features/__init__.py
@@ -1,2 +1,2 @@
-from .importance import feat_importance
+from .importance import feat_importance, feat_imp_MDA, feat_imp_MDI, feat_imp_SFI
 from .orth import get_e_vec, orth_feats
diff --git a/finance_ml/features/importance.py b/finance_ml/features/importance.py
@@ -9,6 +9,19 @@
 
 
 def feat_imp_MDI(forest, feat_names):
+    """Compute Mean Decrease Impurity
+    
+    Params
+    ------
+    forest: Forest Classifier instance
+    feat_names: list(str)
+        List of names of features
+
+    Returns
+    -------
+    imp: pd.DataFrame
+        Importance means and standard deviations
+    """
     imp_dict = {i: tree.feature_importances_ for i, tree in
                 enumerate(forest.estimators_)}
     imp_df = pd.DataFrame.from_dict(imp_dict, orient='index')
@@ -22,8 +35,29 @@ def feat_imp_MDI(forest, feat_names):
     return imp
 
 
-def feat_imp_MDA(clf, X, y, n_splits, sample_weight, t1, pct_embargo,
-                 scoring='neg_log_loss'):
+def feat_imp_MDA(clf, X, y, n_splits, t1, sample_weight=None,
+                 pct_embargo=0, scoring='neg_log_loss'):
+    """Calculate Mean Decrease Accuracy
+    
+    Params
+    ------
+    clf: Classifier instance
+    X: pd.DataFrame, Input feature
+    y: pd.Series, Label
+    n_splits: int
+        The number of splits for cross validation
+    sample_weight: array-like
+        Sampling weight for fit function
+    t1: pd.Series
+        Index and values correspond to begenning and end of timestamps for each point
+    pct_embargo: float
+        The ratio to get rid of from  data
+    
+    Returns
+    -------
+    imp: pd.DataFrame, feature importance of means and standard deviations
+    scores: float, scores of cross validation
+    """
     if scoring not in ['neg_log_loss', 'accuracy']:
         raise Exception('wrong scoring method')
     cv_gen = PurgedKFold(n_splits=n_splits, t1=t1, pct_embargo=pct_embargo)
@@ -33,20 +67,26 @@ def feat_imp_MDA(clf, X, y, n_splits, sample_weight, t1, pct_embargo,
     for idx, (train, test) in zip(index, cv_gen.split(X=X)):
         X_train = X.iloc[train]
         y_train = y.iloc[train]
-        w_train = sample_weight.iloc[train]
+        if sample_weight is not None:
+            w_train = sample_weight.iloc[train].values
+        else:
+            w_train = None
         X_test = X.iloc[test]
         y_test = y.iloc[test]
-        w_test = sample_weight.iloc[test]
-        clf_fit = clf.fit(X_train, y_train, sample_weight=w_train.values)
+        if sample_weight is not None:
+            w_test = sample_weight.iloc[test].values
+        else:
+            w_test = None
+        clf_fit = clf.fit(X_train, y_train, sample_weight=w_train)
         if scoring == 'neg_log_loss':
             prob = clf_fit.predict_proba(X_test)
             scores.loc[idx] = -log_loss(y_test, prob,
-                                        sample_weight=w_test.values,
+                                        sample_weight=w_test,
                                         labels=clf_fit.classes_)
         else:
             pred = clf_fit.predict(X_test)
             scores.loc[idx] = accuracy_score(y_test, pred,
-                                             sample_weight=w_test.values)
+                                             sample_weight=w_test)
 
         for col in X.columns:
             X_test_ = X_test.copy(deep=True)
@@ -55,12 +95,12 @@ def feat_imp_MDA(clf, X, y, n_splits, sample_weight, t1, pct_embargo,
             if scoring == 'neg_log_loss':
                 prob = clf_fit.predict_proba(X_test_)
                 scores_perm.loc[idx, col] = -log_loss(y_test, prob,
-                                                      sample_weight=w_test.value,
+                                                      sample_weight=w_test,
                                                       labels=clf_fit.classes_)
             else:
                 pred = clf_fit.predict(X_test_)
                 scores_perm.loc[idx, col] = accuracy_score(y_test, pred,
-                                                           sample_weight=w_test.values)
+                                                           sample_weight=w_test)
     # (Original score) - (premutated score)
     imprv = (-scores_perm).add(scores, axis=0)
     # Relative to maximum improvement
@@ -74,13 +114,45 @@ def feat_imp_MDA(clf, X, y, n_splits, sample_weight, t1, pct_embargo,
     return imp, scores.mean()
 
 
-def aux_feat_imp_SFI(feat_names, clf, X, cont, scoring, cv_gen):
+def feat_imp_SFI(clf, X, y, sample_weight=None, scoring='neg_log_loss',
+                 n_splits=3, t1=None, cv_gen=None, pct_embargo=0, purging=True):
+    """Calculate Single Feature Importance
+    
+    Params
+    ------
+    clf: Classifier instance
+    X: pd.DataFrame
+    y: pd.Series, optional
+    sample_weight: pd.Series, optional
+        If specified, apply this to bot testing and training
+    scoring: str, default 'neg_log_loss'
+        The name of scoring methods. 'accuracy' or 'neg_log_loss'
+    
+    n_splits: int, default 3
+        The number of splits for cross validation
+    t1: pd.Series
+        Index and value correspond to the begining and end of information
+    cv_gen: KFold instance
+        If not specified, use PurgedKfold
+    pct_embargo: float, default 0
+        The percentage of applying embargo
+    purging: bool, default True
+        If true, apply purging method
+        
+    Returns
+    -------
+    imp: pd.DataFrame, feature importance of means and standard deviations
+    """
     imp = pd.DataFrame(columns=['mean', 'std'])
-    for feat_name in feat_names:
-        scores = cv_score(clf, X=X[[feat_name]], y=cont['size'],
-                          sample_weight=cont['w'],
+    for feat_name in X.columns:
+        scores = cv_score(clf, X=X[[feat_name]], y=y,
+                          sample_weight=sample_weight,
                           scoring=scoring,
-                          cv_gen=cv_gen)
+                          cv_gen=cv_gen,
+                          n_splits=n_splits,
+                          t1=t1,
+                          pct_embargo=pct_embargo,
+                          purging=purging)
         imp.loc[feat_name, 'mean'] = scores.mean()
         imp.loc[feat_name, 'std'] = scores.std() * np.sqrt(scores.shape[0])
     return imp
@@ -117,7 +189,7 @@ def feat_importance(X, cont, clf=None, n_estimators=1000, n_splits=10, max_sampl
         oos = cv_score(clf, X=X, y=cont['size'], sample_weight=cont['w'],
                        scoring=scoring, cv_gen=cv_gen)
         clf.n_jobs = 1
-        imp = mp_pandas_obj(aux_feat_imp_SFI, ('feat_names', X.columns),
+        imp = mp_pandas_obj(feat_imp_SFI, ('feat_names', X.columns),
                             num_threads, clf=clf, X=X, cont=cont,
                             scoring=scoring, cv_gen=cv_gen)
     return imp, oob, oos
diff --git a/finance_ml/features/orth.py b/finance_ml/features/orth.py
@@ -2,28 +2,54 @@
 import pandas as pd
 
 
-def get_e_vec(dot, var_thres):
+def get_evec(dot, var_th):
+    """Calculate eigen values and vectors
+    
+    Params
+    ------
+    dot: pd.DataFrame
+        Z score product dataframe
+    var_th: float
+        Threshold for the explanation of variance
+    
+    Returns
+    -------
+    e_val: pd.Series, eigen values
+    e_vec: pd.DataFrame, eigen vectors
+    """
+    # Compute and sort eigen vectors and values for dot product matrix
     e_val, e_vec = np.linalg.eigh(dot)
-    # Descending order
     idx = e_val.argsort()[::-1]
-    e_val = e_val[idx]
-    e_vec = e_vec[:, idx]
-    # Use only positive ones
+    e_val, e_vec = e_val[idx], e_vec[:, idx]
+    # Labeling features
     e_val = pd.Series(e_val, index=['PC_' + str(i + 1) for i in range(e_val.shape[0])])
     e_vec = pd.DataFrame(e_vec, index=dot.index, columns=e_val.index)
-    e_vec = e_vec.loc[:, e_val > 0]
-    e_val = e_val.loc[e_val > 0]
-    # Reduce dimension with threashold
+    e_vec = e_vec.loc[:, e_val.index]
+    # Reduce dimension from threshold
     cum_var = e_val.cumsum() / e_val.sum()
-    dim = cum_var.values.searchsorted(var_thres)
-    e_val = e_val.iloc[:dim+1]
-    e_vec = e_vec.iloc[:, :dim+1]
+    dim = cum_var.searchsorted(var_th)[0]
+    e_val = e_val.iloc[:dim + 1]
+    e_vec = e_vec.iloc[:, :dim + 1]
     return e_val, e_vec
 
 
-def orth_feats(dfX, var_thres=.95):
-    dfZ = dfX.sub(dfX.mean(), axis=1).div(dfX.std(), axis=1)
-    dot = pd.DataFrame(np.dot(dfZ.T, dfZ), index=dfX.columns, columns=dfX.columns)
-    e_val, e_vec = get_e_vec(dot, var_thres)
-    dfP = pd.DataFrame(np.dot(dfZ, e_vec), index=dfZ.index, columns=e_vec.columns)
-    return dfP
+def ortho_feats(dfX, var_th=.95):
+    """Compute orthgonal features with threshold
+    
+    Params
+    ------
+    dfX: pd.DataFrame
+        Feataures dataframe
+    var_th: float
+        Threshold for the explanation of variance
+        
+    Returns
+    -------
+    pd.DataFrame: orthogonal feature
+    """
+    Z = (dfX.values - dfX.mean().values) / dfX.std().values
+    dot = pd.DataFrame(np.dot(Z.T, Z), index=dfX.columns, columns=dfX.columns)
+    e_val, e_vec = get_evec(dot, var_th)
+    dfP = pd.DataFrame(np.dot(Z, e_vec), index=dfX.index,
+                       columns=['PC_' + str(i + 1) for i in range(e_vec.shape[1])])
+    return dfP
diff --git a/finance_ml/model_selection/kfold.py b/finance_ml/model_selection/kfold.py
@@ -6,34 +6,59 @@
 
 
 class PurgedKFold(_BaseKFold):
-    def __init__(self, n_splits=3, t1=None, pct_embargo=0., purging=False):
+    """Cross Validation with purging and embargo
+    
+    Params
+    ------
+    n_splits: int
+        The number of splits for cross validation
+    t1: pd.Series
+        Index and value correspond to the begining and end of information
+    pct_embargo: float, default 0
+        The percentage of applying embargo
+    purging: bool, default True
+        If true, apply purging method
+    """
+
+    def __init__(self, n_splits=3, t1=None, pct_embargo=0., purging=True):
+        super(PurgedKFold, self).__init__(n_splits=n_splits, shuffle=False, random_state=None)
         if not isinstance(t1, pd.Series):
-            raise ValueError('Label through dates must be a pd.Series')
-        super(PurgedKFold, self).__init__(n_splits=n_splits, shuffle=False,
-                                          random_state=None)
+            raise ValueError('t1 must be pd.Series')
         self.t1 = t1
         self.pct_embargo = pct_embargo
         self.purging = purging
 
     def split(self, X, y=None, groups=None):
+        """Get train and test times stamps
+        
+        Params
+        ------
+        X: pd.DataFrame
+        y: pd.Series, optional
+        
+        Returns
+        -------
+        train_indices, test_indices: np.array
+        """
         if (X.index == self.t1.index).sum() != len(self.t1):
             raise ValueError('X and t1 must have the same index')
         indices = np.arange(X.shape[0])
         # Embargo width
         embg_size = int(X.shape[0] * self.pct_embargo)
+        # Pandas is close set when using [t0:t1]
         test_ranges = [(i[0], i[-1] + 1) for i in
                        np.array_split(indices, self.n_splits)]
         for st, end in test_ranges:
-            # Test data
             test_indices = indices[st:end]
-            # Training data prior to test data
             t0 = self.t1.index[st]
+            # Avoid look ahead leakage here
             train_indices = self.t1.index.searchsorted(
                 self.t1[self.t1 <= t0].index)
-            # Add training data after test data
+            # Edge point of test set in the most recent side
             max_t1_idx = self.t1.index.searchsorted(
                 self.t1[test_indices].max())
             if max_t1_idx < X.shape[0]:
+                # Adding indices after test set
                 train_indices = np.concatenate(
                     (train_indices, indices[max_t1_idx + embg_size:]))
             # Purging

diff --git a/finance_ml/model_selection/score.py b/finance_ml/model_selection/score.py
@@ -5,7 +5,29 @@
 
 
 def cv_score(clf, X, y, sample_weight=None, scoring='neg_log_loss',
-             t1=None, n_splits=3, cv_gen=None, pct_embargo=0., purging=False):
+             n_splits=3, t1=None, cv_gen=None, pct_embargo=0., purging=True):
+    """Cross Validation with default purging and embargo
+    
+    Params
+    ------
+    X: pd.DataFrame
+    y: pd.Series, optional
+    sample_weight: pd.Series, optional
+        If specified, apply this to bot testing and training
+    scoring: str, default 'neg_log_loss'
+        The name of scoring methods. 'accuracy' or 'neg_log_loss'
+    
+    n_splits: int
+        The number of splits for cross validation
+    t1: pd.Series
+        Index and value correspond to the begining and end of information
+    cv_gen: KFold instance
+        If not specified, use PurgedKfold
+    pct_embargo: float, default 0
+        The percentage of applying embargo
+    purging: bool, default True
+        If true, apply purging method
+    """
     if scoring not in ['neg_log_loss', 'accuracy']:
         raise Exception('Wrong scoring method')
     if cv_gen is None:
@@ -29,4 +51,4 @@ def cv_score(clf, X, y, sample_weight=None, scoring='neg_log_loss',
             pred = clf_.predict(X.iloc[test, :])
             score_ = accuracy_score(y.iloc[test], pred, **test_params)
         scores.append(score_)
-    return np.array(scores)
+    return np.array(scores)
diff --git a/finance_ml/model_selection/utils.py b/finance_ml/model_selection/utils.py
@@ -1,8 +1,46 @@
-def get_train_times(t1, test_times):
-    trn = t1.copy(deep=True)
-    for i, j in test_times.iteritems():
-        df0 = trn[(i <= trn.index) & (trn.index <= j)].index
-        df1 = trn[(i <= trn) & (trn <= j)].index
-        df2 = trn[(trn.index <= i) & (j <= trn)].index
-        trn = trn.drop(df0.union(df1.union(df2)))
+import pandas as pd
+
+
+def get_train_times(train_times, test_times):
+    """Sample train points without overlapping with test period
+    
+    Params
+    ------
+    train_times: pd.Series
+        Trainig points with index for initial and values for end time
+    test_times: pd.Series
+        Testing points with index for initial and values for end time
+        
+    Returns
+    -------
+    pd.Series
+    """
+    trn = train_times.copy(deep=True)
+    for init, end in test_times.iteritems():
+        df0 = trn[(init <= trn.index) & (trn.index <= end)].index
+        df1 = trn[(init <= trn) & (trn <= end)].index
+        df2 = trn[(trn.index <= init) & (end <= trn)].index
+        trn = trn.drop(df0 | df1 | df2)
     return trn
+
+
+def get_embargo_times(times, pct_embargo):
+    """Get embargo time index for each timestamp
+    
+    times:
+        times: Timestamps
+            Entire timestamps which you want to apply embargo
+        pct_embargo: float ranged at [0, 1]
+            The ratio to embargo with respect to the size of timestamps
+            
+    Returns:
+        pd.Series: For each valud corresponds to a point which you should take
+        out before from the other forward dataset
+    """
+    step = int(times.shape[0] * pct_embargo)
+    if step == 0:
+        embg = pd.Series(times, index=times)
+    else:
+        embg = pd.Series(times[step:], index=times[:-step])
+        embg = embg.append(pd.Series(times[-1], index=times[-step:]))
+    return embg