added python script version of the ABIDE fmri classification task

nikhil153 · Apr 17, 2023 · 2415c38 · 2415c38
1 parent dd7fb5e
commit 2415c38
Show file tree

Hide file tree

Showing 3 changed files with 176 additions and 29 deletions.
diff --git a/Lectures/07-Machine_Learning_1/code/ML_Classification_Solutions.ipynb b/Lectures/07-Machine_Learning_1/code/ML_Classification_Solutions.ipynb
@@ -1,6 +1,7 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -23,7 +24,6 @@
     "2. Random Forest\n",
     "\n",
     "## Cross-validation\n",
-    "1. k-fold\n",
     "2. shuffle-split\n",
     "\n",
     "## post-hoc analysis\n",
@@ -40,18 +40,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/nikhil/projects/green_comp_neuro/green_compute/lib/python3.7/site-packages/nilearn/datasets/__init__.py:89: FutureWarning: Fetchers from the nilearn.datasets module will be updated in version 0.9 to return python strings instead of bytes and Pandas dataframes instead of Numpy arrays.\n",
-      "  \"Numpy arrays.\", FutureWarning)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "## Imports\n",
     "from nilearn import datasets\n",
@@ -71,18 +62,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/nikhil/projects/green_comp_neuro/green_compute/lib/python3.7/site-packages/numpy/lib/npyio.py:2407: VisibleDeprecationWarning: Reading unicode strings without specifying the encoding argument is deprecated. Set the encoding, use None for the system default.\n",
-      "  output = genfromtxt(fname, **kwargs)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "n_subjects = 100\n",
     "parcel = 'rois_ho' # 'rois_ho' or 'rois_aal\n",
@@ -316,7 +298,7 @@
     }
    ],
    "source": [
-    "pheno = pd.DataFrame(data['phenotypic']).drop(columns=['i','Unnamed_0'])\n",
+    "pheno = pd.DataFrame(data['phenotypic']).drop(columns=['i','Unnamed: 0'])\n",
     "pheno.head()"
    ]
   },
@@ -1100,7 +1082,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.0"
+   "version": "3.7.9"
   }
  },
  "nbformat": 4,

diff --git a/Lectures/07-Machine_Learning_1/code/ML_Regression_Tutorial.py b/Lectures/07-Machine_Learning_1/code/ML_Regression_Tutorial.py
@@ -78,9 +78,9 @@
 
 # ------ Uncomment lines in this block to print results ------- 
 
-print(f'\nThese are the (random) model parameters before training:')
-print(f'b0 (intercept): {initial_model_intercept}\nb1-b10: {initial_model_coefs}')
-print(f'\nMSE before training (i.e. using random weights): {initial_mse:.5g}')
+# print(f'\nThese are the (random) model parameters before training:')
+# print(f'b0 (intercept): {initial_model_intercept}\nb1-b10: {initial_model_coefs}')
+# print(f'\nMSE before training (i.e. using random weights): {initial_mse:.5g}')
 
 # -------------------------------------------------------------
 
@@ -93,7 +93,9 @@
 # +
 
 # model = LinearRegression()
-
+# 
+# train_predictions = 
+# train_mse = 
 
 # TODO
 # **Exercise**: fit the model with training data and get the predictions. 
@@ -111,6 +113,7 @@
 # y_train_hat = 
 # my_train_mse = 
 
+
 # ------ Uncomment lines in this block to print results ------- 
 
 # print(f'\nThese are the model parameters after training:')
@@ -131,6 +134,8 @@
 
 # TODO
 # **Exercise**: Check test set performance 
+# test_predictions = 
+# test_mse = 
 
 # ------ Uncomment lines in this block to print results ------- 
 
@@ -148,6 +153,8 @@
 # smallest Mean Squared Error on the training data.
 #
 # **Exercise**: what constant value prediction minimizes the MSE for the training sample?
+# dummy_predictions = 
+# dummy_mse = 
 
 # ------ Uncomment lines in this block to print results ------- 
 

diff --git a/Lectures/07-Machine_Learning_1/code/ML_classification.py b/Lectures/07-Machine_Learning_1/code/ML_classification.py
@@ -0,0 +1,158 @@
+## Imports
+from nilearn import datasets
+from nilearn.connectome import ConnectivityMeasure
+import pandas as pd
+import numpy as np
+import argparse
+from sklearn import preprocessing
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import precision_recall_fscore_support
+from sklearn.metrics import confusion_matrix
+
+
+def extract_connectome_features(func_data, measure):
+    ''' A function to calculate connnectome based on timeseries data and similarity measure
+    '''
+    connectome_matrix = measure.fit_transform([func_data])[0]
+    tril_idx = np.tril_indices(len(connectome_matrix),k=-1) 
+    flat_features = connectome_matrix[tril_idx]
+
+    return flat_features
+
+def load_data(n_subjects,parcel,data_dir):
+    ''' Reads data from local directory or nilearn dataset
+    '''
+    data = datasets.fetch_abide_pcp(n_subjects=n_subjects,derivatives=[parcel],data_dir=data_dir) 
+    pheno = pd.DataFrame(data['phenotypic']).drop(columns=['i','Unnamed: 0'])  
+
+    return data, pheno
+
+def get_train_test_splits(X,y,test_subset_fraction=0.2):
+    ''' Splits samples into a single train-test split
+    '''
+    stratification = y 
+
+    X_train, X_test, y_train, y_test = train_test_split(
+                                                        X, # input features
+                                                        y, # output labels
+                                                        test_size = test_subset_fraction, 
+                                                        shuffle = True, # shuffle dataset
+                                                                        # before splitting
+                                                        stratify = stratification, 
+                                                        random_state = 123 # same shuffle each time
+                                                        )
+
+    # print the size of our training and test groups
+    print('training:', len(X_train), 'testing:', len(X_test))
+
+    return X_train, X_test, y_train, y_test
+
+def run(n_subjects,parcel,data_dir,task, model):
+    ''' Setup and run ML tasks
+    '''
+    print("-"*25)
+    print("Loading data")
+    print("-"*25)
+    data, pheno = load_data(n_subjects,parcel,data_dir)
+
+    # Imaging variables
+    features = data[parcel]
+    print(f'Number of samples: {len(features)}')
+    subject_feature_shape = features[0].shape
+    n_rois = subject_feature_shape[1]
+    print(f'subject_feature_shape: {subject_feature_shape}')
+
+    # preprocess fmri data (flatten connectome)
+    print("-"*25)
+    print("Flattening the connectome matrix")
+    print("-"*25)
+    correlation_measure = ConnectivityMeasure(kind='correlation')
+
+    print(f"Extracting lower triangle values from {n_rois}x{n_rois} connectivity matrix")
+    flat_features_list = []
+    for func_data in features:
+        flat_features = extract_connectome_features(func_data, correlation_measure)
+        flat_features_list.append(flat_features)
+
+    # setup X,y for ML model
+    print("-"*25)
+    print("Setting up X and y for the ML model")
+    print("-"*25)
+    X = np.array(flat_features_list)
+    print(f'Input data (X) shape: {X.shape}')
+
+    y = pheno[task]
+    y_counts = y.value_counts()
+
+    print(f'Unique output clasess:\n{y_counts}')
+
+    # Encode labels to integer categories
+    le = preprocessing.LabelEncoder()
+    y = le.fit_transform(y)
+
+    # Get a single train-test split (80/20)
+    X_train, X_test, y_train, y_test = get_train_test_splits(X,y)
+
+    # train model    
+    if model == 'RF':
+        clf = RandomForestClassifier(max_depth=3, class_weight='balanced', random_state=0)
+    elif model == 'LR':
+        clf = LogisticRegression(penalty='l1', C=1, class_weight='balanced', solver='saga', random_state=0)
+    else:
+        print(f'Unknown model: {model}')
+
+    if model in ["RF", "LR"]:
+        print("-"*25)
+        print("Training {model} model")
+        print("-"*25)
+        clf.fit(X_train, y_train)
+        train_acc = clf.score(X_train, y_train)
+        print(f'train acc: {train_acc:.3f}')
+
+        # Evaluate on a test set
+        y_pred = clf.predict(X_test)
+        test_acc = clf.score(X_test, y_test)
+        print(f'test acc: {test_acc:.3f}')
+
+        print("-"*25)
+        print("Other useful performance metrics:")
+        print("-"*25)
+        test_cm = confusion_matrix(y_test, y_pred)
+        print(f"Confusion matrix:\n{test_cm}")
+        p,r,f1,_ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
+
+        print(f'precision: {p:.2f}, recall: {r:.2f}, f1: {f1:.2f}')
+
+
+if __name__ == '__main__':
+    # argparse
+    HELPTEXT = """
+    Script version of the classification tutorial (diagnosis or scan-site) using ABIDE dataset
+    """
+    parser = argparse.ArgumentParser(description=HELPTEXT)
+
+    parser.add_argument('--n_subjects', type=int, default=100, help='number of subjects to download')
+    parser.add_argument('--parcel', type=str, default="rois_ho", help='parcellation for connectome (rois_ho or rois_aal)')
+    parser.add_argument('--data_dir', type=str, default="./", help='data dir for previously downloaded data')
+    parser.add_argument('--task', type=str, default="DX_GROUP", help='ML classification task (DX_GROUP or SITE_ID)')
+    parser.add_argument('--model', type=str, default="RF", help='ML model to use (RF or LR)')
+
+    args = parser.parse_args()
+
+    n_subjects = args.n_subjects
+    parcel = args.parcel
+    data_dir = args.data_dir
+    task = args.task
+    model = args.model
+
+    print("-"*50)
+    print(f"Performing {task} classification task using {model} model with {n_subjects} subjects and {parcel} parcellation")
+    print("-"*50)
+
+    run(n_subjects,parcel,data_dir,task,model)    
+
+    print("-"*50)
+    print(f"Analysis completed for {task} classification task using {model}!")
+    print("-"*50)