initial

aepinilla · Sep 29, 2022 · 1579265 · 1579265
1 parent ee2c3a1
commit 1579265
Show file tree

Hide file tree

Showing 25 changed files with 1,887 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+data/
+archive/
+reports/
+stats/
+.idea
+.DS_Store
+__pycache__/
+ *.pyc
diff --git a/README.md b/README.md
@@ -0,0 +1,26 @@
+# Affect detection
+
+## Dataset
+1. Download the 'data' folder from the OSF repository of the study: https://osf.io/5m3yu/
+2. Place the 'data' folder inside the 'affect_detection' folder.
+
+## Instructions
+```
+1. [email protected]:aepinilla/affect_detection.git
+2. Go to src/feature_selection/lme_models
+3. Open the 3 R files located in the 'lme_models' folder.
+4. Edit the line 15 of each of those files, according to the path to your working directory/
+5. From the root folder run: python main.py
+```
+
+## Preprocessing
+The 'data' folder contains data that has been already preprocessed. To replicate the preprocessing steps, follow these steps:
+1. Install Matlab.
+2. Install EEGLAB following these instructions: https://eeglab.org/tutorials/01_Install/Install.html
+3. Clone this repository to the MATLAB folder
+4. Transform XDF files to CSV for faster processing:
+```
+python xdf_to_csv.py
+```
+6. Open EEGLAB in Matlab and run preprocessing script located in src/preprocessing.m
+
diff --git a/main.py b/main.py
@@ -0,0 +1,34 @@
+"""
+Author: Andres Pinilla Palacios
+Institution: Quality and Usability Lab, TU Berlin & UTS Games Studio, University of Technology Sydney
+"""
+
+from src.analyse_features import analyse_features
+from src.build_classifiers import build_classifiers
+from src.compare_methods import compare_methods
+from src.extract_features import extract_features
+from src.lme_structure import lme_structure
+from src.participants_age import participants_age
+from src.random_indices import random_indices
+from src.settings import participants_codes
+
+
+def main():
+    # Generate random indices, extract features, and adjust extracted features to LME format.
+    for p in participants_codes:
+        random_indices(p)
+        extract_features(p)
+        lme_structure(p)
+    # Analyse features using LME and RFE
+    analyse_features()
+    # Build classification models with selected features
+    for p in participants_codes:
+        build_classifiers(p)
+    # Compare accuracy of classifiers built with features selected using each feature selection method.
+    compare_methods()
+    # Calculate participants' age for reporting in manuscript.
+    participants_age()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,86 @@
+antropy==0.1.4
+appnope==0.1.3
+asttokens==2.0.5
+backcall==0.2.0
+Bottleneck==1.3.5
+certifi==2022.9.14
+cffi==1.15.1
+charset-normalizer==2.1.1
+cycler==0.11.0
+debugpy==1.6.0
+decorator==5.1.1
+entrypoints==0.4
+executing==0.8.3
+flatten-dict==0.4.2
+fonttools==4.32.0
+idna==3.4
+ipykernel==6.13.0
+ipyparallel==6.3.0
+ipython==8.2.0
+ipython-genutils==0.2.0
+jedi==0.18.1
+Jinja2==3.1.2
+joblib==1.1.0
+jupyter-client==7.1.2
+jupyter-core==4.10.0
+kiwisolver==1.4.2
+lazy_loader==0.1rc2
+littleutils==0.2.2
+llvmlite==0.39.1
+MarkupSafe==2.1.1
+matplotlib==3.5.1
+matplotlib-inline==0.1.3
+metakernel==0.29.0
+mkl-fft==1.3.1
+mkl-random==1.2.2
+mkl-service==2.4.0
+munkres==1.1.4
+nest-asyncio==1.5.5
+numba==0.56.2
+numexpr==2.8.3
+numpy==1.22.3
+outdated==0.2.1
+packaging==21.3
+pandas==1.4.4
+pandas-flavor==0.3.0
+parso==0.8.3
+patsy==0.5.2
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.1.0
+pingouin==0.5.2
+pip==22.1.2
+portalocker==2.3.0
+prompt-toolkit==3.0.29
+psutil==5.9.0
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pycparser==2.21
+Pygments==2.11.2
+pyparsing==3.0.8
+python-dateutil==2.8.2
+pytz==2022.1
+pytz-deprecation-shim==0.1.0.post0
+pyxdf==1.16.3
+pyzmq==22.3.0
+requests==2.28.1
+rpy2==3.5.4
+scikit-learn==1.0.2
+scipy==1.8.0
+seaborn==0.11.2
+setuptools==59.8.0
+six==1.16.0
+stack-data==0.2.0
+statsmodels==0.13.2
+stochastic==0.7.0
+tabulate==0.8.10
+threadpoolctl==3.1.0
+tornado==6.1
+traitlets==5.1.1
+tzdata==2022.4
+tzlocal==4.2
+unicodedata2==14.0.0
+urllib3==1.26.12
+wcwidth==0.2.5
+wheel==0.37.1
+xarray==2022.6.0
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/analyse_features.py b/src/analyse_features.py
@@ -0,0 +1,15 @@
+"""
+Author: Andres Pinilla Palacios
+Institution: Quality and Usability Lab, TU Berlin & UTS Games Studio, University of Technology Sydney
+"""
+
+from src.feature_selection.conduct_lme import conduct_lme
+from src.feature_selection.conduct_rfe import conduct_rfe
+
+def analyse_features():
+    conduct_lme()
+    conduct_rfe()
+
+if __name__ == "__main__":
+    analyse_features()
+
diff --git a/src/build_classifiers.py b/src/build_classifiers.py
@@ -0,0 +1,70 @@
+"""
+Author: Andres Pinilla Palacios
+Institution: Quality and Usability Lab, TU Berlin & UTS Games Studio, University of Technology Sydney
+"""
+
+from collections import defaultdict
+import pandas as pd
+from sklearn import metrics
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+
+from src.feature_selection.select_features_rfe import select_features_rfe
+from src.feature_selection.select_features_lme import select_features_lme
+from src.settings import d, dimensions, feature_selection_approaches, random_states_list
+
+
+def get_metrics(approach, p, rs):
+    # Load dataset
+    if approach == 'lme':
+        participant_data = select_features_lme(p, rs)
+    if approach == 'rfe':
+        participant_data = select_features_rfe(p, rs)
+
+    nested_dict = lambda: defaultdict(nested_dict)
+    participant_metrics = nested_dict()
+    for dim in dimensions:
+        print('Building classifier for ' + dim)
+        features = participant_data[dim]['features']
+        labels = participant_data[dim]['labels']
+
+        # Split dataset into training set and test set
+        X_train, X_test, y_train, y_test = train_test_split(features, labels, train_size=0.3, random_state=rs)
+
+        #Create a svm Classifier
+        clf = RandomForestClassifier()
+        #Train the model using the training sets
+        clf.fit(X_train, y_train)
+        #Predict the response for test dataset
+        y_pred = clf.predict(X_test)
+
+        accuracy = metrics.accuracy_score(y_test, y_pred)
+        precision = metrics.precision_score(y_test, y_pred)
+        recall = metrics.recall_score(y_test, y_pred)
+        f1_score = metrics.f1_score(y_test, y_pred)
+
+        participant_metrics[dim]['accuracy'] = accuracy
+        participant_metrics[dim]['precision'] = precision
+        participant_metrics[dim]['recall'] = recall
+        participant_metrics[dim]['f1_score'] = f1_score
+
+    return participant_metrics
+
+
+def build_classifiers(p):
+    # Build classifiers with each feature selection approach
+    print('Building classifiers for participant ' + p)
+    for approach in feature_selection_approaches:
+        print('Using features obtained with ' + approach + ' analysis')
+        # Build dict with each random state (10 different random states)
+        participant_metrics_dict = {}
+        for rs in random_states_list:
+            print('Using random state ' + str(rs))
+            participant_metrics_dict[rs] = pd.DataFrame.from_dict(get_metrics(approach, p, rs))
+
+        participant_metrics_df = pd.concat(participant_metrics_dict)
+        participant_metrics_df.to_csv(d + '/reports/metrics/%s/' % (approach) + p + '.csv')
+
+
+if __name__ == "__main__":
+    build_classifiers()
diff --git a/src/compare_methods.py b/src/compare_methods.py
@@ -0,0 +1,100 @@
+"""
+Author: Andres Pinilla Palacios
+Institution: Quality and Usability Lab, TU Berlin & UTS Games Studio, University of Technology Sydney
+"""
+
+from collections import defaultdict
+import matplotlib.pyplot as plt
+import pandas as pd
+import pingouin as pg
+from pingouin import ttest
+import seaborn as sns
+from scipy import stats
+
+from src.helper import conduct_iqr
+from src.settings import d, dimensions, feature_selection_approaches, participants_codes
+
+
+def compare_methods():
+    all_participant_metrics = []
+    for p in participants_codes:
+        for fsa in feature_selection_approaches:
+            file_path = (d + '/reports/metrics/%s/' % (fsa)) + p + '.csv'
+            participant_metrics = pd.read_csv(file_path)
+            participant_metrics = participant_metrics.reset_index(drop=['index'])
+            participant_metrics = participant_metrics.rename(columns = {'Unnamed: 0': 'random_state', 'Unnamed: 1': 'metric'})
+            participant_metrics['participant'] = p
+            participant_metrics['approach'] = fsa
+            all_participant_metrics.append(participant_metrics)
+
+    all_metrics_df = pd.concat(all_participant_metrics)
+    # Precision, recall and F1-score
+    all_means = all_metrics_df.groupby(['participant', 'approach', 'metric']).mean()
+    all_means = all_means.drop(['random_state'], axis=1)
+    all_means_of_means = all_means.groupby(['approach', 'metric']).mean()
+    all_std_of_means = all_means.groupby(['approach', 'metric']).std()
+    print(round(all_means_of_means * 100, 4))
+    print(round(all_std_of_means * 100, 3))
+    # Subset accuracy
+    accuracy = all_metrics_df.loc[all_metrics_df['metric'] == 'accuracy']
+    # Participant means
+    means_pp = accuracy.groupby(['participant', 'approach']).mean()
+    means_pp = means_pp.reset_index().drop(['random_state'], axis=1)
+    # Reshape data
+    reshaped_data = means_pp.melt(id_vars=['participant', 'approach'], var_name='dimension', value_name='mean_accuracy')
+    # Remove outliers
+    outliers = list(conduct_iqr(reshaped_data))
+    no_outliers = reshaped_data[~reshaped_data['participant'].isin(outliers)]
+    no_outliers['mean_accuracy'] = no_outliers['mean_accuracy'] * 100
+    no_outliers['approach'] = no_outliers['approach'].str.upper()
+    # Assupmtions check
+    # Shapiro-Wilk test of normal distribution
+    results_shapiro = stats.shapiro(no_outliers['mean_accuracy'])
+    round(results_shapiro[0], 3)
+    round(results_shapiro[1], 3)
+    # Sphericity
+    # Mauchly's test of sphericity
+    result_mauchly = pg.sphericity(no_outliers, dv='mean_accuracy', subject='participant', within=['approach', 'dimension'])
+    round(result_mauchly[2], 3)
+    round(result_mauchly[4], 3)
+    # ANOVA
+    # Perform two-way repeated m ANOVA
+    two_way_aov = pg.rm_anova(dv='mean_accuracy', within=['approach', 'dimension'], subject='participant', data=no_outliers)
+    print(two_way_aov)
+    # Main effect for dimension
+    main_effect_dimension = pg.anova(dv='mean_accuracy', between='dimension', data=no_outliers, detailed=True)
+    print(main_effect_dimension)
+    # Main effect for feature selection method
+    main_effect_approach = pg.anova(dv='mean_accuracy', between='approach', data=no_outliers, detailed=True)
+    print(main_effect_approach)
+    # Paired samples t-test
+
+    nested_dict = lambda: defaultdict(nested_dict)
+    ttest_dict = nested_dict()
+    for dim in dimensions:
+        dim_data = no_outliers.loc[no_outliers.dimension == dim]
+        dim_rfe = dim_data.loc[dim_data.approach == 'RFE'][['mean_accuracy']].values.flatten()
+        dim_lme = dim_data.loc[dim_data.approach == 'LME'][['mean_accuracy']].values.flatten()
+        res_dim_ttest = ttest(dim_rfe, dim_lme, paired=True).round(3)
+        ttest_dict[dim]['ttest_results'] = res_dim_ttest
+        ttest_dict[dim]['means']['lme'] = dim_lme.mean().round(3)
+        ttest_dict[dim]['means']['rfe'] = dim_rfe.mean().round(3)
+        ttest_dict[dim]['std']['lme'] = dim_lme.std().round(3)
+        ttest_dict[dim]['std']['rfe'] = dim_rfe.std().round(3)
+
+    # Plot
+    sns.set_palette("Paired")
+    sns.set_style("whitegrid")
+    g = sns.barplot(data=no_outliers, x="dimension", y="mean_accuracy", hue='approach')
+    g.set(xlabel='Affective dimension', ylabel='Mean accuracy of classification models')
+    g.set_xticklabels(['Negativity', 'Positivity', 'Net Predisposition'])
+    g.legend(title='Feature selection method')
+    sns.move_legend(g, "lower left")
+    plt.savefig('../reports/figures/anova_results.png', dpi=300)
+    plt.show()
+
+    return outliers
+
+
+if __name__ == "__main__":
+    compare_methods()