Skip to content

Commit

Permalink
initial
Browse files Browse the repository at this point in the history
  • Loading branch information
aepinilla committed Sep 29, 2022
1 parent ee2c3a1 commit 1579265
Show file tree
Hide file tree
Showing 25 changed files with 1,887 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
data/
archive/
reports/
stats/
.idea
.DS_Store
__pycache__/
*.pyc
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Affect detection

## Dataset
1. Download the 'data' folder from the OSF repository of the study: https://osf.io/5m3yu/
2. Place the 'data' folder inside the 'affect_detection' folder.

## Instructions
```
1. [email protected]:aepinilla/affect_detection.git
2. Go to src/feature_selection/lme_models
3. Open the 3 R files located in the 'lme_models' folder.
4. Edit the line 15 of each of those files, according to the path to your working directory/
5. From the root folder run: python main.py
```

## Preprocessing
The 'data' folder contains data that has been already preprocessed. To replicate the preprocessing steps, follow these steps:
1. Install Matlab.
2. Install EEGLAB following these instructions: https://eeglab.org/tutorials/01_Install/Install.html
3. Clone this repository to the MATLAB folder
4. Transform XDF files to CSV for faster processing:
```
python xdf_to_csv.py
```
6. Open EEGLAB in Matlab and run preprocessing script located in src/preprocessing.m

34 changes: 34 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
Author: Andres Pinilla Palacios
Institution: Quality and Usability Lab, TU Berlin & UTS Games Studio, University of Technology Sydney
"""

from src.analyse_features import analyse_features
from src.build_classifiers import build_classifiers
from src.compare_methods import compare_methods
from src.extract_features import extract_features
from src.lme_structure import lme_structure
from src.participants_age import participants_age
from src.random_indices import random_indices
from src.settings import participants_codes


def main():
# Generate random indices, extract features, and adjust extracted features to LME format.
for p in participants_codes:
random_indices(p)
extract_features(p)
lme_structure(p)
# Analyse features using LME and RFE
analyse_features()
# Build classification models with selected features
for p in participants_codes:
build_classifiers(p)
# Compare accuracy of classifiers built with features selected using each feature selection method.
compare_methods()
# Calculate participants' age for reporting in manuscript.
participants_age()


if __name__ == "__main__":
main()
86 changes: 86 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
antropy==0.1.4
appnope==0.1.3
asttokens==2.0.5
backcall==0.2.0
Bottleneck==1.3.5
certifi==2022.9.14
cffi==1.15.1
charset-normalizer==2.1.1
cycler==0.11.0
debugpy==1.6.0
decorator==5.1.1
entrypoints==0.4
executing==0.8.3
flatten-dict==0.4.2
fonttools==4.32.0
idna==3.4
ipykernel==6.13.0
ipyparallel==6.3.0
ipython==8.2.0
ipython-genutils==0.2.0
jedi==0.18.1
Jinja2==3.1.2
joblib==1.1.0
jupyter-client==7.1.2
jupyter-core==4.10.0
kiwisolver==1.4.2
lazy_loader==0.1rc2
littleutils==0.2.2
llvmlite==0.39.1
MarkupSafe==2.1.1
matplotlib==3.5.1
matplotlib-inline==0.1.3
metakernel==0.29.0
mkl-fft==1.3.1
mkl-random==1.2.2
mkl-service==2.4.0
munkres==1.1.4
nest-asyncio==1.5.5
numba==0.56.2
numexpr==2.8.3
numpy==1.22.3
outdated==0.2.1
packaging==21.3
pandas==1.4.4
pandas-flavor==0.3.0
parso==0.8.3
patsy==0.5.2
pexpect==4.8.0
pickleshare==0.7.5
Pillow==9.1.0
pingouin==0.5.2
pip==22.1.2
portalocker==2.3.0
prompt-toolkit==3.0.29
psutil==5.9.0
ptyprocess==0.7.0
pure-eval==0.2.2
pycparser==2.21
Pygments==2.11.2
pyparsing==3.0.8
python-dateutil==2.8.2
pytz==2022.1
pytz-deprecation-shim==0.1.0.post0
pyxdf==1.16.3
pyzmq==22.3.0
requests==2.28.1
rpy2==3.5.4
scikit-learn==1.0.2
scipy==1.8.0
seaborn==0.11.2
setuptools==59.8.0
six==1.16.0
stack-data==0.2.0
statsmodels==0.13.2
stochastic==0.7.0
tabulate==0.8.10
threadpoolctl==3.1.0
tornado==6.1
traitlets==5.1.1
tzdata==2022.4
tzlocal==4.2
unicodedata2==14.0.0
urllib3==1.26.12
wcwidth==0.2.5
wheel==0.37.1
xarray==2022.6.0
Empty file added src/__init__.py
Empty file.
15 changes: 15 additions & 0 deletions src/analyse_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""
Author: Andres Pinilla Palacios
Institution: Quality and Usability Lab, TU Berlin & UTS Games Studio, University of Technology Sydney
"""

from src.feature_selection.conduct_lme import conduct_lme
from src.feature_selection.conduct_rfe import conduct_rfe

def analyse_features():
conduct_lme()
conduct_rfe()

if __name__ == "__main__":
analyse_features()

70 changes: 70 additions & 0 deletions src/build_classifiers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""
Author: Andres Pinilla Palacios
Institution: Quality and Usability Lab, TU Berlin & UTS Games Studio, University of Technology Sydney
"""

from collections import defaultdict
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from src.feature_selection.select_features_rfe import select_features_rfe
from src.feature_selection.select_features_lme import select_features_lme
from src.settings import d, dimensions, feature_selection_approaches, random_states_list


def get_metrics(approach, p, rs):
# Load dataset
if approach == 'lme':
participant_data = select_features_lme(p, rs)
if approach == 'rfe':
participant_data = select_features_rfe(p, rs)

nested_dict = lambda: defaultdict(nested_dict)
participant_metrics = nested_dict()
for dim in dimensions:
print('Building classifier for ' + dim)
features = participant_data[dim]['features']
labels = participant_data[dim]['labels']

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(features, labels, train_size=0.3, random_state=rs)

#Create a svm Classifier
clf = RandomForestClassifier()
#Train the model using the training sets
clf.fit(X_train, y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1_score = metrics.f1_score(y_test, y_pred)

participant_metrics[dim]['accuracy'] = accuracy
participant_metrics[dim]['precision'] = precision
participant_metrics[dim]['recall'] = recall
participant_metrics[dim]['f1_score'] = f1_score

return participant_metrics


def build_classifiers(p):
# Build classifiers with each feature selection approach
print('Building classifiers for participant ' + p)
for approach in feature_selection_approaches:
print('Using features obtained with ' + approach + ' analysis')
# Build dict with each random state (10 different random states)
participant_metrics_dict = {}
for rs in random_states_list:
print('Using random state ' + str(rs))
participant_metrics_dict[rs] = pd.DataFrame.from_dict(get_metrics(approach, p, rs))

participant_metrics_df = pd.concat(participant_metrics_dict)
participant_metrics_df.to_csv(d + '/reports/metrics/%s/' % (approach) + p + '.csv')


if __name__ == "__main__":
build_classifiers()
100 changes: 100 additions & 0 deletions src/compare_methods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""
Author: Andres Pinilla Palacios
Institution: Quality and Usability Lab, TU Berlin & UTS Games Studio, University of Technology Sydney
"""

from collections import defaultdict
import matplotlib.pyplot as plt
import pandas as pd
import pingouin as pg
from pingouin import ttest
import seaborn as sns
from scipy import stats

from src.helper import conduct_iqr
from src.settings import d, dimensions, feature_selection_approaches, participants_codes


def compare_methods():
all_participant_metrics = []
for p in participants_codes:
for fsa in feature_selection_approaches:
file_path = (d + '/reports/metrics/%s/' % (fsa)) + p + '.csv'
participant_metrics = pd.read_csv(file_path)
participant_metrics = participant_metrics.reset_index(drop=['index'])
participant_metrics = participant_metrics.rename(columns = {'Unnamed: 0': 'random_state', 'Unnamed: 1': 'metric'})
participant_metrics['participant'] = p
participant_metrics['approach'] = fsa
all_participant_metrics.append(participant_metrics)

all_metrics_df = pd.concat(all_participant_metrics)
# Precision, recall and F1-score
all_means = all_metrics_df.groupby(['participant', 'approach', 'metric']).mean()
all_means = all_means.drop(['random_state'], axis=1)
all_means_of_means = all_means.groupby(['approach', 'metric']).mean()
all_std_of_means = all_means.groupby(['approach', 'metric']).std()
print(round(all_means_of_means * 100, 4))
print(round(all_std_of_means * 100, 3))
# Subset accuracy
accuracy = all_metrics_df.loc[all_metrics_df['metric'] == 'accuracy']
# Participant means
means_pp = accuracy.groupby(['participant', 'approach']).mean()
means_pp = means_pp.reset_index().drop(['random_state'], axis=1)
# Reshape data
reshaped_data = means_pp.melt(id_vars=['participant', 'approach'], var_name='dimension', value_name='mean_accuracy')
# Remove outliers
outliers = list(conduct_iqr(reshaped_data))
no_outliers = reshaped_data[~reshaped_data['participant'].isin(outliers)]
no_outliers['mean_accuracy'] = no_outliers['mean_accuracy'] * 100
no_outliers['approach'] = no_outliers['approach'].str.upper()
# Assupmtions check
# Shapiro-Wilk test of normal distribution
results_shapiro = stats.shapiro(no_outliers['mean_accuracy'])
round(results_shapiro[0], 3)
round(results_shapiro[1], 3)
# Sphericity
# Mauchly's test of sphericity
result_mauchly = pg.sphericity(no_outliers, dv='mean_accuracy', subject='participant', within=['approach', 'dimension'])
round(result_mauchly[2], 3)
round(result_mauchly[4], 3)
# ANOVA
# Perform two-way repeated m ANOVA
two_way_aov = pg.rm_anova(dv='mean_accuracy', within=['approach', 'dimension'], subject='participant', data=no_outliers)
print(two_way_aov)
# Main effect for dimension
main_effect_dimension = pg.anova(dv='mean_accuracy', between='dimension', data=no_outliers, detailed=True)
print(main_effect_dimension)
# Main effect for feature selection method
main_effect_approach = pg.anova(dv='mean_accuracy', between='approach', data=no_outliers, detailed=True)
print(main_effect_approach)
# Paired samples t-test

nested_dict = lambda: defaultdict(nested_dict)
ttest_dict = nested_dict()
for dim in dimensions:
dim_data = no_outliers.loc[no_outliers.dimension == dim]
dim_rfe = dim_data.loc[dim_data.approach == 'RFE'][['mean_accuracy']].values.flatten()
dim_lme = dim_data.loc[dim_data.approach == 'LME'][['mean_accuracy']].values.flatten()
res_dim_ttest = ttest(dim_rfe, dim_lme, paired=True).round(3)
ttest_dict[dim]['ttest_results'] = res_dim_ttest
ttest_dict[dim]['means']['lme'] = dim_lme.mean().round(3)
ttest_dict[dim]['means']['rfe'] = dim_rfe.mean().round(3)
ttest_dict[dim]['std']['lme'] = dim_lme.std().round(3)
ttest_dict[dim]['std']['rfe'] = dim_rfe.std().round(3)

# Plot
sns.set_palette("Paired")
sns.set_style("whitegrid")
g = sns.barplot(data=no_outliers, x="dimension", y="mean_accuracy", hue='approach')
g.set(xlabel='Affective dimension', ylabel='Mean accuracy of classification models')
g.set_xticklabels(['Negativity', 'Positivity', 'Net Predisposition'])
g.legend(title='Feature selection method')
sns.move_legend(g, "lower left")
plt.savefig('../reports/figures/anova_results.png', dpi=300)
plt.show()

return outliers


if __name__ == "__main__":
compare_methods()
Loading

0 comments on commit 1579265

Please sign in to comment.