Skip to content

Commit

Permalink
add kNN
Browse files Browse the repository at this point in the history
  • Loading branch information
劉佳婷 authored and 劉佳婷 committed Apr 15, 2022
1 parent f506bbb commit 1cdeff5
Show file tree
Hide file tree
Showing 12 changed files with 163 additions and 10 deletions.
58 changes: 58 additions & 0 deletions Feature_Selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Sequential Backward Selection
import argparse
from statistics import mean
import numpy as np
import pandas as pd

from utils.read_data import read_data
from utils.metrics import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE

parser = argparse.ArgumentParser()
parser.add_argument('--M', default=4, help='M-fold cross validation')
parser.add_argument('--k', default=5, help='k for kNN')
parser.add_argument('--use_SMOTE', action='store_true')
args = parser.parse_args()

def main():
X_tr, y_tr = read_data('datasets/algerian_fires_train.csv')
X_test, y_test = read_data('datasets/algerian_fires_test.csv')
# drop first column ("Date" feature)
X_tr, X_test = X_tr.iloc[:,1:], X_test.iloc[:,1:]
model = KNeighborsClassifier(n_neighbors=int(args.k))
scaler = StandardScaler()
sm = SMOTE(random_state=42)
while True:
if X_tr.shape[1] == 1: break
SBS_res = dict()
for col in X_tr.columns:
X_tr_SBS = X_tr.drop(columns=col)
F1_result, Acc_result = [0]*args.M, [0]*args.M
for m in range(args.M):
X_val, y_val = X_tr_SBS.iloc[46*m:46*(m+1)], y_tr.iloc[46*m:46*(m+1)]
if m == 0: X_tr_prime, y_tr_prime = X_tr_SBS.iloc[46:], y_tr.iloc[46:]
elif m == 1:
X_tr_prime = pd.concat([X_tr_SBS.iloc[:46], X_tr_SBS.iloc[92:]])
y_tr_prime = pd.concat([y_tr.iloc[:46], y_tr.iloc[92:]])
elif m == 2:
X_tr_prime = pd.concat([X_tr_SBS.iloc[:92], X_tr_SBS.iloc[138:]])
y_tr_prime = pd.concat([y_tr.iloc[:92], y_tr.iloc[138:]])
else: X_tr_prime, y_tr_prime = X_tr_SBS.iloc[:138], y_tr.iloc[:138]
if args.use_SMOTE:
X_tr_prime, y_tr_prime = sm.fit_resample(X_tr_prime, y_tr_prime)
X_tr_prime = scaler.fit_transform(X_tr_prime)
X_val = scaler.transform(X_val)
model.fit(X_tr_prime, y_tr_prime)
y_val_pred = model.predict(X_val)
F1_result[m], Acc_result[m] = metrics(y_val, y_val_pred, "kNN", work='val')
SBS_res[col] = mean(F1_result)+mean(Acc_result)
SBS_res = sorted(SBS_res.items(), key = lambda kv:(kv[1], kv[0]))
drop_col = SBS_res[0][0]
print("dropped column:", drop_col)
X_tr = X_tr.drop(columns=drop_col)

if __name__ == '__main__':
main()

38 changes: 34 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
- Class 1: 115 (62.5%)
- test: 60

- Sequential Backward Selection
- Most contributing features:
- Ws > RH > FFMC > Tempature > DC > BUI > DMC > Rain > ISI

- Required reference systems
- Trivial system \
`python3 trivial.py`
Expand All @@ -20,27 +24,53 @@
- Test F1-score: 0.6286
- Test Accuracy: 0.7833

- Technique 1: Perceptron Learning \
- Technique 1: Perceptron Learning (Drop "Date")\
`python3 perceptron.py --M 4 --epoch 200 --plot_title perceptron` (M-fold cross-validation)
- Drop "Date"
- Val F1-score: 0.9113
- Val Accuracy: 0.9076
- Test F1-score: 0.8846
- Test Accuracy: 0.9

`python3 perceptron.py --M 4 --epoch 200 --normalization --plot_title p_norm`
- Drop "Date"
- Apply min-max normalization to all features
- Val F1-score: 0.9405
- Val Accuracy: 0.9457
- Test F1-score: 0.8679
- Test Accuracy: 0.8833

`python3 perceptron.py --M 4 --epoch 200 --standardization --plot_title p_std`
- Drop "Date"
- Apply standardization to all features
- Val F1-score: 0.9368
- Val Accuracy: 0.9457
- Test F1-score: 0.92
- Test Accuracy: 0.93

`python3 perceptron.py --M 4 --epoch 200 --standardization --use_SMOTE --plot_title p_std_SMOTE`
- Apply standardization to all features
- Val F1-score: 0.9667
- Val Accuracy: 0.9674
- Test F1-score: 0.9388
- Test Accuracy: 0.95

- Technique 2: KNN Classifier (Drop "Date", with Standardization)\
`python3 kNN.py --M 4 --k 5 --plot_title kNN`
- The following results are for k = (3, 4, 5, 6, 7)
- Val F1-score: (0.8532, 0.8605, 0.8745, 0.886, 0.8678)
- Val Accuracy: (0.8641, 0.8696, 0.875, 0.875, 0.8641)
- Test F1-score: (0.7179, 0.6061, 0.7568, 0.6857, 0.7368)
- Test Accuracy: (0.8167, 0.7833, 0.85, 0.8167, 0.8333)

`python3 kNN.py --M 4 --k 5 --use_SMOTE --plot_title kNN_SMOTE`
- Val F1-score: 0.8594
- Val Accuracy: 0.8696
- Test F1-score: 0.7692
- Test Accuracy: 0.85

`python3 kNN.py --M 4 --k 5 --feat_reduction --plot_title kNN_feat_reduct`
- Four least contributing features: ISI -> Rain -> DMC -> BUI
- Drop (1,2,3,4) features
- Val F1-score: (0.7121, 0.7098, 0.6595, 0.7183)
- Val Accuracy: (0.7663, 0.7663, 0.75, 0.7609)
- Test F1-score: (0.6666, 0.6154, 0.5714, 0.6222)
- Test Accuracy: (0.7666, 0.75, 0.7, 0.7167)

Binary file added cf_matrix_plots/kNN_SMOTE_k=5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added cf_matrix_plots/kNN_feat_reduct_k=5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added cf_matrix_plots/kNN_k=3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added cf_matrix_plots/kNN_k=4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added cf_matrix_plots/kNN_k=5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added cf_matrix_plots/kNN_k=6.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added cf_matrix_plots/kNN_k=7.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added cf_matrix_plots/p_std_SMOTE.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
62 changes: 62 additions & 0 deletions kNN.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import argparse
from statistics import mean
import numpy as np
import pandas as pd

from utils.read_data import read_data
from utils.metrics import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE

parser = argparse.ArgumentParser()
parser.add_argument('--M', default=4, help='M-fold cross validation')
parser.add_argument('--k', default=5, help='k for kNN')
parser.add_argument('--use_SMOTE', action='store_true')
parser.add_argument ('--feat_reduction', action='store_true', help='drop four least contributing features')
parser.add_argument('--plot_title', default='', help='title for cf_matrix plot')
args = parser.parse_args()

def main():
X_tr, y_tr = read_data('datasets/algerian_fires_train.csv')
X_test, y_test = read_data('datasets/algerian_fires_test.csv')
# drop first column ("Date" feature)
X_tr, X_test = X_tr.iloc[:,1:], X_test.iloc[:,1:]
model = KNeighborsClassifier(n_neighbors=int(args.k))
scaler = StandardScaler()
sm = SMOTE(random_state=42)
if args.feat_reduction:
X_tr = X_tr.drop(columns=['ISI'])
X_test = X_test.drop(columns=['ISI'])
F1_result, Acc_result = [0]*args.M, [0]*args.M
for m in range(args.M):
X_val, y_val = X_tr.iloc[46*m:46*(m+1)], y_tr.iloc[46*m:46*(m+1)]
if m == 0: X_tr_prime, y_tr_prime = X_tr.iloc[46:], y_tr.iloc[46:]
elif m == 1:
X_tr_prime = pd.concat([X_tr.iloc[:46], X_tr.iloc[92:]])
y_tr_prime = pd.concat([y_tr.iloc[:46], y_tr.iloc[92:]])
elif m == 2:
X_tr_prime = pd.concat([X_tr.iloc[:92], X_tr.iloc[138:]])
y_tr_prime = pd.concat([y_tr.iloc[:92], y_tr.iloc[138:]])
else: X_tr_prime, y_tr_prime = X_tr.iloc[:138], y_tr.iloc[:138]
if args.use_SMOTE:
X_tr_prime, y_tr_prime = sm.fit_resample(X_tr_prime, y_tr_prime)
X_tr_prime = scaler.fit_transform(X_tr_prime)
X_val = scaler.transform(X_val)
model.fit(X_tr_prime, y_tr_prime)
y_val_pred = model.predict(X_val)
F1_result[m], Acc_result[m] = metrics(y_val, y_val_pred, "kNN", work='val')

print("Val F1_score=", mean(F1_result), "Val Accuracy=", mean(Acc_result))
print("Training with full dataset!")
if args.use_SMOTE:
X_tr, y_tr = sm.fit_resample(X_tr, y_tr)
X_tr = scaler.fit_transform(X_tr)
X_test = scaler.transform(X_test)
y_test_pred = model.predict(X_test)
F1_score, Accuracy = metrics(y_test, y_test_pred, args.plot_title+'_k='+str(args.k))
print("Test F1_score=", F1_score, "Test Accuracy=", Accuracy)

if __name__ == '__main__':
main()

15 changes: 9 additions & 6 deletions perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from utils.read_data import read_data
from utils.metrics import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from imblearn.over_sampling import SMOTE

'''
Using Stochastic GD - variant 1
Expand All @@ -24,6 +25,7 @@
parser.add_argument('--epoch', default=200, help='# epochs trained')
parser.add_argument('--normalization', action='store_true', help='use min-max normalization')
parser.add_argument('--standardization', action='store_true', help='use standardization')
parser.add_argument('--use_SMOTE', action='store_true')
parser.add_argument('--plot_title', default='', help='title for cf_matrix plot')
args = parser.parse_args()

Expand Down Expand Up @@ -116,6 +118,7 @@ def main():
# drop first column ("Date" feature)
X_tr, X_test = X_tr.iloc[:,1:], X_test.iloc[:,1:]
F1_result, Acc_result = [0]*args.M, [0]*args.M
sm = SMOTE(random_state=42)
for m in range(args.M):
X_val, y_val = X_tr.iloc[46*m:46*(m+1)], y_tr.iloc[46*m:46*(m+1)]
if m == 0: X_tr_prime, y_tr_prime = X_tr.iloc[46:], y_tr.iloc[46:]
Expand All @@ -133,6 +136,8 @@ def main():
D = X_tr_prime.shape[1]
w, it, lr, not_linearly_separable, correctly_classified, w_vec, J_vec \
= init_train_param(D)
if args.use_SMOTE:
X_tr_prime, y_tr_prime = sm.fit_resample(X_tr_prime, y_tr_prime)
if args.normalization or args.standardization:
if args.normalization:
scaler = MinMaxScaler()
Expand All @@ -150,13 +155,11 @@ def main():
print("Training with full dataset!")
w, it, lr, not_linearly_separable, correctly_classified, w_vec, J_vec \
= init_train_param(D)
if args.use_SMOTE:
X_tr, y_tr = sm.fit_resample(X_tr, y_tr)
if args.normalization or args.standardization:
if args.normalization:
scaler_all = MinMaxScaler()
elif args.standardization:
scaler_all = StandardScaler()
X_tr = scaler_all.fit_transform(X_tr)
X_test = scaler_all.transform(X_test)
X_tr = scaler.fit_transform(X_tr)
X_test = scaler.transform(X_test)
w_hat = train(X_tr, y_tr, N, idx, w, it, lr, \
not_linearly_separable, correctly_classified, w_vec, J_vec)
y_test_pred = predict(X_test, y_test, w_hat)
Expand Down

0 comments on commit 1cdeff5

Please sign in to comment.