forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
example + benchmark explanation make some private functions + fix public API IForest using BaseForest base class for trees debug + plot_iforest classic anomaly detection datasets and benchmark small modif BaseBagging inheritance shuffle dataset before benchmarking BaseBagging inheritance remove class label 4 from shuttle dataset pep8 + rm shuttle.csv bench_IsolationForest.png + doc decision_function add tests remove comments fetching kddcup99 and shuttle datasets fetching kddcup99 and shuttle datasets pep8 fetching kddcup99 and shuttle datasets pep8 new files iforest.py and test_iforest.py sc alternative to pandas (but very slow) in kddcup99.py faster parser sc pep8 + cleanup + simplification example outlier detection clean and correct idem random_state added percent10=True in benchmark mc remove shuttle + minor changes sc undo modif on forest.py and recompile cython on _tree.c fix travis cosmit change bagging to fix travis Revert "change bagging to fix travis" This reverts commit 30ea500. add max_samples_ in BaseBagging.fit to fix travis mc API : don't add fit param but use a private _fit + update tests + examples to avoid warning adapt to the new structure of _tree.pyx cosmit add performance test for iforest add _tree.c _utils.c _criterion.c TST : pass on tests remove test relax roc-auc to fix AppVeyor add test on toy samples Handle depth averaging at python level plot example: rm html add png load_kddcup99 -> fetch_kddcup99 + doc Take into account arjoly comments sh -> shuffle add decision_path code from scikit-learn#5487 to bench Take into account arjoly comments Revert "add decision_path code from scikit-learn#5487 to bench" This reverts commit 46ad44a. fix bug with max_samples != int
- Loading branch information
Showing
13 changed files
with
1,096 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
""" | ||
========================================== | ||
IsolationForest benchmark | ||
========================================== | ||
A test of IsolationForest on classical anomaly detection datasets. | ||
""" | ||
print(__doc__) | ||
|
||
from time import time | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
from sklearn.ensemble import IsolationForest | ||
from sklearn.metrics import roc_curve, auc | ||
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata | ||
from sklearn.preprocessing import LabelBinarizer | ||
from sklearn.utils import shuffle as sh | ||
|
||
np.random.seed(1) | ||
|
||
|
||
datasets = ['http']#, 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] | ||
|
||
for dat in datasets: | ||
# loading and vectorization | ||
print('loading data') | ||
if dat in ['http', 'smtp', 'SA', 'SF']: | ||
dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True) | ||
X = dataset.data | ||
y = dataset.target | ||
|
||
if dat == 'shuttle': | ||
dataset = fetch_mldata('shuttle') | ||
X = dataset.data | ||
y = dataset.target | ||
sh(X, y) | ||
# we remove data with label 4 | ||
# normal data are then those of class 1 | ||
s = (y != 4) | ||
X = X[s, :] | ||
y = y[s] | ||
y = (y != 1).astype(int) | ||
|
||
if dat == 'forestcover': | ||
dataset = fetch_covtype(shuffle=True) | ||
X = dataset.data | ||
y = dataset.target | ||
# normal data are those with attribute 2 | ||
# abnormal those with attribute 4 | ||
s = (y == 2) + (y == 4) | ||
X = X[s, :] | ||
y = y[s] | ||
y = (y != 2).astype(int) | ||
|
||
print('vectorizing data') | ||
|
||
if dat == 'SF': | ||
lb = LabelBinarizer() | ||
lb.fit(X[:, 1]) | ||
x1 = lb.transform(X[:, 1]) | ||
X = np.c_[X[:, :1], x1, X[:, 2:]] | ||
y = (y != 'normal.').astype(int) | ||
|
||
if dat == 'SA': | ||
lb = LabelBinarizer() | ||
lb.fit(X[:, 1]) | ||
x1 = lb.transform(X[:, 1]) | ||
lb.fit(X[:, 2]) | ||
x2 = lb.transform(X[:, 2]) | ||
lb.fit(X[:, 3]) | ||
x3 = lb.transform(X[:, 3]) | ||
X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] | ||
y = (y != 'normal.').astype(int) | ||
|
||
if dat == 'http' or dat == 'smtp': | ||
y = (y != 'normal.').astype(int) | ||
|
||
n_samples, n_features = np.shape(X) | ||
n_samples_train = n_samples // 2 | ||
n_samples_test = n_samples - n_samples_train | ||
|
||
X = X.astype(float) | ||
X_train = X[:n_samples_train, :] | ||
X_test = X[n_samples_train:, :] | ||
y_train = y[:n_samples_train] | ||
y_test = y[n_samples_train:] | ||
|
||
print('IsolationForest processing...') | ||
model = IsolationForest(bootstrap=True, n_jobs=-1) | ||
tstart = time() | ||
model.fit(X_train) | ||
fit_time = time() - tstart | ||
tstart = time() | ||
|
||
scoring = model.predict(X_test) # the lower, the more normal | ||
predict_time = time() - tstart | ||
fpr, tpr, thresholds = roc_curve(y_test, scoring) | ||
AUC = auc(fpr, tpr) | ||
plt.plot(fpr, tpr, lw=1, label='ROC for %s (area = %0.3f, train-time: %0.2fs, test-time: %0.2fs)' % (dat, AUC, fit_time, predict_time)) | ||
|
||
plt.xlim([-0.05, 1.05]) | ||
plt.ylim([-0.05, 1.05]) | ||
plt.xlabel('False Positive Rate') | ||
plt.ylabel('True Positive Rate') | ||
plt.title('Receiver operating characteristic') | ||
plt.legend(loc="lower right") | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
|
||
.. _kddcup99: | ||
|
||
Kddcup 99 dataset | ||
================= | ||
|
||
The KDD Cup '99 dataset was created by processing the tcpdump portions | ||
of the 1998 DARPA Intrusion Detection System (IDS) Evaluation dataset, | ||
created by MIT Lincoln Lab. The artificial data (described on the `dataset's | ||
homepage <http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html>`_) was | ||
generated using a closed network and hand-injected attacks to produce a | ||
large number of different types of attack with normal activity in the | ||
background. As the initial goal was to produce a large training set for | ||
supervised learning algorithms, there is a large proportion (80.1%) of | ||
abnormal data which is unrealistic in real world, and inapropriate for | ||
unsupervised anomaly detection which aims at detecting 'abnormal' data, ie | ||
1) qualitatively different from normal data | ||
2) in large minority among the observations. | ||
We thus transform the KDD Data set into two differents data set: SA and SF. | ||
|
||
-SA is obtained by simply selecting all the normal data, and a small | ||
proportion of abnormal data to gives an anomaly proportion of 1%. | ||
|
||
-SF is obtained as in [2] | ||
by simply picking up the data whose attribute logged_in is positive, thus | ||
focusing on the intrusion attack, which gives a proportion of 0.3% of | ||
attack. | ||
|
||
-http and smtp are two subsets of SF corresponding with third feature | ||
equal to 'http' (resp. to 'smtp') | ||
|
||
:func:`sklearn.datasets.fetch_kddcup99` will load the kddcup99 dataset; | ||
it returns a dictionary-like object | ||
with the feature matrix in the ``data`` member | ||
and the target values in ``target``. | ||
The dataset will be downloaded from the web if necessary. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
""" | ||
========================================== | ||
IsolationForest example | ||
========================================== | ||
An example using IsolationForest for anomaly detection. | ||
IsolationForest consists in 'isolating' the observations by randomly selecting | ||
a feature and then randomly selecting a split value between the maximum and | ||
minimum values of the selected feature. | ||
Since recursive partitioning can be represented by a tree structure, the | ||
number of splitting required to isolate a sample is equivalent to the path | ||
length from the root node to a terminating node. | ||
This path length, averaged among a forest of such random trees, is a measure | ||
of abnormality and our decision function. | ||
Indeed random partitioning produces noticeable shorter paths for anomalies. | ||
Hence, when a forest of random trees collectively produce shorter path lengths | ||
for some particular samples, then they are highly likely to be anomalies. | ||
.. [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest." | ||
Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on. | ||
""" | ||
print(__doc__) | ||
|
||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
from sklearn.ensemble import IsolationForest | ||
|
||
rng = np.random.RandomState(42) | ||
|
||
# Generate train data | ||
X = 0.3 * rng.randn(100, 2) | ||
X_train = np.r_[X + 2, X - 2] | ||
# Generate some regular novel observations | ||
X = 0.3 * rng.randn(20, 2) | ||
X_test = np.r_[X + 2, X - 2] | ||
# Generate some abnormal novel observations | ||
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) | ||
|
||
# fit the model | ||
clf = IsolationForest(max_samples=100, random_state=rng) | ||
clf.fit(X_train) | ||
y_pred_train = clf.predict(X_train) | ||
y_pred_test = clf.predict(X_test) | ||
y_pred_outliers = clf.predict(X_outliers) | ||
|
||
# plot the line, the samples, and the nearest vectors to the plane | ||
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) | ||
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) | ||
Z = Z.reshape(xx.shape) | ||
|
||
plt.title("IsolationForest") | ||
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) | ||
|
||
b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white') | ||
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green') | ||
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red') | ||
plt.axis('tight') | ||
plt.xlim((-5, 5)) | ||
plt.ylim((-5, 5)) | ||
plt.legend([b1, b2, c], | ||
["training observations", | ||
"new regular observations", "new abnormal observations"], | ||
loc="upper left") | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.