Skip to content

Commit

Permalink
iforest
Browse files Browse the repository at this point in the history
example + benchmark

explanation

make some private functions + fix public API

IForest using BaseForest base class for trees

debug + plot_iforest

classic anomaly detection datasets and benchmark

small modif

BaseBagging inheritance

shuffle dataset before benchmarking

BaseBagging inheritance

remove class label 4 from shuttle dataset

pep8 + rm shuttle.csv bench_IsolationForest.png + doc decision_function

add tests

remove comments

fetching kddcup99 and shuttle datasets

fetching kddcup99 and shuttle datasets

pep8

fetching kddcup99 and shuttle datasets

pep8

new files iforest.py and test_iforest.py

sc

alternative to pandas (but very slow)
in kddcup99.py

faster parser

sc

pep8 + cleanup + simplification

example outlier detection

clean and correct

idem

random_state added

percent10=True in benchmark

mc

remove shuttle + minor changes

sc

undo modif on forest.py and recompile cython on _tree.c

fix travis

cosmit

change bagging to fix travis

Revert "change bagging to fix travis"

This reverts commit 30ea500.

add max_samples_ in BaseBagging.fit to fix travis

mc

API : don't add fit param but use a private _fit + update tests + examples to avoid warning

adapt to the new structure of _tree.pyx

cosmit

add performance test for iforest

add _tree.c _utils.c _criterion.c

TST : pass on tests

remove test

relax roc-auc to fix AppVeyor

add test on toy samples

Handle depth averaging at python level

plot example: rm html add png

load_kddcup99 -> fetch_kddcup99 + doc

Take into account arjoly comments

sh -> shuffle

add decision_path code from scikit-learn#5487 to bench

Take into account arjoly comments

Revert "add decision_path code from scikit-learn#5487 to bench"

This reverts commit 46ad44a.

fix bug with max_samples != int
  • Loading branch information
ngoix committed Oct 24, 2015
1 parent efb0179 commit a75eb63
Show file tree
Hide file tree
Showing 13 changed files with 1,096 additions and 12 deletions.
108 changes: 108 additions & 0 deletions benchmarks/bench_isolation_forest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""
==========================================
IsolationForest benchmark
==========================================
A test of IsolationForest on classical anomaly detection datasets.
"""
print(__doc__)

from time import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import shuffle as sh

np.random.seed(1)


datasets = ['http']#, 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']

for dat in datasets:
# loading and vectorization
print('loading data')
if dat in ['http', 'smtp', 'SA', 'SF']:
dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True)
X = dataset.data
y = dataset.target

if dat == 'shuttle':
dataset = fetch_mldata('shuttle')
X = dataset.data
y = dataset.target
sh(X, y)
# we remove data with label 4
# normal data are then those of class 1
s = (y != 4)
X = X[s, :]
y = y[s]
y = (y != 1).astype(int)

if dat == 'forestcover':
dataset = fetch_covtype(shuffle=True)
X = dataset.data
y = dataset.target
# normal data are those with attribute 2
# abnormal those with attribute 4
s = (y == 2) + (y == 4)
X = X[s, :]
y = y[s]
y = (y != 2).astype(int)

print('vectorizing data')

if dat == 'SF':
lb = LabelBinarizer()
lb.fit(X[:, 1])
x1 = lb.transform(X[:, 1])
X = np.c_[X[:, :1], x1, X[:, 2:]]
y = (y != 'normal.').astype(int)

if dat == 'SA':
lb = LabelBinarizer()
lb.fit(X[:, 1])
x1 = lb.transform(X[:, 1])
lb.fit(X[:, 2])
x2 = lb.transform(X[:, 2])
lb.fit(X[:, 3])
x3 = lb.transform(X[:, 3])
X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
y = (y != 'normal.').astype(int)

if dat == 'http' or dat == 'smtp':
y = (y != 'normal.').astype(int)

n_samples, n_features = np.shape(X)
n_samples_train = n_samples // 2
n_samples_test = n_samples - n_samples_train

X = X.astype(float)
X_train = X[:n_samples_train, :]
X_test = X[n_samples_train:, :]
y_train = y[:n_samples_train]
y_test = y[n_samples_train:]

print('IsolationForest processing...')
model = IsolationForest(bootstrap=True, n_jobs=-1)
tstart = time()
model.fit(X_train)
fit_time = time() - tstart
tstart = time()

scoring = model.predict(X_test) # the lower, the more normal
predict_time = time() - tstart
fpr, tpr, thresholds = roc_curve(y_test, scoring)
AUC = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC for %s (area = %0.3f, train-time: %0.2fs, test-time: %0.2fs)' % (dat, AUC, fit_time, predict_time))

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
36 changes: 36 additions & 0 deletions doc/datasets/kddcup99.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@

.. _kddcup99:

Kddcup 99 dataset
=================

The KDD Cup '99 dataset was created by processing the tcpdump portions
of the 1998 DARPA Intrusion Detection System (IDS) Evaluation dataset,
created by MIT Lincoln Lab. The artificial data (described on the `dataset's
homepage <http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html>`_) was
generated using a closed network and hand-injected attacks to produce a
large number of different types of attack with normal activity in the
background. As the initial goal was to produce a large training set for
supervised learning algorithms, there is a large proportion (80.1%) of
abnormal data which is unrealistic in real world, and inapropriate for
unsupervised anomaly detection which aims at detecting 'abnormal' data, ie
1) qualitatively different from normal data
2) in large minority among the observations.
We thus transform the KDD Data set into two differents data set: SA and SF.

-SA is obtained by simply selecting all the normal data, and a small
proportion of abnormal data to gives an anomaly proportion of 1%.

-SF is obtained as in [2]
by simply picking up the data whose attribute logged_in is positive, thus
focusing on the intrusion attack, which gives a proportion of 0.3% of
attack.

-http and smtp are two subsets of SF corresponding with third feature
equal to 'http' (resp. to 'smtp')

:func:`sklearn.datasets.fetch_kddcup99` will load the kddcup99 dataset;
it returns a dictionary-like object
with the feature matrix in the ``data`` member
and the target values in ``target``.
The dataset will be downloaded from the web if necessary.
2 changes: 2 additions & 0 deletions doc/modules/classes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ Loaders
datasets.fetch_olivetti_faces
datasets.fetch_california_housing
datasets.fetch_covtype
datasets.fetch_kddcup99
datasets.fetch_rcv1
datasets.load_mlcomp
datasets.load_sample_image
Expand Down Expand Up @@ -351,6 +352,7 @@ Samples generator
ensemble.ExtraTreesRegressor
ensemble.GradientBoostingClassifier
ensemble.GradientBoostingRegressor
ensemble.IsolationForest
ensemble.RandomForestClassifier
ensemble.RandomTreesEmbedding
ensemble.RandomForestRegressor
Expand Down
41 changes: 41 additions & 0 deletions doc/modules/outlier_detection.rst
Original file line number Diff line number Diff line change
Expand Up @@ -192,4 +192,45 @@ multiple modes.
an outlier detection method) and a covariance-based outlier
detection with :class:`covariance.MinCovDet`.

Isolation Forest
----------------------------

One efficient way of performing outlier detection in high-dimensional datasets
is to use random forests.
:class:`ensemble.IsolationForest` consists in 'isolating' the observations
by randomly selecting a feature and then randomly selecting a split value
between the maximum and minimum values of the selected feature.

Since recursive partitioning can be represented by a tree structure, the
number of splitting required to isolate a point is equivalent to the path
length from the root node to a terminating node.

This path length, averaged among a forest of such random trees, is a
measure of abnormality and our decision function.

Indeed random partitioning produces noticeable shorter paths for anomalies.
Hence, when a forest of random trees collectively produce shorter path
lengths for some particular points, then they are highly likely to be
anomalies.

This strategy is illustrated below.

.. figure:: ../auto_examples/ensemble/images/plot_isolation_forest_001.png
:target: ../auto_examples/ensemble/plot_isolation_forest.html
:align: center
:scale: 75%

.. topic:: Examples:

* See :ref:`example_ensemble_plot_isolation_forest.py` for
an illustration of the use of IsolationForest.

* See :ref:`example_covariance_plot_outlier_detection.py` for a
comparison of :class:`ensemble.IsolationForest` with
:class:`svm.OneClassSVM` (tuned to perform like an outlier detection
method) and a covariance-based outlier detection with
:class:`covariance.MinCovDet`.

.. topic:: References:
.. [LTZ2008] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
16 changes: 12 additions & 4 deletions examples/covariance/plot_outlier_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Outlier detection with several methods.
==========================================
When the amount of contamination is known, this example illustrates two
When the amount of contamination is known, this example illustrates three
different ways of performing :ref:`outlier_detection`:
- based on a robust estimator of covariance, which is assuming that the
Expand All @@ -14,6 +14,10 @@
data set, hence performing better when the data is strongly
non-Gaussian, i.e. with two well-separated clusters;
- using the Isolation Forest algorithm, which is based on random forests and
hence more adapted to large-dimensional settings, even if it performs
quite well in the examples below.
The ground truth about inliers and outliers is given by the points colors
while the orange-filled area indicates which points are reported as inliers
by each method.
Expand All @@ -32,6 +36,9 @@

from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest

rng = np.random.RandomState(42)

# Example settings
n_samples = 200
Expand All @@ -42,7 +49,8 @@
classifiers = {
"One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
kernel="rbf", gamma=0.1),
"robust covariance estimator": EllipticEnvelope(contamination=.1)}
"robust covariance estimator": EllipticEnvelope(contamination=.1),
"Isolation Forest": IsolationForest(max_samples=n_samples, random_state=rng)}

# Compare given classifiers under given settings
xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
Expand All @@ -61,7 +69,7 @@
# Add outliers
X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]

# Fit the model with the One-Class SVM
# Fit the model
plt.figure(figsize=(10, 5))
for i, (clf_name, clf) in enumerate(classifiers.items()):
# fit the data and tag outliers
Expand All @@ -74,7 +82,7 @@
# plot the levels lines and the points
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
subplot = plt.subplot(1, 2, i + 1)
subplot = plt.subplot(1, 3, i + 1)
subplot.set_title("Outlier detection")
subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
cmap=plt.cm.Blues_r)
Expand Down
69 changes: 69 additions & 0 deletions examples/ensemble/plot_isolation_forest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
==========================================
IsolationForest example
==========================================
An example using IsolationForest for anomaly detection.
IsolationForest consists in 'isolating' the observations by randomly selecting
a feature and then randomly selecting a split value between the maximum and
minimum values of the selected feature.
Since recursive partitioning can be represented by a tree structure, the
number of splitting required to isolate a sample is equivalent to the path
length from the root node to a terminating node.
This path length, averaged among a forest of such random trees, is a measure
of abnormality and our decision function.
Indeed random partitioning produces noticeable shorter paths for anomalies.
Hence, when a forest of random trees collectively produce shorter path lengths
for some particular samples, then they are highly likely to be anomalies.
.. [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
"""
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

rng = np.random.RandomState(42)

# Generate train data
X = 0.3 * rng.randn(100, 2)
X_train = np.r_[X + 2, X - 2]
# Generate some regular novel observations
X = 0.3 * rng.randn(20, 2)
X_test = np.r_[X + 2, X - 2]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))

# fit the model
clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)

# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("IsolationForest")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white')
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green')
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red')
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.legend([b1, b2, c],
["training observations",
"new regular observations", "new abnormal observations"],
loc="upper left")
plt.show()
2 changes: 2 additions & 0 deletions sklearn/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .base import load_sample_images
from .base import load_sample_image
from .covtype import fetch_covtype
from .kddcup99 import fetch_kddcup99
from .mlcomp import load_mlcomp
from .lfw import load_lfw_pairs
from .lfw import load_lfw_people
Expand Down Expand Up @@ -65,6 +66,7 @@
'fetch_california_housing',
'fetch_covtype',
'fetch_rcv1',
'fetch_kddcup99',
'get_data_home',
'load_boston',
'load_diabetes',
Expand Down
Loading

0 comments on commit a75eb63

Please sign in to comment.