Skip to content

Commit

Permalink
identical obs neighborhood edge case fix, test env change
Browse files Browse the repository at this point in the history
  • Loading branch information
vc1492a committed Dec 31, 2018
1 parent 72e5948 commit 864a773
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 17 deletions.
2 changes: 1 addition & 1 deletion .coverage
Original file line number Diff line number Diff line change
@@ -1 +1 @@
!coverage.py: This is a private format, don't read it directly!{"lines": {"/Users/valentinoconstantinou/Files/Coding_Stuff/Data Science/PyNomaly/PyNomaly/loop.py": [1, 2, 3, 4, 5, 7, 8, 9, 12, 19, 36, 38, 40, 43, 44, 45, 47, 48, 49, 50, 51, 52, 56, 58, 59, 60, 61, 62, 63, 64, 65, 67, 68, 69, 73, 74, 75, 76, 78, 79, 81, 83, 84, 85, 86, 87, 89, 91, 93, 94, 95, 96, 97, 99, 101, 103, 104, 105, 106, 107, 109, 111, 113, 114, 115, 116, 117, 119, 121, 122, 123, 125, 126, 127, 129, 130, 131, 133, 134, 136, 137, 139, 140, 143, 144, 145, 146, 147, 148, 151, 152, 153, 155, 156, 158, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 173, 174, 175, 176, 177, 179, 181, 182, 183, 185, 187, 189, 191, 192, 194, 196, 198, 199, 200, 202, 204, 205, 206, 208, 210, 211, 213, 214, 216, 217, 218, 220, 222, 224, 225, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 265, 266, 267, 268, 269, 270, 272, 273, 274, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 292, 293, 294, 295, 296, 297, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 316, 317, 318, 320, 321, 322, 323, 324, 325, 327, 329, 330, 331, 332, 333, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 349, 351, 353, 354, 355, 357, 358, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 375], "/Users/valentinoconstantinou/Files/Coding_Stuff/Data Science/PyNomaly/PyNomaly/__init__.py": [1]}}
!coverage.py: This is a private format, don't read it directly!{"lines": {"/Users/valentinoconstantinou/Files/Coding_Stuff/Data Science/PyNomaly/PyNomaly/__init__.py": [1], "/Users/valentinoconstantinou/Files/Coding_Stuff/Data Science/PyNomaly/PyNomaly/loop.py": [1, 2, 3, 4, 5, 7, 8, 9, 12, 19, 36, 38, 40, 43, 44, 45, 47, 48, 50, 51, 52, 53, 57, 58, 63, 69, 70, 71, 73, 75, 77, 78, 79, 80, 81, 82, 83, 84, 86, 87, 88, 92, 93, 94, 95, 97, 98, 100, 102, 103, 104, 105, 106, 108, 110, 112, 113, 114, 115, 116, 118, 120, 122, 123, 124, 125, 126, 128, 130, 132, 133, 134, 135, 136, 138, 140, 141, 142, 144, 145, 146, 148, 149, 150, 152, 153, 155, 156, 158, 159, 162, 163, 164, 165, 166, 167, 170, 171, 172, 174, 175, 177, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 194, 195, 196, 197, 198, 200, 202, 203, 204, 206, 208, 210, 212, 213, 215, 216, 217, 219, 221, 222, 223, 225, 227, 228, 229, 231, 233, 234, 236, 237, 239, 240, 241, 243, 245, 247, 248, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 291, 292, 293, 294, 295, 296, 298, 299, 300, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 318, 319, 320, 321, 322, 323, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 342, 343, 344, 346, 347, 348, 349, 350, 351, 353, 355, 356, 357, 358, 359, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 375, 377, 379, 380, 381, 383, 384, 386, 387, 388, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 402, 403, 405]}}
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ python:
- "3.4"
- "3.5"
- "3.6"
- "3.7"
# command to install dependencies
install:
- pip install -r requirements.txt
Expand Down
56 changes: 43 additions & 13 deletions PyNomaly/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import warnings

__author__ = 'Valentino Constantinou'
__version__ = '0.2.5'
__version__ = '0.2.6'
__license__ = 'Apache License, Version 2.0'


Expand Down Expand Up @@ -45,14 +45,33 @@ def data(obj):
return points_vector
else:
warnings.warn(
'Provided data must be in ndarray or DataFrame.',
'Provided data or distance martix must be in ndarray or '
'DataFrame.',
UserWarning)
if isinstance(obj, list):
points_vector = np.array(obj)
return points_vector
points_vector = np.array([obj])
return points_vector

def inputs(self, data_obj, dist_obj):
if all(v is None for v in [data_obj, dist_obj]):
warnings.warn(
'Data or a distance matrix must be provided.'
)
return False
elif all(v is not None for v in [data_obj, dist_obj]):
warnings.warn(
'Only one of the following may be provided: data or a '
'distance matrix (not both).'
)
return False
if data_obj is not None:
self.data(data_obj)
if dist_obj is not None:
self.data(dist_obj)
return data_obj, dist_obj

@staticmethod
def cluster_size(obj):
c_labels = obj._cluster_labels()
Expand Down Expand Up @@ -131,13 +150,13 @@ def new_f(*args, **kwds):
UserWarning)
opt_types = {
'extent': {
'type': types[2]
'type': types[3]
},
'n_neighbors': {
'type': types[3]
'type': types[4]
},
'cluster_labels': {
'type': types[4]
'type': types[5]
}
}
for x in kwds:
Expand All @@ -157,9 +176,11 @@ def new_f(*args, **kwds):

return decorator

@accepts(object, np.ndarray, (int, np.integer), (int, np.integer), list)
def __init__(self, data, extent=3, n_neighbors=10, cluster_labels=None):
@accepts(object, np.ndarray, np.ndarray, (int, np.integer), (int, np.integer), list)
def __init__(self, data=None, distance_matrix=None, extent=3,
n_neighbors=10, cluster_labels=None):
self.data = data
self.distance_matrix = distance_matrix
self.extent = extent
self.n_neighbors = n_neighbors
self.cluster_labels = cluster_labels
Expand All @@ -170,7 +191,7 @@ def __init__(self, data, extent=3, n_neighbors=10, cluster_labels=None):
self.local_outlier_probabilities = None
self._objects = {}

self.Validate.data(self.data)
self.Validate().inputs(self.data, self.distance_matrix)
self.Validate.n_neighbors(self)
self.Validate.cluster_size(self)
self.Validate.extent(self)
Expand All @@ -191,7 +212,9 @@ def _prob_outlier_factor(probabilistic_distance, ev_prob_dist):
if np.all(probabilistic_distance == ev_prob_dist):
return np.zeros(probabilistic_distance.shape)
else:
return (probabilistic_distance / ev_prob_dist) - 1.
ev_prob_dist[ev_prob_dist == 0.] = 1.e-8
result = np.divide(probabilistic_distance, ev_prob_dist) - 1.
return result

@staticmethod
def _norm_prob_outlier_factor(extent, ev_probabilistic_outlier_factor):
Expand Down Expand Up @@ -224,6 +247,9 @@ def _euclidean(vector1, vector2):
diff = vector1 - vector2
return np.dot(diff, diff) ** 0.5

# in the case where some or all the points have a zero distance or zero EV dist
# then we want to return 0? or -1?

def _distances(self, data_store):
distances = np.full([self._n_observations(), self.n_neighbors], 9e10,
dtype=float)
Expand Down Expand Up @@ -326,7 +352,7 @@ def _local_outlier_probabilities(self, data_store):

def fit(self):

self.Validate.data(self.data)
self.Validate().inputs(self.data, self.distance_matrix)
self.Validate.n_neighbors(self, set_neighbors=True)
self.Validate.cluster_size(self)
if self.Validate.missing_values(self) is False:
Expand All @@ -348,7 +374,7 @@ def fit(self):

return self

def stream(self, x):
def stream(self, observation=None, distance=None):

if self.Validate.no_cluster_labels(self) is False:
self.cluster_labels = np.array([0] * len(self.data))
Expand All @@ -358,7 +384,9 @@ def stream(self, x):
sys.exit()

distances = np.full([1, self.n_neighbors], 9e10, dtype=float)
point_vector = self.Validate.data(x)
point_vector, dist_vector = self.Validate().inputs(
observation, distance)

for p in range(0, self.points_vector.shape[0]):
d = self._euclidean(self.points_vector[p, :], point_vector)
idx_max = np.argmax(distances[0])
Expand All @@ -368,7 +396,9 @@ def stream(self, x):
std_dist = np.sqrt(np.divide(ssd, self.n_neighbors))
prob_dist = self._prob_distance(self.extent, std_dist)
plof = self._prob_outlier_factor(prob_dist,
np.mean(self.prob_distances_ev))
np.array(
np.mean(self.prob_distances_ev))
)
loop = self._local_outlier_probability(
plof, self.norm_prob_local_outlier_factor)

Expand Down
12 changes: 12 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ All notable changes to PyNomaly will be documented in this Changelog.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## [0.2.6]()
### Fixed
- [Issue #25](https://github.com/vc1492a/PyNomaly/issues/25) - Fixed an issue
that caused zero division errors when all the values in a neighborhood are
duplicate samples. Introduced a unit test that checks for the proper
behavior when duplicate samples are present.

- to do:
- add a test for ensuring it works with duplicate values, results should be same or similar to using np.unique
-


## [0.2.5](https://github.com/vc1492a/PyNomaly/commit/1ff9bdad72948053c8fddb9b6a44eb6183dd4e49)
### Fixed
- [Issue #20](https://github.com/vc1492a/PyNomaly/issues/20) - Fixed
Expand Down
8 changes: 8 additions & 0 deletions examples/kdd_cup_smtp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from PyNomaly import loop
from sklearn.datasets import fetch_kddcup99

data = fetch_kddcup99(subset='smtp', percent10=False)
train_data, target = data.data, data.target
m = loop.LocalOutlierProbability(train_data[0:1000].astype(float), extent=3, n_neighbors=6).fit()


2 changes: 1 addition & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ If citing PyNomaly, use the following:
1. Breunig M., Kriegel H.-P., Ng R., Sander, J. LOF: Identifying Density-based Local Outliers. ACM SIGMOD International Conference on Management of Data (2000). [PDF](http://www.dbs.ifi.lmu.de/Publikationen/Papers/LOF.pdf).
2. Kriegel H., Kröger P., Schubert E., Zimek A. LoOP: Local Outlier Probabilities. 18th ACM conference on Information and knowledge management, CIKM (2009). [PDF](http://www.dbs.ifi.lmu.de/Publikationen/Papers/LoOP1649.pdf).
3. Goldstein M., Uchida S. A Comparative Evaluation of Unsupervised Anomaly Detection Algorithms for Multivariate Data. PLoS ONE 11(4): e0152173 (2016).
4. Hamlet C., Straub J., Russell M., Kerlin S. An incremental and approximate local outlier probability algorithm for intrusion detection and its evaluation. Journal of Cyber Security Technology (2016). [DOI](http://www.tandfonline.com/doi/abs/10.1080/23742917.2016.1226651?journalCode=tsec20)
4. Hamlet C., Straub J., Russell M., Kerlin S. An incremental and approximate local outlier probability algorithm for intrusion detection and its evaluation. Journal of Cyber Security Technology (2016). [DOI](http://www.tandfonline.com/doi/abs/10.1080/23742917.2016.1226651?journalCode=tsec20).

## Acknowledgements
- The authors of LoOP (Local Outlier Probabilities)
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
setup(
name='PyNomaly',
packages=['PyNomaly'],
version='0.2.5',
version='0.2.6',
description='A Python 3 implementation of LoOP: Local Outlier Probabilities, a local density based outlier detection method providing an outlier score in the range of [0,1].',
author='Valentino Constantinou',
author_email='[email protected]',
url='https://github.com/vc1492a/PyNomaly',
download_url='https://github.com/vc1492a/PyNomaly/archive/0.2.5.tar.gz',
download_url='https://github.com/vc1492a/PyNomaly/archive/0.2.6.tar.gz',
keywords=['outlier', 'anomaly', 'detection', 'machine', 'learning', 'probability'],
classifiers=[],
license='Apache License, Version 2.0',
Expand Down
1 change: 1 addition & 0 deletions tests/.coverage
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
!coverage.py: This is a private format, don't read it directly!{"lines": {"/Users/valentinoconstantinou/Files/Coding_Stuff/Data Science/PyNomaly/PyNomaly/loop.py": [128, 1, 2, 3, 4, 5, 7, 8, 9, 138, 139, 12, 19, 84, 85, 98, 36, 38, 108, 175, 177, 178, 179, 118, 73, 57, 140], "/Users/valentinoconstantinou/Files/Coding_Stuff/Data Science/PyNomaly/PyNomaly/__init__.py": [1]}}

0 comments on commit 864a773

Please sign in to comment.