Skip to content


Browse files Browse the repository at this point in the history
  • Loading branch information
merlingo committed Jan 7, 2019
1 parent a14d637 commit dfc1ff7
Show file tree
Hide file tree
Showing 4 changed files with 371 additions and 0 deletions.
48 changes: 48 additions & 0 deletions
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@

def EuclideanDistance(a,b):
import math
distance = math.sqrt(sum([(x-y)**2 for x,y in zip(a,b)]))
print("Uzaklık: "+distance)
return distance

def distanceBasedOutlierDetection(dataset, distfunc, dist_threshold, frac_threshold):
detect anomaly based on distance
:param dataset:
:param dist_threshold:
:param frac_threshold:
outlier = []

for i in range(n):
db = False
for j in range(n):
dist = distfunc(dataset[j],dataset[i])
if(j !=i and dist<dist_threshold):#eger i j degilse ve ideki jye yakin olan tum datasetin frac yuzdesinden cok eleman var mı
break #dataset[i] is not outlier
return outlier

def gridBasedOutlierDetection(dataset, distfunc, dist_threshold, frac_threshold):
:param dataset:
:param distfunc:
:param dist_threshold:
:param frac_threshold:

if __name__ =="__main__":
61 changes: 61 additions & 0 deletions
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import tsne
import numpy as np
import matplotlib.pylab as pylab
import Word2VectorApp as w2v
import AnomalyDet as anomaly
def getWordAndVectors():
# Create a dataset where each tuple will be word2vec vector of a word.
dataset = []
dfn = "word2vec.txt"
a = 0
words = []
with open(dfn, "r") as f:
for l in f:
a += 1
wv = l.split(" ")
wv[-1] = wv[-1].replace("\n", "")

if (a == 1):
return dataset,words
def EuclideanDistance(a,b):
import math
distance = math.sqrt(sum([(x-y)**2 for x,y in zip(a,b)]))
#print("Uzaklık: "+str(distance))
return distance
def printoutliers(outlier,words):
print(str(len(outlier))+" kadar outlier vardır:")
o = [words[i] for i in outlier]

def __main():
#Create a dataset where each tuple will be word2vec vector of a word.
dataset, words =
X =np.array(dataset).astype(np.float)

#Apply an anomoly detection technique
outlier = anomaly.distanceBasedOutlierDetection(X,EuclideanDistance,1,0.2)
print("1.0 icin outliers:")

outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.1, 0.3)
print("1.2 icin outliers:")
outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.2, 0.3)
print("1.2 icin outliers:")
printoutliers(outlier, words)
outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.3, 0.3)
print("1.3 icin outliers:")
printoutliers(outlier, words)
outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.5, 0.4)
print("1.5 icin outliers:")
#Show the string of words on the TSNE visualization
Y = tsne.tsne(X, 2, 50, 20.0)
pylab.scatter(Y[:, 0], Y[:, 1])
for i, word in enumerate(words):
pylab.annotate(word, xy=(Y[i, 0], Y[i, 1]))
if __name__ == "__main__":
72 changes: 72 additions & 0 deletions
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import gensim
from gensim.corpora import WikiCorpus
import os
import random
1- get corpus
1.2- train word2vec models to get vectors
2- select some keywords about same topic as dataset
3- select some anomaly words which have no interest with topic to insert dataset
4- return dataset - word and vektor
keywords = ["bilgisayar","c++","java","sistematik","insan-bilgisayar","komputer","computer","programlama","windows",
"etkileşimi", "işletim","sistemi","teorik","bilimi","matematiksel","kodlama","teorisi","veri","yapıları",
def getCorpus():
# parse an xml file by name
wiki = WikiCorpus("trwiki-20181201-pages-articles-multistream.xml.bz2", lemmatize=False, dictionary={})
space = " "
i = 0
corpus = [text for text in wiki.get_texts()]
print("Finished Saved " + str(len(corpus)) + " articles")
return corpus
def modeling(doc):
model = gensim.models.Word2Vec(doc, size=150, window=10, min_count=2, workers=10)
return model
def dataset(keyws, model):
dataset = []
for k in keyws:
return dataset
def app(outfile="model.txt"):
model = {}
global anomaly
if not(os.path.isfile(outfile) ):
print("model is not exist, it will be produced")
corpus = getCorpus()
model = modeling(corpus)
# trim unneeded model memory = use (much) less RAM
print("model is exist, it will load from file")
model = gensim.models.Word2Vec.load(outfile)

words = list(model.wv.vocab)
lw =len(words)
keyws = checkKeywords(words)
for k in anomaly:
if (k in words):
print("bu anomaly yok:"+k)
dset = dataset(keyws,model)
return dset,keyws

def checkKeywords(words):
global keywords
kw= []
for k in keywords:
if(k in words):
print("bu kelime yok:"+k)
return kw
if __name__=="__main__":
190 changes: 190 additions & 0 deletions
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
# Implementation of t-SNE in Python. The implementation was tested on Python
# 2.7.10, and it requires a working installation of NumPy. The implementation
# comes with an example on the MNIST dataset. In order to plot the
# results of this example, a working installation of matplotlib is required.
# The example can be run by executing: `ipython`
# Created by Laurens van der Maaten on 20-12-08.
# Copyright (c) 2008 Tilburg University. All rights reserved.

import numpy as np
import matplotlib.pylab as pylab

def Hbeta(D=np.array([]), beta=1.0):
Compute the perplexity and the P-row for a specific value of the
precision of a Gaussian distribution.

# Compute P-row and corresponding perplexity
P = np.exp(-D.copy() * beta)
sumP = sum(P)
H = np.log(sumP) + beta * np.sum(D * P) / sumP
P = P / sumP
return H, P

def x2p(X=np.array([]), tol=1e-5, perplexity=30.0):
Performs a binary search to get P-values in such a way that each
conditional Gaussian has the same perplexity.

# Initialize some variables
print("Computing pairwise distances...")
(n, d) = X.shape
sum_X = np.sum(np.square(X), 1)
D = np.add(np.add(-2 *, X.T), sum_X).T, sum_X)
P = np.zeros((n, n))
beta = np.ones((n, 1))
logU = np.log(perplexity)

# Loop over all datapoints
for i in range(n):

# Print progress
if i % 500 == 0:
print("Computing P-values for point %d of %d..." % (i, n))

# Compute the Gaussian kernel and entropy for the current precision
betamin = -np.inf
betamax = np.inf
Di = D[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))]
(H, thisP) = Hbeta(Di, beta[i])

# Evaluate whether the perplexity is within tolerance
Hdiff = H - logU
tries = 0
while np.abs(Hdiff) > tol and tries < 50:

# If not, increase or decrease precision
if Hdiff > 0:
betamin = beta[i].copy()
if betamax == np.inf or betamax == -np.inf:
beta[i] = beta[i] * 2.
beta[i] = (beta[i] + betamax) / 2.
betamax = beta[i].copy()
if betamin == np.inf or betamin == -np.inf:
beta[i] = beta[i] / 2.
beta[i] = (beta[i] + betamin) / 2.

# Recompute the values
(H, thisP) = Hbeta(Di, beta[i])
Hdiff = H - logU
tries += 1

# Set the final row of P
P[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))] = thisP

# Return final P-matrix
print("Mean value of sigma: %f" % np.mean(np.sqrt(1 / beta)))
return P

def pca(X=np.array([]), no_dims=50):
Runs PCA on the NxD array X in order to reduce its dimensionality to
no_dims dimensions.

print("Preprocessing the data using PCA...")
(n, d) = X.shape
X = X - np.tile(np.mean(X, 0), (n, 1))
(l, M) = np.linalg.eig(, X))
Y =, M[:, 0:no_dims])
return Y

def tsne(X=np.array([]), no_dims=2, initial_dims=50, perplexity=30.0):
Runs t-SNE on the dataset in the NxD array X to reduce its
dimensionality to no_dims dimensions. The syntaxis of the function is
`Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array.

# Check inputs
if isinstance(no_dims, float):
print("Error: array X should have type float.")
return -1
if round(no_dims) != no_dims:
print("Error: number of dimensions should be an integer.")
return -1

# Initialize variables
X = pca(X, initial_dims).real
(n, d) = X.shape
max_iter = 1000
initial_momentum = 0.5
final_momentum = 0.8
eta = 500
min_gain = 0.01
Y = np.random.randn(n, no_dims)
dY = np.zeros((n, no_dims))
iY = np.zeros((n, no_dims))
gains = np.ones((n, no_dims))

# Compute P-values
P = x2p(X, 1e-5, perplexity)
P = P + np.transpose(P)
P = P / np.sum(P)
P = P * 4. # early exaggeration
P = np.maximum(P, 1e-12)

# Run iterations
for iter in range(max_iter):

# Compute pairwise affinities
sum_Y = np.sum(np.square(Y), 1)
num = -2. *, Y.T)
num = 1. / (1. + np.add(np.add(num, sum_Y).T, sum_Y))
num[range(n), range(n)] = 0.
Q = num / np.sum(num)
Q = np.maximum(Q, 1e-12)

# Compute gradient
PQ = P - Q
for i in range(n):
dY[i, :] = np.sum(np.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (Y[i, :] - Y), 0)

# Perform the update
if iter < 20:
momentum = initial_momentum
momentum = final_momentum
gains = (gains + 0.2) * ((dY > 0.) != (iY > 0.)) + \
(gains * 0.8) * ((dY > 0.) == (iY > 0.))
gains[gains < min_gain] = min_gain
iY = momentum * iY - eta * (gains * dY)
Y = Y + iY
Y = Y - np.tile(np.mean(Y, 0), (n, 1))

# Compute current value of cost function
if (iter + 1) % 10 == 0:
C = np.sum(P * np.log(P / Q))
print("Iteration %d: error is %f" % (iter + 1, C))

# Stop lying about P-values
if iter == 100:
P = P / 4.

# Return solution
return Y

if __name__ == "__main__":
print("Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset.")
print("Running example on 2,500 MNIST digits...")
X = np.loadtxt("mnist2500_X.txt")
labels = np.loadtxt("mnist2500_labels.txt")
Y = tsne(X, 2, 50, 20.0)
pylab.scatter(Y[:, 0], Y[:, 1], 20, labels)

0 comments on commit dfc1ff7

Please sign in to comment.