Skip to content

Commit

Permalink
first
Browse files Browse the repository at this point in the history
  • Loading branch information
merlingo committed Jan 7, 2019
1 parent a14d637 commit dfc1ff7
Show file tree
Hide file tree
Showing 4 changed files with 371 additions and 0 deletions.
48 changes: 48 additions & 0 deletions AnomalyDet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@

def EuclideanDistance(a,b):
import math
distance = math.sqrt(sum([(x-y)**2 for x,y in zip(a,b)]))
print("Uzaklık: "+distance)
return distance

def distanceBasedOutlierDetection(dataset, distfunc, dist_threshold, frac_threshold):
'''
detect anomaly based on distance
:param dataset:
:param dist_threshold:
:param frac_threshold:
:return:
'''
n=len(dataset)
outlier = []

for i in range(n):
db = False
count=0
for j in range(n):
dist = distfunc(dataset[j],dataset[i])
if(j !=i and dist<dist_threshold):#eger i j degilse ve ideki jye yakin olan tum datasetin frac yuzdesinden cok eleman var mı
count+=1
if(count>=frac_threshold*n):
db=True
break #dataset[i] is not outlier
#endif
#endif
#endfor
if(db):
continue
outlier.append(i)
return outlier

def gridBasedOutlierDetection(dataset, distfunc, dist_threshold, frac_threshold):
'''
:param dataset:
:param distfunc:
:param dist_threshold:
:param frac_threshold:
:return:
'''

if __name__ =="__main__":
distanceBasedOutlierDetection([3,5,7,8,3435,234],EuclideanDistance,50,2)
61 changes: 61 additions & 0 deletions Application.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import tsne
import numpy as np
import matplotlib.pylab as pylab
import Word2VectorApp as w2v
import AnomalyDet as anomaly
def getWordAndVectors():
# Create a dataset where each tuple will be word2vec vector of a word.
dataset = []
dfn = "word2vec.txt"
a = 0
words = []
with open(dfn, "r") as f:
for l in f:
a += 1
wv = l.split(" ")
wv[-1] = wv[-1].replace("\n", "")

if (a == 1):
continue
return dataset,words
def EuclideanDistance(a,b):
import math
distance = math.sqrt(sum([(x-y)**2 for x,y in zip(a,b)]))
#print("Uzaklık: "+str(distance))
return distance
def printoutliers(outlier,words):
print(str(len(outlier))+" kadar outlier vardır:")
o = [words[i] for i in outlier]
print(o)

def __main():
#Create a dataset where each tuple will be word2vec vector of a word.
dataset, words = w2v.app()
X =np.array(dataset).astype(np.float)
print(X)

#Apply an anomoly detection technique
outlier = anomaly.distanceBasedOutlierDetection(X,EuclideanDistance,1,0.2)
print("1.0 icin outliers:")
printoutliers(outlier,words)

outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.1, 0.3)
print("1.2 icin outliers:")
printoutliers(outlier,words)
outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.2, 0.3)
print("1.2 icin outliers:")
printoutliers(outlier, words)
outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.3, 0.3)
print("1.3 icin outliers:")
printoutliers(outlier, words)
outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.5, 0.4)
print("1.5 icin outliers:")
printoutliers(outlier,words)
#Show the string of words on the TSNE visualization
Y = tsne.tsne(X, 2, 50, 20.0)
pylab.scatter(Y[:, 0], Y[:, 1])
for i, word in enumerate(words):
pylab.annotate(word, xy=(Y[i, 0], Y[i, 1]))
pylab.show()
if __name__ == "__main__":
__main()
72 changes: 72 additions & 0 deletions Word2VectorApp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import gensim
from gensim.corpora import WikiCorpus
import os
import random
'''
1- get corpus
1.2- train word2vec models to get vectors
2- select some keywords about same topic as dataset
3- select some anomaly words which have no interest with topic to insert dataset
4- return dataset - word and vektor
'''
keywords = ["bilgisayar","c++","java","sistematik","insan-bilgisayar","komputer","computer","programlama","windows",
"işlem","hesaplama","incelemek",
"algoritmalar","program","yazılım","ağı","veritabanı","sistemleri","paralel","dağıtık",
"etkileşimi", "işletim","sistemi","teorik","bilimi","matematiksel","kodlama","teorisi","veri","yapıları",
"assembly","analizi","işlediğimiz","cihaz","dizüstü","masaüstü","bayt","ikili","sayılar","bit","rastgele",
"erişimli","bellek"]
anomaly=["tarih","türk","atsız","kubilay","cengiz"]
def getCorpus():
# parse an xml file by name
wiki = WikiCorpus("trwiki-20181201-pages-articles-multistream.xml.bz2", lemmatize=False, dictionary={})
space = " "
i = 0
corpus = [text for text in wiki.get_texts()]
print("Finished Saved " + str(len(corpus)) + " articles")
return corpus
def modeling(doc):
model = gensim.models.Word2Vec(doc, size=150, window=10, min_count=2, workers=10)
return model
def dataset(keyws, model):
dataset = []
for k in keyws:
dataset.append(list(model[k.lower()]))
return dataset
def app(outfile="model.txt"):
model = {}
global anomaly
if not(os.path.isfile(outfile) ):
print("model is not exist, it will be produced")
corpus = getCorpus()
model = modeling(corpus)
# trim unneeded model memory = use (much) less RAM
model.init_sims(replace=True)
model.save(outfile)
else:
print("model is exist, it will load from file")
model = gensim.models.Word2Vec.load(outfile)

words = list(model.wv.vocab)
lw =len(words)
print(lw)
#print("\n"+str(words[:100]))
keyws = checkKeywords(words)
for k in anomaly:
if (k in words):
keyws.append(k)
else:
print("bu anomaly yok:"+k)
dset = dataset(keyws,model)
return dset,keyws

def checkKeywords(words):
global keywords
kw= []
for k in keywords:
if(k in words):
kw.append(k)
else:
print("bu kelime yok:"+k)
return kw
if __name__=="__main__":
print(app())
190 changes: 190 additions & 0 deletions tsne.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
#
# tsne.py
#
# Implementation of t-SNE in Python. The implementation was tested on Python
# 2.7.10, and it requires a working installation of NumPy. The implementation
# comes with an example on the MNIST dataset. In order to plot the
# results of this example, a working installation of matplotlib is required.
#
# The example can be run by executing: `ipython tsne.py`
#
#
# Created by Laurens van der Maaten on 20-12-08.
# Copyright (c) 2008 Tilburg University. All rights reserved.

import numpy as np
import matplotlib.pylab as pylab


def Hbeta(D=np.array([]), beta=1.0):
"""
Compute the perplexity and the P-row for a specific value of the
precision of a Gaussian distribution.
"""

# Compute P-row and corresponding perplexity
P = np.exp(-D.copy() * beta)
sumP = sum(P)
H = np.log(sumP) + beta * np.sum(D * P) / sumP
P = P / sumP
return H, P


def x2p(X=np.array([]), tol=1e-5, perplexity=30.0):
"""
Performs a binary search to get P-values in such a way that each
conditional Gaussian has the same perplexity.
"""

# Initialize some variables
print("Computing pairwise distances...")
(n, d) = X.shape
sum_X = np.sum(np.square(X), 1)
D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X)
P = np.zeros((n, n))
beta = np.ones((n, 1))
logU = np.log(perplexity)

# Loop over all datapoints
for i in range(n):

# Print progress
if i % 500 == 0:
print("Computing P-values for point %d of %d..." % (i, n))

# Compute the Gaussian kernel and entropy for the current precision
betamin = -np.inf
betamax = np.inf
Di = D[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))]
(H, thisP) = Hbeta(Di, beta[i])

# Evaluate whether the perplexity is within tolerance
Hdiff = H - logU
tries = 0
while np.abs(Hdiff) > tol and tries < 50:

# If not, increase or decrease precision
if Hdiff > 0:
betamin = beta[i].copy()
if betamax == np.inf or betamax == -np.inf:
beta[i] = beta[i] * 2.
else:
beta[i] = (beta[i] + betamax) / 2.
else:
betamax = beta[i].copy()
if betamin == np.inf or betamin == -np.inf:
beta[i] = beta[i] / 2.
else:
beta[i] = (beta[i] + betamin) / 2.

# Recompute the values
(H, thisP) = Hbeta(Di, beta[i])
Hdiff = H - logU
tries += 1

# Set the final row of P
P[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))] = thisP

# Return final P-matrix
print("Mean value of sigma: %f" % np.mean(np.sqrt(1 / beta)))
return P


def pca(X=np.array([]), no_dims=50):
"""
Runs PCA on the NxD array X in order to reduce its dimensionality to
no_dims dimensions.
"""

print("Preprocessing the data using PCA...")
(n, d) = X.shape
X = X - np.tile(np.mean(X, 0), (n, 1))
(l, M) = np.linalg.eig(np.dot(X.T, X))
Y = np.dot(X, M[:, 0:no_dims])
return Y


def tsne(X=np.array([]), no_dims=2, initial_dims=50, perplexity=30.0):
"""
Runs t-SNE on the dataset in the NxD array X to reduce its
dimensionality to no_dims dimensions. The syntaxis of the function is
`Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array.
"""

# Check inputs
if isinstance(no_dims, float):
print("Error: array X should have type float.")
return -1
if round(no_dims) != no_dims:
print("Error: number of dimensions should be an integer.")
return -1

# Initialize variables
X = pca(X, initial_dims).real
(n, d) = X.shape
max_iter = 1000
initial_momentum = 0.5
final_momentum = 0.8
eta = 500
min_gain = 0.01
Y = np.random.randn(n, no_dims)
dY = np.zeros((n, no_dims))
iY = np.zeros((n, no_dims))
gains = np.ones((n, no_dims))

# Compute P-values
P = x2p(X, 1e-5, perplexity)
P = P + np.transpose(P)
P = P / np.sum(P)
P = P * 4. # early exaggeration
P = np.maximum(P, 1e-12)

# Run iterations
for iter in range(max_iter):

# Compute pairwise affinities
sum_Y = np.sum(np.square(Y), 1)
num = -2. * np.dot(Y, Y.T)
num = 1. / (1. + np.add(np.add(num, sum_Y).T, sum_Y))
num[range(n), range(n)] = 0.
Q = num / np.sum(num)
Q = np.maximum(Q, 1e-12)

# Compute gradient
PQ = P - Q
for i in range(n):
dY[i, :] = np.sum(np.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (Y[i, :] - Y), 0)

# Perform the update
if iter < 20:
momentum = initial_momentum
else:
momentum = final_momentum
gains = (gains + 0.2) * ((dY > 0.) != (iY > 0.)) + \
(gains * 0.8) * ((dY > 0.) == (iY > 0.))
gains[gains < min_gain] = min_gain
iY = momentum * iY - eta * (gains * dY)
Y = Y + iY
Y = Y - np.tile(np.mean(Y, 0), (n, 1))

# Compute current value of cost function
if (iter + 1) % 10 == 0:
C = np.sum(P * np.log(P / Q))
print("Iteration %d: error is %f" % (iter + 1, C))

# Stop lying about P-values
if iter == 100:
P = P / 4.

# Return solution
return Y


if __name__ == "__main__":
print("Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset.")
print("Running example on 2,500 MNIST digits...")
X = np.loadtxt("mnist2500_X.txt")
labels = np.loadtxt("mnist2500_labels.txt")
Y = tsne(X, 2, 50, 20.0)
pylab.scatter(Y[:, 0], Y[:, 1], 20, labels)
pylab.show()

0 comments on commit dfc1ff7

Please sign in to comment.