-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
371 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
|
||
def EuclideanDistance(a,b): | ||
import math | ||
distance = math.sqrt(sum([(x-y)**2 for x,y in zip(a,b)])) | ||
print("Uzaklık: "+distance) | ||
return distance | ||
|
||
def distanceBasedOutlierDetection(dataset, distfunc, dist_threshold, frac_threshold): | ||
''' | ||
detect anomaly based on distance | ||
:param dataset: | ||
:param dist_threshold: | ||
:param frac_threshold: | ||
:return: | ||
''' | ||
n=len(dataset) | ||
outlier = [] | ||
|
||
for i in range(n): | ||
db = False | ||
count=0 | ||
for j in range(n): | ||
dist = distfunc(dataset[j],dataset[i]) | ||
if(j !=i and dist<dist_threshold):#eger i j degilse ve ideki jye yakin olan tum datasetin frac yuzdesinden cok eleman var mı | ||
count+=1 | ||
if(count>=frac_threshold*n): | ||
db=True | ||
break #dataset[i] is not outlier | ||
#endif | ||
#endif | ||
#endfor | ||
if(db): | ||
continue | ||
outlier.append(i) | ||
return outlier | ||
|
||
def gridBasedOutlierDetection(dataset, distfunc, dist_threshold, frac_threshold): | ||
''' | ||
:param dataset: | ||
:param distfunc: | ||
:param dist_threshold: | ||
:param frac_threshold: | ||
:return: | ||
''' | ||
|
||
if __name__ =="__main__": | ||
distanceBasedOutlierDetection([3,5,7,8,3435,234],EuclideanDistance,50,2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import tsne | ||
import numpy as np | ||
import matplotlib.pylab as pylab | ||
import Word2VectorApp as w2v | ||
import AnomalyDet as anomaly | ||
def getWordAndVectors(): | ||
# Create a dataset where each tuple will be word2vec vector of a word. | ||
dataset = [] | ||
dfn = "word2vec.txt" | ||
a = 0 | ||
words = [] | ||
with open(dfn, "r") as f: | ||
for l in f: | ||
a += 1 | ||
wv = l.split(" ") | ||
wv[-1] = wv[-1].replace("\n", "") | ||
|
||
if (a == 1): | ||
continue | ||
return dataset,words | ||
def EuclideanDistance(a,b): | ||
import math | ||
distance = math.sqrt(sum([(x-y)**2 for x,y in zip(a,b)])) | ||
#print("Uzaklık: "+str(distance)) | ||
return distance | ||
def printoutliers(outlier,words): | ||
print(str(len(outlier))+" kadar outlier vardır:") | ||
o = [words[i] for i in outlier] | ||
print(o) | ||
|
||
def __main(): | ||
#Create a dataset where each tuple will be word2vec vector of a word. | ||
dataset, words = w2v.app() | ||
X =np.array(dataset).astype(np.float) | ||
print(X) | ||
|
||
#Apply an anomoly detection technique | ||
outlier = anomaly.distanceBasedOutlierDetection(X,EuclideanDistance,1,0.2) | ||
print("1.0 icin outliers:") | ||
printoutliers(outlier,words) | ||
|
||
outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.1, 0.3) | ||
print("1.2 icin outliers:") | ||
printoutliers(outlier,words) | ||
outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.2, 0.3) | ||
print("1.2 icin outliers:") | ||
printoutliers(outlier, words) | ||
outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.3, 0.3) | ||
print("1.3 icin outliers:") | ||
printoutliers(outlier, words) | ||
outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.5, 0.4) | ||
print("1.5 icin outliers:") | ||
printoutliers(outlier,words) | ||
#Show the string of words on the TSNE visualization | ||
Y = tsne.tsne(X, 2, 50, 20.0) | ||
pylab.scatter(Y[:, 0], Y[:, 1]) | ||
for i, word in enumerate(words): | ||
pylab.annotate(word, xy=(Y[i, 0], Y[i, 1])) | ||
pylab.show() | ||
if __name__ == "__main__": | ||
__main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import gensim | ||
from gensim.corpora import WikiCorpus | ||
import os | ||
import random | ||
''' | ||
1- get corpus | ||
1.2- train word2vec models to get vectors | ||
2- select some keywords about same topic as dataset | ||
3- select some anomaly words which have no interest with topic to insert dataset | ||
4- return dataset - word and vektor | ||
''' | ||
keywords = ["bilgisayar","c++","java","sistematik","insan-bilgisayar","komputer","computer","programlama","windows", | ||
"işlem","hesaplama","incelemek", | ||
"algoritmalar","program","yazılım","ağı","veritabanı","sistemleri","paralel","dağıtık", | ||
"etkileşimi", "işletim","sistemi","teorik","bilimi","matematiksel","kodlama","teorisi","veri","yapıları", | ||
"assembly","analizi","işlediğimiz","cihaz","dizüstü","masaüstü","bayt","ikili","sayılar","bit","rastgele", | ||
"erişimli","bellek"] | ||
anomaly=["tarih","türk","atsız","kubilay","cengiz"] | ||
def getCorpus(): | ||
# parse an xml file by name | ||
wiki = WikiCorpus("trwiki-20181201-pages-articles-multistream.xml.bz2", lemmatize=False, dictionary={}) | ||
space = " " | ||
i = 0 | ||
corpus = [text for text in wiki.get_texts()] | ||
print("Finished Saved " + str(len(corpus)) + " articles") | ||
return corpus | ||
def modeling(doc): | ||
model = gensim.models.Word2Vec(doc, size=150, window=10, min_count=2, workers=10) | ||
return model | ||
def dataset(keyws, model): | ||
dataset = [] | ||
for k in keyws: | ||
dataset.append(list(model[k.lower()])) | ||
return dataset | ||
def app(outfile="model.txt"): | ||
model = {} | ||
global anomaly | ||
if not(os.path.isfile(outfile) ): | ||
print("model is not exist, it will be produced") | ||
corpus = getCorpus() | ||
model = modeling(corpus) | ||
# trim unneeded model memory = use (much) less RAM | ||
model.init_sims(replace=True) | ||
model.save(outfile) | ||
else: | ||
print("model is exist, it will load from file") | ||
model = gensim.models.Word2Vec.load(outfile) | ||
|
||
words = list(model.wv.vocab) | ||
lw =len(words) | ||
print(lw) | ||
#print("\n"+str(words[:100])) | ||
keyws = checkKeywords(words) | ||
for k in anomaly: | ||
if (k in words): | ||
keyws.append(k) | ||
else: | ||
print("bu anomaly yok:"+k) | ||
dset = dataset(keyws,model) | ||
return dset,keyws | ||
|
||
def checkKeywords(words): | ||
global keywords | ||
kw= [] | ||
for k in keywords: | ||
if(k in words): | ||
kw.append(k) | ||
else: | ||
print("bu kelime yok:"+k) | ||
return kw | ||
if __name__=="__main__": | ||
print(app()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,190 @@ | ||
# | ||
# tsne.py | ||
# | ||
# Implementation of t-SNE in Python. The implementation was tested on Python | ||
# 2.7.10, and it requires a working installation of NumPy. The implementation | ||
# comes with an example on the MNIST dataset. In order to plot the | ||
# results of this example, a working installation of matplotlib is required. | ||
# | ||
# The example can be run by executing: `ipython tsne.py` | ||
# | ||
# | ||
# Created by Laurens van der Maaten on 20-12-08. | ||
# Copyright (c) 2008 Tilburg University. All rights reserved. | ||
|
||
import numpy as np | ||
import matplotlib.pylab as pylab | ||
|
||
|
||
def Hbeta(D=np.array([]), beta=1.0): | ||
""" | ||
Compute the perplexity and the P-row for a specific value of the | ||
precision of a Gaussian distribution. | ||
""" | ||
|
||
# Compute P-row and corresponding perplexity | ||
P = np.exp(-D.copy() * beta) | ||
sumP = sum(P) | ||
H = np.log(sumP) + beta * np.sum(D * P) / sumP | ||
P = P / sumP | ||
return H, P | ||
|
||
|
||
def x2p(X=np.array([]), tol=1e-5, perplexity=30.0): | ||
""" | ||
Performs a binary search to get P-values in such a way that each | ||
conditional Gaussian has the same perplexity. | ||
""" | ||
|
||
# Initialize some variables | ||
print("Computing pairwise distances...") | ||
(n, d) = X.shape | ||
sum_X = np.sum(np.square(X), 1) | ||
D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X) | ||
P = np.zeros((n, n)) | ||
beta = np.ones((n, 1)) | ||
logU = np.log(perplexity) | ||
|
||
# Loop over all datapoints | ||
for i in range(n): | ||
|
||
# Print progress | ||
if i % 500 == 0: | ||
print("Computing P-values for point %d of %d..." % (i, n)) | ||
|
||
# Compute the Gaussian kernel and entropy for the current precision | ||
betamin = -np.inf | ||
betamax = np.inf | ||
Di = D[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))] | ||
(H, thisP) = Hbeta(Di, beta[i]) | ||
|
||
# Evaluate whether the perplexity is within tolerance | ||
Hdiff = H - logU | ||
tries = 0 | ||
while np.abs(Hdiff) > tol and tries < 50: | ||
|
||
# If not, increase or decrease precision | ||
if Hdiff > 0: | ||
betamin = beta[i].copy() | ||
if betamax == np.inf or betamax == -np.inf: | ||
beta[i] = beta[i] * 2. | ||
else: | ||
beta[i] = (beta[i] + betamax) / 2. | ||
else: | ||
betamax = beta[i].copy() | ||
if betamin == np.inf or betamin == -np.inf: | ||
beta[i] = beta[i] / 2. | ||
else: | ||
beta[i] = (beta[i] + betamin) / 2. | ||
|
||
# Recompute the values | ||
(H, thisP) = Hbeta(Di, beta[i]) | ||
Hdiff = H - logU | ||
tries += 1 | ||
|
||
# Set the final row of P | ||
P[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))] = thisP | ||
|
||
# Return final P-matrix | ||
print("Mean value of sigma: %f" % np.mean(np.sqrt(1 / beta))) | ||
return P | ||
|
||
|
||
def pca(X=np.array([]), no_dims=50): | ||
""" | ||
Runs PCA on the NxD array X in order to reduce its dimensionality to | ||
no_dims dimensions. | ||
""" | ||
|
||
print("Preprocessing the data using PCA...") | ||
(n, d) = X.shape | ||
X = X - np.tile(np.mean(X, 0), (n, 1)) | ||
(l, M) = np.linalg.eig(np.dot(X.T, X)) | ||
Y = np.dot(X, M[:, 0:no_dims]) | ||
return Y | ||
|
||
|
||
def tsne(X=np.array([]), no_dims=2, initial_dims=50, perplexity=30.0): | ||
""" | ||
Runs t-SNE on the dataset in the NxD array X to reduce its | ||
dimensionality to no_dims dimensions. The syntaxis of the function is | ||
`Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array. | ||
""" | ||
|
||
# Check inputs | ||
if isinstance(no_dims, float): | ||
print("Error: array X should have type float.") | ||
return -1 | ||
if round(no_dims) != no_dims: | ||
print("Error: number of dimensions should be an integer.") | ||
return -1 | ||
|
||
# Initialize variables | ||
X = pca(X, initial_dims).real | ||
(n, d) = X.shape | ||
max_iter = 1000 | ||
initial_momentum = 0.5 | ||
final_momentum = 0.8 | ||
eta = 500 | ||
min_gain = 0.01 | ||
Y = np.random.randn(n, no_dims) | ||
dY = np.zeros((n, no_dims)) | ||
iY = np.zeros((n, no_dims)) | ||
gains = np.ones((n, no_dims)) | ||
|
||
# Compute P-values | ||
P = x2p(X, 1e-5, perplexity) | ||
P = P + np.transpose(P) | ||
P = P / np.sum(P) | ||
P = P * 4. # early exaggeration | ||
P = np.maximum(P, 1e-12) | ||
|
||
# Run iterations | ||
for iter in range(max_iter): | ||
|
||
# Compute pairwise affinities | ||
sum_Y = np.sum(np.square(Y), 1) | ||
num = -2. * np.dot(Y, Y.T) | ||
num = 1. / (1. + np.add(np.add(num, sum_Y).T, sum_Y)) | ||
num[range(n), range(n)] = 0. | ||
Q = num / np.sum(num) | ||
Q = np.maximum(Q, 1e-12) | ||
|
||
# Compute gradient | ||
PQ = P - Q | ||
for i in range(n): | ||
dY[i, :] = np.sum(np.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (Y[i, :] - Y), 0) | ||
|
||
# Perform the update | ||
if iter < 20: | ||
momentum = initial_momentum | ||
else: | ||
momentum = final_momentum | ||
gains = (gains + 0.2) * ((dY > 0.) != (iY > 0.)) + \ | ||
(gains * 0.8) * ((dY > 0.) == (iY > 0.)) | ||
gains[gains < min_gain] = min_gain | ||
iY = momentum * iY - eta * (gains * dY) | ||
Y = Y + iY | ||
Y = Y - np.tile(np.mean(Y, 0), (n, 1)) | ||
|
||
# Compute current value of cost function | ||
if (iter + 1) % 10 == 0: | ||
C = np.sum(P * np.log(P / Q)) | ||
print("Iteration %d: error is %f" % (iter + 1, C)) | ||
|
||
# Stop lying about P-values | ||
if iter == 100: | ||
P = P / 4. | ||
|
||
# Return solution | ||
return Y | ||
|
||
|
||
if __name__ == "__main__": | ||
print("Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset.") | ||
print("Running example on 2,500 MNIST digits...") | ||
X = np.loadtxt("mnist2500_X.txt") | ||
labels = np.loadtxt("mnist2500_labels.txt") | ||
Y = tsne(X, 2, 50, 20.0) | ||
pylab.scatter(Y[:, 0], Y[:, 1], 20, labels) | ||
pylab.show() |