first

merlingo · Jan 7, 2019 · dfc1ff7 · dfc1ff7
1 parent a14d637
commit dfc1ff7
Show file tree

Hide file tree

Showing 4 changed files with 371 additions and 0 deletions.
diff --git a/AnomalyDet.py b/AnomalyDet.py
@@ -0,0 +1,48 @@
+
+def EuclideanDistance(a,b):
+    import math
+    distance = math.sqrt(sum([(x-y)**2 for x,y in zip(a,b)]))
+    print("Uzaklık: "+distance)
+    return distance
+
+def distanceBasedOutlierDetection(dataset, distfunc, dist_threshold, frac_threshold):
+    '''
+    detect anomaly based on distance
+    :param dataset:
+    :param dist_threshold:
+    :param frac_threshold:
+    :return:
+    '''
+    n=len(dataset)
+    outlier = []
+
+    for i in range(n):
+        db = False
+        count=0
+        for j in range(n):
+            dist = distfunc(dataset[j],dataset[i])
+            if(j !=i and dist<dist_threshold):#eger i j degilse ve ideki jye yakin olan tum datasetin frac yuzdesinden cok eleman var mı
+                count+=1
+                if(count>=frac_threshold*n):
+                    db=True
+                    break #dataset[i] is not outlier
+                    #endif
+                #endif
+            #endfor
+        if(db):
+            continue
+        outlier.append(i)
+    return outlier
+
+def gridBasedOutlierDetection(dataset, distfunc, dist_threshold, frac_threshold):
+    '''
+
+    :param dataset:
+    :param distfunc:
+    :param dist_threshold:
+    :param frac_threshold:
+    :return:
+    '''
+
+if __name__ =="__main__":
+    distanceBasedOutlierDetection([3,5,7,8,3435,234],EuclideanDistance,50,2)
diff --git a/Application.py b/Application.py
@@ -0,0 +1,61 @@
+import tsne
+import numpy as np
+import matplotlib.pylab as pylab
+import Word2VectorApp as w2v
+import AnomalyDet as anomaly
+def getWordAndVectors():
+    # Create	a	dataset	where	each	tuple	will	be	word2vec	vector	of	a	word.
+    dataset = []
+    dfn = "word2vec.txt"
+    a = 0
+    words = []
+    with open(dfn, "r") as f:
+        for l in f:
+            a += 1
+            wv = l.split(" ")
+            wv[-1] = wv[-1].replace("\n", "")
+
+            if (a == 1):
+                continue
+    return dataset,words
+def EuclideanDistance(a,b):
+    import math
+    distance = math.sqrt(sum([(x-y)**2 for x,y in zip(a,b)]))
+    #print("Uzaklık: "+str(distance))
+    return distance
+def printoutliers(outlier,words):
+    print(str(len(outlier))+" kadar outlier vardır:")
+    o = [words[i] for i in outlier]
+    print(o)
+
+def __main():
+    #Create	a	dataset	where	each	tuple	will	be	word2vec	vector	of	a	word.
+    dataset, words = w2v.app()
+    X =np.array(dataset).astype(np.float)
+    print(X)
+
+    #Apply	an	anomoly	detection	technique
+    outlier = anomaly.distanceBasedOutlierDetection(X,EuclideanDistance,1,0.2)
+    print("1.0 icin outliers:")
+    printoutliers(outlier,words)
+
+    outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.1, 0.3)
+    print("1.2 icin outliers:")
+    printoutliers(outlier,words)
+    outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.2, 0.3)
+    print("1.2 icin outliers:")
+    printoutliers(outlier, words)
+    outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.3, 0.3)
+    print("1.3 icin outliers:")
+    printoutliers(outlier, words)
+    outlier = anomaly.distanceBasedOutlierDetection(X, EuclideanDistance, 1.5, 0.4)
+    print("1.5 icin outliers:")
+    printoutliers(outlier,words)
+    #Show	the	string	of	words	on	the	TSNE	visualization
+    Y = tsne.tsne(X, 2, 50, 20.0)
+    pylab.scatter(Y[:, 0], Y[:, 1])
+    for i, word in enumerate(words):
+        pylab.annotate(word, xy=(Y[i, 0], Y[i, 1]))
+    pylab.show()
+if __name__ == "__main__":
+    __main()
diff --git a/Word2VectorApp.py b/Word2VectorApp.py
@@ -0,0 +1,72 @@
+import gensim
+from gensim.corpora import WikiCorpus
+import os
+import random
+'''
+1- get corpus
+1.2- train word2vec models to get vectors
+2- select some keywords about same topic as dataset
+3- select some anomaly words which have no interest with topic to insert dataset
+4- return dataset - word and vektor
+'''
+keywords = ["bilgisayar","c++","java","sistematik","insan-bilgisayar","komputer","computer","programlama","windows",
+            "işlem","hesaplama","incelemek",
+            "algoritmalar","program","yazılım","ağı","veritabanı","sistemleri","paralel","dağıtık",
+            "etkileşimi", "işletim","sistemi","teorik","bilimi","matematiksel","kodlama","teorisi","veri","yapıları",
+            "assembly","analizi","işlediğimiz","cihaz","dizüstü","masaüstü","bayt","ikili","sayılar","bit","rastgele",
+            "erişimli","bellek"]
+anomaly=["tarih","türk","atsız","kubilay","cengiz"]
+def getCorpus():
+    # parse an xml file by name
+    wiki = WikiCorpus("trwiki-20181201-pages-articles-multistream.xml.bz2", lemmatize=False, dictionary={})
+    space = " "
+    i = 0
+    corpus = [text for text in wiki.get_texts()]
+    print("Finished Saved " + str(len(corpus)) + " articles")
+    return  corpus
+def modeling(doc):
+    model = gensim.models.Word2Vec(doc, size=150, window=10, min_count=2, workers=10)
+    return model
+def dataset(keyws, model):
+    dataset = []
+    for k in keyws:
+        dataset.append(list(model[k.lower()]))
+    return dataset
+def app(outfile="model.txt"):
+    model = {}
+    global anomaly
+    if not(os.path.isfile(outfile) ):
+        print("model is not exist, it will be produced")
+        corpus = getCorpus()
+        model = modeling(corpus)
+        # trim unneeded model memory = use (much) less RAM
+        model.init_sims(replace=True)
+        model.save(outfile)
+    else:
+        print("model is exist, it will load from file")
+        model = gensim.models.Word2Vec.load(outfile)
+
+    words = list(model.wv.vocab)
+    lw =len(words)
+    print(lw)
+    #print("\n"+str(words[:100]))
+    keyws = checkKeywords(words)
+    for k in anomaly:
+        if (k in words):
+            keyws.append(k)
+        else:
+            print("bu anomaly yok:"+k)
+    dset = dataset(keyws,model)
+    return dset,keyws
+
+def checkKeywords(words):
+    global keywords
+    kw= []
+    for k in keywords:
+        if(k in words):
+            kw.append(k)
+        else:
+            print("bu kelime yok:"+k)
+    return kw
+if __name__=="__main__":
+    print(app())
diff --git a/tsne.py b/tsne.py
@@ -0,0 +1,190 @@
+#
+#  tsne.py
+#
+# Implementation of t-SNE in Python. The implementation was tested on Python
+# 2.7.10, and it requires a working installation of NumPy. The implementation
+# comes with an example on the MNIST dataset. In order to plot the
+# results of this example, a working installation of matplotlib is required.
+#
+# The example can be run by executing: `ipython tsne.py`
+#
+#
+#  Created by Laurens van der Maaten on 20-12-08.
+#  Copyright (c) 2008 Tilburg University. All rights reserved.
+
+import numpy as np
+import matplotlib.pylab as pylab
+
+
+def Hbeta(D=np.array([]), beta=1.0):
+    """
+        Compute the perplexity and the P-row for a specific value of the
+        precision of a Gaussian distribution.
+    """
+
+    # Compute P-row and corresponding perplexity
+    P = np.exp(-D.copy() * beta)
+    sumP = sum(P)
+    H = np.log(sumP) + beta * np.sum(D * P) / sumP
+    P = P / sumP
+    return H, P
+
+
+def x2p(X=np.array([]), tol=1e-5, perplexity=30.0):
+    """
+        Performs a binary search to get P-values in such a way that each
+        conditional Gaussian has the same perplexity.
+    """
+
+    # Initialize some variables
+    print("Computing pairwise distances...")
+    (n, d) = X.shape
+    sum_X = np.sum(np.square(X), 1)
+    D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X)
+    P = np.zeros((n, n))
+    beta = np.ones((n, 1))
+    logU = np.log(perplexity)
+
+    # Loop over all datapoints
+    for i in range(n):
+
+        # Print progress
+        if i % 500 == 0:
+            print("Computing P-values for point %d of %d..." % (i, n))
+
+        # Compute the Gaussian kernel and entropy for the current precision
+        betamin = -np.inf
+        betamax = np.inf
+        Di = D[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))]
+        (H, thisP) = Hbeta(Di, beta[i])
+
+        # Evaluate whether the perplexity is within tolerance
+        Hdiff = H - logU
+        tries = 0
+        while np.abs(Hdiff) > tol and tries < 50:
+
+            # If not, increase or decrease precision
+            if Hdiff > 0:
+                betamin = beta[i].copy()
+                if betamax == np.inf or betamax == -np.inf:
+                    beta[i] = beta[i] * 2.
+                else:
+                    beta[i] = (beta[i] + betamax) / 2.
+            else:
+                betamax = beta[i].copy()
+                if betamin == np.inf or betamin == -np.inf:
+                    beta[i] = beta[i] / 2.
+                else:
+                    beta[i] = (beta[i] + betamin) / 2.
+
+            # Recompute the values
+            (H, thisP) = Hbeta(Di, beta[i])
+            Hdiff = H - logU
+            tries += 1
+
+        # Set the final row of P
+        P[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))] = thisP
+
+    # Return final P-matrix
+    print("Mean value of sigma: %f" % np.mean(np.sqrt(1 / beta)))
+    return P
+
+
+def pca(X=np.array([]), no_dims=50):
+    """
+        Runs PCA on the NxD array X in order to reduce its dimensionality to
+        no_dims dimensions.
+    """
+
+    print("Preprocessing the data using PCA...")
+    (n, d) = X.shape
+    X = X - np.tile(np.mean(X, 0), (n, 1))
+    (l, M) = np.linalg.eig(np.dot(X.T, X))
+    Y = np.dot(X, M[:, 0:no_dims])
+    return Y
+
+
+def tsne(X=np.array([]), no_dims=2, initial_dims=50, perplexity=30.0):
+    """
+        Runs t-SNE on the dataset in the NxD array X to reduce its
+        dimensionality to no_dims dimensions. The syntaxis of the function is
+        `Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array.
+    """
+
+    # Check inputs
+    if isinstance(no_dims, float):
+        print("Error: array X should have type float.")
+        return -1
+    if round(no_dims) != no_dims:
+        print("Error: number of dimensions should be an integer.")
+        return -1
+
+    # Initialize variables
+    X = pca(X, initial_dims).real
+    (n, d) = X.shape
+    max_iter = 1000
+    initial_momentum = 0.5
+    final_momentum = 0.8
+    eta = 500
+    min_gain = 0.01
+    Y = np.random.randn(n, no_dims)
+    dY = np.zeros((n, no_dims))
+    iY = np.zeros((n, no_dims))
+    gains = np.ones((n, no_dims))
+
+    # Compute P-values
+    P = x2p(X, 1e-5, perplexity)
+    P = P + np.transpose(P)
+    P = P / np.sum(P)
+    P = P * 4.									# early exaggeration
+    P = np.maximum(P, 1e-12)
+
+    # Run iterations
+    for iter in range(max_iter):
+
+        # Compute pairwise affinities
+        sum_Y = np.sum(np.square(Y), 1)
+        num = -2. * np.dot(Y, Y.T)
+        num = 1. / (1. + np.add(np.add(num, sum_Y).T, sum_Y))
+        num[range(n), range(n)] = 0.
+        Q = num / np.sum(num)
+        Q = np.maximum(Q, 1e-12)
+
+        # Compute gradient
+        PQ = P - Q
+        for i in range(n):
+            dY[i, :] = np.sum(np.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (Y[i, :] - Y), 0)
+
+        # Perform the update
+        if iter < 20:
+            momentum = initial_momentum
+        else:
+            momentum = final_momentum
+        gains = (gains + 0.2) * ((dY > 0.) != (iY > 0.)) + \
+                (gains * 0.8) * ((dY > 0.) == (iY > 0.))
+        gains[gains < min_gain] = min_gain
+        iY = momentum * iY - eta * (gains * dY)
+        Y = Y + iY
+        Y = Y - np.tile(np.mean(Y, 0), (n, 1))
+
+        # Compute current value of cost function
+        if (iter + 1) % 10 == 0:
+            C = np.sum(P * np.log(P / Q))
+            print("Iteration %d: error is %f" % (iter + 1, C))
+
+        # Stop lying about P-values
+        if iter == 100:
+            P = P / 4.
+
+    # Return solution
+    return Y
+
+
+if __name__ == "__main__":
+    print("Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset.")
+    print("Running example on 2,500 MNIST digits...")
+    X = np.loadtxt("mnist2500_X.txt")
+    labels = np.loadtxt("mnist2500_labels.txt")
+    Y = tsne(X, 2, 50, 20.0)
+    pylab.scatter(Y[:, 0], Y[:, 1], 20, labels)
+    pylab.show()