Add files via upload

Blimpyway · Jan 26, 2022 · 9a7d09f · 9a7d09f
1 parent 901399b
commit 9a7d09f
Show file tree

Hide file tree

Showing 5 changed files with 479 additions and 0 deletions.
diff --git a/README.txt b/README.txt
@@ -0,0 +1,103 @@
+
+Here is an example of naive SDR associative memory using only python & numpy
+
+Files:
+-----
+fly_hash_encoder.py - an approximative implementation of Fly Hash encoder. 
+	It is used to convert (reatively) low sparsity mnist digits to 2048 bit sparse distributed representation
+	Since memory performance degrades with the square of 1 bits, the SDRs tend to be low <1-2% sparsity
+
+
+load_mnist_data.py, mnist_data.npz  - quick&dirty loader of numpy savez mnist digits
+
+sdr_mem2d.py - The actual 2d associative memory see how it works below 
+
+fh_am_test.py - The main program assembling all above files together. 
+
+Requirements:
+-------------
+Since mnist loader is included, this toy demo depends only on python3 and numpy
+
+
+Running: 
+--------
+
+The main demo program is fh_am_test.py. 
+
+  $ python3 fh_am_test.py
+
+It reads in MNIST digits, stores 20000 train digits in associative memory
+then queries same memory for all 10000 test digits, use results to "vote" on result's digit value. 
+It compares only number of responses from associative memory, it output results without any 
+distance measurements between the actual query digits and responses.  
+
+Every other .py program here runs its own if __name__ == "__main__" stuff for testing.
+e.g. sdr_mem2d.py runs a performance test on storing 100k random SDRs
+
+How is a SDR encoded and indexed
+--------------------------------
+SDR Size:   the number of all (ON or OFF) bits in any SDR 
+SDR Length: the number of ON (==1) bits
+
+We prefer to use SDRs with equal sizes lengths, to exploit numpy array optimisations
+We store only the list of ON bits for any SDR
+
+Index is built by expanding ON bits in pairs. 
+e.g. if a SDR has the bits [15, 64, 429, 900] its pair expansion is: 
+[(15,64), (15,429), (15,900), (64,429), (64,900), (429,900)]
+
+If the SDR Size is N, the number of possible bit pairs is N*(N-1)/2 
+
+Each pair above projects into a memory SLOT. Each SLOT stores a number of ID positions. This is the SLOT_Size
+Default slot size is 64. 
+An ID is just a 32bit int, used by programmers to connect it with meaningful data. (e.g. the original SDR)
+
+Memory Size is total number of SLOTs == how many possible bit pairs of SDRs of size N 
+e.g. N = 1024 then Memory Size is 1024*1023/2 =~ 512k number of SLOTs
+
+Its actual size in bytes is found by multiplying the 
+Memory_Size x SLOT_size x int32_size
+
+e.g. for N = 1024 and SLOT_size = 64 we get :
+
+1024*1023/2 * 64 * 4 =~ 128 MBytes 
+
+If we double the SDR size we get an 4x increase in memory needed to store the whole index.
+
+Projecting an SDR and corresponding ID into the memory: 
+
+- build the list of SDR slots by expanding SDR's ON bits in pairs. 
+- At each SLOT in the above list the corresponding ID is in one of available positions
+- position within a slot is chosen to be reproductible - same ID will be written at same position in the same slot. 
+
+Example:
+When writing a 32 bit long  SDR, it will have its ID recorded in 32 * 31 / 2 = 496 SLOTS. 
+
+Which means there is a certain likelihood that after many writings, the original ID could be restored from a significant number of slots when queried. 
+
+
+How it is used
+--------------
+
+Sparse encoding: All SDRs here are represented as lists of corresponding 1 bit positions in the 0..2047 bit SDR space. 
+e.g a SDR of 30  one bits is a numpy array of 30 int32 values.
+
+The SDR associative memory here is actually a mapper between int32 "ids" and SDRs  
+To store ids and sdr use an instance of SDRMap: 
+
+SDRMap.store(sdr_id_list, sdr_list)
+
+sdr_id_list has to by a 1d numpy int32 array, 
+sdr_list has to be a matching 2d array, one sparse encoded sdr per row for every id in the sdr_id_list
+
+
+When queried the memory does not return actual SDRs but (what it considers to be) best matching 32bit sdr_id-s
+used during store(). Is the user's responsibility to encode/decode "meaning" onto these ids.
+
+SDRMap.query(sdr_list, max_ids = 4)
+
+For each sdr in sdr_list returns a list of sdr_id and number of index hits. The list length does not exceed max_ids 
+
+
+SDRMap.queryExtended(sdr_list, min_hits = 4)  
+For each sdr in sdr_lists returns a list of sdr_id and number of index hits. Drops out ids with less than min_hits hits
diff --git a/fh_am_test.py b/fh_am_test.py
@@ -0,0 +1,96 @@
+
+"""
+Testing fly_hash encoder and associative memory to test MNIST retrieval accuracy
+
+We get >%91 accuracy using first 20k train digits
+This is just a poor replacement for knn nearest neighbor. 
+
+Accuracy losses may be caused by both FHEncoder and associative memory overlaps.
+
+"""
+from sdr_mem2d import SDRMap
+from fly_hash_encoder import FHEncoder
+from load_mnist_data import x_train, x_test, y_train, y_test, normalize
+
+x_train = normalize(x_train) # Our normalize aranges all images flattened and all image sum()-s are 1.0 
+x_test  = normalize(x_test)
+
+import numpy as np
+from time import time
+
+
+smap = SDRMap(slot_size = 112)
+
+# Create a FlyHash encoder. The hasher converts a alist of MNIST image to a SDR list of size 2048
+# x_train here is given to adjust "biases" to equalise SDR bits chances to produce 1
+hasher = FHEncoder(x_train[10000:20000])
+
+istart,ilen = 0,20000 # Which x_train digits will stored in memory
+iend   = istart + ilen
+
+print(f"hasher initialised, we use it to convert {ilen} x_train images to SDRs")
+t = time()
+sdrs = hasher.compute_sdrs(x_train[istart:iend],sdr_len = 25)
+t = time() - t
+print(f"{ilen} sdrs computed in {int(t*1000)}ms")
+print(f"Training sdrs.shape:{sdrs.shape}, dtype:{sdrs.dtype}")
+
+
+# This encodes both y_train values and train index (position in x_train
+sids = y_train[istart:iend] + np.arange(istart,iend) * 100 + 10000000
+# If above looks weird, here-s an  example of what it does: if y_train[15632] == 7 then the ID becomes: 
+# 11563207
+# _NNNNN_D   - where '_' are ignored 'D' is the digit value and NNNNN is the row num in the x_train array
+
+print(f"Begin storing {ilen} SDRs in associative memory (a.k.a sdr map)")
+t = time()
+smap.store(sids,sdrs)
+t = time()-t
+print(f"sdrs stored in {int(1000*t)}ms")
+
+
+# uncomment these blocks if you want to encode&store all 60000 x_train digits
+istart,ilen = 20000,20000
+iend = istart+ilen
+sdrs = hasher.compute_sdrs(x_train[istart:iend],sdr_len = 40)
+sids = y_train[istart:iend] + np.arange(istart,iend) * 100 + 10000000
+smap.store(sids,sdrs)
+
+istart,ilen = 40000,20000
+iend = istart+ilen
+sdrs = hasher.compute_sdrs(x_train[istart:iend],sdr_len = 40)
+sids = y_train[istart:iend] + np.arange(istart,iend) * 100 + 10000000
+smap.store(sids,sdrs)
+
+# "training" done 
+
+# Testing 
+
+#xtest = x_train[:-10000]
+#ytest = y_train[:-10000]
+xtest = x_test
+ytest = y_test
+print("Begin querrying memory map with x_test")
+t=time()
+sdrs = hasher.compute_sdrs(xtest, sdr_len=25)
+idlists, idcounts = smap.query(sdrs,first=6)
+t = time()-t
+print(f"Associative query {len(idlists)} done in {int(t*1000)}ms")
+
+print(f"\nNext we retrieve predicted digit numbers from the responses")
+t = time()
+idresults = []
+
+for idlist, idcount in zip(idlists, idcounts):
+    results = np.zeros(10)
+    for i in range(len(idlist)):
+        pred = idlist[i] % 100 # restore digit value from id.
+        results[pred] += idcount[i]
+    idresults.append(np.argmax(results))
+idresults = np.array(idresults)
+# print(f"idresults.sum() {idresults.sum()}, dtype={idresults.dtype}")
+
+comps = idresults == ytest
+t=time()-t
+
+print(f" result len = {len(comps)}, positive = {comps.sum()/100}%, computed in {int(t*1000)}ms")
diff --git a/fly_hash_encoder.py b/fly_hash_encoder.py
@@ -0,0 +1,70 @@
+import numpy as np
+class FHEncoder():
+
+    def __init__(self, x = None, fname = None, random_state = None, sdr_size=2048, pixels_per_encoder=16): 
+        if fname is None: 
+            self.new_encoders(x.shape[1],sdr_size,pixels_per_encoder, random_state)
+            self.init_factors(x)
+        else:
+            self.load(fname)
+
+    def compute_sdrs(self, x, sdr_len = 32):
+        scores = x[:,self.encoders].sum(axis=2) / self.factors
+        sdrs = np.flip(np.argsort(scores),axis = -1)
+        return sdrs[:,:sdr_len].astype(np.uint32)
+
+    def new_encoders(self,input_size, sdr_size, pixels_per_encoder, random_state):
+        encoders = [] 
+        np.random.seed(random_state)
+        for _ in range(sdr_size):
+            encoders.append(np.random.permutation(input_size)[:pixels_per_encoder])
+        self.encoders = np.array(encoders,dtype=np.uint32)
+
+    def init_factors(self,x):
+        factors = []
+        tops = x.shape[0]//50
+        for encoder in self.encoders:
+            scores = x[:,encoder].sum(axis=1)
+            order = np.argsort(scores)
+            top2pcnt = order[-tops:]
+            factors.append(scores[top2pcnt].mean())
+        self.factors = np.array(factors).astype(np.float32)
+
+
+    def load(self,fname):
+        if fname.split('.')[-1] != "npz": 
+            fname += ".npz"
+        data = np.load(fname)
+        self.encodings = data["encodings"]
+        self.factors = data["factors"]
+
+    def save(self, fname):
+        np.savez(fname, encodings = self.encodings, factors = self.factors) 
+
+if __name__ == "__main__":
+    from load_mnist_data import x_train,normalize
+    from time import time
+
+    X = normalize(x_train)
+    SDR_SIZE = 1024
+
+    t = time()
+    fhe = FHEncoder(X[:10000],sdr_size=SDR_SIZE)
+    t = time() - t
+    #fhe.load_from_dump("fpackteam1.txt")
+
+
+
+    print(f"{fhe.encoders.shape[0]} encoders generated in {int(t*1000)}ms")
+    print(f"encoders shape: {fhe.encoders.shape}")
+
+    num = 10000
+
+    t = time()
+    sdrs = fhe.compute_sdrs(X[:num], sdr_len = 24)
+    t = time() - t
+
+    print(f"{num} sdrs of shape {sdrs.shape} computed in {int(t*1000)}ms")
+
+
+
diff --git a/load_mnist_data.py b/load_mnist_data.py
@@ -0,0 +1,24 @@
+import numpy as np
+import os
+
+dname = os.path.dirname(__file__) 
+if len(dname) == 0:
+    dname = '.'
+
+_d = np.load(dname + "/mnist_data.npz") 
+x_test,  y_test  = _d['x_test'], _d['y_test']
+x_train, y_train = _d['x_train'],_d['y_train']
+
+def normalize(X):
+    """
+    transforms X so sum() of each digit is 1. 
+    X must be adimensional, e.g. for MNIST each digit of shape (28,28) should be reshaped to (784,)
+    """
+    X = X.reshape(X.shape[0],-1)
+    sums = X.sum(axis=1)
+    return (X.T / sums).T.astype(np.float32)
+
+if __name__ == "__main__":
+    # np.savez("mnist_data", x_test = x_test, x_train = x_train, y_test = y_test, y_train = y_train)
+    print(f"mnist data loaded in {[k[0] for k in _d.items()]}")
+    print({normalize(x_test).sum()})