First version. Shingling, min hash, lsh.

hncuong · Jul 5, 2019 · 82529bd · 82529bd
commit 82529bd
Show file tree

Hide file tree

Showing 10 changed files with 23,343 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+.vscode/
+.ipynb_checkpoints/
diff --git a/Local sensitive Hashing.ipynb b/Local sensitive Hashing.ipynb
@@ -0,0 +1,154 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import hashlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MAX_VAL = 1000000007"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NBIT = 32\n",
+    "\n",
+    "def hashing(s):\n",
+    "    # Encode to bytes\n",
+    "    if type(s) == str:\n",
+    "        s = s.encode('utf-8')\n",
+    "        \n",
+    "    # Hashing using md5\n",
+    "    hashobj = hashlib.md5(s)\n",
+    "    hexv = hashobj.hexdigest()\n",
+    "    val = int(hexv, 16) % MAX_VAL\n",
+    "    \n",
+    "    return val\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "149481674"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "hashing('sea shells by the sea shore')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hashobj = hashlib.md5(str(149481674).encode('utf-8'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# hash .obj = hashlib.md5(1232343299)\n",
+    "hexv = hashobj.hexdigest()\n",
+    "val = int(hexv, 16) % MAX_VAL"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### SHingling "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Min hash "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Lsh"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/TechCrunch.csv b/TechCrunch.csv
diff --git a/VentureBeat.csv b/VentureBeat.csv
diff --git a/__init__.py b/__init__.py
diff --git a/corpus.py b/corpus.py
@@ -0,0 +1,59 @@
+
+
+class Corpus(object):
+    """Represent corpus of documents use in LSH
+    - docs: Contain docs of shingles
+    - new_docs: mapping shingles to 0 => num shingles.
+    - idx_docs: idx -> docs contain idx.
+    """
+    def __init__(self, docs):
+        self._docs = docs
+        self.new_docs, self.idx_docs, self.value_to_idx = self.remap_index()
+
+    def remap_index(self):
+        """Remap index to 0 -> num shingles"""
+        print("Remap token to index from 0 -> len(token)")
+        value_to_idx = {}
+        cnt = 0
+        new_docs = []
+        idx_docs = []
+
+        for doc_id, doc in enumerate(self._docs):
+            new_doc = []
+
+            for token in doc:
+                if token in value_to_idx:
+                    # Map id
+                    word_id = value_to_idx[token]
+                else:
+                    # New token
+                    word_id = cnt
+                    value_to_idx[token] = word_id
+                    idx_docs.append([])
+                    cnt += 1
+
+                # Add to doc
+                new_doc.append(word_id)
+                # Add doc to list contain idx
+                idx_docs[word_id].append(doc_id)
+
+            new_docs.append(new_doc)
+
+        print("Number of shingles = {}".format(len(value_to_idx)))
+        return new_docs, idx_docs, value_to_idx
+
+    def metadata(self):
+        return self.new_docs, self.idx_docs, self.value_to_idx
+
+    def get_idx_docs(self):
+        return self.idx_docs
+
+    def get_num_docs(self):
+        return len(self._docs)
+
+if __name__ == "__main__":
+    # Read docs
+
+    # Convert to corpus
+    pass
+
diff --git a/local_sensitive_hashing.py b/local_sensitive_hashing.py
@@ -0,0 +1,17 @@
+
+class LocalSensitiveHashing(object):
+    def __init__(self, signatures, k_cool_prime=1000000007):
+        """ Local sensitive hashing
+        Return: 
+        - signatures: min hash signatures. Size  * ndocs matrix. Each value in range [0, cool_prime - 1]
+        - k_cool_prime: a cool prime ~1e9. Number of bucket to hash each band into. Select here 
+        http://compoasso.free.fr/primelistweb/page/prime/liste_online_en.php
+        """
+        self._signatures = signatures
+
+    def tune_bucket(self):
+        pass
+
+    def hashing(self):
+        pass
+
diff --git a/min_hashing.py b/min_hashing.py
@@ -0,0 +1,57 @@
+import numpy as np
+from utils import choose_random_pseudo_perm_hasher, gen_random_int_hasher
+from corpus import Corpus
+
+MAX_INT = 2**31 - 1
+
+class MinHasher(object):
+    """Min hashing list of sets"""
+
+    def __init__(self, corpus, k=100):
+        """
+        - k: # "permute" hash functions. Default 100.
+        - corpus: documents as Corpus object.
+        - cool_prime: a prime that is litter bigger than number of unique shingles.
+        http://compoasso.free.fr/primelistweb/page/prime/liste_online_en.php
+        """
+        self._k = k
+
+        self._ndocs = corpus.get_num_docs()
+        self.idx_docs = corpus.get_idx_docs()
+        self._hashers = self.get_hasher()
+
+    def get_hashers(self):
+        print("Generate {} random bucker hashers".format(self._k))
+        return gen_random_int_hasher(self._k)
+
+    def pseudo_perm_hasher(self):
+        """Pseudo permutation hashing. MIN HASHING
+        Return: k * ndocs matrix. Each value in range [0, cool_prime - 1]
+        """
+        # Do k hashing 
+        # signatures = np.full((self._k, self._ndocs), MAX_INT, dtype=np.int32)
+        signatures = [[MAX_INT] * self._k for _ in range(self._ndocs)]
+
+        # For each row
+        for r in range(len(self.idx_docs)):
+
+            # DO perm hash functions for this row
+            hrs = []
+            for i in range(self._k):
+                hasher = self._hashers(i)
+                hrs.append(hasher(r))
+
+            # For doc contain token id r
+            for doc_id in self.idx_docs[r]:
+                for hidx, hr in enumerate(hrs):
+                    if hr < signatures[hidx][doc_id]:
+                        signatures[hidx][doc_id] = hr
+
+        return signatures
+
+if __name__ == "__main__":
+    pass
+
+
+
+
diff --git a/shingling.py b/shingling.py
@@ -0,0 +1,40 @@
+import hashlib
+from utils import hashing_md5_to_32bit, preprocessing_doc
+
+class Shingles(object):
+    """Shingling documents in to set of shingles 
+    and then compress them to 4 bytes tokens."""
+
+    def __init__(self, k, hash_function=None):
+        # shingles length
+        self._k = k
+        if hash_function is not None:
+            self._hash_function = hash_function
+        else:
+            self._hash_function = hashing_md5_to_32bit
+
+
+    def shingling(self, doc):
+        doc = preprocessing_doc(doc)
+
+        tokens = []
+
+        # First token
+        shingle = list(doc[:self._k])
+        tokens.append(hashing_md5_to_32bit(''.join(shingle)))
+
+        for i in range(self._k, len(doc)):
+            # Move to next shingle
+            shingle.pop(0)
+            shingle.append(doc[i])
+
+            # Hash
+            tokens.append(hashing_md5_to_32bit(''.join(shingle)))
+
+        return set(tokens)
+
+if __name__ == "__main__":
+    shingler = Shingles(k=4)
+    print(shingler.shingling('Love the way you lie'))
+    print(shingler.shingling('Love the way I lie'))
+
diff --git a/utils.py b/utils.py
@@ -0,0 +1,55 @@
+import hashlib
+import numpy as np
+
+NBIT = 32
+MAX_VAL = 2 ** 32
+
+def hashing_md5_to_32bit(s):
+    # Encode to bytes
+    if type(s) == str:
+        s = s.encode('utf-8')
+
+    # Hashing using md5
+    hashobj = hashlib.md5(s)
+    hexv = hashobj.hexdigest()
+    val = int(hexv, 16) % MAX_VAL
+
+    return val
+
+def preprocessing_doc(doc):
+    return doc.lower()
+
+def choose_random_pseudo_perm_hasher(cool_prime, k):
+    """
+    cool_prime: a prime number slightly larger than the total number of shingle sets.
+    Select here: http://compoasso.free.fr/primelistweb/page/prime/liste_online_en.php
+    k: number of hasher to take
+    """
+    coefs = np.random.randint(0, cool_prime, size=(k, 2))
+    return coefs
+
+def md5_hashing(s):
+    # Encode to bytes
+    if type(s) == str:
+        s = s.encode('utf-8')
+
+    # Hashing using md5
+    hashobj = hashlib.md5(s)
+    hexv = hashobj.hexdigest()
+    val = int(hexv, 16)
+
+    return val
+
+def random_hasher_integer(big_num):
+    def hashing(n):
+        n = str(n)
+        val = md5_hashing(n)
+        val = val % big_num
+        return val
+    return hashing
+
+def gen_random_int_hasher(k):
+    """k : number of hasher."""
+    big_nums = np.random.randint(int(1e9), int(2e9), k)
+    return [random_hasher_integer(big_num) for big_num in big_nums]
+