Skip to content

Commit

Permalink
First version. Shingling, min hash, lsh.
Browse files Browse the repository at this point in the history
  • Loading branch information
cuonghn committed Jul 5, 2019
0 parents commit 82529bd
Show file tree
Hide file tree
Showing 10 changed files with 23,343 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
__pycache__/
.vscode/
.ipynb_checkpoints/
154 changes: 154 additions & 0 deletions Local sensitive Hashing.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import hashlib"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"MAX_VAL = 1000000007"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"NBIT = 32\n",
"\n",
"def hashing(s):\n",
" # Encode to bytes\n",
" if type(s) == str:\n",
" s = s.encode('utf-8')\n",
" \n",
" # Hashing using md5\n",
" hashobj = hashlib.md5(s)\n",
" hexv = hashobj.hexdigest()\n",
" val = int(hexv, 16) % MAX_VAL\n",
" \n",
" return val\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"149481674"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hashing('sea shells by the sea shore')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"hashobj = hashlib.md5(str(149481674).encode('utf-8'))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# hash .obj = hashlib.md5(1232343299)\n",
"hexv = hashobj.hexdigest()\n",
"val = int(hexv, 16) % MAX_VAL"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### SHingling "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Min hash "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Lsh"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
12,395 changes: 12,395 additions & 0 deletions TechCrunch.csv

Large diffs are not rendered by default.

10,563 changes: 10,563 additions & 0 deletions VentureBeat.csv

Large diffs are not rendered by default.

Empty file added __init__.py
Empty file.
59 changes: 59 additions & 0 deletions corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@


class Corpus(object):
"""Represent corpus of documents use in LSH
- docs: Contain docs of shingles
- new_docs: mapping shingles to 0 => num shingles.
- idx_docs: idx -> docs contain idx.
"""
def __init__(self, docs):
self._docs = docs
self.new_docs, self.idx_docs, self.value_to_idx = self.remap_index()

def remap_index(self):
"""Remap index to 0 -> num shingles"""
print("Remap token to index from 0 -> len(token)")
value_to_idx = {}
cnt = 0
new_docs = []
idx_docs = []

for doc_id, doc in enumerate(self._docs):
new_doc = []

for token in doc:
if token in value_to_idx:
# Map id
word_id = value_to_idx[token]
else:
# New token
word_id = cnt
value_to_idx[token] = word_id
idx_docs.append([])
cnt += 1

# Add to doc
new_doc.append(word_id)
# Add doc to list contain idx
idx_docs[word_id].append(doc_id)

new_docs.append(new_doc)

print("Number of shingles = {}".format(len(value_to_idx)))
return new_docs, idx_docs, value_to_idx

def metadata(self):
return self.new_docs, self.idx_docs, self.value_to_idx

def get_idx_docs(self):
return self.idx_docs

def get_num_docs(self):
return len(self._docs)

if __name__ == "__main__":
# Read docs

# Convert to corpus
pass

17 changes: 17 additions & 0 deletions local_sensitive_hashing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@

class LocalSensitiveHashing(object):
def __init__(self, signatures, k_cool_prime=1000000007):
""" Local sensitive hashing
Return:
- signatures: min hash signatures. Size * ndocs matrix. Each value in range [0, cool_prime - 1]
- k_cool_prime: a cool prime ~1e9. Number of bucket to hash each band into. Select here
http://compoasso.free.fr/primelistweb/page/prime/liste_online_en.php
"""
self._signatures = signatures

def tune_bucket(self):
pass

def hashing(self):
pass

57 changes: 57 additions & 0 deletions min_hashing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import numpy as np
from utils import choose_random_pseudo_perm_hasher, gen_random_int_hasher
from corpus import Corpus

MAX_INT = 2**31 - 1

class MinHasher(object):
"""Min hashing list of sets"""

def __init__(self, corpus, k=100):
"""
- k: # "permute" hash functions. Default 100.
- corpus: documents as Corpus object.
- cool_prime: a prime that is litter bigger than number of unique shingles.
http://compoasso.free.fr/primelistweb/page/prime/liste_online_en.php
"""
self._k = k

self._ndocs = corpus.get_num_docs()
self.idx_docs = corpus.get_idx_docs()
self._hashers = self.get_hasher()

def get_hashers(self):
print("Generate {} random bucker hashers".format(self._k))
return gen_random_int_hasher(self._k)

def pseudo_perm_hasher(self):
"""Pseudo permutation hashing. MIN HASHING
Return: k * ndocs matrix. Each value in range [0, cool_prime - 1]
"""
# Do k hashing
# signatures = np.full((self._k, self._ndocs), MAX_INT, dtype=np.int32)
signatures = [[MAX_INT] * self._k for _ in range(self._ndocs)]

# For each row
for r in range(len(self.idx_docs)):

# DO perm hash functions for this row
hrs = []
for i in range(self._k):
hasher = self._hashers(i)
hrs.append(hasher(r))

# For doc contain token id r
for doc_id in self.idx_docs[r]:
for hidx, hr in enumerate(hrs):
if hr < signatures[hidx][doc_id]:
signatures[hidx][doc_id] = hr

return signatures

if __name__ == "__main__":
pass




40 changes: 40 additions & 0 deletions shingling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import hashlib
from utils import hashing_md5_to_32bit, preprocessing_doc

class Shingles(object):
"""Shingling documents in to set of shingles
and then compress them to 4 bytes tokens."""

def __init__(self, k, hash_function=None):
# shingles length
self._k = k
if hash_function is not None:
self._hash_function = hash_function
else:
self._hash_function = hashing_md5_to_32bit


def shingling(self, doc):
doc = preprocessing_doc(doc)

tokens = []

# First token
shingle = list(doc[:self._k])
tokens.append(hashing_md5_to_32bit(''.join(shingle)))

for i in range(self._k, len(doc)):
# Move to next shingle
shingle.pop(0)
shingle.append(doc[i])

# Hash
tokens.append(hashing_md5_to_32bit(''.join(shingle)))

return set(tokens)

if __name__ == "__main__":
shingler = Shingles(k=4)
print(shingler.shingling('Love the way you lie'))
print(shingler.shingling('Love the way I lie'))

55 changes: 55 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import hashlib
import numpy as np

NBIT = 32
MAX_VAL = 2 ** 32

def hashing_md5_to_32bit(s):
# Encode to bytes
if type(s) == str:
s = s.encode('utf-8')

# Hashing using md5
hashobj = hashlib.md5(s)
hexv = hashobj.hexdigest()
val = int(hexv, 16) % MAX_VAL

return val

def preprocessing_doc(doc):
return doc.lower()

def choose_random_pseudo_perm_hasher(cool_prime, k):
"""
cool_prime: a prime number slightly larger than the total number of shingle sets.
Select here: http://compoasso.free.fr/primelistweb/page/prime/liste_online_en.php
k: number of hasher to take
"""
coefs = np.random.randint(0, cool_prime, size=(k, 2))
return coefs

def md5_hashing(s):
# Encode to bytes
if type(s) == str:
s = s.encode('utf-8')

# Hashing using md5
hashobj = hashlib.md5(s)
hexv = hashobj.hexdigest()
val = int(hexv, 16)

return val

def random_hasher_integer(big_num):
def hashing(n):
n = str(n)
val = md5_hashing(n)
val = val % big_num
return val
return hashing

def gen_random_int_hasher(k):
"""k : number of hasher."""
big_nums = np.random.randint(int(1e9), int(2e9), k)
return [random_hasher_integer(big_num) for big_num in big_nums]

0 comments on commit 82529bd

Please sign in to comment.