-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
First version. Shingling, min hash, lsh.
- Loading branch information
cuonghn
committed
Jul 5, 2019
0 parents
commit 82529bd
Showing
10 changed files
with
23,343 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
__pycache__/ | ||
.vscode/ | ||
.ipynb_checkpoints/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import hashlib" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"MAX_VAL = 1000000007" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"NBIT = 32\n", | ||
"\n", | ||
"def hashing(s):\n", | ||
" # Encode to bytes\n", | ||
" if type(s) == str:\n", | ||
" s = s.encode('utf-8')\n", | ||
" \n", | ||
" # Hashing using md5\n", | ||
" hashobj = hashlib.md5(s)\n", | ||
" hexv = hashobj.hexdigest()\n", | ||
" val = int(hexv, 16) % MAX_VAL\n", | ||
" \n", | ||
" return val\n", | ||
" " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"149481674" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"hashing('sea shells by the sea shore')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 13, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"hashobj = hashlib.md5(str(149481674).encode('utf-8'))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 14, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# hash .obj = hashlib.md5(1232343299)\n", | ||
"hexv = hashobj.hexdigest()\n", | ||
"val = int(hexv, 16) % MAX_VAL" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### SHingling " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Min hash " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Lsh" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
|
||
|
||
class Corpus(object): | ||
"""Represent corpus of documents use in LSH | ||
- docs: Contain docs of shingles | ||
- new_docs: mapping shingles to 0 => num shingles. | ||
- idx_docs: idx -> docs contain idx. | ||
""" | ||
def __init__(self, docs): | ||
self._docs = docs | ||
self.new_docs, self.idx_docs, self.value_to_idx = self.remap_index() | ||
|
||
def remap_index(self): | ||
"""Remap index to 0 -> num shingles""" | ||
print("Remap token to index from 0 -> len(token)") | ||
value_to_idx = {} | ||
cnt = 0 | ||
new_docs = [] | ||
idx_docs = [] | ||
|
||
for doc_id, doc in enumerate(self._docs): | ||
new_doc = [] | ||
|
||
for token in doc: | ||
if token in value_to_idx: | ||
# Map id | ||
word_id = value_to_idx[token] | ||
else: | ||
# New token | ||
word_id = cnt | ||
value_to_idx[token] = word_id | ||
idx_docs.append([]) | ||
cnt += 1 | ||
|
||
# Add to doc | ||
new_doc.append(word_id) | ||
# Add doc to list contain idx | ||
idx_docs[word_id].append(doc_id) | ||
|
||
new_docs.append(new_doc) | ||
|
||
print("Number of shingles = {}".format(len(value_to_idx))) | ||
return new_docs, idx_docs, value_to_idx | ||
|
||
def metadata(self): | ||
return self.new_docs, self.idx_docs, self.value_to_idx | ||
|
||
def get_idx_docs(self): | ||
return self.idx_docs | ||
|
||
def get_num_docs(self): | ||
return len(self._docs) | ||
|
||
if __name__ == "__main__": | ||
# Read docs | ||
|
||
# Convert to corpus | ||
pass | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
|
||
class LocalSensitiveHashing(object): | ||
def __init__(self, signatures, k_cool_prime=1000000007): | ||
""" Local sensitive hashing | ||
Return: | ||
- signatures: min hash signatures. Size * ndocs matrix. Each value in range [0, cool_prime - 1] | ||
- k_cool_prime: a cool prime ~1e9. Number of bucket to hash each band into. Select here | ||
http://compoasso.free.fr/primelistweb/page/prime/liste_online_en.php | ||
""" | ||
self._signatures = signatures | ||
|
||
def tune_bucket(self): | ||
pass | ||
|
||
def hashing(self): | ||
pass | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import numpy as np | ||
from utils import choose_random_pseudo_perm_hasher, gen_random_int_hasher | ||
from corpus import Corpus | ||
|
||
MAX_INT = 2**31 - 1 | ||
|
||
class MinHasher(object): | ||
"""Min hashing list of sets""" | ||
|
||
def __init__(self, corpus, k=100): | ||
""" | ||
- k: # "permute" hash functions. Default 100. | ||
- corpus: documents as Corpus object. | ||
- cool_prime: a prime that is litter bigger than number of unique shingles. | ||
http://compoasso.free.fr/primelistweb/page/prime/liste_online_en.php | ||
""" | ||
self._k = k | ||
|
||
self._ndocs = corpus.get_num_docs() | ||
self.idx_docs = corpus.get_idx_docs() | ||
self._hashers = self.get_hasher() | ||
|
||
def get_hashers(self): | ||
print("Generate {} random bucker hashers".format(self._k)) | ||
return gen_random_int_hasher(self._k) | ||
|
||
def pseudo_perm_hasher(self): | ||
"""Pseudo permutation hashing. MIN HASHING | ||
Return: k * ndocs matrix. Each value in range [0, cool_prime - 1] | ||
""" | ||
# Do k hashing | ||
# signatures = np.full((self._k, self._ndocs), MAX_INT, dtype=np.int32) | ||
signatures = [[MAX_INT] * self._k for _ in range(self._ndocs)] | ||
|
||
# For each row | ||
for r in range(len(self.idx_docs)): | ||
|
||
# DO perm hash functions for this row | ||
hrs = [] | ||
for i in range(self._k): | ||
hasher = self._hashers(i) | ||
hrs.append(hasher(r)) | ||
|
||
# For doc contain token id r | ||
for doc_id in self.idx_docs[r]: | ||
for hidx, hr in enumerate(hrs): | ||
if hr < signatures[hidx][doc_id]: | ||
signatures[hidx][doc_id] = hr | ||
|
||
return signatures | ||
|
||
if __name__ == "__main__": | ||
pass | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import hashlib | ||
from utils import hashing_md5_to_32bit, preprocessing_doc | ||
|
||
class Shingles(object): | ||
"""Shingling documents in to set of shingles | ||
and then compress them to 4 bytes tokens.""" | ||
|
||
def __init__(self, k, hash_function=None): | ||
# shingles length | ||
self._k = k | ||
if hash_function is not None: | ||
self._hash_function = hash_function | ||
else: | ||
self._hash_function = hashing_md5_to_32bit | ||
|
||
|
||
def shingling(self, doc): | ||
doc = preprocessing_doc(doc) | ||
|
||
tokens = [] | ||
|
||
# First token | ||
shingle = list(doc[:self._k]) | ||
tokens.append(hashing_md5_to_32bit(''.join(shingle))) | ||
|
||
for i in range(self._k, len(doc)): | ||
# Move to next shingle | ||
shingle.pop(0) | ||
shingle.append(doc[i]) | ||
|
||
# Hash | ||
tokens.append(hashing_md5_to_32bit(''.join(shingle))) | ||
|
||
return set(tokens) | ||
|
||
if __name__ == "__main__": | ||
shingler = Shingles(k=4) | ||
print(shingler.shingling('Love the way you lie')) | ||
print(shingler.shingling('Love the way I lie')) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import hashlib | ||
import numpy as np | ||
|
||
NBIT = 32 | ||
MAX_VAL = 2 ** 32 | ||
|
||
def hashing_md5_to_32bit(s): | ||
# Encode to bytes | ||
if type(s) == str: | ||
s = s.encode('utf-8') | ||
|
||
# Hashing using md5 | ||
hashobj = hashlib.md5(s) | ||
hexv = hashobj.hexdigest() | ||
val = int(hexv, 16) % MAX_VAL | ||
|
||
return val | ||
|
||
def preprocessing_doc(doc): | ||
return doc.lower() | ||
|
||
def choose_random_pseudo_perm_hasher(cool_prime, k): | ||
""" | ||
cool_prime: a prime number slightly larger than the total number of shingle sets. | ||
Select here: http://compoasso.free.fr/primelistweb/page/prime/liste_online_en.php | ||
k: number of hasher to take | ||
""" | ||
coefs = np.random.randint(0, cool_prime, size=(k, 2)) | ||
return coefs | ||
|
||
def md5_hashing(s): | ||
# Encode to bytes | ||
if type(s) == str: | ||
s = s.encode('utf-8') | ||
|
||
# Hashing using md5 | ||
hashobj = hashlib.md5(s) | ||
hexv = hashobj.hexdigest() | ||
val = int(hexv, 16) | ||
|
||
return val | ||
|
||
def random_hasher_integer(big_num): | ||
def hashing(n): | ||
n = str(n) | ||
val = md5_hashing(n) | ||
val = val % big_num | ||
return val | ||
return hashing | ||
|
||
def gen_random_int_hasher(k): | ||
"""k : number of hasher.""" | ||
big_nums = np.random.randint(int(1e9), int(2e9), k) | ||
return [random_hasher_integer(big_num) for big_num in big_nums] | ||
|