-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest_hashers.py
165 lines (137 loc) · 6.55 KB
/
test_hashers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import random
from hypothesis import given
import hypothesis.strategies
from corpushash.hashers import *
import os
import shutil
import urllib
from bs4 import BeautifulSoup
pwd = os.getcwd()
base_path = os.path.dirname(pwd)
test_path = os.path.join(base_path, 'corpus_test')
try:
shutil.rmtree(test_path)
except FileNotFoundError:
pass
os.mkdir(test_path)
urls = ["http://www.humancomp.org/unichtm/calblur8.htm",
"http://www.humancomp.org/unichtm/unilang8.htm",
"http://www.humancomp.org/unichtm/banviet8.htm",
"http://www.humancomp.org/unichtm/croattx8.htm",
"http://www.humancomp.org/unichtm/danish8.htm",
"http://www.humancomp.org/unichtm/esperan8.htm",
"http://www.humancomp.org/unichtm/jpndoc8.htm",
"http://www.humancomp.org/unichtm/kordoc8.htm",
"http://www.humancomp.org/unichtm/neural8.htm",
"http://www.humancomp.org/unichtm/russmnv8.htm",
"http://www.humancomp.org/unichtm/sample68.htm",
"http://www.humancomp.org/unichtm/huseyin8.htm",
"http://www.humancomp.org/unichtm/tongtws8.htm",
"http://www.humancomp.org/unichtm/armenia8.htm",
"http://www.humancomp.org/unichtm/maopoem8.htm",
"http://www.humancomp.org/unichtm/linjilu8.htm",
"http://www.humancomp.org/unichtm/ulysses8.htm",
"http://www.humancomp.org/unichtm/zenbibl8.htm"]
# from http://www.humancomp.org/unichtm/unichtm.htm
def setup_data():
test_corpus = []
for url in urls:
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html)
for script in soup(["script", "style"]): # clean
script.extract() # rip it out
text = soup.get_text()
split_text = text_split(text)
test_corpus.append(split_text)
salt_length = random.randint(1, 33)
salt_choice = random.choice([True, False])
encoded_corp = CorpusHash(test_corpus, test_path, salt_length=salt_length,
one_salt=salt_choice)
return encoded_corp
encoded_corp = setup_data()
print('one_salt is {}'.format(encoded_corp.one_salt))
@given(hypothesis.strategies.text())
def test_hash_token_works(s):
response = hash_token(s)
assert response
@given(hypothesis.strategies.text())
def test_hash_token_return_token(s):
hashed_token, _ = hash_token(s)
assert isinstance(hashed_token, str)
assert len(hashed_token) == 40
@given(hypothesis.strategies.text())
def test_hash_token_return_salt(s):
_, salt = hash_token(s)
assert isinstance(salt, bytes)
def test_dictionary_length():
encode_dictionary_length = len(encoded_corp.encode_dictionary)
assert encode_dictionary_length == len(encoded_corp.decode_dictionary)
def test_encode_dictionary_types():
token = random.choice(list(encoded_corp.encode_dictionary.keys()))
hashed_token = encoded_corp.encode_dictionary[token]
assert isinstance(token, str)
assert isinstance(hashed_token, str)
assert len(hashed_token) == 40
def test_decode_dictionary_types():
hashed_token = random.choice(list(encoded_corp.decode_dictionary.keys()))
token, salt = encoded_corp.decode_dictionary[hashed_token]
assert isinstance(hashed_token, str)
assert len(hashed_token) == 40
assert isinstance(token, str)
assert isinstance(salt, str)
# assert len(salt) == encoded_corp.salt_length
def test_encode_decode_dictionaries():
"""
Choose 10 random tokens. get its hash in encode_dictionary. get the token
and salt in decode_dictionary using the hash found in encode_dictionary.
check if tokens and hashed tokens are the same.
"""
for _ in range(10):
enc_token = random.choice(list(encoded_corp.encode_dictionary.keys()))
enc_hashed_token = encoded_corp.encode_dictionary[enc_token]
dec_token, salt = encoded_corp.decode_dictionary[enc_hashed_token]
assert enc_token == dec_token
hashed_token, _ = hash_token(enc_token, salt=base64.b85decode(salt))
assert hashed_token == enc_hashed_token
def test_loading_dictionaries():
toy_corpus = random.sample(list(encoded_corp.encode_dictionary.keys()), 15)
second_encoded_corp = CorpusHash([toy_corpus], test_path)
for token in toy_corpus:
hashed_token = second_encoded_corp.encode_dictionary[token]
correct_token, previous_salt_str = encoded_corp.decode_dictionary[hashed_token]
salt_in_previous_hashing = base64.b85decode(previous_salt_str.encode())
correct_hashed_token, _ = hash_token(token, salt=salt_in_previous_hashing)
assert token == correct_token
assert correct_hashed_token == hashed_token == encoded_corp.encode_dictionary[token]
salt = second_encoded_corp.decode_dictionary[hashed_token][1]
assert previous_salt_str == salt
def test_encoding_using_read_hashed_corpus():
for encoded_document, decoded_document in zip(encoded_corp.read_hashed_corpus(),
encoded_corp.corpus):
for enc_hashed_token, token in zip(walk_nested_list(encoded_document),
walk_nested_list(decoded_document)):
hashed_token = encoded_corp._encode_token(token)
assert hashed_token == enc_hashed_token
def test_decoding():
for encoded_document, decoded_document in zip(encoded_corp.read_hashed_corpus(),
encoded_corp.corpus):
for enc_hashed_token, token in zip(walk_nested_list(encoded_document),
walk_nested_list(decoded_document)):
dec_token, salt = encoded_corp.decode_dictionary[enc_hashed_token]
assert hash_token(token, salt=base64.b85decode(salt))[0] == enc_hashed_token
def test_one_salt():
if isinstance(encoded_corp.one_salt, bool) and encoded_corp.one_salt:
# if all salts are the same
random_key = random.choice(list(encoded_corp.decode_dictionary.keys()))
comparison_key = encoded_corp.decode_dictionary[random_key][1]
for key in encoded_corp.decode_dictionary.keys():
assert comparison_key == encoded_corp.decode_dictionary[key][1]
elif isinstance(encoded_corp.one_salt, bool) and not encoded_corp.one_salt:
# if all salts are different
comparison_salt = 0
for key in encoded_corp.decode_dictionary.keys():
salt = encoded_corp.decode_dictionary[key][1]
assert comparison_salt != salt
comparison_salt = salt
def test_corpus_size():
assert encoded_corp.corpus_size == len(os.listdir(encoded_corp.public_path))