-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnblearn.py
108 lines (81 loc) · 3.45 KB
/
nblearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python
import os, sys, glob
from collections import defaultdict
from collections import Counter
class Learn(object):
def __init__(self):
self.fname = ""
self.vocabList = []
self.spamTrainingCount = 0 #contains total spam training data
self.hamTrainingCount = 0 #contans total ham training data
self.totalTrainingCount = 0
self.spamWordCount = {}
self.hamWordCount = {}
self.spamFiles = []
self.hamFiles = []
self.spamWordList = []
self.hamWordList = []
self.proDict = {'word' : {},
'spam' : 0,
'ham': 0
}
def getData(self):
"""
This function is used to parse the vocab list and training data provided to us.
"""
for root, dirnames, filenames in os.walk(self.fname):
if "spam" in dirnames:
spamdir = os.path.join(root, "spam")
self.spamFiles.extend([os.path.join(spamdir, x) for x in os.listdir(spamdir) if x.endswith(".txt")])
for sFile in self.spamFiles:
with open(sFile, "r", encoding="latin1") as f:
self.spamWordList.extend(f.read().split())
self.vocabList.extend(self.spamWordList)
for root, dirnames, filenames in os.walk(self.fname):
if "ham" in dirnames:
hamdir = os.path.join(root, "ham")
self.hamFiles.extend([os.path.join(hamdir, x) for x in os.listdir(hamdir) if x.endswith(".txt")])
for hFile in self.hamFiles:
with open (hFile, "r", encoding="latin1") as f:
self.hamWordList.extend(f.read().split())
self.vocabList.extend(self.hamWordList)
self.vocabList = list(set(self.vocabList))
#print(len(self.spamWordList))
# print len(self.spamWordList)
return
def find_token_probability(self):
self.spamWordCount = Counter(self.spamWordList)
self.hamWordCount = Counter(self.hamWordList)
self.spamTrainingCount = len(self.spamFiles)
self.hamTrainingCount = len(self.hamFiles)
self.totalTrainingCount = self.spamTrainingCount + self.hamTrainingCount
self.proDict = {'word': {},
'spam': 0,
'ham': 0
}
distinctWordLen = len(self.vocabList)
for word in self.vocabList:
self.proDict['word'][word] = (((self.spamWordCount[word] + 1) / (len(self.spamWordList) + distinctWordLen)) , \
((self.hamWordCount[word] + 1)/ (len(self.hamWordList) + distinctWordLen)))
self.proDict['spam'] = self.spamTrainingCount / self.totalTrainingCount
self.proDict['ham'] = self.hamTrainingCount / self.totalTrainingCount
try:
with open('nbmodel.txt', 'w') as f:
f.write(str(self.proDict))
except:
print("something went wrong with FIL IO")
exit(1)
# print self.spamWordCount
# print self.hamWordCount
# print self.wordSpamProbDict
return
if __name__ == "__main__":
if (len(sys.argv)) != 2:
print("Usage: python3 nblearn.py /path/to/input")
exit(1)
dataPath = sys.argv[1]
learn_obj = Learn()
learn_obj.fname = dataPath
learn_obj.getData()
learn_obj.find_token_probability()
exit(0)