forked from zwChan/Wordembedding-and-semantics
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorpus-stats.py
62 lines (53 loc) · 1.54 KB
/
corpus-stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from __future__ import division,print_function
__author__ = 'Jason'
import sys
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import math
if len(sys.argv) < 2:
print("Usage: [stat-file] ",file=sys.stderr)
exit(1)
infile = sys.argv[1]
with open(infile) as f:
freq2word = {}
freqList = []
err_cnt = 0
for line in f.readlines():
tokens = line.split()
if len(tokens) != 2:
err_cnt += 1
continue
word = tokens[0].strip()
freq = int(tokens[1])
if freq > 0:
freqList.append(freq)
else:
print(line)
if freq in freq2word:
freq2word[freq].add(word)
else:
freq2word[freq] = set()
freq_cnt = sorted([(k,len(v)) for k, v in freq2word.items()],reverse=True)
# freq_cnt_log = [(math.log10(x[0]+1), math.log10(x[1]+1)) for x in freq_cnt]
print(sum([v if k < 5 else 0 for k,v in freq_cnt]))
# unigram
freqList.sort(reverse=True)
fsum = sum(freqList)
freqList = [(x/fsum)**0.75 for x in freqList]
f,ax1=plt.subplots()
ax = [math.log(x+1) for x in range(0,len(freqList))]
ay = [math.log(y) for y in freqList]
# plt.plot(ax,ay)
plt.plot(freqList,'-')
plt.xlabel("Ranking index of words")
plt.ylabel("Probability")
# plt.xlim([0,100000])
plt.ylim([0,0.15])
ax1.set_xscale('log')
# ax1.set_yscale('log')
savename = 'ugram.jpg'
f.savefig(savename, bbox_inches='tight', dpi=200)
plt.show()
plt.close()
print("save image %s" % savename)