-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwordattractionrank.py
57 lines (49 loc) · 1.85 KB
/
wordattractionrank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# coding:utf-8
import os
from configparser import ConfigParser
import gensim
import networkx as nx
from util.edge_feature import calc_dice, euc_distance, calc_force, get_edge_freq
from util.evaluate import evaluate_pagerank
from util.graph import build_graph, dict2list
from util.node_feature import read_lda
from util.text_process import filter_text, read_file, rm_tags, stem2word
def wordattractionrank(name, dataset):
dataset = dataset.lower()
cfg = ConfigParser()
cfg.read(os.path.join('./config', dataset+'.ini'))
window = int(cfg.get('graph', 'window'))
damping = float(cfg.get('graph', 'damping'))
abstract_dir = cfg.get('dataset', 'abstract')
with_tag = cfg.getboolean('dataset', 'with_tag')
cfg.read('./config/global.ini')
vec_path = cfg.get('embedding', 'wiki_vec')
doc_path = os.path.join(abstract_dir, name)
text = read_file(doc_path)
stemdict = stem2word(text)
text_candidate = filter_text(text, with_tag=with_tag)
edge_freq = get_edge_freq(text_candidate, window=window)
wvmodel = gensim.models.KeyedVectors.load_word2vec_format(vec_path, binary=False)
edge_weight = {}
for edge in edge_freq:
word1 = edge[0]
word2 = edge[1]
try:
distance = 1 - wvmodel.similarity(stemdict[word1], stemdict[word2])
except:
distance = 1
words = text_candidate.split()
tf1 = words.count(word1)
tf2 = words.count(word2)
cf = edge_freq[edge]
force = calc_force(tf1, tf2, distance)
dice = calc_dice(tf1, tf2, cf)
edge_weight[edge] = force * dice
edges = dict2list(edge_weight)
graph = build_graph(edges)
pr = nx.pagerank(graph, alpha=damping)
return pr, graph
if __name__ == "__main__":
datasetlist = ['kdd', 'sigir']
for d in datasetlist:
evaluate_pagerank(d, wordattractionrank)