Skip to content

Commit

Permalink
add a withFlag param to extract_tags
Browse files Browse the repository at this point in the history
  • Loading branch information
jerryday committed Oct 30, 2015
1 parent b6f1ce7 commit 26e339f
Showing 1 changed file with 9 additions and 4 deletions.
13 changes: 9 additions & 4 deletions jieba/analyse/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def set_idf_path(self, idf_path):
self.idf_loader.set_new_path(new_abs_path)
self.idf_freq, self.median_idf = self.idf_loader.get_idf()

def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=()):
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
"""
Extract keywords from sentence using TF-IDF algorithm.
Parameter:
Expand All @@ -81,6 +81,9 @@ def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=()):
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
if the POS of w is not in this list,it will be filtered.
- withFlag: only work with allowPOS is not empty.
if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
"""
if allowPOS:
allowPOS = frozenset(allowPOS)
Expand All @@ -92,14 +95,16 @@ def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=()):
if allowPOS:
if w.flag not in allowPOS:
continue
else:
elif not withFlag:
w = w.word
if len(w.strip()) < 2 or w.lower() in self.stop_words:
wc = w.word if allowPOS and withFlag else w
if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
continue
freq[w] = freq.get(w, 0.0) + 1.0
total = sum(freq.values())
for k in freq:
freq[k] *= self.idf_freq.get(k, self.median_idf) / total
kw = k.word if allowPOS and withFlag else k
freq[k] *= self.idf_freq.get(kw, self.median_idf) / total

if withWeight:
tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
Expand Down

0 comments on commit 26e339f

Please sign in to comment.