Skip to content

Commit

Permalink
add pinyin
Browse files Browse the repository at this point in the history
  • Loading branch information
isnowfy committed Dec 5, 2013
1 parent 97072a4 commit 2789f9e
Show file tree
Hide file tree
Showing 2 changed files with 56,083 additions and 2 deletions.
21 changes: 19 additions & 2 deletions snownlp/normal/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,20 @@

import zh

data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
stop_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'stopwords')
pinyin_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'pinyin.txt')
stop = set()
fr = codecs.open(data_path, 'r', 'utf-8')
pinyin = {}
fr = codecs.open(stop_path, 'r', 'utf-8')
for word in fr:
stop.add(word.strip())
fr.close()
fr = codecs.open(pinyin_path, 'r', 'utf-8')
for word in fr:
words = word.split()
pinyin[words[0]] = words[1:]


def filter_stop(words):
Expand All @@ -38,3 +45,13 @@ def get_sentences(doc):
continue
sentences.append(sent)
return sentences


def get_pinyin(word):
if word in pinyin:
return pinyin[word]
ret = []
for w in word:
if w in pinyin:
ret.append(pinyin[w])
return ret
Loading

0 comments on commit 2789f9e

Please sign in to comment.