Skip to content

Commit

Permalink
Add new features
Browse files Browse the repository at this point in the history
  • Loading branch information
ShawnyXiao committed Dec 8, 2017
1 parent e63c84a commit f9d7b68
Showing 1 changed file with 27 additions and 0 deletions.
27 changes: 27 additions & 0 deletions src/train_w2v_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# coding=utf-8
from collections import defaultdict

import pandas as pd
import param
import util
from gensim.models import Word2Vec

############################ 加载数据 ############################
df_all = pd.read_csv(param.data_path + '/output/corpus/all_data.csv', encoding='utf8', nrows=param.train_num).reset_index()
df_all['penalty'] = df_all['penalty'] - 1

############################ w2v ############################
documents = df_all['content'].values
util.log('documents number %d' % len(documents))

texts = [[word for word in document.split(' ')] for document in documents]
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] >= 5] for text in texts]

util.log('Train Model...')
w2v = Word2Vec(texts, size=300, window=5, iter=15, workers=12, seed=param.seed)
w2v.save(param.data_path + '/output/model/w2v_12w.model')
util.log('Save done!')

0 comments on commit f9d7b68

Please sign in to comment.