Skip to content

Commit

Permalink
Merge branch 'master' of github.com:warmheartli/ChatBotCourse
Browse files Browse the repository at this point in the history
  • Loading branch information
lcdevelop committed Oct 10, 2016
2 parents 5a51fd0 + 39468b7 commit 3ecc840
Show file tree
Hide file tree
Showing 11 changed files with 99 additions and 3 deletions.
3 changes: 2 additions & 1 deletion word2vec/compute-accuracy.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <malloc.h>
//#include <malloc.h>
#include <stdlib.h>
#include <ctype.h>

const long long max_size = 2000; // max length of strings
Expand Down
Empty file modified word2vec/demo-analogy.sh
100644 → 100755
Empty file.
Empty file modified word2vec/demo-classes.sh
100644 → 100755
Empty file.
Empty file modified word2vec/demo-phrase-accuracy.sh
100644 → 100755
Empty file.
Empty file modified word2vec/demo-phrases.sh
100644 → 100755
Empty file.
Empty file modified word2vec/demo-word-accuracy.sh
100644 → 100755
Empty file.
Empty file modified word2vec/demo-word.sh
100644 → 100755
Empty file.
3 changes: 2 additions & 1 deletion word2vec/distance.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <malloc.h>
//#include <malloc.h>
#include <stdlib.h>

const long long max_size = 2000; // max length of strings
const long long N = 40; // number of closest words that will be shown
Expand Down
3 changes: 2 additions & 1 deletion word2vec/word-analogy.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <malloc.h>
//#include <malloc.h>
#include <stdlib.h>

const long long max_size = 2000; // max length of strings
const long long N = 40; // number of closest words that will be shown
Expand Down
31 changes: 31 additions & 0 deletions word_segment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# coding:utf-8

import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

import jieba
from jieba import analyse

def segment(input, output):
input_file = open(input, "r")
output_file = open(output, "w")
while True:
line = input_file.readline()
if line:
line = line.strip()
seg_list = jieba.cut(line)
segments = ""
for str in seg_list:
segments = segments + " " + str
output_file.write(segments)
else:
break
input_file.close()
output_file.close()

if __name__ == '__main__':
if 3 != len(sys.argv):
print "Usage: ", sys.argv[0], "input output"
sys.exit(-1)
segment(sys.argv[1], sys.argv[2]);
62 changes: 62 additions & 0 deletions word_vectors_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# coding:utf-8

import sys
import struct
import math
import numpy as np

reload(sys)
sys.setdefaultencoding( "utf-8" )

max_w = 50
float_size = 4

def load_vectors(input):
print "begin load vectors"

input_file = open(input, "rb")

# 获取词表数目及向量维度
words_and_size = input_file.readline()
words_and_size = words_and_size.strip()
words = long(words_and_size.split(' ')[0])
size = long(words_and_size.split(' ')[1])
print "words =", words
print "size =", size

word_vector = {}

for b in range(0, words):
a = 0
word = ''
# 读取一个词
while True:
c = input_file.read(1)
word = word + c
if False == c or c == ' ':
break
if a < max_w and c != '\n':
a = a + 1
word = word.strip()

# 读取词向量
vector = np.empty([200])
for index in range(0, size):
m = input_file.read(float_size)
(weight,) = struct.unpack('f', m)
vector[index] = weight

# 将词及其对应的向量存到dict中
word_vector[word.decode('utf-8')] = vector

input_file.close()

print "load vectors finish"
return word_vector

if __name__ == '__main__':
if 2 != len(sys.argv):
print "Usage: ", sys.argv[0], "vectors.bin"
sys.exit(-1)
d = load_vectors(sys.argv[1])
print d[u'真的']

0 comments on commit 3ecc840

Please sign in to comment.