Merge branch 'master' of github.com:warmheartli/ChatBotCourse

eight-corner · Oct 10, 2016 · 3ecc840 · 3ecc840
2 parents 5a51fd0 + 39468b7
commit 3ecc840
Show file tree

Hide file tree

Showing 11 changed files with 99 additions and 3 deletions.
diff --git a/word2vec/compute-accuracy.c b/word2vec/compute-accuracy.c
@@ -16,7 +16,8 @@
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
-#include <malloc.h>
+//#include <malloc.h>
+#include <stdlib.h>
 #include <ctype.h>
 
 const long long max_size = 2000;         // max length of strings

diff --git a/word2vec/demo-analogy.sh b/word2vec/demo-analogy.sh
diff --git a/word2vec/demo-classes.sh b/word2vec/demo-classes.sh
diff --git a/word2vec/demo-phrase-accuracy.sh b/word2vec/demo-phrase-accuracy.sh
diff --git a/word2vec/demo-phrases.sh b/word2vec/demo-phrases.sh
diff --git a/word2vec/demo-word-accuracy.sh b/word2vec/demo-word-accuracy.sh
diff --git a/word2vec/demo-word.sh b/word2vec/demo-word.sh
diff --git a/word2vec/distance.c b/word2vec/distance.c
@@ -15,7 +15,8 @@
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
-#include <malloc.h>
+//#include <malloc.h>
+#include <stdlib.h>
 
 const long long max_size = 2000;         // max length of strings
 const long long N = 40;                  // number of closest words that will be shown

diff --git a/word2vec/word-analogy.c b/word2vec/word-analogy.c
@@ -15,7 +15,8 @@
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
-#include <malloc.h>
+//#include <malloc.h>
+#include <stdlib.h>
 
 const long long max_size = 2000;         // max length of strings
 const long long N = 40;                  // number of closest words that will be shown

diff --git a/word_segment.py b/word_segment.py
@@ -0,0 +1,31 @@
+# coding:utf-8
+
+import sys
+reload(sys)
+sys.setdefaultencoding( "utf-8" )
+
+import jieba
+from jieba import analyse
+
+def segment(input, output):
+    input_file = open(input, "r")
+    output_file = open(output, "w")
+    while True:
+        line = input_file.readline()
+        if line:
+            line = line.strip()
+            seg_list = jieba.cut(line)
+            segments = ""
+            for str in seg_list:
+                segments = segments + " " + str
+            output_file.write(segments)
+        else:
+            break
+    input_file.close()
+    output_file.close()
+
+if __name__ == '__main__':
+    if 3 != len(sys.argv):
+        print "Usage: ", sys.argv[0], "input output"
+        sys.exit(-1)
+    segment(sys.argv[1], sys.argv[2]);
diff --git a/word_vectors_loader.py b/word_vectors_loader.py
@@ -0,0 +1,62 @@
+# coding:utf-8
+
+import sys
+import struct
+import math
+import numpy as np
+
+reload(sys)
+sys.setdefaultencoding( "utf-8" )
+
+max_w = 50
+float_size = 4
+
+def load_vectors(input):
+    print "begin load vectors"
+
+    input_file = open(input, "rb")
+
+    # 获取词表数目及向量维度
+    words_and_size = input_file.readline()
+    words_and_size = words_and_size.strip()
+    words = long(words_and_size.split(' ')[0])
+    size = long(words_and_size.split(' ')[1])
+    print "words =", words
+    print "size =", size
+
+    word_vector = {}
+
+    for b in range(0, words):
+        a = 0
+        word = ''
+        # 读取一个词
+        while True:
+            c = input_file.read(1)
+            word = word + c
+            if False == c or c == ' ':
+                break
+            if a < max_w and c != '\n':
+                a = a + 1
+        word = word.strip()
+
+        # 读取词向量
+        vector = np.empty([200])
+        for index in range(0, size):
+            m = input_file.read(float_size)
+            (weight,) = struct.unpack('f', m)
+            vector[index] = weight
+
+        # 将词及其对应的向量存到dict中
+        word_vector[word.decode('utf-8')] = vector
+
+    input_file.close()
+
+    print "load vectors finish"
+    return word_vector
+
+if __name__ == '__main__':
+    if 2 != len(sys.argv):
+        print "Usage: ", sys.argv[0], "vectors.bin"
+        sys.exit(-1)
+    d = load_vectors(sys.argv[1])
+    print d[u'真的']