20141020 new add files

cmathx · Oct 21, 2014 · 4e1617e · 4e1617e
1 parent 2c0568c
commit 4e1617e
Show file tree

Hide file tree

Showing 16 changed files with 608 additions and 385 deletions.
diff --git a/code/.idea/code.iml b/code/.idea/code.iml
diff --git a/code/.idea/workspace.xml b/code/.idea/workspace.xml
diff --git a/code/cjw/CF/ItemCF.py b/code/cjw/CF/ItemCF.py
@@ -1,8 +1,8 @@
 # __author__ = 'cjweffort'
 # -*- coding: utf-8 -*-
 import math
-import Rate
-from cjw.AlreadyPublishNews import *
+# import Rate
+# from cjw.AlreadyPublishNews import *
 from cjw.PLSA.plsaRecommend import *
 
 class ItemBasedCF:
@@ -43,7 +43,7 @@ def ItemSimilarity(self):
         for i,related_items in C.items():
             self.W.setdefault(i,{})
             for j,cij in related_items.items():
-                self.W[i][j] = cij / (math.sqrt(N[i] * N[j]))
+                self.W[i][j] = cij / (math.sqrt(N[i] * N[j]))#(N[i] + N[j] - cij)
         return self.W
 
     #根据topic model计算结果读入文档的相似度
@@ -69,7 +69,7 @@ def Recommend(self,user,K=3,N=1):
         rank = dict()
         action_item = self.train[user]     #用户user产生过行为的item和评分
         for item,score in action_item.items():
-            for j,wj in self.W[item].items():#sorted(self.W[item].items(),key=lambda x:x[1],reverse=True):#[0:K]:
+            for j,wj in self.W1[item].items():#sorted(self.W[item].items(),key=lambda x:x[1],reverse=True):#[0:K]:
                 if j in action_item.keys():
                     continue
                 rank.setdefault(j,0)

diff --git a/code/cjw/CF/ItemCF.pyc b/code/cjw/CF/ItemCF.pyc
diff --git a/code/cjw/CF/Rate.pyc b/code/cjw/CF/Rate.pyc
diff --git a/code/cjw/CF/__init__.pyc b/code/cjw/CF/__init__.pyc
diff --git a/code/cjw/PLSA/__init__.pyc b/code/cjw/PLSA/__init__.pyc
diff --git a/code/cjw/PLSA/data/.document.csv.swp b/code/cjw/PLSA/data/.document.csv.swp
diff --git a/code/cjw/PLSA/plsaRecommend.py b/code/cjw/PLSA/plsaRecommend.py
@@ -23,10 +23,10 @@ def createDocMapAndClickInfo(total_set_file, doc_set_file):
     fp_total_set = open(total_set_file, 'r')
     if is_write_need_file == False:
         fp_doc_set = open(doc_set_file, 'w')
-        fp_doc_map1 = open('data//doc_map1.csv', 'w')
-        fp_doc_map2 = open('data//doc_map2.csv', 'w')
-        fp_doc_click_count = open('data//doc_click_count.csv', 'w')
-        fp_user_doc_click_count = open('data//user_doc_click_count.csv', 'w')
+        fp_doc_map1 = open('../PLSA/data/doc_map1.csv', 'w')
+        fp_doc_map2 = open('../PLSA/data/doc_map2.csv', 'w')
+        fp_doc_click_count = open('../PLSA/data/doc_click_count.csv', 'w')
+        fp_user_doc_click_count = open('../PLSA/data/user_doc_click_count.csv', 'w')
     cnt = 0
     pynlpir.open()
     for line in fp_total_set:
@@ -44,10 +44,10 @@ def createDocMapAndClickInfo(total_set_file, doc_set_file):
             doc_map2[cnt] = word[1]
             cnt += 1
             if is_write_need_file == False:
-                # title_split_result = pynlpir.nlpir.ParagraphProcess(word[4], True)
+                title_split_result = pynlpir.nlpir.ParagraphProcess(word[4], True)
                 content_split_result = pynlpir.nlpir.ParagraphProcess(word[5], True)
                 #make sure that news id map is true
-                fp_doc_set.write('%s\t%s' %(word[1], content_split_result))#, content_split_result))
+                fp_doc_set.write('%s\t%s\t%s\n' %(word[1], title_split_result, content_split_result))#, content_split_result))
 
     # doc_map = sorted(doc_map1.items(), key=lambda d:d[1], reverse=False)
     if is_write_need_file == False:
@@ -116,7 +116,7 @@ def computeUserItemScoreAndRecommend(user_set, doc_set, doc_map1, doc_click_coun
         cluster_click_count.setdefault(clu, 0)
         cluster_click_count[clu] += cnt
 
-    fp_recommend = open('E://Plan-Action//CCF//news_recommend//code//recommend//plsa_recommend.csv', 'w')
+    fp_recommend = open('../../../code/recommend/plsa_recommend.csv', 'w')
     fp_recommend.write('userid,newsid\n')
     starttime = datetime.datetime.now()
     print 'start to recommend'
@@ -154,7 +154,7 @@ def computeUserItemScoreAndRecommend(user_set, doc_set, doc_map1, doc_click_coun
 
 
 if __name__ == '__main__':
-    total_set_file = 'E://Plan-Action//CCF//news_recommend//code//data//total_set.txt'
+    total_set_file = '../../data/total_set.txt'
     doc_set_file = 'data//document.csv'
     user_set, doc_set, doc_map1, doc_map2, doc_click_count, user_doc_click_count = createDocMapAndClickInfo(total_set_file, doc_set_file)
 

diff --git a/code/cjw/PLSA/utils.pyc b/code/cjw/PLSA/utils.pyc
diff --git a/code/cjw/__init__.pyc b/code/cjw/__init__.pyc
diff --git a/code/cjw/gensim/GensimTest.py b/code/cjw/gensim/GensimTest.py
@@ -1,6 +1,7 @@
 # __author__ = 'cjweffort'
 # -*- coding: utf-8 -*-
 
+import math
 from gensim import corpora, models, similarities
 import logging
 
@@ -21,13 +22,18 @@
 print dictionary.token2id
 print '用符串表示的文档转换用id表示的文档向量'
 corpus = [dictionary.doc2bow(text) for text in texts]
-print corpus
+print corpus, type(corpus)
+'''
+自己计算词频-逆文档频率：见代码最后面
+'''
+
 print '基于训练文档计算一个TF-IDF模型'
 tfidf = models.TfidfModel(corpus)
+print tfidf
 print '基于TF-IDF模型计算出TF-IDF矩阵'
 corpus_tfidf = tfidf[corpus]
 for doc in corpus_tfidf:
-    print doc
+    print doc, type(doc)
 print tfidf.dfs
 print tfidf.idfs
 
@@ -39,26 +45,72 @@
 for doc in corpus_lsi:
     print doc
 
-print '训练LDA模型'
-lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=2)
-lda.print_topics(2)
-print '基于LDA模型计算文档和主题的相关性'
-corpus_lda = lda[corpus_tfidf]
-for doc in corpus_lda:
-    print doc
+# print '训练LDA模型'
+# lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=2)
+# lda.print_topics(2)
+# print '基于LDA模型计算文档和主题的相关性'
+# corpus_lda = lda[corpus_tfidf]
+# for doc in corpus_lda:
+#     print doc
+
+result = (0.67211468809878583 * 0.44124825208697871 - 0.54880682119356106 * 0.83594920480338963) / math.sqrt(0.67211468809878583 * 0.67211468809878583 + \
+    0.54880682119356106 * 0.54880682119356106) / math.sqrt(0.44124825208697871 * 0.44124825208697871 + 0.83594920480338963 * 0.83594920480338963)
+print result
 
 print '建立索引'
-index = similarities.MatrixSimilarity(lsi[corpus])
+index = similarities.MatrixSimilarity(lsi[corpus_tfidf])
+# print index
 print '查询文档向量化'
-query = 'gold silver truck'
+query = 'Shipment of gold damaged in a fire'
+# query = 'gold silver truck'
 query_bow = dictionary.doc2bow(query.lower().split())
 print query_bow
+print tfidf[query_bow]
 print '用之前训练好的LSI模型将其映射到二维的topic空间'
-query_lsi = lsi[query_bow]
+query_lsi = lsi[tfidf[query_bow]]
 print query_lsi
+
 print '计算其和index中doc的余弦相似度'
 sims = index[query_lsi]
 print list(enumerate(sims))
 print '按照相似度进行排序'
 sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
-print sort_sims
+print sort_sims
+
+# word_occur = dict()
+# word_sum = []
+# for ll in corpus:
+#     t_sum = 0
+#     for tup in ll:
+#         word_occur.setdefault(tup[0], 0)
+#         word_occur[tup[0]] += 1
+#         t_sum += tup[1]
+#     word_sum.append(t_sum)
+# tf_list = []
+# idfs_list = []
+# tfidfs_list = []
+# cnt = 0
+# for ll in corpus:
+#     t_lst1 = []
+#     t_lst2 = []
+#     t_lst3 = []
+#     for tup in ll:
+#         tf = 1.0 * tup[1] / word_sum[cnt]
+#         idfs = math.log(3.0 / word_occur[tup[0]])
+#         t_lst1.append((tup[0], tf))
+#         t_lst2.append((tup[0], idfs))
+#         t_lst3.append((tup[0], tf * idfs))
+#     tf_list.append(t_lst1)
+#     idfs_list.append(t_lst2)
+#     tfidfs_list.append(t_lst3)
+#     cnt += 1
+#
+# print 'tf_list'
+# for ele in tf_list:
+#     print ele
+# print 'idfs_list'
+# for ele in idfs_list:
+#     print ele
+# print 'tfidfs_list'
+# for ele in tfidfs_list:
+#     print ele
diff --git a/code/cjw/gensim/TFIDFRecommend.py b/code/cjw/gensim/TFIDFRecommend.py
@@ -14,7 +14,7 @@
 logging.basicConfig(format=' %(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
 print '建立doc映射表'
-doc_set_file = 'data/document.csv'
+doc_set_file = '../PLSA/data/document.csv'
 total_set_file = '../../data/total_set.txt'
 user_set, doc_set, doc_map1, doc_map2, doc_click_count, user_doc_click_count = \
     createDocMapAndClickInfo(total_set_file, doc_set_file)
@@ -28,12 +28,45 @@
 print '中文分词'
 corpus = plsaForNewsCluster.Corpus()
 original_texts = []
+title_no_key_word = 0
+content_no_key_word = 0
 for line in fp_doc_set:
     ele = line.split('\t')
-    document = plsaForNewsCluster.Document("", ele[1])  #prepare words which has been segmented
-    document.split()
+    document = plsaForNewsCluster.Document(ele[1], ele[2])  #prepare words which has been segmented
+    title_tag, content_tag = document.split()
+    if title_tag == False:
+        title_no_key_word += 1
+    if content_no_key_word == False:
+        content_no_key_word += 1
     corpus.add_document(document)
-    original_texts.append(document.content_words)
+    original_texts.append(document.title_words)
+print '标题没有提取出关键字有%d个' %title_no_key_word
+print '正文没有提取出关键字有%d个' %content_no_key_word
+
+fp_title_set = open('/tmp/title.csv', 'w')
+fp_content_set = open('/tmp/content.csv', 'w')
+for doc in corpus.documents:
+    tag = True
+    for word in doc.getTitleKeyWords():
+        if tag == True:
+            tag = False
+        else:
+            fp_title_set.write('\t')
+        fp_title_set.write('%s' %word)
+    fp_title_set.write('\n')
+    # fp_title_set.write(str(unicode(doc.getTitleKeyWords(), 'gbk')) + '\n')
+for doc in corpus.documents:
+    tag = True
+    for word in doc.getContentKeyWords():
+        if tag == True:
+            tag = False
+        else:
+            fp_content_set.write('\t')
+        fp_content_set.write('%s' %word)
+    fp_content_set.write('\n')
+    # fp_title_set.write(str(unicode(doc.getContentKeyWords(), 'gbk')) + '\n')
+fp_title_set.close()
+fp_content_set.close()
 
 LOW_FREQUENCE = 50
 print '去掉在预料库中出现次数小于等于LOW_FREQUENCE的低频词'
@@ -92,6 +125,8 @@
 lda.print_topics(topic_num)
 print '基于LDA模型计算文档和主题的相关性'
 corpus_lda = lda[corpus_tfidf]
+
+
 # fp_lda_doc_pro = open('data/lda_doc_probability.csv', 'w')
 # cnt = 0
 # for doc in corpus_lda:
@@ -106,7 +141,7 @@
 # print 'cnt = ', cnt
 
 print '建立索引'
-index = similarities.MatrixSimilarity(lda[corpus])
+index = similarities.MatrixSimilarity(lda[corpus_tfidf])
 
 
 length = len(texts)

diff --git a/code/cjw/gensim/plsaForNewsCluster.py b/code/cjw/gensim/plsaForNewsCluster.py
@@ -19,6 +19,7 @@
 
 
 def split(paragraph):
+    tag = True
     words = []
     # print paragraph
     count = 0
@@ -29,14 +30,15 @@ def split(paragraph):
         if len(word) >= 2 and len(word[1]) > 0 and len(word[0]) >= 4:
             #only consider Noun, v, adj etc. and the length of this word must be greater than 1
             cc = word[1][0]
-            if cc == 'n' or cc == 's' or cc == 'f'\
-                or cc == 'v' or cc == 'a' or cc == 'b' or cc == 'z'\
-                or cc == 'r' or cc == 'd':
+            if cc == 'n':# or cc == 's' or cc == 'f'\
+                # or cc == 'v' or cc == 'a' or cc == 'b' or cc == 'z'\
+                # or cc == 'r' or cc == 'd':
                 words.append(word[0])
                 count += 1
     if count == 0:
-        print '没有关键词被提取出来！'
-    return words
+        tag = False
+    #     print '没有关键词被提取出来！'
+    return words, tag
 
 class Document(object):
 
@@ -75,8 +77,15 @@ def split(self):
         lowercase everything; preserve contractions; disallow strings that
         include non-letters.
         '''
-        # self.title_words = split(self.title)
-        self.content_words = split(self.content)
+        self.title_words, title_tag = split(self.title)
+        self.content_words, content_tag = split(self.content)
+        return title_tag, content_tag
+
+    def getTitleKeyWords(self):
+        return self.title_words
+
+    def getContentKeyWords(self):
+        return self.content_words
 
 class Corpus(object):
 

diff --git a/code/cjw/gensim/plsaForNewsCluster.pyc b/code/cjw/gensim/plsaForNewsCluster.pyc
diff --git a/code/cjw/gensim/utils.pyc b/code/cjw/gensim/utils.pyc