Skip to content

Commit

Permalink
20141020 new add files
Browse files Browse the repository at this point in the history
  • Loading branch information
cmathx committed Oct 21, 2014
1 parent 2c0568c commit 4e1617e
Show file tree
Hide file tree
Showing 16 changed files with 608 additions and 385 deletions.
2 changes: 1 addition & 1 deletion code/.idea/code.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

821 changes: 474 additions & 347 deletions code/.idea/workspace.xml

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions code/cjw/CF/ItemCF.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# __author__ = 'cjweffort'
# -*- coding: utf-8 -*-
import math
import Rate
from cjw.AlreadyPublishNews import *
# import Rate
# from cjw.AlreadyPublishNews import *
from cjw.PLSA.plsaRecommend import *

class ItemBasedCF:
Expand Down Expand Up @@ -43,7 +43,7 @@ def ItemSimilarity(self):
for i,related_items in C.items():
self.W.setdefault(i,{})
for j,cij in related_items.items():
self.W[i][j] = cij / (math.sqrt(N[i] * N[j]))
self.W[i][j] = cij / (math.sqrt(N[i] * N[j]))#(N[i] + N[j] - cij)
return self.W

#根据topic model计算结果读入文档的相似度
Expand All @@ -69,7 +69,7 @@ def Recommend(self,user,K=3,N=1):
rank = dict()
action_item = self.train[user] #用户user产生过行为的item和评分
for item,score in action_item.items():
for j,wj in self.W[item].items():#sorted(self.W[item].items(),key=lambda x:x[1],reverse=True):#[0:K]:
for j,wj in self.W1[item].items():#sorted(self.W[item].items(),key=lambda x:x[1],reverse=True):#[0:K]:
if j in action_item.keys():
continue
rank.setdefault(j,0)
Expand Down
Binary file modified code/cjw/CF/ItemCF.pyc
Binary file not shown.
Binary file modified code/cjw/CF/Rate.pyc
Binary file not shown.
Binary file modified code/cjw/CF/__init__.pyc
Binary file not shown.
Binary file modified code/cjw/PLSA/__init__.pyc
Binary file not shown.
Binary file removed code/cjw/PLSA/data/.document.csv.swp
Binary file not shown.
16 changes: 8 additions & 8 deletions code/cjw/PLSA/plsaRecommend.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ def createDocMapAndClickInfo(total_set_file, doc_set_file):
fp_total_set = open(total_set_file, 'r')
if is_write_need_file == False:
fp_doc_set = open(doc_set_file, 'w')
fp_doc_map1 = open('data//doc_map1.csv', 'w')
fp_doc_map2 = open('data//doc_map2.csv', 'w')
fp_doc_click_count = open('data//doc_click_count.csv', 'w')
fp_user_doc_click_count = open('data//user_doc_click_count.csv', 'w')
fp_doc_map1 = open('../PLSA/data/doc_map1.csv', 'w')
fp_doc_map2 = open('../PLSA/data/doc_map2.csv', 'w')
fp_doc_click_count = open('../PLSA/data/doc_click_count.csv', 'w')
fp_user_doc_click_count = open('../PLSA/data/user_doc_click_count.csv', 'w')
cnt = 0
pynlpir.open()
for line in fp_total_set:
Expand All @@ -44,10 +44,10 @@ def createDocMapAndClickInfo(total_set_file, doc_set_file):
doc_map2[cnt] = word[1]
cnt += 1
if is_write_need_file == False:
# title_split_result = pynlpir.nlpir.ParagraphProcess(word[4], True)
title_split_result = pynlpir.nlpir.ParagraphProcess(word[4], True)
content_split_result = pynlpir.nlpir.ParagraphProcess(word[5], True)
#make sure that news id map is true
fp_doc_set.write('%s\t%s' %(word[1], content_split_result))#, content_split_result))
fp_doc_set.write('%s\t%s\t%s\n' %(word[1], title_split_result, content_split_result))#, content_split_result))

# doc_map = sorted(doc_map1.items(), key=lambda d:d[1], reverse=False)
if is_write_need_file == False:
Expand Down Expand Up @@ -116,7 +116,7 @@ def computeUserItemScoreAndRecommend(user_set, doc_set, doc_map1, doc_click_coun
cluster_click_count.setdefault(clu, 0)
cluster_click_count[clu] += cnt

fp_recommend = open('E://Plan-Action//CCF//news_recommend//code//recommend//plsa_recommend.csv', 'w')
fp_recommend = open('../../../code/recommend/plsa_recommend.csv', 'w')
fp_recommend.write('userid,newsid\n')
starttime = datetime.datetime.now()
print 'start to recommend'
Expand Down Expand Up @@ -154,7 +154,7 @@ def computeUserItemScoreAndRecommend(user_set, doc_set, doc_map1, doc_click_coun


if __name__ == '__main__':
total_set_file = 'E://Plan-Action//CCF//news_recommend//code//data//total_set.txt'
total_set_file = '../../data/total_set.txt'
doc_set_file = 'data//document.csv'
user_set, doc_set, doc_map1, doc_map2, doc_click_count, user_doc_click_count = createDocMapAndClickInfo(total_set_file, doc_set_file)

Expand Down
Binary file modified code/cjw/PLSA/utils.pyc
Binary file not shown.
Binary file modified code/cjw/__init__.pyc
Binary file not shown.
78 changes: 65 additions & 13 deletions code/cjw/gensim/GensimTest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# __author__ = 'cjweffort'
# -*- coding: utf-8 -*-

import math
from gensim import corpora, models, similarities
import logging

Expand All @@ -21,13 +22,18 @@
print dictionary.token2id
print '用符串表示的文档转换用id表示的文档向量'
corpus = [dictionary.doc2bow(text) for text in texts]
print corpus
print corpus, type(corpus)
'''
自己计算词频-逆文档频率:见代码最后面
'''

print '基于训练文档计算一个TF-IDF模型'
tfidf = models.TfidfModel(corpus)
print tfidf
print '基于TF-IDF模型计算出TF-IDF矩阵'
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
print doc
print doc, type(doc)
print tfidf.dfs
print tfidf.idfs

Expand All @@ -39,26 +45,72 @@
for doc in corpus_lsi:
print doc

print '训练LDA模型'
lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=2)
lda.print_topics(2)
print '基于LDA模型计算文档和主题的相关性'
corpus_lda = lda[corpus_tfidf]
for doc in corpus_lda:
print doc
# print '训练LDA模型'
# lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=2)
# lda.print_topics(2)
# print '基于LDA模型计算文档和主题的相关性'
# corpus_lda = lda[corpus_tfidf]
# for doc in corpus_lda:
# print doc

result = (0.67211468809878583 * 0.44124825208697871 - 0.54880682119356106 * 0.83594920480338963) / math.sqrt(0.67211468809878583 * 0.67211468809878583 + \
0.54880682119356106 * 0.54880682119356106) / math.sqrt(0.44124825208697871 * 0.44124825208697871 + 0.83594920480338963 * 0.83594920480338963)
print result

print '建立索引'
index = similarities.MatrixSimilarity(lsi[corpus])
index = similarities.MatrixSimilarity(lsi[corpus_tfidf])
# print index
print '查询文档向量化'
query = 'gold silver truck'
query = 'Shipment of gold damaged in a fire'
# query = 'gold silver truck'
query_bow = dictionary.doc2bow(query.lower().split())
print query_bow
print tfidf[query_bow]
print '用之前训练好的LSI模型将其映射到二维的topic空间'
query_lsi = lsi[query_bow]
query_lsi = lsi[tfidf[query_bow]]
print query_lsi

print '计算其和index中doc的余弦相似度'
sims = index[query_lsi]
print list(enumerate(sims))
print '按照相似度进行排序'
sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
print sort_sims
print sort_sims

# word_occur = dict()
# word_sum = []
# for ll in corpus:
# t_sum = 0
# for tup in ll:
# word_occur.setdefault(tup[0], 0)
# word_occur[tup[0]] += 1
# t_sum += tup[1]
# word_sum.append(t_sum)
# tf_list = []
# idfs_list = []
# tfidfs_list = []
# cnt = 0
# for ll in corpus:
# t_lst1 = []
# t_lst2 = []
# t_lst3 = []
# for tup in ll:
# tf = 1.0 * tup[1] / word_sum[cnt]
# idfs = math.log(3.0 / word_occur[tup[0]])
# t_lst1.append((tup[0], tf))
# t_lst2.append((tup[0], idfs))
# t_lst3.append((tup[0], tf * idfs))
# tf_list.append(t_lst1)
# idfs_list.append(t_lst2)
# tfidfs_list.append(t_lst3)
# cnt += 1
#
# print 'tf_list'
# for ele in tf_list:
# print ele
# print 'idfs_list'
# for ele in idfs_list:
# print ele
# print 'tfidfs_list'
# for ele in tfidfs_list:
# print ele
45 changes: 40 additions & 5 deletions code/cjw/gensim/TFIDFRecommend.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
logging.basicConfig(format=' %(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

print '建立doc映射表'
doc_set_file = 'data/document.csv'
doc_set_file = '../PLSA/data/document.csv'
total_set_file = '../../data/total_set.txt'
user_set, doc_set, doc_map1, doc_map2, doc_click_count, user_doc_click_count = \
createDocMapAndClickInfo(total_set_file, doc_set_file)
Expand All @@ -28,12 +28,45 @@
print '中文分词'
corpus = plsaForNewsCluster.Corpus()
original_texts = []
title_no_key_word = 0
content_no_key_word = 0
for line in fp_doc_set:
ele = line.split('\t')
document = plsaForNewsCluster.Document("", ele[1]) #prepare words which has been segmented
document.split()
document = plsaForNewsCluster.Document(ele[1], ele[2]) #prepare words which has been segmented
title_tag, content_tag = document.split()
if title_tag == False:
title_no_key_word += 1
if content_no_key_word == False:
content_no_key_word += 1
corpus.add_document(document)
original_texts.append(document.content_words)
original_texts.append(document.title_words)
print '标题没有提取出关键字有%d个' %title_no_key_word
print '正文没有提取出关键字有%d个' %content_no_key_word

fp_title_set = open('/tmp/title.csv', 'w')
fp_content_set = open('/tmp/content.csv', 'w')
for doc in corpus.documents:
tag = True
for word in doc.getTitleKeyWords():
if tag == True:
tag = False
else:
fp_title_set.write('\t')
fp_title_set.write('%s' %word)
fp_title_set.write('\n')
# fp_title_set.write(str(unicode(doc.getTitleKeyWords(), 'gbk')) + '\n')
for doc in corpus.documents:
tag = True
for word in doc.getContentKeyWords():
if tag == True:
tag = False
else:
fp_content_set.write('\t')
fp_content_set.write('%s' %word)
fp_content_set.write('\n')
# fp_title_set.write(str(unicode(doc.getContentKeyWords(), 'gbk')) + '\n')
fp_title_set.close()
fp_content_set.close()

LOW_FREQUENCE = 50
print '去掉在预料库中出现次数小于等于LOW_FREQUENCE的低频词'
Expand Down Expand Up @@ -92,6 +125,8 @@
lda.print_topics(topic_num)
print '基于LDA模型计算文档和主题的相关性'
corpus_lda = lda[corpus_tfidf]


# fp_lda_doc_pro = open('data/lda_doc_probability.csv', 'w')
# cnt = 0
# for doc in corpus_lda:
Expand All @@ -106,7 +141,7 @@
# print 'cnt = ', cnt

print '建立索引'
index = similarities.MatrixSimilarity(lda[corpus])
index = similarities.MatrixSimilarity(lda[corpus_tfidf])


length = len(texts)
Expand Down
23 changes: 16 additions & 7 deletions code/cjw/gensim/plsaForNewsCluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@


def split(paragraph):
tag = True
words = []
# print paragraph
count = 0
Expand All @@ -29,14 +30,15 @@ def split(paragraph):
if len(word) >= 2 and len(word[1]) > 0 and len(word[0]) >= 4:
#only consider Noun, v, adj etc. and the length of this word must be greater than 1
cc = word[1][0]
if cc == 'n' or cc == 's' or cc == 'f'\
or cc == 'v' or cc == 'a' or cc == 'b' or cc == 'z'\
or cc == 'r' or cc == 'd':
if cc == 'n':# or cc == 's' or cc == 'f'\
# or cc == 'v' or cc == 'a' or cc == 'b' or cc == 'z'\
# or cc == 'r' or cc == 'd':
words.append(word[0])
count += 1
if count == 0:
print '没有关键词被提取出来!'
return words
tag = False
# print '没有关键词被提取出来!'
return words, tag

class Document(object):

Expand Down Expand Up @@ -75,8 +77,15 @@ def split(self):
lowercase everything; preserve contractions; disallow strings that
include non-letters.
'''
# self.title_words = split(self.title)
self.content_words = split(self.content)
self.title_words, title_tag = split(self.title)
self.content_words, content_tag = split(self.content)
return title_tag, content_tag

def getTitleKeyWords(self):
return self.title_words

def getContentKeyWords(self):
return self.content_words

class Corpus(object):

Expand Down
Binary file modified code/cjw/gensim/plsaForNewsCluster.pyc
Binary file not shown.
Binary file modified code/cjw/gensim/utils.pyc
Binary file not shown.

0 comments on commit 4e1617e

Please sign in to comment.