Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
lichuang committed Jun 8, 2016
0 parents commit dce9018
Show file tree
Hide file tree
Showing 29 changed files with 966 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.pyc
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# page-classify
48 changes: 48 additions & 0 deletions binary_classify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# coding:utf-8

import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc


X = []

# 前三行作为输入样本
X.append("fuck you")
X.append("fuck you all")
X.append("hello everyone")

# 后两句作为测试样本
X.append("fuck me")
X.append("hello boy")

# y为样本标注
y = [1,1,0]

vectorizer = TfidfVectorizer()

# 取X的前三句作为输入做tfidf转换
X_train = vectorizer.fit_transform(X[:-2])

# 取X的后两句用上句生成的tfidf做转换
X_test = vectorizer.transform(X[-2:])

# 用逻辑回归模型做训练
classifier = LogisticRegression()
classifier.fit(X_train, y)

# 做测试样例的预测
predictions = classifier.predict(X_test)
print predictions
pred = [[1],[0]]
false_positive_rate, recall, thresholds = roc_curve(pred, predictions)
print false_positive_rate, recall, thresholds
roc_auc = auc(false_positive_rate, recall)
plt.plot(false_positive_rate, recall, 'b', label='AUC = %0.2f' % roc_auc)
plt.show()
10 changes: 10 additions & 0 deletions class_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# coding:utf-8

import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

from sklearn.feature_extraction import DictVectorizer
onehot_encoder = DictVectorizer()
instances = [{'city': '北京'},{'city': '天津'}, {'city': '上海'}]
print(onehot_encoder.fit_transform(instances).toarray())
94 changes: 94 additions & 0 deletions complete_link.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# coding:utf-8

import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

import MySQLdb
import re
from conn import Conn
from elasticsearch import Elasticsearch


def FindToken(cutlist, char):
if char in cutlist:
return True
else:
return False

def Cut(cutlist,lines):
l = []
line = []

for i in lines:
if FindToken(cutlist,i):
l.append("".join(line))
l.append(i)
line = []
else:
line.append(i)
return l

def FindLongestSentence(lines):
cutlist = "\/[。,,!……!《》<>\"'::?\?、\|“”‘’;]{}(){}【】(){}():?!。,;、~——+%%`:“”"'‘\n\r".decode('utf8')
l = Cut(list(cutlist),list(lines.decode('utf8')))
longest_sentence = ""
max_len = 0
for line in l:
if line.strip() <> "":#这里可能包含空格
li = line.strip().split()
for sentence in li:
if len(sentence) > max_len:
max_len = len(sentence)
longest_sentence = sentence

start = lines.decode('utf8').find(longest_sentence)
end = start + max_len
utf8_lines = lines.decode('utf8')
pre = utf8_lines[0:start]
after = utf8_lines[end:]
return pre, longest_sentence, after

def removeIllegalChar(line):
return re.sub('\[|\]|\/|\'|\"|\(|\)|\!|\?|\~','',line)

es = Elasticsearch()

conn = Conn().getConnection()
cursor = conn.cursor()
upcursor = conn.cursor()
sql = "select id, title, substring_index(content,'相关原创文章,敬请关注',1) from CrawlPage where content not like '%</a>%'"
cursor.execute(sql)
for row in cursor.fetchall():
id = row[0]
title = row[1]
content = row[2]
title = re.sub('\[|\]|\/|\'|\"|\(|\)|\!|\?|\~|\-','',title)

try:
res = es.search(index="app", body={"fields":["title"],"size":1,"query": {"query_string": {"query":title}}})
for hit in res['hits']['hits']:
print "process:", id, title
pre, sentence, after = FindLongestSentence(content)
middle = len(sentence) / 2
left = sentence[0:middle]
right = sentence[middle:]
new_content = "%s%s%s%s%s%s%s%s%s" % (
removeIllegalChar(pre),
removeIllegalChar(left),
"<a href='http://www.shareditor.com/blogshow/?blogId=",
hit['_id'],
"'>",
hit['fields']['title'][0],
"</a>",
removeIllegalChar(right),
removeIllegalChar(after))
update_sql = "update CrawlPage set content=\"%s\" where id=%d" % (new_content, id)
upcursor.execute(update_sql)
conn.commit()

except Exception,e:
print "Error:"
print title
print e
sys.exit(-1)
11 changes: 11 additions & 0 deletions conn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# coding:utf-8

import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
import MySQLdb

class Conn:
def getConnection(self):
conn = MySQLdb.connect(host="127.0.0.1",user="lichuang",passwd="qwerty",db="sharenote2.0",charset="utf8")
return conn
13 changes: 13 additions & 0 deletions corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# coding:utf-8

import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

from sklearn.feature_extraction.text import CountVectorizer
corpus = [
'UNC played Duke in basketball',
'Duke lost the basketball game' ]
vectorizer = CountVectorizer()
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_
28 changes: 28 additions & 0 deletions cut_and_cal_tfidf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# coding:utf-8

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import jieba
import MySQLdb

conn = MySQLdb.connect(host="127.0.0.1",user="lichuang",passwd="qwerty",db="test",charset="utf8")
cursor = conn.cursor()

sql = "select content from page"
cursor.execute(sql)
corpus=[]
for content in cursor.fetchall():
seg_list = jieba.cut(content[0])
line = ""
for str in seg_list:
line = line + " " + str
corpus.append(line)
conn.commit()
conn.close()

vectorizer=CountVectorizer()
csr_mat = vectorizer.fit_transform(corpus)
transformer=TfidfTransformer()
tfidf=transformer.fit_transform(csr_mat)
word=vectorizer.get_feature_names()
print tfidf.toarray()
5 changes: 5 additions & 0 deletions elasticsearch_api_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from elasticsearch import Elasticsearch

es = Elasticsearch()
res = es.search(index="app", body={"fields":["title"],"size":1,"query": {"query_string": {"query":"fdsfsd"}}})
print res['hits']['total']
101 changes: 101 additions & 0 deletions feature_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# coding:utf-8

import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import jieba
from jieba import analyse
import MySQLdb
import numpy as np

conn = MySQLdb.connect(host="127.0.0.1",user="lichuang",passwd="qwerty",db="sharenote2.0",charset="utf8")

def get_segment():
cursor = conn.cursor()
sql = "select id, title, content from CrawlPage"
cursor.execute(sql)
jieba.analyse.set_stop_words("stopwords.txt")
for result in cursor.fetchall():
id = result[0]
content = result[1] + ' ' + result[2]
seg_list = jieba.cut(content)
line = ""
for str in seg_list:
line = line + " " + str
line = line.replace('\'', ' ')
sql = "update CrawlPage set segment='%s' where id=%d" % (line, id)
try:
cursor.execute(sql)
conn.commit()
except Exception,e:
print line
print e
sys.exit(-1)
conn.close()

def feature_dump():
cursor = conn.cursor()
category={}
category[0] = 'isTec'
category[1] = 'isSoup'
category[2] = 'isML'
category[3] = 'isMath'
category[4] = 'isNews'

corpus=[]
for index in range(0, 5):
sql = "select segment from CrawlPage where " + category[index] + "=1"
print sql
cursor.execute(sql)
line = ""
for result in cursor.fetchall():
segment = result[0]
line = line + " " + segment
corpus.append(line)

conn.commit()
conn.close()

vectorizer=CountVectorizer()
csr_mat = vectorizer.fit_transform(corpus)
transformer=TfidfTransformer()
tfidf=transformer.fit_transform(csr_mat)
word=vectorizer.get_feature_names()
print tfidf.toarray()

for index in range(0, 5):
f = file("tfidf_%d" % index, "wb")
for i in np.argsort(-tfidf.toarray()[index]):
if tfidf.toarray()[index][i] > 0:
f.write("%f %s\n" % (tfidf.toarray()[index][i], word[i]))
f.close()

def feature_extraction():
d = {}
for index in range(0, 5):
f = file("tfidf_%d" % index, "r")
lines = f.readlines()
for line in lines:
word = line.split(' ')[1][:-1]
tfidf = line.split(' ')[0]
if d.has_key(word):
d[word] = np.append(d[word], tfidf)
else:
d[word] = np.array(tfidf)

f.close();
f = file("features.txt", "wb")
for word in d:
if d[word].size >= 2:
index = np.argsort(d[word])
if float(d[word][index[d[word].size-0-1]]) - float(d[word][index[d[word].size-1-1]]) > 0.01:
f.write("%s %s\n" % (word, d[word]))
f.close()

if __name__ == '__main__':
#get_segment();
feature_dump();
feature_extraction();
63 changes: 63 additions & 0 deletions gen_sitemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# coding:utf-8

import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
import MySQLdb
from conn import Conn

begin='''<?xml version="1.0" encoding="UTF-8"?>
<urlset
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
'''
end='</urlset>'
subbegin=' <url><loc>'
subend='</loc></url>'
conn = Conn().getConnection()

def addUrl(sitemap, url):
return sitemap+"%s%s%s\n" % (subbegin, url, subend)

def addStaticUrl(sitemap):
sitemap = addUrl(sitemap, 'http://www.shareditor.com/')
sitemap = addUrl(sitemap, 'http://www.shareditor.com/bloglist/1')
sitemap = addUrl(sitemap, 'http://www.shareditor.com/bloglist/2')
sitemap = addUrl(sitemap, 'http://www.shareditor.com/bloglist/3')
sitemap = addUrl(sitemap, 'http://www.shareditor.com/bloglist/4')
sitemap = addUrl(sitemap, 'http://www.shareditor.com/bloglist/5')
sitemap = addUrl(sitemap, 'http://favorite.shareditor.com/favorite/')
sitemap = addUrl(sitemap, 'http://favorite.shareditor.com/favorite/categorylist?category=机器学习')
sitemap = addUrl(sitemap, 'http://favorite.shareditor.com/favorite/categorylist?category=技术文章')
sitemap = addUrl(sitemap, 'http://favorite.shareditor.com/favorite/categorylist?category=新闻资讯')
sitemap = addUrl(sitemap, 'http://favorite.shareditor.com/favorite/categorylist?category=数学知识')
sitemap = addUrl(sitemap, 'http://favorite.shareditor.com/favorite/categorylist?category=鸡汤文章')
return sitemap

def gen_sitemap(sitemap):
cursor = conn.cursor()
sql = "select id from BlogPost"
cursor.execute(sql)
for row in cursor.fetchall():
url='http://www.shareditor.com/blogshow/?blogId=%d' % row[0]
sitemap = addUrl(sitemap, url)
return sitemap

def gen_favoritesitemap(sitemap):
cursor = conn.cursor()
sql = "select id from CrawlPage"
cursor.execute(sql)
for row in cursor.fetchall():
url='http://favorite.shareditor.com/favorite/pageshow?pageid=%d' % row[0]
sitemap = addUrl(sitemap, url)
return sitemap

if __name__ == '__main__':
sitemap=begin
sitemap=addStaticUrl(sitemap)
sitemap=gen_sitemap(sitemap)
sitemap=gen_favoritesitemap(sitemap)
sitemap+=end
print sitemap
Loading

0 comments on commit dce9018

Please sign in to comment.