Skip to content

Commit

Permalink
1.6.1
Browse files Browse the repository at this point in the history
1.对RSS编码和网页编码进行了数据库缓存
2.可选在元数据中的书籍标题后添加日期信息
3.可选仅推送指定时间之内的文章
  • Loading branch information
cdhigh committed Aug 10, 2013
1 parent ee26e6e commit 6475e4c
Show file tree
Hide file tree
Showing 27 changed files with 389 additions and 1,960 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ oGlobo.py
oGlobo.pyc
mh_correio.gif
mh_globo.gif
cv_correio.jpg
cv_globo.jpg
105 changes: 82 additions & 23 deletions books/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
"""

import os, re, urllib, urlparse, random, imghdr, logging
from datetime import datetime
from urllib2 import *
import chardet
from google.appengine.api import urlfetch
from google.appengine.runtime import apiproxy_errors
from google.appengine.ext import db

from bs4 import BeautifulSoup, Comment
from lib import feedparser
Expand All @@ -22,10 +24,20 @@

MAX_ASYNC_REQUESTS = 5 #最大支持10,数值越大越容易出现流量越限异常

class UrlEncoding(db.Model):
#缓存网站的编码记录,探测一次编码成功后,以后再也不需要重新探测
netloc = db.StringProperty()
feedenc = db.StringProperty()
pageenc = db.StringProperty()

class AutoDecoder:
def __init__(self):
# 封装数据库编码缓存和同一网站文章的编码缓存
# 因为chardet是非常慢的,所以需要那么复杂的缓存和其他特殊处理
def __init__(self, isfeed=True):
self.encoding = None
def decode(self, content):
self.isfeed = isfeed #True:Feed,False:page

def decode(self, content, url):
result = content
if not ALWAYS_CHAR_DETECT and self.encoding: # 先使用上次的编码打开文件尝试
try:
Expand All @@ -39,22 +51,61 @@ def decode(self, content):
result = content
else: # 保存下次使用,以节省时间
self.encoding = encoding
#同时保存到数据库
netloc = urlparse.urlsplit(url)[1]
urlenc = UrlEncoding.all().filter('netloc = ', netloc).get()
if urlenc:
enc = urlenc.feedenc if self.isfeed else urlenc.pageenc
if enc != encoding:
if self.isfeed:
urlenc.feedenc = encoding
else:
urlenc.pageenc = encoding
urlenc.put()
elif self.isfeed:
UrlEncoding(netloc=netloc,feedenc=encoding).put()
else:
UrlEncoding(netloc=netloc,pageenc=encoding).put()
else: # 暂时没有之前的编码信息
self.encoding = chardet.detect(content)['encoding']
netloc = urlparse.urlsplit(url)[1]
urlenc = UrlEncoding.all().filter('netloc = ', netloc).get()
if urlenc: #先看数据库有没有
enc = urlenc.feedenc if self.isfeed else urlenc.pageenc
if enc:
try:
result = content.decode(enc)
except UnicodeDecodeError: # 出错,重新探测编码
self.encoding = chardet.detect(content)['encoding']
else:
self.encoding = enc
return result
else: #数据库暂时没有数据
self.encoding = chardet.detect(content)['encoding']
else:
self.encoding = chardet.detect(content)['encoding']

#使用探测到的编码解压
try:
result = content.decode(self.encoding)
except UnicodeDecodeError: # 出错,则不转换,直接返回
result = content
else:
#保存到数据库
newurlenc = urlenc if urlenc else UrlEncoding(netloc=netloc)
if self.isfeed:
newurlenc.feedenc = self.encoding
else:
newurlenc.pageenc = self.encoding
newurlenc.put()
return result

class BaseFeedBook:
""" base class of Book """
title = ''
__author__ = ''
description = ''
publisher = ''
category = ''
max_articles_per_feed = 30
oldest_article = 7 #下载多长时间之内的文章,单位为天,0则不限制
host = None # 有些网页的图像下载需要请求头里面包含Referer,使用此参数配置
network_timeout = None # None则使用默认
fetch_img_via_ssl = False # 当网页为https时,其图片是否也转换成https
Expand Down Expand Up @@ -220,6 +271,8 @@ def url_unescape(self, value):
def ParseFeedUrls(self):
""" return list like [(section,title,url,desc),..] """
urls = []
tnow = datetime.utcnow()
urladded = set()
for feed in self.feeds:
section, url = feed[0], feed[1]
isfulltext = feed[2] if len(feed) > 2 else False
Expand All @@ -230,23 +283,29 @@ def ParseFeedUrls(self):
if self.feed_encoding:
feed = feedparser.parse(result.content.decode(self.feed_encoding))
else:
feed = feedparser.parse(AutoDecoder().decode(result.content))
feed = feedparser.parse(AutoDecoder(True).decode(result.content,url))

urladded = set() # 防止部分RSS产生重复文章
for e in feed['entries'][:self.max_articles_per_feed]:
if self.oldest_article > 0 and hasattr(e, 'published_parsed'):
delta = tnow - datetime(*(e.published_parsed[0:6]))
if delta.days*86400+delta.seconds > 86400*self.oldest_article:
self.log.debug("article '%s' is too old"%e.title)
continue
#支持HTTPS
urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link
if urlfeed not in urladded:
desc = None
if isfulltext:
if hasattr(e, 'content') and e.content[0].value:
desc = e.content[0].value
elif hasattr(e, 'summary'):
desc = e.summary
else:
self.log.warn('feed item invalid,link to webpage for article.(%s)'%e.title)
urls.append((section, e.title, urlfeed, desc))
urladded.add(urlfeed)
if urlfeed in urladded:
continue

desc = None
if isfulltext:
if hasattr(e, 'content') and e.content[0].value:
desc = e.content[0].value
elif hasattr(e, 'summary'):
desc = e.summary
else:
self.log.warn('feed item invalid,link to webpage for article.(%s)'%e.title)
urls.append((section, e.title, urlfeed, desc))
urladded.add(urlfeed)
else:
self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
return urls
Expand All @@ -260,7 +319,7 @@ def Items(self, opts=None):
urls = self.ParseFeedUrls()
readability = self.readability if self.fulltext_by_readability else self.readability_by_soup
prevsection = ''
decoder = AutoDecoder()
decoder = AutoDecoder(False)
if USE_ASYNC_URLFETCH:
asyncurls = [(i,url) for i,(_a,_b,url,desc) in enumerate(urls) if not desc]
rpcs, i = [], 0
Expand Down Expand Up @@ -325,7 +384,7 @@ def Items(self, opts=None):
if self.page_encoding:
article = content.decode(self.page_encoding)
else:
article = decoder.decode(content)
article = decoder.decode(content,url)

#如果是图片,title则是mime
for title, imgurl, imgfn, content, brief in readability(article,url,opts):
Expand Down Expand Up @@ -372,7 +431,7 @@ def fetcharticle(self, url, decoder):
if self.page_encoding:
return content.decode(self.page_encoding)
else:
return decoder.decode(content)
return decoder.decode(content,url)

def readability(self, article, url, opts=None):
#使用readability-lxml处理全文信息
Expand Down Expand Up @@ -587,7 +646,7 @@ def Items(self, opts=None):
对于图片,mime,url,filename,content
"""
cnt4debug = 0
decoder = AutoDecoder()
decoder = AutoDecoder(False)
timeout = self.timeout
for section, url in self.feeds:
cnt4debug += 1
Expand All @@ -604,7 +663,7 @@ def Items(self, opts=None):
if self.page_encoding:
content = content.decode(self.page_encoding)
else:
content = decoder.decode(content)
content = decoder.decode(content,url)

content = self.preprocess(content)
soup = BeautifulSoup(content, "lxml")
Expand Down
Binary file modified books/base.pyc
Binary file not shown.
38 changes: 9 additions & 29 deletions calibre/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,12 @@
__copyright__ = '2008, Kovid Goyal <[email protected]>'
__docformat__ = 'restructuredtext en'

import sys, os, re, time, random, warnings
import sys, os, re, time
from functools import partial

warnings.simplefilter('ignore', DeprecationWarning)
from calibre.constants import (iswindows, isosx, islinux, isfrozen,
isbsd, preferred_encoding, __appname__, __version__, __author__,
win32event, win32api, winerror, fcntl,
filesystem_encoding, plugins, config_dir)

if False and islinux and not getattr(sys, 'frozen', False):
# Imported before PyQt4 to workaround PyQt4 util-linux conflict discovered on gentoo
# See http://bugs.gentoo.org/show_bug.cgi?id=317557
# Importing uuid is slow so get rid of this at some point, maybe in a few
# years when even Debian has caught up
# Also remember to remove it from site.py in the binary builds
import uuid
uuid.uuid4()

if False:
# Prevent pyflakes from complaining
winutil, winutilerror, __appname__, islinux, __version__
fcntl, win32event, isfrozen, __author__
winerror, win32api, isbsd, config_dir
from calibre.constants import (iswindows,
preferred_encoding, __appname__, __version__, __author__,
winerror, filesystem_encoding, plugins, config_dir)

_mt_inited = False
def _init_mimetypes():
Expand Down Expand Up @@ -91,10 +74,6 @@ def unicode_path(path, abs=False):
def confirm_config_name(name):
return name + '_again'

_filename_sanitize = re.compile(r'[\xae\0\\|\?\*<":>\+/]')
_filename_sanitize_unicode = frozenset([u'\\', u'|', u'?', u'*', u'<',
u'"', u':', u'>', u'+', u'/'] + list(map(unichr, xrange(32))))

def sanitize_file_name(name, substitute='_', as_unicode=False):
'''
Sanitize the filename `name`. All invalid characters are replaced by `substitute`.
Expand All @@ -107,6 +86,8 @@ def sanitize_file_name(name, substitute='_', as_unicode=False):
'''
if isinstance(name, unicode):
name = name.encode(filesystem_encoding, 'ignore')
_filename_sanitize = re.compile(r'[\xae\0\\|\?\*<":>\+/]')

one = _filename_sanitize.sub(substitute, name)
one = re.sub(r'\s', ' ', one).strip()
bname, ext = os.path.splitext(one)
Expand All @@ -133,6 +114,8 @@ def sanitize_file_name_unicode(name, substitute='_'):
'''
if isbytestring(name):
return sanitize_file_name(name, substitute=substitute, as_unicode=True)
_filename_sanitize_unicode = frozenset([u'\\', u'|', u'?', u'*', u'<',
u'"', u':', u'>', u'+', u'/'] + list(map(unichr, xrange(32))))
chars = [substitute if c in _filename_sanitize_unicode else c for c in
name]
one = u''.join(chars)
Expand Down Expand Up @@ -208,9 +191,6 @@ def prints(*args, **kwargs):
file.write(bytes(sep))
file.write(bytes(end))

class CommandLineError(Exception):
pass

def filename_to_utf8(name):
'''Return C{name} encoded in utf8. Unhandled characters are replaced. '''
if isinstance(name, unicode):
Expand Down Expand Up @@ -245,11 +225,11 @@ def __exit__(self, *args):


relpath = os.path.relpath
_spat = re.compile(r'^the\s+|^a\s+|^an\s+', re.IGNORECASE)
def english_sort(x, y):
'''
Comapare two english phrases ignoring starting prepositions.
'''
_spat = re.compile(r'^the\s+|^a\s+|^an\s+', re.IGNORECASE)
return cmp(_spat.sub('', x), _spat.sub('', y))

def walk(dir):
Expand Down
Binary file modified calibre/__init__.pyc
Binary file not shown.
35 changes: 6 additions & 29 deletions calibre/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,36 +9,18 @@
Various run time constants.
'''

import sys, locale, codecs
import sys, codecs

iswindows = False
isosx = False
isnewosx = False
isfreebsd = False
isnetbsd = False
isdragonflybsd = False
isbsd = False
islinux = False
isfrozen = False
isunix = False
isportable = False
ispy3 = False
isxp = False
is64bit = False
isworker = False

try:
preferred_encoding = locale.getpreferredencoding()
codecs.lookup(preferred_encoding)
except:
preferred_encoding = 'utf-8'
#try:
# preferred_encoding = locale.getpreferredencoding()
# codecs.lookup(preferred_encoding)
#except:
preferred_encoding = 'utf-8'

iswindows = False
win32event = None
winerror = None
win32api = None
fcntl = None #if iswindows else importlib.import_module('fcntl')

_osx_ver = None

filesystem_encoding = sys.getfilesystemencoding()
Expand All @@ -48,15 +30,10 @@
try:
if codecs.lookup(filesystem_encoding).name == 'ascii':
filesystem_encoding = 'utf-8'
# On linux, unicode arguments to os file functions are coerced to an ascii
# bytestring if sys.getfilesystemencoding() == 'ascii', which is
# just plain dumb. This is fixed by the icu.py module which, when
# imported changes ascii to utf-8
except:
filesystem_encoding = 'utf-8'

DEBUG = False
_cache_dir = None
plugins = None
CONFIG_DIR_MODE = 0700
config_dir = ""
Binary file modified calibre/constants.pyc
Binary file not shown.
8 changes: 4 additions & 4 deletions calibre/customize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ class Plugin(object): # {{{
version = (1, 0, 0)

#: A short string describing what this plugin does
description = _('Does absolutely nothing')
description = 'Does absolutely nothing'

#: The author of this plugin
author = _('Unknown')
author = 'Unknown'

#: When more than one plugin exists for a filetype,
#: the plugins are run in order of decreasing priority
Expand All @@ -64,7 +64,7 @@ class Plugin(object): # {{{

#: The type of this plugin. Used for categorizing plugins in the
#: GUI
type = _('Base')
type = 'Base'

def __init__(self, plugin_path):
self.plugin_path = plugin_path
Expand Down Expand Up @@ -196,7 +196,7 @@ class FileTypePlugin(Plugin): # {{{
#: on the final file produced by the conversion output plugin.
on_postprocess = False

type = _('File type')
type = 'File type'

def run(self, path_to_ebook):
'''
Expand Down
Binary file modified calibre/customize/__init__.pyc
Binary file not shown.
Loading

0 comments on commit 6475e4c

Please sign in to comment.