1.6.1

1.对RSS编码和网页编码进行了数据库缓存 2.可选在元数据中的书籍标题后添加日期信息 3.可选仅推送指定时间之内的文章
dacer · Aug 10, 2013 · 6475e4c · 6475e4c
1 parent ee26e6e
commit 6475e4c
Show file tree

Hide file tree

Showing 27 changed files with 389 additions and 1,960 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,5 @@ oGlobo.py
 oGlobo.pyc
 mh_correio.gif
 mh_globo.gif
+cv_correio.jpg
+cv_globo.jpg
diff --git a/books/base.py b/books/base.py
@@ -6,10 +6,12 @@
 """
 
 import os, re, urllib, urlparse, random, imghdr, logging
+from datetime import datetime
 from urllib2 import *
 import chardet
 from google.appengine.api import urlfetch
 from google.appengine.runtime import apiproxy_errors
+from google.appengine.ext import db
 
 from bs4 import BeautifulSoup, Comment
 from lib import feedparser
@@ -22,10 +24,20 @@
 
 MAX_ASYNC_REQUESTS = 5 #最大支持10，数值越大越容易出现流量越限异常
 
+class UrlEncoding(db.Model):
+    #缓存网站的编码记录，探测一次编码成功后，以后再也不需要重新探测
+    netloc = db.StringProperty()
+    feedenc = db.StringProperty()
+    pageenc = db.StringProperty()
+
 class AutoDecoder:
-    def __init__(self):
+    # 封装数据库编码缓存和同一网站文章的编码缓存
+    # 因为chardet是非常慢的，所以需要那么复杂的缓存和其他特殊处理
+    def __init__(self, isfeed=True):
         self.encoding = None
-    def decode(self, content):
+        self.isfeed = isfeed #True:Feed,False:page
+
+    def decode(self, content, url):
         result = content
         if not ALWAYS_CHAR_DETECT and self.encoding: # 先使用上次的编码打开文件尝试
             try:
@@ -39,22 +51,61 @@ def decode(self, content):
                     result = content
                 else: # 保存下次使用，以节省时间
                     self.encoding = encoding
+                    #同时保存到数据库
+                    netloc = urlparse.urlsplit(url)[1]
+                    urlenc = UrlEncoding.all().filter('netloc = ', netloc).get()
+                    if urlenc:
+                        enc = urlenc.feedenc if self.isfeed else urlenc.pageenc
+                        if enc != encoding:
+                            if self.isfeed:
+                                urlenc.feedenc = encoding
+                            else:
+                                urlenc.pageenc = encoding
+                            urlenc.put()
+                    elif self.isfeed:
+                        UrlEncoding(netloc=netloc,feedenc=encoding).put()
+                    else:
+                        UrlEncoding(netloc=netloc,pageenc=encoding).put()
         else:  # 暂时没有之前的编码信息
-            self.encoding = chardet.detect(content)['encoding']
+            netloc = urlparse.urlsplit(url)[1]
+            urlenc = UrlEncoding.all().filter('netloc = ', netloc).get()
+            if urlenc: #先看数据库有没有
+                enc = urlenc.feedenc if self.isfeed else urlenc.pageenc
+                if enc:
+                    try:
+                        result = content.decode(enc)
+                    except UnicodeDecodeError: # 出错，重新探测编码
+                        self.encoding = chardet.detect(content)['encoding']
+                    else:
+                        self.encoding = enc
+                        return result
+                else: #数据库暂时没有数据
+                    self.encoding = chardet.detect(content)['encoding']
+            else:
+                self.encoding = chardet.detect(content)['encoding']
+
+            #使用探测到的编码解压
             try:
                 result = content.decode(self.encoding)
             except UnicodeDecodeError: # 出错，则不转换，直接返回
                 result = content
+            else:
+                #保存到数据库
+                newurlenc = urlenc if urlenc else UrlEncoding(netloc=netloc)
+                if self.isfeed:
+                    newurlenc.feedenc = self.encoding
+                else:
+                    newurlenc.pageenc = self.encoding
+                newurlenc.put()
         return result
 
 class BaseFeedBook:
     """ base class of Book """
     title                 = ''
     __author__            = ''
     description           = ''
-    publisher             = ''
-    category              = ''
     max_articles_per_feed = 30
+    oldest_article        = 7 #下载多长时间之内的文章，单位为天，0则不限制
     host                  = None # 有些网页的图像下载需要请求头里面包含Referer,使用此参数配置
     network_timeout       = None  # None则使用默认
     fetch_img_via_ssl     = False # 当网页为https时，其图片是否也转换成https
@@ -220,6 +271,8 @@ def url_unescape(self, value):
     def ParseFeedUrls(self):
         """ return list like [(section,title,url,desc),..] """
         urls = []
+        tnow = datetime.utcnow()
+        urladded = set()
         for feed in self.feeds:
             section, url = feed[0], feed[1]
             isfulltext = feed[2] if len(feed) > 2 else False
@@ -230,23 +283,29 @@ def ParseFeedUrls(self):
                 if self.feed_encoding:
                     feed = feedparser.parse(result.content.decode(self.feed_encoding))
                 else:
-                    feed = feedparser.parse(AutoDecoder().decode(result.content))
+                    feed = feedparser.parse(AutoDecoder(True).decode(result.content,url))
 
-                urladded = set() # 防止部分RSS产生重复文章
                 for e in feed['entries'][:self.max_articles_per_feed]:
+                    if self.oldest_article > 0 and hasattr(e, 'published_parsed'):
+                            delta = tnow - datetime(*(e.published_parsed[0:6]))
+                            if delta.days*86400+delta.seconds > 86400*self.oldest_article:
+                                self.log.debug("article '%s' is too old"%e.title)
+                                continue
                     #支持HTTPS
                     urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link
-                    if urlfeed not in urladded:
-                        desc = None
-                        if isfulltext:
-                            if hasattr(e, 'content') and e.content[0].value:
-                                desc = e.content[0].value
-                            elif hasattr(e, 'summary'):
-                                desc = e.summary
-                            else:
-                                self.log.warn('feed item invalid,link to webpage for article.(%s)'%e.title)
-                        urls.append((section, e.title, urlfeed, desc))
-                        urladded.add(urlfeed)
+                    if urlfeed in urladded:
+                        continue
+
+                    desc = None
+                    if isfulltext:
+                        if hasattr(e, 'content') and e.content[0].value:
+                            desc = e.content[0].value
+                        elif hasattr(e, 'summary'):
+                            desc = e.summary
+                        else:
+                            self.log.warn('feed item invalid,link to webpage for article.(%s)'%e.title)
+                    urls.append((section, e.title, urlfeed, desc))
+                    urladded.add(urlfeed)
             else:
                 self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))
         return urls
@@ -260,7 +319,7 @@ def Items(self, opts=None):
         urls = self.ParseFeedUrls()
         readability = self.readability if self.fulltext_by_readability else self.readability_by_soup
         prevsection = ''
-        decoder = AutoDecoder()
+        decoder = AutoDecoder(False)
         if USE_ASYNC_URLFETCH:
             asyncurls = [(i,url) for i,(_a,_b,url,desc) in enumerate(urls) if not desc]
             rpcs, i = [], 0
@@ -325,7 +384,7 @@ def Items(self, opts=None):
                 if self.page_encoding:
                     article = content.decode(self.page_encoding)
                 else:
-                    article = decoder.decode(content)
+                    article = decoder.decode(content,url)
 
                 #如果是图片，title则是mime
                 for title, imgurl, imgfn, content, brief in readability(article,url,opts):
@@ -372,7 +431,7 @@ def fetcharticle(self, url, decoder):
         if self.page_encoding:
             return content.decode(self.page_encoding)
         else:
-            return decoder.decode(content)
+            return decoder.decode(content,url)
 
     def readability(self, article, url, opts=None):
         #使用readability-lxml处理全文信息
@@ -587,7 +646,7 @@ def Items(self, opts=None):
         对于图片，mime,url,filename,content
         """
         cnt4debug = 0
-        decoder = AutoDecoder()
+        decoder = AutoDecoder(False)
         timeout = self.timeout
         for section, url in self.feeds:
             cnt4debug += 1
@@ -604,7 +663,7 @@ def Items(self, opts=None):
             if self.page_encoding:
                 content = content.decode(self.page_encoding)
             else:
-                content = decoder.decode(content)
+                content = decoder.decode(content,url)
 
             content =  self.preprocess(content)
             soup = BeautifulSoup(content, "lxml")

diff --git a/books/base.pyc b/books/base.pyc
diff --git a/calibre/__init__.py b/calibre/__init__.py
@@ -3,29 +3,12 @@
 __copyright__ = '2008, Kovid Goyal <[email protected]>'
 __docformat__ = 'restructuredtext en'
 
-import sys, os, re, time, random, warnings
+import sys, os, re, time
 from functools import partial
 
-warnings.simplefilter('ignore', DeprecationWarning)
-from calibre.constants import (iswindows, isosx, islinux, isfrozen,
-        isbsd, preferred_encoding, __appname__, __version__, __author__,
-        win32event, win32api, winerror, fcntl,
-        filesystem_encoding, plugins, config_dir)
-
-if False and islinux and not getattr(sys, 'frozen', False):
-    # Imported before PyQt4 to workaround PyQt4 util-linux conflict discovered on gentoo
-    # See http://bugs.gentoo.org/show_bug.cgi?id=317557
-    # Importing uuid is slow so get rid of this at some point, maybe in a few
-    # years when even Debian has caught up
-    # Also remember to remove it from site.py in the binary builds
-    import uuid
-    uuid.uuid4()
-
-if False:
-    # Prevent pyflakes from complaining
-    winutil, winutilerror, __appname__, islinux, __version__
-    fcntl, win32event, isfrozen, __author__
-    winerror, win32api, isbsd, config_dir
+from calibre.constants import (iswindows, 
+        preferred_encoding, __appname__, __version__, __author__,
+        winerror, filesystem_encoding, plugins, config_dir)
 
 _mt_inited = False
 def _init_mimetypes():
@@ -91,10 +74,6 @@ def unicode_path(path, abs=False):
 def confirm_config_name(name):
     return name + '_again'
 
-_filename_sanitize = re.compile(r'[\xae\0\\|\?\*<":>\+/]')
-_filename_sanitize_unicode = frozenset([u'\\', u'|', u'?', u'*', u'<',
-    u'"', u':', u'>', u'+', u'/'] + list(map(unichr, xrange(32))))
-
 def sanitize_file_name(name, substitute='_', as_unicode=False):
     '''
     Sanitize the filename `name`. All invalid characters are replaced by `substitute`.
@@ -107,6 +86,8 @@ def sanitize_file_name(name, substitute='_', as_unicode=False):
     '''
     if isinstance(name, unicode):
         name = name.encode(filesystem_encoding, 'ignore')
+    _filename_sanitize = re.compile(r'[\xae\0\\|\?\*<":>\+/]')
+
     one = _filename_sanitize.sub(substitute, name)
     one = re.sub(r'\s', ' ', one).strip()
     bname, ext = os.path.splitext(one)
@@ -133,6 +114,8 @@ def sanitize_file_name_unicode(name, substitute='_'):
     '''
     if isbytestring(name):
         return sanitize_file_name(name, substitute=substitute, as_unicode=True)
+    _filename_sanitize_unicode = frozenset([u'\\', u'|', u'?', u'*', u'<',
+        u'"', u':', u'>', u'+', u'/'] + list(map(unichr, xrange(32))))
     chars = [substitute if c in _filename_sanitize_unicode else c for c in
             name]
     one = u''.join(chars)
@@ -208,9 +191,6 @@ def prints(*args, **kwargs):
             file.write(bytes(sep))
     file.write(bytes(end))
 
-class CommandLineError(Exception):
-    pass
-
 def filename_to_utf8(name):
     '''Return C{name} encoded in utf8. Unhandled characters are replaced. '''
     if isinstance(name, unicode):
@@ -245,11 +225,11 @@ def __exit__(self, *args):
 
 
 relpath = os.path.relpath
-_spat = re.compile(r'^the\s+|^a\s+|^an\s+', re.IGNORECASE)
 def english_sort(x, y):
     '''
     Comapare two english phrases ignoring starting prepositions.
     '''
+    _spat = re.compile(r'^the\s+|^a\s+|^an\s+', re.IGNORECASE)
     return cmp(_spat.sub('', x), _spat.sub('', y))
 
 def walk(dir):

diff --git a/calibre/__init__.pyc b/calibre/__init__.pyc
diff --git a/calibre/constants.py b/calibre/constants.py
@@ -9,36 +9,18 @@
 Various run time constants.
 '''
 
-import sys, locale, codecs
+import sys, codecs
 
 iswindows = False
-isosx     = False
-isnewosx  = False
-isfreebsd = False
-isnetbsd = False
-isdragonflybsd = False
-isbsd = False
-islinux   = False
-isfrozen  = False
-isunix = False
-isportable = False
 ispy3 = False
-isxp = False
-is64bit = False
-isworker = False
 
-try:
-    preferred_encoding = locale.getpreferredencoding()
-    codecs.lookup(preferred_encoding)
-except:
-    preferred_encoding = 'utf-8'
+#try:
+#    preferred_encoding = locale.getpreferredencoding()
+#    codecs.lookup(preferred_encoding)
+#except:
+preferred_encoding = 'utf-8'
 
-iswindows = False
-win32event = None
 winerror   = None
-win32api   = None
-fcntl      = None #if iswindows else importlib.import_module('fcntl')
-
 _osx_ver = None
 
 filesystem_encoding = sys.getfilesystemencoding()
@@ -48,15 +30,10 @@
     try:
         if codecs.lookup(filesystem_encoding).name == 'ascii':
             filesystem_encoding = 'utf-8'
-            # On linux, unicode arguments to os file functions are coerced to an ascii
-            # bytestring if sys.getfilesystemencoding() == 'ascii', which is
-            # just plain dumb. This is fixed by the icu.py module which, when
-            # imported changes ascii to utf-8
     except:
         filesystem_encoding = 'utf-8'
 
 DEBUG = False
-_cache_dir = None
 plugins = None
 CONFIG_DIR_MODE = 0700
 config_dir = ""
diff --git a/calibre/constants.pyc b/calibre/constants.pyc
diff --git a/calibre/customize/__init__.py b/calibre/customize/__init__.py
@@ -43,10 +43,10 @@ class Plugin(object): # {{{
     version        = (1, 0, 0)
 
     #: A short string describing what this plugin does
-    description    = _('Does absolutely nothing')
+    description    = 'Does absolutely nothing'
 
     #: The author of this plugin
-    author         = _('Unknown')
+    author         = 'Unknown'
 
     #: When more than one plugin exists for a filetype,
     #: the plugins are run in order of decreasing priority
@@ -64,7 +64,7 @@ class Plugin(object): # {{{
 
     #: The type of this plugin. Used for categorizing plugins in the
     #: GUI
-    type = _('Base')
+    type = 'Base'
 
     def __init__(self, plugin_path):
         self.plugin_path        = plugin_path
@@ -196,7 +196,7 @@ class FileTypePlugin(Plugin): # {{{
     #: on the final file produced by the conversion output plugin.
     on_postprocess = False
 
-    type = _('File type')
+    type = 'File type'
 
     def run(self, path_to_ebook):
         '''

diff --git a/calibre/customize/__init__.pyc b/calibre/customize/__init__.pyc