normalize feed

feedreader · Sep 19, 2013 · 813461c · 813461c
1 parent da4a9a6
commit 813461c
Show file tree

Hide file tree

Showing 6 changed files with 292 additions and 96 deletions.
diff --git a/Manifest.txt b/Manifest.txt
@@ -23,6 +23,7 @@ lib/pluto/server/views/layout.erb
 lib/pluto/server/views/sites.erb
 lib/pluto/server/views/timeline.erb
 lib/pluto/updater.rb
+lib/pluto/utils.rb
 lib/pluto/version.rb
 templates/blank.html.erb
 templates/blank.top.html.erb

diff --git a/lib/pluto.rb b/lib/pluto.rb
@@ -33,6 +33,9 @@
 require 'pluto/schema'
 require 'pluto/models'
 require 'pluto/connecter'
+
+require 'pluto/utils'
+
 require 'pluto/updater'
 require 'pluto/fetcher'
 require 'pluto/formatter'

diff --git a/lib/pluto/updater.rb b/lib/pluto/updater.rb
@@ -42,19 +42,6 @@ def fetch_feed( url )
   end
 
 
-  def parse_feed( xml )
-    parser = RSS::Parser.new( xml )
-    parser.do_validate            = false
-    parser.ignore_unknown_element = true
-
-    puts "Parsing feed..."
-    feed = parser.parse
-
-    puts "  feed.class=#{feed.class.name}"
-    feed
-  end
-
-
   def update_subscriptions( config, opts={} )
 
     ## for now - use single site w/ key planet  -- fix!! allow multiple sites (planets)
@@ -138,29 +125,25 @@ def update_feeds( opts={} )
 
       puts "Before parsing feed >#{feed_key}<..."
 
-      feed = parse_feed( feed_xml )
-
-      if feed.class == RSS::Atom::Feed
-        puts "== #{feed.title.content} =="
-      else  ## assume RSS::Rss::Feed
-        puts "==  #{feed.channel.title} =="
-      end
+      feed = FeedUtils::Parser.new( feed_xml ).parse
 
       feed.items.each do |item|
-        if feed.class == RSS::Atom::Feed
-          item_attribs = handle_feed_item_atom( item )
-        else  ## assume RSS::Rss::Feed
-          item_attribs = handle_feed_item_rss( item )
-        end
 
-        # add feed_id fk_ref
-        item_attribs[ :feed_id ] = feed_rec.id
+        item_attribs = {
+          title:        item.title,
+          url:          item.url,
+          content:      item.content,
+          published_at: item.published,
+          feed_id:      feed_rec.id    # add feed_id fk_ref
+        }
 
-        rec = Item.find_by_guid( item_attribs[ :guid ] )
+        rec = Item.find_by_guid( item.guid )
         if rec.nil?
           rec      = Item.new
+          item_attribs[ :guid ] = item.guid
           puts "** NEW"
         else
+          ## todo: check if any attribs changed
           puts "UPDATE"
         end
 
@@ -172,74 +155,7 @@ def update_feeds( opts={} )
   end # method run
 
 
-  def handle_feed_item_atom( item )
-
-        ## todo: if content.content empty use summary for example
-        item_attribs = {
-          title:        item.title.content,
-          url:          item.link.href,
-          published_at: item.updated.content.utc.strftime( "%Y-%m-%d %H:%M" ),
-          # content:   item.content.content,
-        }
-
-        item_attribs[ :guid ] = item.id.content
-
-        if item.summary
-          item_attribs[ :content ] = item.summary.content
-        else
-          if item.content
-            text  = item.content.content.dup
-            ## strip all html tags
-            text = text.gsub( /<[^>]+>/, '' )
-            text = text[ 0..400 ] # get first 400 chars
-            ## todo: check for length if > 400 add ... at the end???
-            item_attribs[ :content ] = text
-          end
-        end
-
-        puts "- #{item.title.content}"
-        puts "  link >#{item.link.href}<"
-        puts "  id (~guid) >#{item.id.content}<"
-
-        ### todo: use/try published first? why? why not?
-        puts "  updated (~pubDate) >#{item.updated.content}< >#{item.updated.content.utc.strftime( "%Y-%m-%d %H:%M" )}< : #{item.updated.content.class.name}"
-        puts
-
-        # puts "*** dump item:"
-        # pp item
-
-        item_attribs
-  end
-
-  def handle_feed_item_rss( item )
-
-       item_attribs = {
-          title:        item.title,
-          url:          item.link,
-          published_at: item.pubDate.utc.strftime( "%Y-%m-%d %H:%M" ),
-          # content:  item.content_encoded,
-        }
-
-        # if item.content_encoded.nil?
-          # puts " using description for content"
-          item_attribs[ :content ] = item.description
-        # end
-
-        item_attribs[ :guid ] = item.guid.content
-
-        puts "- #{item.title}"
-        puts "  link (#{item.link})"
-        puts "  guid (#{item.guid.content})"
-        puts "  pubDate >#{item.pubDate}< >#{item.pubDate.utc.strftime( "%Y-%m-%d %H:%M" )}< : #{item.pubDate.class.name}"
-        puts
-
-        # puts "*** dump item:"
-        # pp item
-
-        item_attribs
-  end
-
-
+
 end # class Fetcher
 
 end # module Pluto
diff --git a/lib/pluto/utils.rb b/lib/pluto/utils.rb
@@ -0,0 +1,190 @@
+
+###########################
+# todo: move to feedutils
+
+module FeedUtils
+
+
+  class Feed
+    attr_accessor :object
+
+    attr_accessor :format   # e.g. atom|rss 2.0|etc.
+    attr_accessor :title
+    attr_accessor :title_type  # e.g. text|html  (optional) -use - why?? why not??
+
+    attr_accessor :items
+
+    def self.create( feed_wild )
+
+      puts "  feed.class=#{feed_wild.class.name}"
+
+      if feed_wild.class == RSS::Atom::Feed   ## fix: use feed_wild.kind_of?( RSS::Atom::Feed )
+        feed = self.create_from_atom( feed_wild )
+      else  ## assume RSS::Rss::Feed
+        feed = self.create_from_rss( feed_wild )
+      end
+
+      puts "== #{feed.format} / #{feed.title} =="
+      feed
+    end
+
+    def self.create_from_atom( atom_feed )
+      feed = Feed.new
+      feed.object = atom_feed
+      feed.title  = atom_feed.title.content
+      feed.format = 'atom'
+
+      items = []
+      atom_feed.items.each do |atom_item|
+        items << Item.create_from_atom( atom_item )
+      end
+      feed.items = items
+
+      feed # return new feed
+    end
+
+    def self.create_from_rss( rss_feed )
+      feed = Feed.new
+      feed.object = rss_feed
+      feed.title  = rss_feed.channel.title
+      feed.format = "rss #{rss_feed.rss_version}"
+
+      items = []
+      rss_feed.items.each do |rss_item|
+        items << Item.create_from_rss( rss_item )
+      end
+      feed.items = items
+
+      feed # return new feed
+    end
+
+  end  # class Feed
+
+
+
+  class Item
+    attr_accessor :object   # orginal object (e.g RSS item or ATOM entry etc.)
+
+    attr_accessor :title
+    attr_accessor :title_type    # optional for now (text|html) - not yet set
+    attr_accessor :url      # todo: rename to link (use alias) ??
+    attr_accessor :content
+    attr_accessor :content_type  # optional for now (text|html) - not yet set
+
+## todo: add summary (alias description)  ???
+## todo: add author/authors
+## todo: add category/categories
+
+    attr_accessor :updated
+    attr_accessor :published
+
+    attr_accessor :guid     # todo: rename to id (use alias) ??
+
+
+    def self.create_from_atom( atom_item )
+      item = self.new   # Item.new
+      item.object = atom_item
+
+      item.title     = atom_item.title.content
+      item.url       = atom_item.link.href
+
+      ## todo: check if updated or published present
+      #    set 
+      item.updated    =  atom_item.updated.content.utc.strftime( "%Y-%m-%d %H:%M" )
+      item.published  =  item.updated  # fix: check if publshed set
+
+      item.guid       =  atom_item.id.content
+
+
+      # todo: move logic to updater or something
+      #  - not part of normalize
+
+      if atom_item.summary
+        item.content = atom_item.summary.content
+      else
+        if atom_item.content
+          text  = atom_item.content.content.dup
+          ## strip all html tags
+          text = text.gsub( /<[^>]+>/, '' )
+          text = text[ 0..400 ] # get first 400 chars
+          ## todo: check for length if > 400 add ... at the end???
+          item.content = text
+        end
+      end
+
+      puts "- #{atom_item.title.content}"
+      puts "  link >#{atom_item.link.href}<"
+      puts "  id (~guid) >#{atom_item.id.content}<"
+
+      ### todo: use/try published first? why? why not?
+      puts "  updated (~pubDate) >#{atom_item.updated.content}< >#{atom_item.updated.content.utc.strftime( "%Y-%m-%d %H:%M" )}< : #{atom_item.updated.content.class.name}"
+      puts
+
+      # puts "*** dump item:"
+      # pp item
+
+      item
+    end # method create_from_atom
+
+    def self.create_from_rss( rss_item )
+
+      item = self.new    # Item.new
+      item.object = rss_item
+
+      item.title     = rss_item.title
+      item.url       = rss_item.link
+
+      ## todo: check if updated or published present
+      #    set 
+      item.published = rss_item.pubDate.utc.strftime( "%Y-%m-%d %H:%M" )
+      item.updated   = item.published
+
+      # content:  item.content_encoded,
+
+        # if item.content_encoded.nil?
+          # puts " using description for content"
+
+      item.content  = rss_item.description
+        # end
+
+      item.guid     = rss_item.guid.content
+
+      puts "- #{rss_item.title}"
+      puts "  link (#{rss_item.link})"
+      puts "  guid (#{rss_item.guid.content})"
+      puts "  pubDate >#{rss_item.pubDate}< >#{rss_item.pubDate.utc.strftime( "%Y-%m-%d %H:%M" )}< : #{rss_item.pubDate.class.name}"
+      puts
+
+      # puts "*** dump item:"
+      # pp item
+
+      item
+    end # method create_from_rss
+
+  end  # class Item
+
+
+  class Parser
+
+    ### Note: lets keep/use same API as RSS::Parser for now
+    def initialize( xml )
+      @xml = xml
+    end
+
+    def parse
+
+      parser = RSS::Parser.new( @xml )
+      parser.do_validate            = false
+      parser.ignore_unknown_element = true
+
+      puts "Parsing feed..."
+      feed_wild = parser.parse  # not yet normalized
+
+      feed = Feed.create( feed_wild )
+      feed # return new (normalized) feed
+    end
+
+  end  # class Parser
+
+
+end # module FeedUtils