Skip to content

Commit

Permalink
normalize feed
Browse files Browse the repository at this point in the history
  • Loading branch information
geraldb committed Sep 19, 2013
1 parent da4a9a6 commit 813461c
Show file tree
Hide file tree
Showing 6 changed files with 292 additions and 96 deletions.
1 change: 1 addition & 0 deletions Manifest.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ lib/pluto/server/views/layout.erb
lib/pluto/server/views/sites.erb
lib/pluto/server/views/timeline.erb
lib/pluto/updater.rb
lib/pluto/utils.rb
lib/pluto/version.rb
templates/blank.html.erb
templates/blank.top.html.erb
Expand Down
3 changes: 3 additions & 0 deletions lib/pluto.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
require 'pluto/schema'
require 'pluto/models'
require 'pluto/connecter'

require 'pluto/utils'

require 'pluto/updater'
require 'pluto/fetcher'
require 'pluto/formatter'
Expand Down
108 changes: 12 additions & 96 deletions lib/pluto/updater.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,6 @@ def fetch_feed( url )
end


def parse_feed( xml )
parser = RSS::Parser.new( xml )
parser.do_validate = false
parser.ignore_unknown_element = true

puts "Parsing feed..."
feed = parser.parse

puts " feed.class=#{feed.class.name}"
feed
end


def update_subscriptions( config, opts={} )

## for now - use single site w/ key planet -- fix!! allow multiple sites (planets)
Expand Down Expand Up @@ -138,29 +125,25 @@ def update_feeds( opts={} )

puts "Before parsing feed >#{feed_key}<..."

feed = parse_feed( feed_xml )

if feed.class == RSS::Atom::Feed
puts "== #{feed.title.content} =="
else ## assume RSS::Rss::Feed
puts "== #{feed.channel.title} =="
end
feed = FeedUtils::Parser.new( feed_xml ).parse

feed.items.each do |item|
if feed.class == RSS::Atom::Feed
item_attribs = handle_feed_item_atom( item )
else ## assume RSS::Rss::Feed
item_attribs = handle_feed_item_rss( item )
end

# add feed_id fk_ref
item_attribs[ :feed_id ] = feed_rec.id
item_attribs = {
title: item.title,
url: item.url,
content: item.content,
published_at: item.published,
feed_id: feed_rec.id # add feed_id fk_ref
}

rec = Item.find_by_guid( item_attribs[ :guid ] )
rec = Item.find_by_guid( item.guid )
if rec.nil?
rec = Item.new
item_attribs[ :guid ] = item.guid
puts "** NEW"
else
## todo: check if any attribs changed
puts "UPDATE"
end

Expand All @@ -172,74 +155,7 @@ def update_feeds( opts={} )
end # method run


def handle_feed_item_atom( item )

## todo: if content.content empty use summary for example
item_attribs = {
title: item.title.content,
url: item.link.href,
published_at: item.updated.content.utc.strftime( "%Y-%m-%d %H:%M" ),
# content: item.content.content,
}

item_attribs[ :guid ] = item.id.content

if item.summary
item_attribs[ :content ] = item.summary.content
else
if item.content
text = item.content.content.dup
## strip all html tags
text = text.gsub( /<[^>]+>/, '' )
text = text[ 0..400 ] # get first 400 chars
## todo: check for length if > 400 add ... at the end???
item_attribs[ :content ] = text
end
end

puts "- #{item.title.content}"
puts " link >#{item.link.href}<"
puts " id (~guid) >#{item.id.content}<"

### todo: use/try published first? why? why not?
puts " updated (~pubDate) >#{item.updated.content}< >#{item.updated.content.utc.strftime( "%Y-%m-%d %H:%M" )}< : #{item.updated.content.class.name}"
puts

# puts "*** dump item:"
# pp item

item_attribs
end

def handle_feed_item_rss( item )

item_attribs = {
title: item.title,
url: item.link,
published_at: item.pubDate.utc.strftime( "%Y-%m-%d %H:%M" ),
# content: item.content_encoded,
}

# if item.content_encoded.nil?
# puts " using description for content"
item_attribs[ :content ] = item.description
# end

item_attribs[ :guid ] = item.guid.content

puts "- #{item.title}"
puts " link (#{item.link})"
puts " guid (#{item.guid.content})"
puts " pubDate >#{item.pubDate}< >#{item.pubDate.utc.strftime( "%Y-%m-%d %H:%M" )}< : #{item.pubDate.class.name}"
puts

# puts "*** dump item:"
# pp item

item_attribs
end



end # class Fetcher

end # module Pluto
190 changes: 190 additions & 0 deletions lib/pluto/utils.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@

###########################
# todo: move to feedutils

module FeedUtils


class Feed
attr_accessor :object

attr_accessor :format # e.g. atom|rss 2.0|etc.
attr_accessor :title
attr_accessor :title_type # e.g. text|html (optional) -use - why?? why not??

attr_accessor :items

def self.create( feed_wild )

puts " feed.class=#{feed_wild.class.name}"

if feed_wild.class == RSS::Atom::Feed ## fix: use feed_wild.kind_of?( RSS::Atom::Feed )
feed = self.create_from_atom( feed_wild )
else ## assume RSS::Rss::Feed
feed = self.create_from_rss( feed_wild )
end

puts "== #{feed.format} / #{feed.title} =="
feed
end

def self.create_from_atom( atom_feed )
feed = Feed.new
feed.object = atom_feed
feed.title = atom_feed.title.content
feed.format = 'atom'

items = []
atom_feed.items.each do |atom_item|
items << Item.create_from_atom( atom_item )
end
feed.items = items

feed # return new feed
end

def self.create_from_rss( rss_feed )
feed = Feed.new
feed.object = rss_feed
feed.title = rss_feed.channel.title
feed.format = "rss #{rss_feed.rss_version}"

items = []
rss_feed.items.each do |rss_item|
items << Item.create_from_rss( rss_item )
end
feed.items = items

feed # return new feed
end

end # class Feed



class Item
attr_accessor :object # orginal object (e.g RSS item or ATOM entry etc.)

attr_accessor :title
attr_accessor :title_type # optional for now (text|html) - not yet set
attr_accessor :url # todo: rename to link (use alias) ??
attr_accessor :content
attr_accessor :content_type # optional for now (text|html) - not yet set

## todo: add summary (alias description) ???
## todo: add author/authors
## todo: add category/categories

attr_accessor :updated
attr_accessor :published

attr_accessor :guid # todo: rename to id (use alias) ??


def self.create_from_atom( atom_item )
item = self.new # Item.new
item.object = atom_item

item.title = atom_item.title.content
item.url = atom_item.link.href

## todo: check if updated or published present
# set
item.updated = atom_item.updated.content.utc.strftime( "%Y-%m-%d %H:%M" )
item.published = item.updated # fix: check if publshed set

item.guid = atom_item.id.content


# todo: move logic to updater or something
# - not part of normalize

if atom_item.summary
item.content = atom_item.summary.content
else
if atom_item.content
text = atom_item.content.content.dup
## strip all html tags
text = text.gsub( /<[^>]+>/, '' )
text = text[ 0..400 ] # get first 400 chars
## todo: check for length if > 400 add ... at the end???
item.content = text
end
end

puts "- #{atom_item.title.content}"
puts " link >#{atom_item.link.href}<"
puts " id (~guid) >#{atom_item.id.content}<"

### todo: use/try published first? why? why not?
puts " updated (~pubDate) >#{atom_item.updated.content}< >#{atom_item.updated.content.utc.strftime( "%Y-%m-%d %H:%M" )}< : #{atom_item.updated.content.class.name}"
puts

# puts "*** dump item:"
# pp item

item
end # method create_from_atom

def self.create_from_rss( rss_item )

item = self.new # Item.new
item.object = rss_item

item.title = rss_item.title
item.url = rss_item.link

## todo: check if updated or published present
# set
item.published = rss_item.pubDate.utc.strftime( "%Y-%m-%d %H:%M" )
item.updated = item.published

# content: item.content_encoded,

# if item.content_encoded.nil?
# puts " using description for content"

item.content = rss_item.description
# end

item.guid = rss_item.guid.content

puts "- #{rss_item.title}"
puts " link (#{rss_item.link})"
puts " guid (#{rss_item.guid.content})"
puts " pubDate >#{rss_item.pubDate}< >#{rss_item.pubDate.utc.strftime( "%Y-%m-%d %H:%M" )}< : #{rss_item.pubDate.class.name}"
puts

# puts "*** dump item:"
# pp item

item
end # method create_from_rss

end # class Item


class Parser

### Note: lets keep/use same API as RSS::Parser for now
def initialize( xml )
@xml = xml
end

def parse

parser = RSS::Parser.new( @xml )
parser.do_validate = false
parser.ignore_unknown_element = true

puts "Parsing feed..."
feed_wild = parser.parse # not yet normalized

feed = Feed.create( feed_wild )
feed # return new (normalized) feed
end

end # class Parser


end # module FeedUtils
Loading

0 comments on commit 813461c

Please sign in to comment.