Skip to content

Commit

Permalink
added tags
Browse files Browse the repository at this point in the history
  • Loading branch information
jkraemer committed Oct 23, 2006
1 parent 6c2e1cd commit ebd6386
Show file tree
Hide file tree
Showing 13 changed files with 463 additions and 255 deletions.
7 changes: 7 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
0.3.2
- make RDig compatible with Ferret 0.10.x
- won't work any more with Ferret 0.9.x and before

0.3.1
- Bug fix release: fixed handling of unparseable URLs

0.3.0
- file system crawling
- optional url rewriting before indexing, e.g. for linking to results
Expand Down
2 changes: 1 addition & 1 deletion README
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ to help building a site search for web sites or intranets. Internally,
Ferret is used for the full text indexing. After creating a config file
for your site, the index can be built with a single call to rdig.

RDig depends on Ferret (>= 0.3.2) and the RubyfulSoup library (>= 1.0.4).
RDig depends on Ferret (>= 0.10.0) and the RubyfulSoup library (>= 1.0.4).

== basic usage

Expand Down
22 changes: 15 additions & 7 deletions lib/rdig.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#++
#

RDIGVERSION = '0.3.1'
RDIGVERSION = '0.3.2'


require 'thread'
Expand All @@ -42,15 +42,13 @@

begin
require 'ferret'
require 'rubyful_soup'
rescue LoadError
require 'rubygems'
require 'ferret'
require 'rubyful_soup'
end

require 'htmlentities/htmlentities'

require 'rdig/content_extractors'
require 'rdig/url_filters'
require 'rdig/search'
Expand Down Expand Up @@ -124,8 +122,17 @@ def configuration
:wait_before_leave => 10
),
:content_extraction => OpenStruct.new(
# settings for html content extraction
:html => OpenStruct.new(
:hpricot => OpenStruct.new(
# css selector for the element containing the page title
:title_tag_selector => 'title',
# might also be a proc returning either an element or a string:
# :title_tag_selector => lambda { |hpricot_doc| ... }
:content_tag_selector => 'body'
# might also be a proc returning either an element or a string:
# :content_tag_selector => lambda { |hpricot_doc| ... }
),
# settings for html content extraction (RubyfulSoup)
:rubyful_soup => OpenStruct.new(
# select the html element that contains the content to index
# by default, we index all inside the body tag:
:content_tag_selector => lambda { |tagsoup|
Expand All @@ -142,7 +149,8 @@ def configuration
:create => true,
:handle_parse_errors => true,
:analyzer => Ferret::Analysis::StandardAnalyzer.new,
:occur_default => Ferret::Search::BooleanClause::Occur::MUST
:occur_default => :must,
:default_field => '*'
)
)
end
Expand Down
231 changes: 6 additions & 225 deletions lib/rdig/content_extractors.rb
Original file line number Diff line number Diff line change
@@ -1,26 +1,3 @@
# override some methods concered with entity resolving
# to convert them to strings
class BeautifulStoneSoup
# resolve unknown html entities using the htmlentities lib
alias :orig_unknown_entityref :unknown_entityref
def unknown_entityref(ref)
if HTMLEntities::MAP.has_key?(ref)
handle_data [HTMLEntities::MAP[ref]].pack('U')
else
orig_unknown_entityref ref
end
end

# resolve numeric entities to utf8
def handle_charref(ref)
handle_data( ref.gsub(/([0-9]{1,7})/) {
[$1.to_i].pack('U')
}.gsub(/x([0-9a-f]{1,6})/i) {
[$1.to_i(16)].pack('U')
} )
end
end

module RDig

# Contains classes which are used for extracting content and meta data from
Expand All @@ -30,15 +7,6 @@ module ContentExtractors
# process the given +content+ depending on it's +content_type+.
def self.process(content, content_type)
ContentExtractor.process(content, content_type)
# case content_type
#when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
# return HtmlContentExtractor.process(content)
#when /^application\/.+pdf/
# return PdfContentExtractor.process(content) unless RDig.config.content_extraction.pdf.disabled
#else
# puts "unable to handle content type #{content_type}"
#end
#return nil
end

# Base class for Content Extractors.
Expand Down Expand Up @@ -72,7 +40,7 @@ def initialize(config)
end

def can_do(content_type)
content_type =~ @pattern
@pattern && content_type =~ @pattern
end
end

Expand Down Expand Up @@ -104,197 +72,10 @@ def can_do(content_type)
end
end

# Extract text from pdf content.
#
# Requires the pdftotext and pdfinfo utilities from the
# xpdf-utils package
# (on debian and friends do 'apt-get install xpdf-utils')
#
class PdfContentExtractor < ContentExtractor
include ExternalAppHelper

def initialize(config)
super(config)
@pattern = /^application\/pdf/
@pdftotext = 'pdftotext'
@pdfinfo = 'pdfinfo'
@available = true
[ @pdftotext, @pdfinfo].each { |program|
unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
@available = false
break
end
}
end

def process(content)
result = {}
as_file(content) do |file|
result[:content] = get_content(file.path).strip
result[:title] = get_title(file.path)
end
result
end

def get_content(path_to_tempfile)
%x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
end

# extracts the title from pdf meta data
# needs pdfinfo
# returns the title or nil if no title was found
def get_title(path_to_tempfile)
%x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
rescue
end
end

# Extract text from word documents
#
# Requires the wvHtml utility
# (on debian and friends do 'apt-get install wv')
class WordContentExtractor < ContentExtractor
include ExternalAppHelper

def initialize(config)
super(config)
@wvhtml = 'wvHtml'
@pattern = /^application\/msword/
# html extractor for parsing wvHtml output
@html_extractor = HtmlContentExtractor.new(OpenStruct.new(
:html => OpenStruct.new(
:content_tag_selector => lambda { |tagsoup|
tagsoup.html.body
},
:title_tag_selector => lambda { |tagsoup|
tagsoup.html.head.title
}
)))

# TODO: besser: if $?.exitstatus == 127 (not found)
@available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
end

def process(content)
result = {}
as_file(content) do |file|
result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
end
return result || {}
end

end

# extracts title, content and links from html documents
class HtmlContentExtractor < ContentExtractor

def initialize(config)
super(config)
@pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
end

# returns:
# { :content => 'extracted clear text',
# :meta => { :title => 'Title' },
# :links => [array of urls] }
def process(content)
result = { }
tag_soup = BeautifulSoup.new(content)
result[:title] = extract_title(tag_soup)
result[:links] = extract_links(tag_soup)
result[:content] = extract_content(tag_soup)
return result
end

# Extracts textual content from the HTML tree.
#
# - First, the root element to use is determined using the
# +content_element+ method, which itself uses the content_tag_selector
# from RDig.configuration.
# - Then, this element is processed by +extract_text+, which will give
# all textual content contained in the root element and all it's
# children.
def extract_content(tag_soup)
content = ''
ce = content_element(tag_soup)
ce.children { |child|
extract_text(child, content)
} unless ce.nil?
return content.strip
end

# extracts the href attributes of all a tags, except
# internal links like <a href="#top">
def extract_links(tagsoup)
tagsoup.find_all('a').map { |link|
CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
}.compact
end

# Extracts the title from the given html tree
def extract_title(tagsoup)
the_title_tag = title_tag(tagsoup)
if the_title_tag.is_a? String
the_title_tag
else
title = ''
extract_text(the_title_tag, title)
title.strip
end
end

# Recursively extracts all text contained in the given element,
# and appends it to content.
def extract_text(element, content='')
return nil if element.nil?
if element.is_a? NavigableString
value = strip_comments(element)
value.strip!
unless value.empty?
content << value
content << ' '
end
elsif element.string # it's a Tag, and it has some content string
# skip inline scripts and styles
return nil if element.name =~ /^(script|style)$/i
value = element.string.strip
unless value.empty?
content << value
content << ' '
end
else
element.children { |child|
extract_text(child, content)
}
end
end

# Returns the element to extract the title from.
#
# This may return a string, e.g. an attribute value selected from a meta
# tag, too.
def title_tag(tagsoup)
if @config.html.title_tag_selector
@config.html.title_tag_selector.call(tagsoup)
else
tagsoup.html.head.title
end
end

# Retrieve the root element to extract document content from
def content_element(tagsoup)
if @config.html.content_tag_selector
@config.html.content_tag_selector.call(tagsoup)
else
tagsoup.html.body
end
end

# Return the given string minus all html comments
def strip_comments(string)
string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
end
end

end
end

# load stock content extractors
Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
require f
end
41 changes: 41 additions & 0 deletions lib/rdig/content_extractors/doc.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
module RDig
module ContentExtractors

# Extract text from word documents
#
# Requires the wvHtml utility
# (on debian and friends do 'apt-get install wv')
class WordContentExtractor < ContentExtractor
include ExternalAppHelper

def initialize(config)
super(config)
@wvhtml = 'wvHtml'
@pattern = /^application\/msword/
# html extractor for parsing wvHtml output
@html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
:rubyful_soup => OpenStruct.new(
:content_tag_selector => lambda { |tagsoup|
tagsoup.html.body
},
:title_tag_selector => lambda { |tagsoup|
tagsoup.html.head.title
}
)))

# TODO: better: if $?.exitstatus == 127 (not found)
@available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
end

def process(content)
result = {}
as_file(content) do |file|
result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
end
return result || {}
end

end

end
end
Loading

0 comments on commit ebd6386

Please sign in to comment.