added tags

jkraemer · Oct 23, 2006 · ebd6386 · ebd6386
1 parent 6c2e1cd
commit ebd6386
Show file tree

Hide file tree

Showing 13 changed files with 463 additions and 255 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,3 +1,10 @@
+0.3.2
+- make RDig compatible with Ferret 0.10.x
+- won't work any more with Ferret 0.9.x and before
+
+0.3.1
+- Bug fix release: fixed handling of unparseable URLs
+
 0.3.0
 - file system crawling
 - optional url rewriting before indexing, e.g. for linking to results 

diff --git a/README b/README
@@ -5,7 +5,7 @@ to help building a site search for web sites or intranets. Internally,
 Ferret is used for the full text indexing. After creating a config file 
 for your site, the index can be built with a single call to rdig.
 
-RDig depends on Ferret (>= 0.3.2) and the RubyfulSoup library (>= 1.0.4).
+RDig depends on Ferret (>= 0.10.0) and the RubyfulSoup library (>= 1.0.4).
 
 == basic usage
 

diff --git a/lib/rdig.rb b/lib/rdig.rb
@@ -24,7 +24,7 @@
 #++
 #
 
-RDIGVERSION = '0.3.1'
+RDIGVERSION = '0.3.2'
 
 
 require 'thread'
@@ -42,15 +42,13 @@
 
 begin
   require 'ferret'
-  require 'rubyful_soup'
 rescue LoadError
   require 'rubygems'
   require 'ferret'
-  require 'rubyful_soup'
 end
 
 require 'htmlentities/htmlentities'
-    
+
 require 'rdig/content_extractors'
 require 'rdig/url_filters'
 require 'rdig/search'
@@ -124,8 +122,17 @@ def configuration
             :wait_before_leave => 10
           ),
           :content_extraction  => OpenStruct.new(
-            # settings for html content extraction
-            :html => OpenStruct.new(
+            :hpricot      => OpenStruct.new(
+              # css selector for the element containing the page title
+              :title_tag_selector => 'title', 
+              # might also be a proc returning either an element or a string:
+              # :title_tag_selector => lambda { |hpricot_doc| ... }
+              :content_tag_selector => 'body'
+              # might also be a proc returning either an element or a string:
+              # :content_tag_selector => lambda { |hpricot_doc| ... }
+            ),
+            # settings for html content extraction (RubyfulSoup)
+            :rubyful_soup => OpenStruct.new(
               # select the html element that contains the content to index
               # by default, we index all inside the body tag:
               :content_tag_selector => lambda { |tagsoup|
@@ -142,7 +149,8 @@ def configuration
             :create              => true,
             :handle_parse_errors => true,
             :analyzer            => Ferret::Analysis::StandardAnalyzer.new,
-            :occur_default       => Ferret::Search::BooleanClause::Occur::MUST
+            :occur_default       => :must,
+            :default_field       => '*'
           )
         )
       end

diff --git a/lib/rdig/content_extractors.rb b/lib/rdig/content_extractors.rb
@@ -1,26 +1,3 @@
-# override some methods concered with entity resolving
-# to convert them to strings
-class BeautifulStoneSoup
-  # resolve unknown html entities using the htmlentities lib
-  alias :orig_unknown_entityref :unknown_entityref
-  def unknown_entityref(ref)
-    if HTMLEntities::MAP.has_key?(ref)
-      handle_data [HTMLEntities::MAP[ref]].pack('U')
-    else
-      orig_unknown_entityref ref
-    end
-  end
-
-  # resolve numeric entities to utf8
-  def handle_charref(ref)
-    handle_data( ref.gsub(/([0-9]{1,7})/) { 
-                            [$1.to_i].pack('U') 
-                    }.gsub(/x([0-9a-f]{1,6})/i) { 
-                            [$1.to_i(16)].pack('U') 
-                    } )
-  end
-end
-
 module RDig
 
   # Contains classes which are used for extracting content and meta data from
@@ -30,15 +7,6 @@ module ContentExtractors
     # process the given +content+ depending on it's +content_type+.
     def self.process(content, content_type)
       ContentExtractor.process(content, content_type)
-      #      case content_type
-      #when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
-      #  return HtmlContentExtractor.process(content)
-      #when /^application\/.+pdf/
-      #  return PdfContentExtractor.process(content) unless RDig.config.content_extraction.pdf.disabled
-      #else
-      #  puts "unable to handle content type #{content_type}"
-      #end
-      #return nil
     end
 
     # Base class for Content Extractors.
@@ -72,7 +40,7 @@ def initialize(config)
       end
 
       def can_do(content_type)
-        content_type =~ @pattern
+        @pattern && content_type =~ @pattern
       end
     end
 
@@ -104,197 +72,10 @@ def can_do(content_type)
       end
     end
 
-    # Extract text from pdf content.
-    #
-    # Requires the pdftotext and pdfinfo utilities from the 
-    # xpdf-utils package
-    # (on debian and friends do 'apt-get install xpdf-utils')
-    #
-    class PdfContentExtractor < ContentExtractor
-      include ExternalAppHelper
-
-      def initialize(config)
-        super(config)
-        @pattern = /^application\/pdf/
-        @pdftotext = 'pdftotext'
-        @pdfinfo = 'pdfinfo'
-        @available = true
-        [ @pdftotext, @pdfinfo].each { |program|
-          unless %x{#{program} -h 2>&1} =~ /Copyright 1996/ 
-            @available = false 
-            break
-          end
-        }
-      end
-
-      def process(content)
-        result = {}
-        as_file(content) do |file|
-          result[:content] = get_content(file.path).strip
-          result[:title] = get_title(file.path)
-        end
-        result
-      end
-
-      def get_content(path_to_tempfile)
-        %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
-      end
-
-      # extracts the title from pdf meta data
-      # needs pdfinfo
-      # returns the title or nil if no title was found
-      def get_title(path_to_tempfile)
-        %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
-      rescue
-      end
-    end
-
-    # Extract text from word documents
-    #
-    # Requires the wvHtml utility
-    # (on debian and friends do 'apt-get install wv')
-    class WordContentExtractor < ContentExtractor
-      include ExternalAppHelper
-
-      def initialize(config)
-        super(config)
-        @wvhtml = 'wvHtml'
-        @pattern = /^application\/msword/
-        # html extractor for parsing wvHtml output
-        @html_extractor = HtmlContentExtractor.new(OpenStruct.new(
-            :html => OpenStruct.new(
-              :content_tag_selector => lambda { |tagsoup|
-                tagsoup.html.body
-              },
-              :title_tag_selector         => lambda { |tagsoup|
-                tagsoup.html.head.title
-              }
-            )))
-
-        # TODO: besser: if $?.exitstatus == 127 (not found)
-        @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
-      end
-
-      def process(content)
-        result = {}
-        as_file(content) do |file|  
-          result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
-        end
-        return result || {}
-      end
-
-    end
-
-    # extracts title, content and links from html documents
-    class HtmlContentExtractor < ContentExtractor
-
-      def initialize(config)
-        super(config)
-        @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
-      end
-
-      # returns: 
-      # { :content => 'extracted clear text',
-      #   :meta => { :title => 'Title' },
-      #   :links => [array of urls] }
-      def process(content)
-        result = { }
-        tag_soup = BeautifulSoup.new(content)
-        result[:title] = extract_title(tag_soup)
-        result[:links] = extract_links(tag_soup)
-        result[:content] = extract_content(tag_soup)
-        return result
-      end
-
-      # Extracts textual content from the HTML tree.
-      #
-      # - First, the root element to use is determined using the 
-      # +content_element+ method, which itself uses the content_tag_selector
-      # from RDig.configuration.
-      # - Then, this element is processed by +extract_text+, which will give
-      # all textual content contained in the root element and all it's
-      # children.
-      def extract_content(tag_soup)
-        content = ''
-        ce = content_element(tag_soup)
-        ce.children { |child| 
-          extract_text(child, content)
-        } unless ce.nil?
-        return content.strip
-      end
-
-      # extracts the href attributes of all a tags, except 
-      # internal links like <a href="#top">
-      def extract_links(tagsoup)
-        tagsoup.find_all('a').map { |link|
-          CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
-        }.compact
-      end
-
-      # Extracts the title from the given html tree
-      def extract_title(tagsoup)
-        the_title_tag = title_tag(tagsoup)
-        if the_title_tag.is_a? String
-          the_title_tag
-        else
-          title = ''
-          extract_text(the_title_tag, title)
-          title.strip
-        end
-      end
-
-      # Recursively extracts all text contained in the given element, 
-      # and appends it to content.
-      def extract_text(element, content='')
-        return nil if element.nil?
-        if element.is_a? NavigableString
-          value = strip_comments(element)
-          value.strip!
-          unless value.empty?
-            content << value
-            content << ' '
-          end
-        elsif element.string  # it's a Tag, and it has some content string
-          # skip inline scripts and styles
-          return nil if element.name =~ /^(script|style)$/i 
-          value = element.string.strip 
-          unless value.empty?
-            content << value
-            content << ' '
-          end
-        else
-          element.children { |child|
-            extract_text(child, content)
-          }
-        end
-      end
-
-      # Returns the element to extract the title from.
-      #
-      # This may return a string, e.g. an attribute value selected from a meta
-      # tag, too.
-      def title_tag(tagsoup)
-        if @config.html.title_tag_selector
-          @config.html.title_tag_selector.call(tagsoup)
-        else 
-          tagsoup.html.head.title
-        end
-      end
-
-      # Retrieve the root element to extract document content from
-      def content_element(tagsoup)
-        if @config.html.content_tag_selector
-          @config.html.content_tag_selector.call(tagsoup)
-        else
-          tagsoup.html.body
-        end
-      end
-
-      # Return the given string minus all html comments
-      def strip_comments(string)
-        string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
-      end
-    end
-
   end
 end
+
+# load stock content extractors
+Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
+  require f
+end
diff --git a/lib/rdig/content_extractors/doc.rb b/lib/rdig/content_extractors/doc.rb
@@ -0,0 +1,41 @@
+module RDig
+  module ContentExtractors
+
+    # Extract text from word documents
+    #
+    # Requires the wvHtml utility
+    # (on debian and friends do 'apt-get install wv')
+    class WordContentExtractor < ContentExtractor
+      include ExternalAppHelper
+
+      def initialize(config)
+        super(config)
+        @wvhtml = 'wvHtml'
+        @pattern = /^application\/msword/
+        # html extractor for parsing wvHtml output
+        @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
+            :rubyful_soup => OpenStruct.new(
+              :content_tag_selector => lambda { |tagsoup|
+                tagsoup.html.body
+              },
+              :title_tag_selector         => lambda { |tagsoup|
+                tagsoup.html.head.title
+              }
+            )))
+
+        # TODO: better: if $?.exitstatus == 127 (not found)
+        @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
+      end
+
+      def process(content)
+        result = {}
+        as_file(content) do |file|  
+          result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
+        end
+        return result || {}
+      end
+
+    end
+
+  end
+end