Fixed error with scraping display results, minor editing to homepage …

…text
brendan-oconnell · Sep 9, 2023 · a673998 · a673998
1 parent 2a556e5
commit a673998
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 36 deletions.
diff --git a/app/controllers/websites_controller.rb b/app/controllers/websites_controller.rb
@@ -3,17 +3,19 @@
 class WebsitesController < ApplicationController
 
   def create
-    # Pseudo: create the website if new for the user otherwise a new version of the website
+    # create the website if new for the user otherwise a new version of the website
+    # user functionality for the dashboard is currently disabled.
     user_url = "https://www.#{website_params[:url]}"
-    existing_website = Website.find_by_url(user_url)
+    # existing_website = Website.find_by_url(user_url)
 
-    if Website.find_by_url(user_url) && existing_website.user == current_or_guest_user
-      @website = Website.find_by_url(user_url)
-    else
-      @website = Website.new(website_params)
-      @website.url = user_url
-      @website.user = current_or_guest_user unless current_or_guest_user.nil?
-    end
+    # if Website.find_by_url(user_url) && existing_website.user == current_or_guest_user
+    #   @website = Website.find_by_url(user_url)
+    # else
+       @website = Website.new(website_params)
+       @website.url = user_url
+       @website.user = current_or_guest_user unless current_or_guest_user.nil?
+    # end
+    @website.url = user_url
 
     if @website.valid?
       @website.save
@@ -42,19 +44,20 @@ def create_version(website)
     FontsBackgroundsScrapingJob.perform_later(@version, @website)
     ImageScrapingJob.perform_later(@version, @website)
     CarbonApiJob.perform_later(website.url, @version)
-    sleep 20
+    # wait before loading the results page, to give at least some of the scraping and API to load.
+    sleep 10
   end
 
   def reuse_recent_version(last_version, url)
     # Pseudo: reuse the version if the new request is less than 24h
-    if last_version && (Time.now.utc - last_version.created_at) < 86_400
-      @version = last_version
-      @version[:carbonapi_updated] = false
-      carbon_infos = nil
-    else
+    # if last_version && (Time.now.utc - last_version.created_at) < 86_400
+    #   @version = last_version
+    #   @version[:carbonapi_updated] = false
+    #   carbon_infos = nil
+    # else
       @version = Version.new
       @version.update(website_id: @website.id)
-    end
+    # end
   end
 
 end
diff --git a/app/jobs/fonts_backgrounds_scraping_job.rb b/app/jobs/fonts_backgrounds_scraping_job.rb
@@ -1,3 +1,4 @@
+require "pry-byebug"
 class FontsBackgroundsScrapingJob < ApplicationJob
   queue_as :default
 
@@ -16,17 +17,17 @@ def perform(version, website)
 
 def stylesheets_scraping(html_doc, website)
   stylesheets = []
-
   html_doc.search("link").each do |link|
     stylesheets << link.attributes["href"].value if link.attributes["rel"].value == "stylesheet"
-    # if stylesheet begins with https://www, don't do anything with it.
-    stylesheets.map! { |stylesheet| control_link_validity(stylesheet) }
+    stylesheets.map! { |stylesheet| control_link_validity(stylesheet, website.url) }
   end
   return stylesheets
 end
 
-def control_link_validity(link)
- link.start_with?("http") ? link : link.insert(0, @website.url)
+def control_link_validity(link, url)
+  # if stylesheet is a full link, (e.g. https://www.nytimes.com/assets/example.css) leave as is
+  # if a relative link, add beginning of URL (e.g. https://www.nytimes.com) so it can be properly scraped
+ link.start_with?("http") ? link : link.insert(0, url)
 end
 
 def fonts_and_backgrounds_scraping(stylesheets)

diff --git a/app/jobs/image_scraping_job.rb b/app/jobs/image_scraping_job.rb
@@ -10,25 +10,27 @@ def perform(version, website)
     html_file = URI.open(website.url).read
     html_doc = Nokogiri::HTML(html_file)
 
-    image_scraping(version, html_doc)
+    image_scraping(version, html_doc, website.url)
   end
 end
 
 private
 
 
-def image_scraping(version, html_doc)
+def image_scraping(version, html_doc, url)
 
   @photos = []
 
   html_doc.search("img").each do |image|
-    # added in code for lazy loading. If page has lazy loading, there won't be an image URL and it should be skipped.
+    # if page has lazy loading, there won't be an image URL and it should be skipped.
     # e.g. nytimes.com
     # if image.attributes["loading"] exists and has a value (e.g. "lazy"), we can't scrape that image.
-    # so don't include it. hence the unless.
+    # so don't include it.
     unless image.attributes["alt"].nil? || image.attributes["loading"]
       src_value = image.attributes["data-src"] ? image.attributes["data-src"].value : image.attributes["src"].value
-      link = control_link_validity(src_value)
+      link = control_link_validity(src_value, url)
+      # https://rubygems.org/gems/fastimage/versions/2.2.5
+      # approximates image size by quickly grabbing dimensions
       dimensions = FastImage.size(link)
       if dimensions
         type = FastImage.type(link)
@@ -48,8 +50,8 @@ def image_scraping(version, html_doc)
 end
 
 
-def control_link_validity(link)
-  link.start_with?("http") ? link : link.insert(0, @website.url)
+def control_link_validity(link, web_url)
+  link.start_with?("http") ? link : link.insert(0, web_url)
 end
 
 def sort_main_photos(photos)

diff --git a/app/views/pages/home.html.erb b/app/views/pages/home.html.erb
@@ -29,10 +29,10 @@
     <div class="spacer"></div>
     <div class="container text-center text-light mt-5 mb-3">
       <br>
-      <p>Type URL <strong>without</strong> https://www, for example lewagon.com.
-      <br>Analysis will take 20 seconds or longer to complete. We're not currently able to analyze all websites.
+      <p>Type URL <strong>without</strong> https://www, for example nytimes.com.
+      <br>Analysis will take 10 seconds or longer to complete. We're not currently able to analyze all websites.
       <br>
-      Here are a few examples that work well: <i>lewagon.com, food.com, pollutingsite.com.</i></p>
+      Here are a few examples that work well: <i>nytimes.com, salon.com, vox.com.</i></p>
     </div>
   </div>
 </div>
diff --git a/app/views/pages/scrapingerror.html.erb b/app/views/pages/scrapingerror.html.erb
@@ -3,8 +3,8 @@
       <h1> We weren't able to successfully analyze your site.</h1>
       <p> Don't worry! You didn't do anything wrong. We're still working on developing this tool so we can successfully analyze data from a wider range of websites. Something simple, such as how you display images on your site, probably tripped us up.</p>
       <p>In the meantime, if you'd like to see what a sample results page looks like, try one of these:</p>
-      <p>www.marmiton.org</p>
-      <p>www.pollutingsite.com - our test site</p>
+      <p>www.nytimes.com</p>
+      <p>www.vox.com</p>
   </div>
 </div>
 

diff --git a/config/routes.rb b/config/routes.rb
@@ -1,13 +1,14 @@
 Rails.application.routes.draw do
+  # users are currently hidden in view, but functionality is built out.
   devise_for :users, components: {registrations: 'registrations', sessions: 'sessions'}
 
   require "sidekiq/web"
   authenticate :user, ->(user) { user.admin? } do
+    # use Sidekiq for background jobs (website carbon API, image scraping, font and background color scraping)
+    # so results page doesn't take too long to load.
     mount Sidekiq::Web => '/sidekiq'
   end
 
-  # For details on the DSL available within this file, see https://guides.rubyonrails.org/routing.html
-
   # pages
   root to: 'pages#home'
 
@@ -20,7 +21,7 @@
   resources :versions, only: :show
 
   # dashboard
-
+  # not currently available in view
   get 'dashboard', to: 'pages#dashboard', as: :dashboard
 
   # footer