Skip to content

Commit

Permalink
Fixed error with scraping display results, minor editing to homepage …
Browse files Browse the repository at this point in the history
…text
  • Loading branch information
brendan-oconnell committed Sep 9, 2023
1 parent 2a556e5 commit a673998
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 36 deletions.
35 changes: 19 additions & 16 deletions app/controllers/websites_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@
class WebsitesController < ApplicationController

def create
# Pseudo: create the website if new for the user otherwise a new version of the website
# create the website if new for the user otherwise a new version of the website
# user functionality for the dashboard is currently disabled.
user_url = "https://www.#{website_params[:url]}"
existing_website = Website.find_by_url(user_url)
# existing_website = Website.find_by_url(user_url)

if Website.find_by_url(user_url) && existing_website.user == current_or_guest_user
@website = Website.find_by_url(user_url)
else
@website = Website.new(website_params)
@website.url = user_url
@website.user = current_or_guest_user unless current_or_guest_user.nil?
end
# if Website.find_by_url(user_url) && existing_website.user == current_or_guest_user
# @website = Website.find_by_url(user_url)
# else
@website = Website.new(website_params)
@website.url = user_url
@website.user = current_or_guest_user unless current_or_guest_user.nil?
# end
@website.url = user_url

if @website.valid?
@website.save
Expand Down Expand Up @@ -42,19 +44,20 @@ def create_version(website)
FontsBackgroundsScrapingJob.perform_later(@version, @website)
ImageScrapingJob.perform_later(@version, @website)
CarbonApiJob.perform_later(website.url, @version)
sleep 20
# wait before loading the results page, to give at least some of the scraping and API to load.
sleep 10
end

def reuse_recent_version(last_version, url)
# Pseudo: reuse the version if the new request is less than 24h
if last_version && (Time.now.utc - last_version.created_at) < 86_400
@version = last_version
@version[:carbonapi_updated] = false
carbon_infos = nil
else
# if last_version && (Time.now.utc - last_version.created_at) < 86_400
# @version = last_version
# @version[:carbonapi_updated] = false
# carbon_infos = nil
# else
@version = Version.new
@version.update(website_id: @website.id)
end
# end
end

end
11 changes: 6 additions & 5 deletions app/jobs/fonts_backgrounds_scraping_job.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
require "pry-byebug"
class FontsBackgroundsScrapingJob < ApplicationJob
queue_as :default

Expand All @@ -16,17 +17,17 @@ def perform(version, website)

def stylesheets_scraping(html_doc, website)
stylesheets = []

html_doc.search("link").each do |link|
stylesheets << link.attributes["href"].value if link.attributes["rel"].value == "stylesheet"
# if stylesheet begins with https://www, don't do anything with it.
stylesheets.map! { |stylesheet| control_link_validity(stylesheet) }
stylesheets.map! { |stylesheet| control_link_validity(stylesheet, website.url) }
end
return stylesheets
end

def control_link_validity(link)
link.start_with?("http") ? link : link.insert(0, @website.url)
def control_link_validity(link, url)
# if stylesheet is a full link, (e.g. https://www.nytimes.com/assets/example.css) leave as is
# if a relative link, add beginning of URL (e.g. https://www.nytimes.com) so it can be properly scraped
link.start_with?("http") ? link : link.insert(0, url)
end

def fonts_and_backgrounds_scraping(stylesheets)
Expand Down
16 changes: 9 additions & 7 deletions app/jobs/image_scraping_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,27 @@ def perform(version, website)
html_file = URI.open(website.url).read
html_doc = Nokogiri::HTML(html_file)

image_scraping(version, html_doc)
image_scraping(version, html_doc, website.url)
end
end

private


def image_scraping(version, html_doc)
def image_scraping(version, html_doc, url)

@photos = []

html_doc.search("img").each do |image|
# added in code for lazy loading. If page has lazy loading, there won't be an image URL and it should be skipped.
# if page has lazy loading, there won't be an image URL and it should be skipped.
# e.g. nytimes.com
# if image.attributes["loading"] exists and has a value (e.g. "lazy"), we can't scrape that image.
# so don't include it. hence the unless.
# so don't include it.
unless image.attributes["alt"].nil? || image.attributes["loading"]
src_value = image.attributes["data-src"] ? image.attributes["data-src"].value : image.attributes["src"].value
link = control_link_validity(src_value)
link = control_link_validity(src_value, url)
# https://rubygems.org/gems/fastimage/versions/2.2.5
# approximates image size by quickly grabbing dimensions
dimensions = FastImage.size(link)
if dimensions
type = FastImage.type(link)
Expand All @@ -48,8 +50,8 @@ def image_scraping(version, html_doc)
end


def control_link_validity(link)
link.start_with?("http") ? link : link.insert(0, @website.url)
def control_link_validity(link, web_url)
link.start_with?("http") ? link : link.insert(0, web_url)
end

def sort_main_photos(photos)
Expand Down
6 changes: 3 additions & 3 deletions app/views/pages/home.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@
<div class="spacer"></div>
<div class="container text-center text-light mt-5 mb-3">
<br>
<p>Type URL <strong>without</strong> https://www, for example lewagon.com.
<br>Analysis will take 20 seconds or longer to complete. We're not currently able to analyze all websites.
<p>Type URL <strong>without</strong> https://www, for example nytimes.com.
<br>Analysis will take 10 seconds or longer to complete. We're not currently able to analyze all websites.
<br>
Here are a few examples that work well: <i>lewagon.com, food.com, pollutingsite.com.</i></p>
Here are a few examples that work well: <i>nytimes.com, salon.com, vox.com.</i></p>
</div>
</div>
</div>
4 changes: 2 additions & 2 deletions app/views/pages/scrapingerror.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
<h1> We weren't able to successfully analyze your site.</h1>
<p> Don't worry! You didn't do anything wrong. We're still working on developing this tool so we can successfully analyze data from a wider range of websites. Something simple, such as how you display images on your site, probably tripped us up.</p>
<p>In the meantime, if you'd like to see what a sample results page looks like, try one of these:</p>
<p>www.marmiton.org</p>
<p>www.pollutingsite.com - our test site</p>
<p>www.nytimes.com</p>
<p>www.vox.com</p>
</div>
</div>

Expand Down
7 changes: 4 additions & 3 deletions config/routes.rb
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
Rails.application.routes.draw do
# users are currently hidden in view, but functionality is built out.
devise_for :users, components: {registrations: 'registrations', sessions: 'sessions'}

require "sidekiq/web"
authenticate :user, ->(user) { user.admin? } do
# use Sidekiq for background jobs (website carbon API, image scraping, font and background color scraping)
# so results page doesn't take too long to load.
mount Sidekiq::Web => '/sidekiq'
end

# For details on the DSL available within this file, see https://guides.rubyonrails.org/routing.html

# pages
root to: 'pages#home'

Expand All @@ -20,7 +21,7 @@
resources :versions, only: :show

# dashboard

# not currently available in view
get 'dashboard', to: 'pages#dashboard', as: :dashboard

# footer
Expand Down

0 comments on commit a673998

Please sign in to comment.