-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_mars.py
140 lines (102 loc) · 4.55 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Scraping script
# Dependencies
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests as rq
from webdriver_manager.chrome import ChromeDriverManager
from splinter import Browser
import time
import pprint as pp
# Time for webpage to load
time_variable = 1
def mars_website_scrape(browser):
# Mars website
mars_url = "https://redplanetscience.com/"
browser.visit(mars_url)
# Let the website load otherwise it might not be able to grab html
time.sleep(time_variable)
html_mars = browser.html
# Create BeautifulSoup object; parse with 'html.parser'
mars_soup = bs(html_mars, "html.parser")
# Get title of news article
news_title = mars_soup.select_one("div.list_text").find("div", class_ = "content_title").get_text()
# Get news short article
news_article = mars_soup.select_one("div.list_text").find("div", class_ = "article_teaser_body").get_text()
return news_title, news_article
def nasa_website_scrape(browser):
# Nasa website
nasa_url = "https://spaceimages-mars.com/"
browser.visit(nasa_url)
# Let the website load otherwise it might not be able to grab html
time.sleep(time_variable)
html_nasa = browser.html
# Create BeautifulSoup object; parse with 'html.parser'
nasa_soup = bs(html_nasa, "html.parser")
# Get the image url for the header image on nasa
featured_image_url = nasa_soup.find("img", class_ = "headerimage").get("src")
featured_image_url = nasa_url + featured_image_url
return featured_image_url
def mars_fact_website_scrape():
# Mars fact website
mars_fact_url = pd.read_html("https://galaxyfacts-mars.com/")[1]
mars_fact_url = mars_fact_url.rename(columns = {0: "Mars Planet Profile", 1: "Facts"})
# Convert mars data to html
mars_fact_html = mars_fact_url.to_html()
return mars_fact_html
def mars_hemi_website_scrape(browser):
# Mars hemishere website
mars_hemi_url = "https://marshemispheres.com/"
browser.visit(mars_hemi_url)
# Let the website load otherwise it might not be able to grab html
time.sleep(time_variable)
html_mars_hemi = browser.html
# Create BeautifulSoup object; parse with 'html.parser'
mars_hemi_soup = bs(html_mars_hemi, "html.parser")
hemisphere_image_urls = []
# Grab all div tags in first webpage
mars_hemi_soup_divtag = mars_hemi_soup.find_all("div", class_ = "description")
# for-loop through div tag in mars_hemi_soup_divtag
for div_tag_first in mars_hemi_soup_divtag:
hemi_title = div_tag_first.find("h3").get_text()
hemi_image_link = div_tag_first.find("a").get("href")
next_page_img_link = mars_hemi_url + hemi_image_link
# Visit the subsequent page for the hemisphere image
browser.visit(next_page_img_link)
html_mars_hemi_next = browser.html
# Create BeautifulSoup object; parse with 'html.parser'
mars_hemi_soup_next = bs(html_mars_hemi_next, "html.parser")
# Grabb all div tags of the subsequent page
mars_hemi_soup_next_divtag = mars_hemi_soup_next.find_all("div", class_ = "downloads")
# for-loop through div tag in mars_hemi_soup_next_divtag
for div_tag_second in mars_hemi_soup_next_divtag:
hemi_full_link = div_tag_second.find("a").get("href")
full_mars_hemi_url = mars_hemi_url+hemi_full_link
hemi_dict = {
"title": hemi_title,
"img_url": full_mars_hemi_url
}
# Append title and full image link into list
hemisphere_image_urls.append(hemi_dict)
return hemisphere_image_urls
# Scrapes information from various mars or space related websites and creates a dictionary of information.
def scrape_mars_info():
executable_path = {"executable_path": ChromeDriverManager().install()}
browser = Browser("chrome", **executable_path, headless = True)
news_title, news_article = mars_website_scrape(browser)
featured_image_url = nasa_website_scrape(browser)
mars_fact_html = mars_fact_website_scrape()
hemisphere_image_urls = mars_hemi_website_scrape(browser)
scrape_dict = {
"mars_news_title": news_title,
"mars_news_article": news_article,
"nasa_featured_image": featured_image_url,
"mars_facts": mars_fact_html,
"mars_hemispheres": hemisphere_image_urls
}
# Turn this on to see content of dictionary
# pp.pprint(scrape_dict)
return scrape_dict
# scrape_mars_info(chrome)
if __name__ == "__main__":
# pp.pprint(scrape_mars_info(browser))
pp.pprint(scrape_mars_info())