Skip to content

Commit

Permalink
Merge branch 'init_scrape'
Browse files Browse the repository at this point in the history
  • Loading branch information
SKotekal committed Oct 29, 2016
2 parents c437892 + e9b5b44 commit 7e845f1
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 36 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# DHS Project
Build Illinois Jobslink scraper, API, and front end for Illinois DHS.
# Illinois JobsLink Project
Build Illinois Jobslink scraper, API, and front end.


## Overview
Expand Down
71 changes: 37 additions & 34 deletions init_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,30 +17,33 @@


def scrape():
url = SEARCH_URL
r = session.get(SEARCH_URL)
soup = BeautifulSoup(r.content, "html.parser")
# print(soup.prettify().encode('utf-8'))
url1 = 'https://illinoisjoblink.illinois.gov/ada/r/search/jobs?is_subsequent_search=false&page=1&per_page=250&refiners=%7B%7D&status=Active&utf8=%E2%9C%93'

# Print the urls for each job listing.
# Will need to implement stepping through these urls and scraping the
# relevant data.
listings = soup.find_all("dt") # Finds all dt tags (elements in the list)
for l in listings:
# Finds the a tag, which will have the name and the url
urls = l.find_all('a')
for u in urls:
job_url = u['href'] # The href part of the tag will have the url
name = u.string # The name will be in the string part of the a tag
id_num = u.string[u.string.find('(') + 1:u.string.find(')')]
# Insert the job listing into the database (only the name and url
# have been implemented at this point)
c.execute(
"INSERT INTO listings VALUES (?, ?, ?, 'TODO', 'TODO', 'TODO', 'TODO', 'TODO');", (name, id_num, job_url))
# Need to scrape for description, zipcode, wages, education, etc and
# put them into the DB. ---> Use above code as a model as well as what
# we did in the scraping workshop.
# Goes through all 40 pages of job listings, scrapes job url, name, and id
# number
# There are 40 pages of results, with 250 listings per page. There should
# be more, but it's capped here.
for n in range(1, 41):
page = url1[:87] + str(n) + url1[88:] # Changes the page= number
r = requests.get(page)
soup = BeautifulSoup(r.content, "html.parser")
listings = soup.find_all("dt") # Finds all dt tags
for l in listings:
# Finds the a tag, which will have the name and the url
urls = l.find_all('a')
for u in urls:
# The href part of the tag will have the url
job_url = u['href']
name = u.string # The name will be in the string part of the a tag
id_num = u.string[u.string.find('(') + 1:u.string.find(')')]

# Insert the job listing into the database (only the name and url
# have been implemented at this point)
c.execute(
"INSERT INTO listings VALUES (?, ?, ?, 'TODO', 'TODO', 'TODO', 'TODO', 'TODO');", (name, id_num, job_url))
# Need to scrape for description, zipcode, wages, education, etc and
# put them into the DB. ---> Use above code as a model as well as what
# we did in the scraping workshop.
conn.commit()

if __name__ == '__main__':
Expand All @@ -49,17 +52,17 @@ def scrape():
# Code for Illinois Jobs Link Login - TODO: Fix login issues. (Try utf-8
# encoding??)

# soup = BeautifulSoup(session.get(SEARCH_URL).content, "html.parser")
# inputs = soup.find_all('input')
# token = ''
# for t in inputs:
# try:
# if t['name'] == 'authenticity_token':
# token = t['value']
# break
# except KeyError as e:
# pass
# # print(soup.prettify().encode('utf-8'))
soup = BeautifulSoup(session.get(SEARCH_URL).content, "html.parser")
inputs = soup.find_all('input')
token = ''
for t in inputs:
try:
if t['name'] == 'authenticity_token':
token = t['value']
break
except KeyError as e:
pass
# print(soup.prettify().encode('utf-8'))
# print(token)

# login_data = dict(v_username=USER_NAME,
Expand All @@ -74,7 +77,7 @@ def scrape():

# print(r.content)

scrape()
# scrape()

# Print our entries in the database
for row in c.execute('SELECT * FROM listings'):
Expand Down

0 comments on commit 7e845f1

Please sign in to comment.