Merge branch 'init_scrape'

uchicagotechteam · Oct 29, 2016 · 7e845f1 · 7e845f1
2 parents c437892 + e9b5b44
commit 7e845f1
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
-# DHS Project
-Build Illinois Jobslink scraper, API, and front end for Illinois DHS. 
+# Illinois JobsLink Project
+Build Illinois Jobslink scraper, API, and front end. 
 
 
 ## Overview

diff --git a/init_scrape.py b/init_scrape.py
@@ -17,30 +17,33 @@
 
 
 def scrape():
-    url = SEARCH_URL
-    r = session.get(SEARCH_URL)
-    soup = BeautifulSoup(r.content, "html.parser")
-    # print(soup.prettify().encode('utf-8'))
+    url1 = 'https://illinoisjoblink.illinois.gov/ada/r/search/jobs?is_subsequent_search=false&page=1&per_page=250&refiners=%7B%7D&status=Active&utf8=%E2%9C%93'
 
-    # Print the urls for each job listing.
-    # Will need to implement stepping through these urls and scraping the
-    # relevant data.
-    listings = soup.find_all("dt")  # Finds all dt tags (elements in the list)
-    for l in listings:
-        # Finds the a tag, which will have the name and the url
-        urls = l.find_all('a')
-        for u in urls:
-            job_url = u['href']  # The href part of the tag will have the url
-            name = u.string  # The name will be in the string part of the a tag
-            id_num = u.string[u.string.find('(') + 1:u.string.find(')')]
-            # Insert the job listing into the database (only the name and url
-            # have been implemented at this point)
-            c.execute(
-                "INSERT INTO listings VALUES (?, ?, ?, 'TODO', 'TODO', 'TODO', 'TODO', 'TODO');", (name, id_num, job_url))
-        # Need to scrape for description, zipcode, wages, education, etc and
-        # put them into the DB. ---> Use above code as a model as well as what
-        # we did in the scraping workshop.
+    # Goes through all 40 pages of job listings, scrapes job url, name, and id
+    # number
+    # There are 40 pages of results, with 250 listings per page. There should
+    # be more, but it's capped here.
+    for n in range(1, 41):
+        page = url1[:87] + str(n) + url1[88:]   # Changes the page= number
+        r = requests.get(page)
+        soup = BeautifulSoup(r.content, "html.parser")
+        listings = soup.find_all("dt")  # Finds all dt tags
+        for l in listings:
+            # Finds the a tag, which will have the name and the url
+            urls = l.find_all('a')
+            for u in urls:
+                # The href part of the tag will have the url
+                job_url = u['href']
+                name = u.string     # The name will be in the string part of the a tag
+                id_num = u.string[u.string.find('(') + 1:u.string.find(')')]
 
+                # Insert the job listing into the database (only the name and url
+                # have been implemented at this point)
+                c.execute(
+                    "INSERT INTO listings VALUES (?, ?, ?, 'TODO', 'TODO', 'TODO', 'TODO', 'TODO');", (name, id_num, job_url))
+            # Need to scrape for description, zipcode, wages, education, etc and
+            # put them into the DB. ---> Use above code as a model as well as what
+            # we did in the scraping workshop.
     conn.commit()
 
 if __name__ == '__main__':
@@ -49,17 +52,17 @@ def scrape():
     # Code for Illinois Jobs Link Login - TODO: Fix login issues. (Try utf-8
     # encoding??)
 
-    # soup = BeautifulSoup(session.get(SEARCH_URL).content, "html.parser")
-    # inputs = soup.find_all('input')
-    # token = ''
-    # for t in inputs:
-    #     try:
-    #         if t['name'] == 'authenticity_token':
-    #             token = t['value']
-    #             break
-    #     except KeyError as e:
-    #         pass
-    # # print(soup.prettify().encode('utf-8'))
+    soup = BeautifulSoup(session.get(SEARCH_URL).content, "html.parser")
+    inputs = soup.find_all('input')
+    token = ''
+    for t in inputs:
+        try:
+            if t['name'] == 'authenticity_token':
+                token = t['value']
+                break
+        except KeyError as e:
+            pass
+    # print(soup.prettify().encode('utf-8'))
     # print(token)
 
     # login_data = dict(v_username=USER_NAME,
@@ -74,7 +77,7 @@ def scrape():
 
     # print(r.content)
 
-    scrape()
+    # scrape()
 
     # Print our entries in the database
     for row in c.execute('SELECT * FROM listings'):