-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
af90c7e
commit 8f589a6
Showing
1 changed file
with
242 additions
and
0 deletions.
There are no files selected for viewing
242 changes: 242 additions & 0 deletions
242
swatchbharaturban_crawler/spiders/swatchbharaturban_crawler.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,242 @@ | ||
# -*- coding: utf-8 -*- | ||
import scrapy | ||
import requests | ||
|
||
class SwatchbharaturbanCrawlerSpider(scrapy.Spider): | ||
# name of the spider | ||
name = "swatchbharaturban_crawler" | ||
# the domains where the spider is allowed to crawl | ||
allowed_domains = ["swachhbharaturban.gov.in"] | ||
# the url where the spider with start crawling | ||
start_urls = ['http://swachhbharaturban.gov.in/ihhl/RPTApplicationSummary.aspx'] | ||
# delay before downoadig a web page | ||
download_delay = 1 | ||
|
||
|
||
# path to dave data | ||
data_dir_path = "./swatchbharaturban_crawler/data/" | ||
raw_dir_path = "./swatchbharaturban_crawler/data/" | ||
|
||
# file name to save data | ||
filename = "swatchbharat_data.csv" | ||
|
||
# url to push the updated data | ||
url = None | ||
|
||
# custom setting to write the output in a file | ||
custom_settings = { | ||
# fields to be written | ||
'FEED_EXPORT_FIELDS': [ | ||
"State", | ||
"District", | ||
"ULB Name", | ||
"Ward", | ||
"No. of Applications Received", | ||
"No. of Applications Not Verified", | ||
"No. of Applications Verified", | ||
"No. of Applications Approved", | ||
"No. of Applications Approved having Aadhar No.", | ||
"No. of Applications Rejected", | ||
"No. of Applications Pullback", | ||
"No. of Applications Closed", | ||
"No. of Constructed Toilet Photo", | ||
"No. of Commenced Toilet Photo", | ||
"No. of Constructed Toilet Photo through Swachhalaya", | ||
], | ||
# format in which the output should be saved | ||
'FEED_FORMAT': 'csv', | ||
# the file path and name to store the data | ||
'FEED_URI': data_dir_path+filename, | ||
} | ||
|
||
def parse(self, response): | ||
''' | ||
This function is called by default and the crawling starts from here only. | ||
This function crawls and list down all the states. | ||
This function then calls the parse_district function in order to crawl the districts of the state. | ||
''' | ||
|
||
# iterate over the list made by this css selector to get the list of all states | ||
for state in response.css('a.lnkButton'): | ||
|
||
# to get the target value, to be passed in the form request | ||
target=state.css("::attr(href)").extract_first().split("'")[1] | ||
# to get the state name | ||
state_name=state.css("::text").extract_first() | ||
|
||
# for every state, a FormRequest would be made to extract districts of the state | ||
yield scrapy.FormRequest( | ||
# url where the request would be sent | ||
'http://swachhbharaturban.gov.in/ihhl/RPTApplicationSummary.aspx', | ||
|
||
# data to be sent along with form | ||
formdata={ | ||
# state_name | ||
'state': state_name, | ||
# information along with target value would be sent | ||
'ctl00$ScriptManager1': 'ctl00$ContentPlaceHolder1$uppnlApplication_id|'+target, | ||
# the target value extracted above would be sent | ||
'__EVENTTARGET': target, | ||
'__EVENTARGUMENT': '' , | ||
'__LASTFOCUS': '', | ||
# the VIEWSTATE would be extracted and that would be passes along with the form ..... property of APS.NET sites | ||
'__VIEWSTATE': response.css('input#__VIEWSTATE::attr(value)').extract_first(), | ||
# the VIEWSTATEGENERATOR is also the same as VIEWSTATE | ||
'__VIEWSTATEGENERATOR': response.css('input#__VIEWSTATEGENERATOR::attr(value)').extract_first() , | ||
'__VIEWSTATEENCRYPTED': '', | ||
'ctl00$hidden1': '', | ||
'ctl00$ddlLanguage': 'en-US', | ||
'ctl00$ContentPlaceHolder1$HiddenField1': '', | ||
'ctl00$ContentPlaceHolder1$hdnDetail': '', | ||
'__ASYNCPOST': 'false' | ||
}, | ||
|
||
# now the parse_districts would be called, in order crawl over the districts of the state | ||
callback=self.parse_districts | ||
) | ||
|
||
# after the crawling is complete, data is uploaded to the url specified | ||
self.send_data() | ||
|
||
|
||
def parse_districts(self,response): | ||
''' | ||
This function is called by parse() funtion which is used to crwl to crwl over the state. | ||
For every state, parse_dictionary() function would be called to crawl over all the districts of a state. | ||
This function then calls the parse_ulb function in order to crawl the ULB of the District. | ||
''' | ||
|
||
# iterate over the list made by this css selector to get the list of all districts | ||
for district in response.css('a.lnkButton')[1:]: | ||
|
||
# to get the updated target value, to be passed in the form request | ||
target=district.css("::attr(href)").extract_first().split("'")[1] | ||
# to get the district name | ||
district_name=district.css("::text").extract_first() | ||
|
||
# for every district, a FormRequest would be made to extract ULB of the district ...here only the new information would be added | ||
yield scrapy.FormRequest.from_response( | ||
# would be taking the info from the previous response we had | ||
response, | ||
|
||
# the data which needs to be updated to be sent along with form | ||
formdata={ | ||
# information along with target value would be sent | ||
'ctl00$ScriptManager1':'ctl00$ContentPlaceHolder1$uppnlApplication_id|'+target, | ||
# refreshing the EVENTTARGET with the updated value | ||
'__EVENTTARGET':target, | ||
'ctl00$hidden1':'', | ||
'ctl00$ContentPlaceHolder1$HiddenField1':'', | ||
'ctl00$ContentPlaceHolder1$hdnDetail':'', | ||
# the district name would also be sent along with the request | ||
'district': district_name, | ||
}, | ||
|
||
# now the parse_ulb would be called, in order crawl over the ULB of the District | ||
callback=self.parse_ulb, | ||
) | ||
|
||
def parse_ulb(self,response): | ||
''' | ||
This function is called by parse_district() funtion which is used to crawl over the districts. | ||
For every district, parse_ulb() function would be called to crawl over all the ULB of a district. | ||
This function then calls the parse_ward() function in order to crawl the Ward of the ULB. | ||
''' | ||
|
||
# iterate over the list made by this css selector to get the list of all ULB | ||
for ulb in response.css('a.lnkButton')[2:]: | ||
|
||
# to get the updated target value, to be passed in the form request | ||
target=ulb.css("::attr(href)").extract_first().split("'")[1] | ||
# to get the ULB name | ||
ulb_name=ulb.css("::text").extract_first() | ||
|
||
# for every ULB, a FormRequest would be made to extract Ward of the ULB ...here only the new information would be added in the FormRequest | ||
yield scrapy.FormRequest.from_response( | ||
# would be taking the info from the previous response we had | ||
response, | ||
|
||
# the data which needs to be updated to be sent along with form | ||
formdata={ | ||
# information along with target value would be sent | ||
'ctl00$ScriptManager1':'ctl00$ContentPlaceHolder1$uppnlApplication_id|'+target, | ||
# refreshing the EVENTTARGET with the updated value | ||
'__EVENTTARGET':target, | ||
'ctl00$hidden1':'', | ||
'ctl00$ContentPlaceHolder1$HiddenField1':'', | ||
'ctl00$ContentPlaceHolder1$hdnDetail':'', | ||
# the ULB name would also be sent along with the request | ||
'ulb': ulb_name, | ||
}, | ||
|
||
# now the parse_ward would be called, in order crawl over the Ward of the ULB | ||
callback=self.parse_ward, | ||
) | ||
|
||
def parse_ward(self,response): | ||
''' | ||
This function is called by parse_ulb() funtion which is used to crawl over the ULB. | ||
For every district, parse_ulb() function would be called to crawl over all the Wards of a ULB. | ||
This is the lowest Level of the Data, all the data would be stored here in this function. | ||
''' | ||
|
||
# this css selector would be used to get the list of various names defined on the page | ||
name_list=response.css("a.lnkButton::text").extract() | ||
|
||
# get the info from the name_list | ||
state_name=name_list[0] | ||
district_name=name_list[1] | ||
ulb_name=name_list[2] | ||
|
||
# this xpath selector would be used to select all the rows i.e the complete ward row | ||
table=response.xpath("//span[contains(@id,'WARD_NO')]/../..") | ||
|
||
# iterate over the tabel rows to get the information about evry ward | ||
for row in table: | ||
|
||
# making dictonary for evry row | ||
item= { | ||
# state name ... extracted earlier only | ||
"State":state_name, | ||
# district name ... extracted earlier only | ||
"District":district_name, | ||
# ULB name ... extracted earlier only | ||
"ULB Name":ulb_name, | ||
# Ward name ... extracting using CSS selector ... the cell that contains 'WARD_NO' in the id | ||
"Ward":row.css("[id*='WARD_NO']::text").extract_first(), | ||
# Ward name ... extracting using CSS selector ... the cell that contains 'WARD_NO' in the id | ||
"No. of Applications Received":row.css("[id*='APP_RECEIVED']::text").extract_first(), | ||
# Applications Not Verified ... extracting using CSS selector ... the cell that contains 'APP_VERIFIEDNOT' in the id | ||
"No. of Applications Not Verified":row.css("[id*='APP_VERIFIEDNOT']::text").extract_first(), | ||
# Applications Verified ... extracting using CSS selector ... the cell that contains 'APP_VERIFIED' in the id | ||
"No. of Applications Verified":row.css("[id*='APP_VERIFIED']::text").extract_first(), | ||
# Applications Approved ... extracting using CSS selector ... the cell that contains 'APP_APPROVED' in the id | ||
"No. of Applications Approved":row.css("[id*='APP_APPROVED']::text").extract_first(), | ||
# Applications Approved having Aadhar No. ... extracting using CSS selector ... the cell that contains 'APP_APPROVED_AADHAR' in the id | ||
"No. of Applications Approved having Aadhar No.":row.css("[id*='APP_APPROVED_AADHAR']::text").extract_first(), | ||
# Applications Rejected ... extracting using CSS selector ... the cell that contains 'APP_REJECTED' in the id | ||
"No. of Applications Rejected":row.css("[id*='APP_REJECTED']::text").extract_first(), | ||
# Applications Pullback ... extracting using CSS selector ... the cell that contains 'APP_PULLBACK' in the id | ||
"No. of Applications Pullback":row.css("[id*='APP_PULLBACK']::text").extract_first(), | ||
# Applications Closed ... extracting using CSS selector ... the cell that contains 'APP_CLOSED' in the id | ||
"No. of Applications Closed":row.css("[id*='APP_CLOSED']::text").extract_first(), | ||
# Constructed Toilet Photo ... extracting using CSS selector ... the cell that contains 'APP_CTP' in the id | ||
"No. of Constructed Toilet Photo":row.css("[id*='APP_CTP']::text").extract_first(), | ||
# Commenced Toilet Photo ... extracting using CSS selector ... the cell that contains 'APP_INTER' in the id | ||
"No. of Commenced Toilet Photo":row.css("[id*='APP_INTER']::text").extract_first(), | ||
# Constructed Toilet Photo through Swachhalaya ... extracting using CSS selector ... the cell that contains 'APP_CTSWACHHALAYA' in the id | ||
"No. of Constructed Toilet Photo through Swachhalaya":row.css("[id*='APP_CTSWACHHALAYA']::text").extract_first(), | ||
} | ||
yield item | ||
|
||
|
||
def send_data(self): | ||
''' | ||
This function is used to send data after the crawling is comlete | ||
''' | ||
|
||
# check if the url is not None | ||
if(self.url is not None): | ||
# open the file and POST it over the url | ||
with open('../data/swatchbharat_data.csv', 'rb') as f: | ||
requests.post(self.url, data=f) |