-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path2. Scrape_updated_data_tw.py
256 lines (207 loc) · 13.8 KB
/
2. Scrape_updated_data_tw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
#Scrape updated data from Charity Commission website
#Tom Wallace
#5/10/18
#This file reads in the JSON from '1.1 CreatingSubsetofData_np.py' as a pandas dataframe and then scrapes the 2018 income and expenditure of each charity form the charity commission website.
#It then creates a new JSON file with the new data which can be easily appended onto the origional data as shown in '4. Combine_UKDA_scrape_Twitter_tw.py'.
################################# Import packages #################################
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import requests
import time
from time import sleep
import datetime
import random
import json
import pandas as pd
import os.path
import re
################################# Functions #################################
def moneyconverter(money): # Convert the money from strings like '16.5M' to 16500000 so it can be held as an int
if 'M' in money.text: # Some held in millions 'M' and some in thusands 'k'
money = money.text.replace('£', '') # Remove string characters
money = money.replace('.', '')
money = money.replace('M', '')
moneytext = money + '00000' # Add the correct number of zeros
elif 'K' in money.text:
money = money.text.replace('£', '')
money = money.replace('.', '')
money = money.replace('K', '')
moneytext = money + '00'
else:
print('*** WARN! Charity number: ' , charno, 'income format error ***') # Warn the user if there is a format problem.
return (moneytext)
def textformatter(text, string): # Strip text of surrounding whitespace and divide by return to form a list
text = text.replace(string,'')
text = text.replace('\n', '')
text = text.replace('\t', '')
text = text.replace('/', ' or ') # Replace forward slash, whcih is a JSON relevant character, with 'or'.
text = text.split('\r') # Make the block of text into a list, seperating elements by the return character.
text = list(filter(None, text)) # There are blank lines in the origional blck so this line deletes empty elements from the list.
return(text)
def sleepytime(incometext_list, exptext_list, charno_list, finyeartext_list, trustee_list, staff_list, volunteers_list, companyno_list, survived_list, website_list, what_list, who_list, how_list): # This saves a backup of the data and sleeps the scraper at an interval set in the main program
incometext_list = list(map(lambda x :int(x), incometext_list))
exptext_list = list(map(lambda x :int(x), exptext_list))
charno_list_dump = charno_list[:count] # Charity number is a bit different as it is not appended each time so this selects it from the list based on the count number of the run
dicto_part={'ccnum':charno_list_dump, 'updated_income':incometext_list, 'Updated_expenditure':exptext_list, 'Financial_year_ending':finyeartext_list, 'Trustees':trustee_list, 'Staff':staff_list, 'Volunteers':volunteers_list, 'Company number':companyno_list, \
'Survived':survived_list, 'Website':website_list, 'What the charity does':what_list, 'Who the charity helps':who_list, 'How the charity works':how_list}
df_part = pd.DataFrame(dicto_part)
df_part.set_index(['ccnum'], inplace=True)
starttime_str = starttime.strftime("%Y-%m-%d")
dumppath = './Partial_dumps'
if not os.path.exists(dumppath): # If that path doesn't exist then make it.
os.makedirs(dumppath)
dumpfilepath = dumppath + '/partial_dump' + starttime_str + '.json'
df_part.to_json(path_or_buf=dumpfilepath, orient='index') # Save the dump dataframe out to a JSON
sleeptime = 100 + random.randint(1,60) # Generate a sleep time with some randomness to make the scrape look less systematic.
longsleeptime = datetime.datetime.now()
sleeptime1 = time.strftime('%H:%M:%S', time.gmtime(sleeptime))
resumetime = longsleeptime + datetime.timedelta(0,sleeptime)
print('Records up to processing number', count, 'dumped to disk')
print('Sleeping for', sleeptime1, 'at', longsleeptime.strftime("%H:%M:%S"), 'resuming at', resumetime.strftime("%H:%M:%S")) # Print the time info.
print(' ')
sleep(sleeptime) # Sleep to reduce load on remote servers
################################# Scraper function #################################
def scrapeorg(charno): # This defines the main charity scraper fucntion - this get's looped over so everything is individual charity level.
charno=str(charno) # Trun the charity number into a string so it can be concatinated into a URL.
pubregister = 'http://beta.charitycommission.gov.uk/charity-details/?regid=' # This is the base URL for the charity commission's database.
webbaddress = pubregister + charno + '&subid=0' # Generate a varaible holding each individual charity page by concatenating stings.
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} # Spoof the user-agent of the request to return the content seen by Chrome.
rorg = 0
errorcount=0
while rorg==0: # This block of code makes the scraper robust to disconnection
try:
rorg = requests.get(webbaddress, headers=headers)
except:
print('Disconnected, sleeping for 5 seconds')
sleep(5)
errorcount = errorcount + 1
if errorcount==3:
print('Multiple disconnections, sleeping for 100 mins')
sleep(100*60)
errorcount=0
else:
pass
try: # This try block wraps the entire parse. If the charity has been removed they still have a page which is mostly blank and the parse will fail. They will then be given missing values intead.
html_org = rorg.text # Get the text elements of the page.
soup_org = soup(html_org, 'html.parser') # Parse the text as a BS object.
money = soup_org.find_all("span", class_="big-money") # Find all of the large money text.
income = money[0] # Income is always the first element.
incometext = moneyconverter(income) # Pass to parsing function
expend = money[1] # Expenditure is always the second element.
exptext = moneyconverter(expend)
name1 = soup_org.find("div", class_="charity-heading-panel")
name = name1.find("h1")
nametext = name.text
finyear1 = soup_org.find("div", class_="charity-heading-panel")
finyear = finyear1.find("em")
finyeartext = finyear.text
try: # Not all charities have this information so it has it's own try/except block which can gen missing values if it's not found rather than tripping the main try/except.
whatwhohow = soup_org.find('div', id='plWhatWhoHow', class_='detail-panel') # Need to go deeper into the tags to get these.
whatwhohow = whatwhohow.find_all('div', class_='detail-panel-wrap')
what = whatwhohow[0].text # What is always element 0.
what = textformatter(what,'What the charity does')
who = whatwhohow[1].text # Who is element 1
who = textformatter(who,'Who the charity helps')
how = whatwhohow[2].text # how is element 2
how = textformatter(how,'How the charity works')
except:
what = '.' # Set missing to a period for now, will apply numpy missing in another script.
who = '.'
how = '.'
print('*** WARN! Charity number: ' , charno, 'Area info error ***')
try:
staff = soup_org.find("div", class_="detail-25")
staff = staff.find('span', class_='small-header', text='Employees')
staff = staff.previous_sibling
staff = staff.text
staff = int(staff)
except:
staff = 0
try:
trustee = soup_org.find("div", class_="detail-25")
trustee = trustee.find('span', class_='small-header', text='Trustees')
trustee = trustee.previous_sibling
trustee = trustee.text
trustee = int(trustee)
except:
trustee = 0
try:
volunteers = soup_org.find("div", class_="detail-25")
volunteers = volunteers.find('span', class_='small-header', text='Volunteers')
volunteers = volunteers.previous_sibling
volunteers = volunteers.text
volunteers = int(volunteers)
except:
volunteers = 0
try:
companyno = soup_org.find("div", id='ContentPlaceHolderDefault_cp_content_ctl00_CharityDetails_4_TabContainer1_tpOverview_plCompanyNumber', class_="charity-no") # Grab the company ID for charities which have it.
companyno = companyno.text
companyno = companyno.replace('Company no.','')
companyno = companyno.replace('\n', '')
companyno = companyno.replace('\t', '')
companyno = companyno.replace('\r', '')
companyno = companyno.replace(' ', '')
except:
companyno = -99 # Set missing to an int so the variable can be kept as an int when the charities are combined togethor at the end.
try:
website = soup_org.find_all("div", class_="detail-33")
website = website[1]
website = website.find('a', href= re.compile('http'))
website = website.text
except:
website = '.'
survived=1 # Generate a variable tracking if the charity survived - if they have and up to date record they survived. If this function goes to 'except' they died.
statustext = "Charity found" # Generate status text for the output window if above is sucessful.
return (incometext, nametext, webbaddress, finyeartext, statustext, exptext, survived, trustee, staff, volunteers, what, who, how, companyno, website) # Push the generated variables back to the main program
except Exception as e: # If the prase fails, generate fault information and set variables to missing values - this charity died or was removed.
typetext = (str(type(e))) # This indicates the type of error.
etext = (str(e)) # This is the details of the error.
incometext, companyno, exptext = -99, -99, -99
trustee, staff, volunteers, survived = 0, 0, 0, 0
nametext, finyeartext, what, who, how, website = ".", ".", ".", ".", ".", "."
statustext = "*** WARN! " + typetext + ": " + etext + " ***" # Format status to be shown to user
return (incometext, nametext, webbaddress, finyeartext, statustext, exptext, survived, trustee, staff, volunteers, what, who, how, companyno, website) # Push the missing value variables back to the main program
################################# Main program #################################
starttime = datetime.datetime.now() # Grab the date and time for when the run stated.
print(' ') # Whitespace used to make the output window more readable
print('>>> Run started at', starttime.strftime("%Y-%m-%d %H:%M:%S") , ' <<<') # Header of the output, with the start time.
print(' ')
count=0 # Create count variable which is used in the validation at the end of the program run.
df1 = pd.read_json(path_or_buf='charity_oneyear.json', orient ='index')
charno_list = df1.index.values.tolist()
charno_list_len = len(charno_list)
incometext_list, finyeartext_list, statustext_list, exptext_list, trustee_list, staff_list, volunteers_list, survived_list, what_list, who_list, how_list, companyno_list, \
website_list = [], [], [], [], [], [], [], [], [], [], [], [], []
for charno in charno_list:
incometext, nametext, webbaddress, finyeartext, statustext, exptext, survived, trustee, staff, volunteers, what, who, how, companyno, website = scrapeorg(charno) # Feed the funciton each charity number in turn and pickup the returns from the fucntion.
for var, var_list in zip([incometext, finyeartext, statustext, exptext, survived, trustee, staff, volunteers, what, who, \
how, companyno, website], [incometext_list, finyeartext_list, statustext_list, exptext_list, survived_list, trustee_list, staff_list, volunteers_list, what_list, who_list, how_list, companyno_list, website_list]):
var_list.append(var)
count=count+1
print('---------------------------------------------------------------------------')
runtime = datetime.datetime.now() #
percdone = (count+1)/charno_list_len*100 # Calculate the percentage complete
print('Prcoessing number:',count, ' | ', 'Time stamp:', runtime.strftime("%H:%M:%S"),' | ', 'Elapsed time:', runtime-starttime, ' | ' "%.2f" % percdone, '% done')
print('+++', statustext, '+++')
print(webbaddress)
print('Charity number: ',charno)
print('Name: ',nametext)
print('---------------------------------------------------------------------------')
print(' ')
if count % 10 == 0: # To avoid overloading the server or getting banned, sleep every 100 charities scraped. Through experimentation this was deemed a better strategy than having a shorter sleep on every run. This block also saves the current results out in case the run fails later.
sleepytime(incometext_list, exptext_list, charno_list, finyeartext_list, trustee_list, staff_list, volunteers_list, companyno_list, survived_list, website_list, what_list, who_list, how_list)
else:
pass
error_count = survived_list.count(0) # Count how many survived to generate end of run validation stats
incometext_list = list(map(lambda x :int(x), incometext_list)) # Turn lists of numbers into ints
exptext_list = list(map(lambda x :int(x), exptext_list))
dicto={'ccnum':charno_list, 'updated_income':incometext_list, 'Updated_expenditure':exptext_list, 'Financial_year_ending':finyeartext_list, 'Trustees':trustee_list, 'Staff':staff_list, 'Volunteers':volunteers_list, 'Company number':companyno_list, \
'Survived':survived_list, 'Website':website_list, 'What the charity does':what_list, 'Who the charity helps':who_list, 'How the charity works':how_list} # Store the new variables to be appended to the data as a dictionary
df2 = pd.DataFrame(dicto)
df2.set_index(['ccnum'], inplace=True)
df2.to_json(path_or_buf='new_scrape_data.json', orient='index')
finishtime = datetime.datetime.now()
detlatime = finishtime-starttime
print('>>> Finished run at' , finishtime.strftime("%H:%M:%S"),'|' , 'Run took', detlatime, '|', count , 'of' ,charno_list_len, 'charities processed <<<') # Show the time at the end of the run and how many charities were processed - this should always equal the input n since the program is robust to errors
print('>>>',error_count ,'returned no financial information <<<') # Show the number of charities which did not have information avalable - these are dead Jim
print('>>>',100-((error_count/charno_list_len)*100) ,'% of records were successfully updated <<<') # Using the error count, calculate the percentage of records which were sucessfully retrived and updated