-
Notifications
You must be signed in to change notification settings - Fork 0
/
faq-scraper.py
64 lines (49 loc) · 1.8 KB
/
faq-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import unicodedata
import pandas as pd
import requests
# Here to add relative link fix later
from urllib.parse import urljoin
# Adds time stamp to output filename
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
# The url to be scraped
r = requests.get('https://www.fda.gov/emergency-preparedness-and-response/coronavirus-disease-2019-covid-19/coronavirus-disease-2019-covid-19-frequently-asked-questions')
from bs4 import BeautifulSoup
#import bs4
soup = BeautifulSoup(r.text, 'html.parser')
questions = soup.select('.panel-heading .panel-title')
answers = soup.select('.panel-collapse .panel-body')
# Removes attributes from html output, except img and a attributes
def remove_attributes(answers):
leavealone = ['a','img']
for tag in soup.find_all(True):
if tag.name not in leavealone:
tag.attrs = {}
else:
attrs = dict(tag.attrs)
for attr in attrs:
if attr not in ['src','href']:
del tag.attrs[attr]
return answers
answers = remove_attributes(answers)
# Creates lists with Qs and As
question_list = []
for question in questions:
question = question.find('a').text
question_list.append(question)
answer_list = []
for answer in answers:
# answer = answer.find('p')
if answer is not None:
answer_string = str(answer)
answer_anchors = answer_string.replace('href="/', 'href="https://www.fda.gov/')
answer_list.append(answer_anchors)
# print(question_list)
# print(answer_list)
# Merges questions and answers via zip
merged_qa = list(zip(question_list, answer_list))
# print(merged_qa)
df = pd.DataFrame(merged_qa, columns= ['Question', 'Answer'])
# print(df)
#df.to_csv (r'csv/fda-faq-' + timestr + '.csv', index = False, header=True)
df.to_csv('raksha.csv',index=False, header=True)