-
Notifications
You must be signed in to change notification settings - Fork 0
/
edgar_scraper.py
251 lines (209 loc) · 9.98 KB
/
edgar_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# import the selenium libraries
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import re
from bs4 import BeautifulSoup
import time
import csv
import requests
# Utility function
def save_to_txt(doc10k, company, date, folder_path):
"""Saves data to a text file in the specified folder."""
import os
# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)
# Construct the full file path
file_name = "doc10k_{}_{}.txt".format(company, date) # Create a file name
file_path = os.path.join(folder_path, file_name)
# Open the file in write mode ('w')
with open(file_path, 'w', encoding='utf-8') as file: # Open file in write mode
file.write(doc10k) # Write the document content to the file
print(f"Saved {file_name} to {file_path}.")
## Class for EDGAR
class EDGAR_scraper:
def __init__(self):
options = EdgeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Edge(options=options)
self.edgar_search_results = [] # Get a link of all the urls of the document files
self.edgar_search_index = []
self.companies = []
self.filing_doc_htmls = []
'''
Get the list of companies that we want to scrape 10ks for
'''
def get_companies(self, file_name):
company_info_df = pd.read_csv(file_name)
ciks = company_info_df['CIK Number'].tolist()[:2000] # get 2000 companies
self.edgar_search_results = ['https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}}}&type=10-K&dateb=&owner=include&count=100&search_text='.format(cik)
for cik in ciks]
self.companies = company_info_df['Company Name'].tolist()
'''
Processes below will get the index page(first step in actually getting the 10ks).
Will output csv file that contains the links to the pages that have the links to the 10K.
'''
def get_index_pages(self):
for (search_results, company) in zip(self.edgar_search_results, self.companies):
self.get_index_page(search_results, company)
def get_index_page(self, search_results, company):
# get the EDGAR website
try:
self.driver.get(search_results)
# wait for the website to load up
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'tableFile2')))
# Click the 10-K links
filings = self.driver.find_element(By.CLASS_NAME, 'tableFile2').find_elements(By.ID, "documentsbutton")
for filing in filings:
try:
self.edgar_search_index.append((company, filing.get_attribute('href')))
except:
self.edgar_search_index.append((company, ' '))
except:
self.edgar_search_index.append((company, ' '))
def process_index_pages(self, identifier):
df = pd.DataFrame(self.edgar_search_index, columns=['Company', 'Index URL'])
df.to_csv("Company Index Urls {}.csv".format(identifier))
'''
Processes below will try to get the 10k htmls(the links to those 10ks)
'''
def get_10k_htmls(self, index_urls):
dates = []
html_urls = []
for url in index_urls:
try:
self.driver.get(url)
# wait for the website to load up
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'tableFile')))
try:
# find the dates first
date_sections = self.driver.find_element(By.CLASS_NAME, 'formGrouping')
text = date_sections.text
date = re.findall(r'\d{4}-\d{2}-\d{2}', text)
dates.append(date[0])
except:
dates.append(' ')
# now get the html file
try:
# Get the first table(should have the 10-k link)
doc_format_table = self.driver.find_element(By.CLASS_NAME, 'tableFile')
# Get the first row
rows = doc_format_table.find_elements(By.TAG_NAME, 'tr')
for row in rows:
if '10-K' in row.text:
link = row.find_element(By.TAG_NAME, 'a')
link_url = link.get_attribute("href")
print(f'Link found: {link_url}')
html_urls.append(link_url)
break
except:
html_urls.append(' ')
except:
dates.append(' ')
html_urls.append(' ')
return html_urls, dates
'''
Processes will get the text from the HTML that links to the 10-K
'''
def get_10k(self, urls_10k, dates, company_names):
try:
for (url, date, company) in zip(urls_10k, dates, company_names):
try:
self.driver.get(url)
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, 'body')) # Wait until the body is loaded
)
# Extract text
html_content = self.driver.page_source
# Use BeautifulSoup to parse the HTML
soup = BeautifulSoup(html_content, 'html.parser')
# Extract all text
doc10k = soup.get_text(separator='\n', strip=True)
# Write to a file
save_to_txt(doc10k, company, date, 'docs10k')
except:
print('No 10K data found for {}.'.format(company))
except:
print("No 10K data found.")
def main():
# Identifier to make it easier to run code to get new set of data
identifier = 1
# Year you are interested in (e.g. 2024 would be 24)
yr = '24'
# initialize the EDGAR scraper
edgar = EDGAR_scraper()
'''
1. First get the index pages
'''
try:
# Attempt to open the file in read mode
with open('Company Index Urls {}.csv'.format(identifier), mode='r', newline='') as csvfile:
reader = csv.reader(csvfile)
print("CSV file was found.")
except FileNotFoundError:
print("The file 'Company Index Urls {}.csv' does not exist. Creating a new CSV file.".format(identifier))
edgar.get_companies('Company Names.csv')
edgar.get_index_pages()
edgar.process_index_pages(1)
time.sleep(8)
'''
2. Get the html links to the 10-k's from the index pages
'''
try:
# Attempt to open the file in read mode
with open('Company HTML Urls {}.csv'.format(identifier), mode='r', newline='') as csvfile:
reader = csv.reader(csvfile)
print("CSV file content:")
for row in reader:
print(row)
except FileNotFoundError:
print("The file 'Company HTML Urls {}.csv' does not exist. Creating a new CSV file.".format(identifier))
index_df = pd.read_csv('Company Index Urls {}.csv'.format(identifier))
index_df = index_df.dropna()
# EDGAR format always has -last 2 digits of yr- to determine which year we want
index_df = index_df[index_df['Index URL'].str.contains('-{}-'.format(yr))].drop_duplicates(subset=['Company'])
index_urls = index_df['Index URL'].tolist()
html_urls, dates = edgar.get_10k_htmls(index_urls)
index_df['HTML URL'] = html_urls
index_df['Filing Date'] = dates
index_df.to_csv('Company HTML Urls {}.csv'.format(identifier))
time.sleep(8)
'''
3. Get the 10-K Text, look at links where the date is as specified
'''
try:
# Attempt to open the file in read mode
with open('Company 10-K {}.csv'.format(identifier), mode='r', newline='') as csvfile:
reader = csv.reader(csvfile)
print("CSV file content:")
for row in reader:
print(row)
except FileNotFoundError:
print("The file 'Company 10-K {}.csv' does not exist. Creating a new CSV file.".format(identifier))
html_df = pd.read_csv('Company HTML Urls {}.csv'.format(identifier)).drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
# Filter for desired years
year1, year2 = '2024', '2024'
mask = html_df.apply(lambda x: (year1 in str(x['Filing Date'])) or (year2 in str(x['Filing Date'])), axis=1)
html_df = html_df[mask]
# get rid of /ix?doc=
html_df['HTML URL'] = html_df['HTML URL'].apply(lambda x: re.sub(r'/ix\?doc\=', '', x))
# Get the 10-k's and write them to txt files
html_url = html_df['HTML URL'].tolist()
dates = html_df['Filing Date'].tolist()
company_names = html_df['Company'].tolist()
edgar.get_10k(html_url, dates, company_names)
# # Now we need to save each document into a txt file
# for (company, date, doc) in zip(company_names, dates, docs):
# file_name = "10k_document_{}_{}.txt".format(company, date) # Create a file name
# with open(file_name, 'w', encoding='utf-8') as file: # Open file in write mode
# file.write(doc) # Write the document content to the file
# print(f"Saved {file_name}")
# print("All documents have been saved.")
html_df.to_csv('Company 10-K {}.csv'.format(identifier))
if __name__ == '__main__':
main()