edgar_scraper.py

# import the selenium libraries
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import re
from bs4 import BeautifulSoup
import time
import csv
import requests

# Utility function
def save_to_txt(doc10k, company, date, folder_path):
    """Saves data to a text file in the specified folder."""

    import os

    # Create the folder if it doesn't exist
    os.makedirs(folder_path, exist_ok=True)

    # Construct the full file path
    file_name = "doc10k_{}_{}.txt".format(company, date)  # Create a file name
    file_path = os.path.join(folder_path, file_name)

    # Open the file in write mode ('w')
    with open(file_path, 'w', encoding='utf-8') as file:  # Open file in write mode
        file.write(doc10k)  # Write the document content to the file
    print(f"Saved {file_name} to {file_path}.")

## Class for EDGAR
class EDGAR_scraper:
    def __init__(self):
        options = EdgeOptions()
        options.add_experimental_option('excludeSwitches', ['enable-logging'])
        self.driver = webdriver.Edge(options=options)
        self.edgar_search_results = [] # Get a link of all the urls of the document files
        self.edgar_search_index = []
        self.companies = []
        self.filing_doc_htmls = []

    '''
    Get the list of companies that we want to scrape 10ks for
    '''
    def get_companies(self, file_name):
        company_info_df = pd.read_csv(file_name)
        ciks = company_info_df['CIK Number'].tolist()[:2000] # get 2000 companies        
        self.edgar_search_results = ['https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}}}&type=10-K&dateb=&owner=include&count=100&search_text='.format(cik)
                                     for cik in ciks]    
        self.companies = company_info_df['Company Name'].tolist()
    
    '''
    Processes below will get the index page(first step in actually getting the 10ks).
    Will output csv file that contains the links to the pages that have the links to the 10K.
    '''
    def get_index_pages(self):
        for (search_results, company) in zip(self.edgar_search_results, self.companies):
            self.get_index_page(search_results, company)
    
    def get_index_page(self, search_results, company):
        # get the EDGAR website
        try:
            self.driver.get(search_results)
        
            # wait for the website to load up
            WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'tableFile2')))

            # Click the 10-K links
            filings = self.driver.find_element(By.CLASS_NAME, 'tableFile2').find_elements(By.ID, "documentsbutton")
            
            for filing in filings:
                try:
                    self.edgar_search_index.append((company, filing.get_attribute('href')))
                except:
                    self.edgar_search_index.append((company, ' '))
        except:
            self.edgar_search_index.append((company, ' '))

    def process_index_pages(self, identifier):
        df = pd.DataFrame(self.edgar_search_index, columns=['Company', 'Index URL'])
        df.to_csv("Company Index Urls {}.csv".format(identifier))

    '''
    Processes below will try to get the 10k htmls(the links to those 10ks)
    '''
    def get_10k_htmls(self, index_urls):
        dates = []
        html_urls = []
        for url in index_urls:
            try:
                self.driver.get(url)
                
                # wait for the website to load up
                WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'tableFile')))

                try:
                    # find the dates first
                    date_sections = self.driver.find_element(By.CLASS_NAME, 'formGrouping')
                    text = date_sections.text
                    date = re.findall(r'\d{4}-\d{2}-\d{2}', text)
                    dates.append(date[0])
                except:
                    dates.append(' ')    
                
                # now get the html file
                try:
                    # Get the first table(should have the 10-k link)
                    doc_format_table = self.driver.find_element(By.CLASS_NAME, 'tableFile')
                    
                    # Get the first row
                    rows = doc_format_table.find_elements(By.TAG_NAME, 'tr')
                    for row in rows:
                        if '10-K' in row.text:
                            link = row.find_element(By.TAG_NAME, 'a')
                            link_url = link.get_attribute("href")
                            print(f'Link found: {link_url}')
                            html_urls.append(link_url)
                            break
                except:
                    html_urls.append(' ')
            except:
                dates.append(' ')
                html_urls.append(' ')
        
        return html_urls, dates

    '''
    Processes will get the text from the HTML that links to the 10-K
    '''
    def get_10k(self, urls_10k, dates, company_names):
        try:
            for (url, date, company) in zip(urls_10k, dates, company_names):
                try:
                    self.driver.get(url)
                    WebDriverWait(self.driver, 10).until(
                        EC.presence_of_element_located((By.TAG_NAME, 'body'))  # Wait until the body is loaded
                        )
                    
                    # Extract text
                    html_content = self.driver.page_source
                    
                    # Use BeautifulSoup to parse the HTML
                    soup = BeautifulSoup(html_content, 'html.parser')
                    
                    # Extract all text
                    doc10k = soup.get_text(separator='\n', strip=True)
                    
                    # Write to a file
                    save_to_txt(doc10k, company, date, 'docs10k')

                except:
                    print('No 10K data found for {}.'.format(company))
        except:
            print("No 10K data found.")
            
def main():
    
    # Identifier to make it easier to run code to get new set of data
    identifier = 1
    
    # Year you are interested in (e.g. 2024 would be 24)
    yr = '24'
    
    # initialize the EDGAR scraper
    edgar = EDGAR_scraper()
    
    '''
    1. First get the index pages
    '''
    try:
    # Attempt to open the file in read mode
        with open('Company Index Urls {}.csv'.format(identifier), mode='r', newline='') as csvfile:
            reader = csv.reader(csvfile)
            print("CSV file was found.")
    except FileNotFoundError:
        print("The file 'Company Index Urls {}.csv' does not exist. Creating a new CSV file.".format(identifier))
        edgar.get_companies('Company Names.csv')
        edgar.get_index_pages()
        edgar.process_index_pages(1)

    time.sleep(8)
    '''
    2. Get the html links to the 10-k's from the index pages
    '''
    try:
    # Attempt to open the file in read mode
        with open('Company HTML Urls {}.csv'.format(identifier), mode='r', newline='') as csvfile:
            reader = csv.reader(csvfile)
            print("CSV file content:")
            for row in reader:
                print(row)
    except FileNotFoundError:
        print("The file 'Company HTML Urls {}.csv' does not exist. Creating a new CSV file.".format(identifier))
        index_df = pd.read_csv('Company Index Urls {}.csv'.format(identifier))
        index_df = index_df.dropna()
        
        # EDGAR format always has -last 2 digits of yr- to determine which year we want
        index_df = index_df[index_df['Index URL'].str.contains('-{}-'.format(yr))].drop_duplicates(subset=['Company']) 
        index_urls = index_df['Index URL'].tolist()
        html_urls, dates = edgar.get_10k_htmls(index_urls)
        
        index_df['HTML URL'] = html_urls
        index_df['Filing Date'] = dates
        index_df.to_csv('Company HTML Urls {}.csv'.format(identifier))
    
    time.sleep(8)
    '''
    3. Get the 10-K Text, look at links where the date is as specified
    '''
    try:
    # Attempt to open the file in read mode
        with open('Company 10-K {}.csv'.format(identifier), mode='r', newline='') as csvfile:
            reader = csv.reader(csvfile)
            print("CSV file content:")
            for row in reader:
                print(row)
    except FileNotFoundError:
        print("The file 'Company 10-K {}.csv' does not exist. Creating a new CSV file.".format(identifier))
        html_df = pd.read_csv('Company HTML Urls {}.csv'.format(identifier)).drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
        
        # Filter for desired years
        year1, year2 = '2024', '2024'
        mask = html_df.apply(lambda x: (year1 in str(x['Filing Date'])) or (year2 in str(x['Filing Date'])), axis=1)
        html_df = html_df[mask]
        
        # get rid of /ix?doc=
        html_df['HTML URL'] = html_df['HTML URL'].apply(lambda x: re.sub(r'/ix\?doc\=', '', x))

        # Get the 10-k's and write them to txt files
        html_url = html_df['HTML URL'].tolist()
        dates = html_df['Filing Date'].tolist()
        company_names = html_df['Company'].tolist()
       
        edgar.get_10k(html_url, dates, company_names)
        
        # # Now we need to save each document into a txt file
        # for (company, date, doc) in zip(company_names, dates, docs):
        #     file_name = "10k_document_{}_{}.txt".format(company, date)  # Create a file name
        #     with open(file_name, 'w', encoding='utf-8') as file:  # Open file in write mode
        #         file.write(doc)  # Write the document content to the file
        #     print(f"Saved {file_name}")

        # print("All documents have been saved.")
        
        html_df.to_csv('Company 10-K {}.csv'.format(identifier))
    
if __name__ == '__main__':
    main()