Increased scraping speed, added progress bar (mariostoev#94)

d4munche3z · Apr 3, 2021 · 65b727b · 65b727b
1 parent 9dd1d1a
commit 65b727b
Show file tree

Hide file tree

Showing 20 changed files with 895 additions and 107 deletions.
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/finviz-1.iml b/.idea/finviz-1.iml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/README.rst b/README.rst
@@ -23,13 +23,13 @@ Downloading & Installation
 
 What is Finviz?
 =====
-FinViz_ aims to make market information accessible and provides a lot of data in visual snapshots, allowing traders and investors to quickly find the stock, future or forex pair they are looking for. The site provides advanced screeners, market maps, analysis, comparative tools and charts.
+FinViz_ aims to make market information accessible and provides a lot of data in visual snapshots, allowing traders and investors to quickly find the stock, future or forex pair they are looking for. The site provides advanced screeners, market maps, analysis, comparative tools, and charts.
 
 .. _FinViz: https://finviz.com/?a=128493348
 
 **Important Information**
 
-Any quotes data displayed on finviz.com is delayed by 15 minutes for NASDAQ, and 20 minutes for NYSE and AMEX. This API should **NOT** be used for live trading, it's main purpuse is financial analysis, research and data scraping.
+Any quotes data displayed on finviz.com is delayed by 15 minutes for NASDAQ, and 20 minutes for NYSE and AMEX. This API should **NOT** be used for live trading, it's main purpose is financial analysis, research, and data scraping.
 
 Using Screener
 =====

diff --git a/example.py b/example.py
@@ -7,17 +7,21 @@
 filters = Screener.load_filter_dict()
 some_filters = [filters["PEG"]["Under 1"], filters["Exchange"]["AMEX"]]
 stock_list = Screener(filters=some_filters, order="ticker")
+print(stock_list)
 
 # Use raw filter tags in a list
 # filters = ['geo_usa']
 filters = ["idx_sp500"]  # Shows companies in the S&P500
-print("Filtering stocks..")
+print("Screening stocks...")
 stock_list = Screener(filters=filters, order="ticker")
-print("Parsing every stock..")
-stock_list.get_ticker_details()
+print(stock_list)
+
+print("Retrieving stock data...")
+stock_data = stock_list.get_ticker_details()
+print(stock_data)
 
 # Export the screener results to CSV file
 stock_list.to_csv("sp500.csv")
 
 # Create a SQLite database
-# stock_list.to_sqlite('sp500.sqlite')
+# stock_list.to_sqlite("sp500.sqlite")
diff --git a/finviz/helper_functions/display_functions.py b/finviz/helper_functions/display_functions.py
@@ -3,10 +3,10 @@ def create_table_string(table_list):
 
     col_size = [max(map(len, col)) for col in zip(*table_list)]
     format_str = " | ".join([f"{{:<{i}}}" for i in col_size])
-    table_list.insert(1, ['-' * i for i in col_size])
+    table_list.insert(1, ["-" * i for i in col_size])
 
     table_string = ""
     for item in table_list:
-        table_string += format_str.format(*item) + '\n'
+        table_string += format_str.format(*item) + "\n"
 
     return table_string
diff --git a/finviz/helper_functions/request_functions.py b/finviz/helper_functions/request_functions.py
@@ -1,21 +1,24 @@
 import asyncio
-import time
 from typing import Callable, Dict, List
 
 import aiohttp
 import requests
+import tenacity
 import urllib3
 from lxml import html
+from requests import Response
+from tqdm import tqdm
 from user_agent import generate_user_agent
 
 from finviz.config import connection_settings
-from finviz.helper_functions.error_handling import (ConnectionTimeout,
-                                                    TooManyRequests)
+from finviz.helper_functions.error_handling import ConnectionTimeout
 
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
 
-def http_request_get(url, session=None, payload=None, parse=True):
+def http_request_get(
+    url, session=None, payload=None, parse=True, user_agent=generate_user_agent()
+):
     """ Sends a GET HTTP request to a website and returns its HTML content and full url address. """
 
     if payload is None:
@@ -27,14 +30,14 @@ def http_request_get(url, session=None, payload=None, parse=True):
                 url,
                 params=payload,
                 verify=False,
-                headers={"User-Agent": generate_user_agent()},
+                headers={"User-Agent": user_agent},
             )
         else:
             content = requests.get(
                 url,
                 params=payload,
                 verify=False,
-                headers={"User-Agent": generate_user_agent()},
+                headers={"User-Agent": user_agent},
             )
 
         content.raise_for_status()  # Raise HTTPError for bad requests (4xx or 5xx)
@@ -46,28 +49,24 @@ def http_request_get(url, session=None, payload=None, parse=True):
         raise ConnectionTimeout(url)
 
 
+@tenacity.retry(wait=tenacity.wait_exponential())
+def finviz_request(url: str, user_agent: str) -> Response:
+    response = requests.get(url, headers={"User-Agent": user_agent})
+    if response.text == "Too many requests.":
+        raise Exception("Too many requests.")
+    return response
+
+
 def sequential_data_scrape(
-    scrape_func: Callable, urls: List[str], delay: float = 0.5, *args, **kwargs
+    scrape_func: Callable, urls: List[str], user_agent: str, *args, **kwargs
 ) -> List[Dict]:
     data = []
-    delay_multiplier = 1.0
 
-    for url in urls:
+    for url in tqdm(urls):
         try:
-            while True:
-                response = requests.get(
-                    url, headers={"User-Agent": generate_user_agent()}
-                )
-                if response.text == "Too many requests.":
-                    time.sleep(delay * delay_multiplier)
-                    delay_multiplier *= 1.5
-                    continue
-                else:
-                    delay_multiplier = 1.0
-                    break
+            response = finviz_request(url, user_agent)
             kwargs["URL"] = url
             data.append(scrape_func(response, *args, **kwargs))
-            time.sleep(delay)
         except Exception as exc:
             raise exc
 
@@ -81,24 +80,33 @@ def __init__(
         self,
         scrape_function: Callable,
         urls: List[str],
+        user_agent: str,
         *args,
         css_select: bool = False
     ):
         self.scrape_function = scrape_function
         self.urls = urls
+        self.user_agent = user_agent
         self.arguments = args
         self.css_select = css_select
         self.data = []
 
     async def __http_request__async(
-        self, url: str, session: aiohttp.ClientSession, user_agent: str
+        self,
+        url: str,
+        session: aiohttp.ClientSession,
     ):
         """ Sends asynchronous http request to URL address and scrapes the webpage. """
 
         try:
-            async with session.get(url, headers={"User-Agent": user_agent}) as response:
+            async with session.get(
+                url, headers={"User-Agent": self.user_agent}
+            ) as response:
                 page_html = await response.read()
 
+                if page_html.decode("utf-8") == "Too many requests.":
+                    raise Exception("Too many requests.")
+
                 if self.css_select:
                     return self.scrape_function(
                         html.fromstring(page_html), *self.arguments
@@ -115,13 +123,12 @@ async def __async_scraper(self):
             limit_per_host=connection_settings["CONCURRENT_CONNECTIONS"]
         )
         timeout = aiohttp.ClientTimeout(total=connection_settings["CONNECTION_TIMEOUT"])
-        user_agent = generate_user_agent()
 
         async with aiohttp.ClientSession(
-            connector=conn, timeout=timeout, headers={"User-Agent": user_agent}
+            connector=conn, timeout=timeout, headers={"User-Agent": self.user_agent}
         ) as session:
             for url in self.urls:
-                async_tasks.append(self.__http_request__async(url, session, user_agent))
+                async_tasks.append(self.__http_request__async(url, session))
 
             self.data = await asyncio.gather(*async_tasks)
 

diff --git a/finviz/helper_functions/scraper_functions.py b/finviz/helper_functions/scraper_functions.py
@@ -19,7 +19,7 @@ def get_table(page_html: requests.Response, headers, rows=None, **kwargs):
     # Skip the first element ([1:]), since it's the headers (we already have it as a constant)
     all_rows = [
         column.xpath("td//text()")
-        for column in page_parsed.cssselect('tr[valign="top"]')[1: rows + 1]
+        for column in page_parsed.cssselect('tr[valign="top"]')[1 : rows + 1]
     ]
 
     # If rows is different from -2, this function is called from Screener
@@ -80,7 +80,7 @@ def download_chart_image(page_content: requests.Response, **kwargs):
 
 
 def get_analyst_price_targets_for_export(
-        ticker=None, page_content=None, last_ratings=5
+    ticker=None, page_content=None, last_ratings=5
 ):
     analyst_price_targets = []
 

diff --git a/finviz/main_func.py b/finviz/main_func.py
@@ -88,9 +88,15 @@ def get_all_news():
     """
 
     page_parsed, _ = http_request_get(url=NEWS_URL, parse=True)
-    all_dates = [row.text_content() for row in page_parsed.cssselect('td[class="nn-date"]')]
-    all_headlines = [row.text_content() for row in page_parsed.cssselect('a[class="nn-tab-link"]')]
-    all_links = [row.get('href') for row in page_parsed.cssselect('a[class="nn-tab-link"]')]
+    all_dates = [
+        row.text_content() for row in page_parsed.cssselect('td[class="nn-date"]')
+    ]
+    all_headlines = [
+        row.text_content() for row in page_parsed.cssselect('a[class="nn-tab-link"]')
+    ]
+    all_links = [
+        row.get("href") for row in page_parsed.cssselect('a[class="nn-tab-link"]')
+    ]
 
     return list(zip(all_dates, all_headlines, all_links))
 
@@ -143,14 +149,14 @@ def get_analyst_price_targets(ticker, last_ratings=5):
         for row in ratings_list:
             if count == last_ratings:
                 break
-            # defalut values for len(row) == 4 , that is there is NO price information
+            # default values for len(row) == 4 , that is there is NO price information
             price_from, price_to = 0, 0
             if len(row) == 5:
 
                 strings = row[4].split("→")
                 # print(strings)
                 if len(strings) == 1:
-                    # if only ONE price is avalable then it is 'price_to' value
+                    # if only ONE price is available then it is 'price_to' value
                     price_to = strings[0].strip(" ").strip("$")
                 else:
                     # both '_from' & '_to' prices available

diff --git a/finviz/portfolio.py b/finviz/portfolio.py
@@ -91,7 +91,7 @@ def create_portfolio(self, name, file, drop_invalid_ticker=False):
         ENGH:CA,1,,1,
 
         (!) For transaction - 1 = BUY, 2 = SELL
-        (!) Note that if the price is ommited the function will take today's ticker price
+        (!) Note that if the price is omitted the function will take today's ticker price
         """
 
         data = {

diff --git a/finviz/screener.py b/finviz/screener.py
@@ -5,11 +5,13 @@
 from urllib.parse import urlencode, urlparse
 
 from bs4 import BeautifulSoup
+from user_agent import generate_user_agent
 
 import finviz.helper_functions.scraper_functions as scrape
 from finviz.helper_functions.display_functions import create_table_string
 from finviz.helper_functions.error_handling import InvalidTableType, NoResults
-from finviz.helper_functions.request_functions import (http_request_get,
+from finviz.helper_functions.request_functions import (Connector,
+                                                       http_request_get,
                                                        sequential_data_scrape)
 from finviz.helper_functions.save_data import export_to_csv, export_to_db
 
@@ -66,7 +68,8 @@ def __init__(
         signal="",
         table=None,
         custom=None,
-        delay=0.5,
+        user_agent=generate_user_agent(),
+        request_method="sequential",
     ):
         """
         Initializes all variables to its values
@@ -118,7 +121,8 @@ def __init__(
         self._rows = rows
         self._order = order
         self._signal = signal
-        self._delay = delay
+        self._user_agent = user_agent
+        self._request_method = request_method
 
         self.analysis = []
         self.data = self.__search_screener()
@@ -362,7 +366,7 @@ def get_charts(self, period="d", size="l", chart_type="c", ta="1"):
                 f"https://finviz.com/chart.ashx?{encoded_payload}&t={row.get('Ticker')}"
                 for row in self.data
             ],
-            self._delay,
+            self._user_agent,
         )
 
     def get_ticker_details(self):
@@ -376,7 +380,7 @@ def get_ticker_details(self):
                 f"https://finviz.com/quote.ashx?&t={row.get('Ticker')}"
                 for row in self.data
             ],
-            self._delay,
+            self._user_agent,
         )
 
         for entry in ticker_data:
@@ -426,17 +430,30 @@ def __search_screener(self):
                 "s": self._signal,
                 "c": ",".join(self._custom),
             },
+            user_agent=self._user_agent,
         )
 
         self._rows = self.__check_rows()
         self.headers = self.__get_table_headers()
-        pages_data = sequential_data_scrape(
-            scrape.get_table,
-            scrape.get_page_urls(self._page_content, self._rows, self._url),
-            self._delay,
-            self.headers,
-            self._rows,
-        )
+
+        if self._request_method == "async":
+            async_connector = Connector(
+                scrape.get_table,
+                scrape.get_page_urls(self._page_content, self._rows, self._url),
+                self._user_agent,
+                self.headers,
+                self._rows,
+                css_select=True,
+            )
+            pages_data = async_connector.run_connector()
+        else:
+            pages_data = sequential_data_scrape(
+                scrape.get_table,
+                scrape.get_page_urls(self._page_content, self._rows, self._url),
+                self._user_agent,
+                self.headers,
+                self._rows,
+            )
 
         data = []
         for page in pages_data: