Skip to content

Commit

Permalink
Increased scraping speed, added progress bar (mariostoev#94)
Browse files Browse the repository at this point in the history
  • Loading branch information
d3an authored Apr 3, 2021
1 parent 9dd1d1a commit 65b727b
Show file tree
Hide file tree
Showing 20 changed files with 895 additions and 107 deletions.
8 changes: 0 additions & 8 deletions .idea/.gitignore

This file was deleted.

15 changes: 0 additions & 15 deletions .idea/finviz-1.iml

This file was deleted.

6 changes: 0 additions & 6 deletions .idea/inspectionProfiles/profiles_settings.xml

This file was deleted.

4 changes: 0 additions & 4 deletions .idea/misc.xml

This file was deleted.

8 changes: 0 additions & 8 deletions .idea/modules.xml

This file was deleted.

4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ Downloading & Installation

What is Finviz?
=====
FinViz_ aims to make market information accessible and provides a lot of data in visual snapshots, allowing traders and investors to quickly find the stock, future or forex pair they are looking for. The site provides advanced screeners, market maps, analysis, comparative tools and charts.
FinViz_ aims to make market information accessible and provides a lot of data in visual snapshots, allowing traders and investors to quickly find the stock, future or forex pair they are looking for. The site provides advanced screeners, market maps, analysis, comparative tools, and charts.

.. _FinViz: https://finviz.com/?a=128493348

**Important Information**

Any quotes data displayed on finviz.com is delayed by 15 minutes for NASDAQ, and 20 minutes for NYSE and AMEX. This API should **NOT** be used for live trading, it's main purpuse is financial analysis, research and data scraping.
Any quotes data displayed on finviz.com is delayed by 15 minutes for NASDAQ, and 20 minutes for NYSE and AMEX. This API should **NOT** be used for live trading, it's main purpose is financial analysis, research, and data scraping.

Using Screener
=====
Expand Down
12 changes: 8 additions & 4 deletions example.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,21 @@
filters = Screener.load_filter_dict()
some_filters = [filters["PEG"]["Under 1"], filters["Exchange"]["AMEX"]]
stock_list = Screener(filters=some_filters, order="ticker")
print(stock_list)

# Use raw filter tags in a list
# filters = ['geo_usa']
filters = ["idx_sp500"] # Shows companies in the S&P500
print("Filtering stocks..")
print("Screening stocks...")
stock_list = Screener(filters=filters, order="ticker")
print("Parsing every stock..")
stock_list.get_ticker_details()
print(stock_list)

print("Retrieving stock data...")
stock_data = stock_list.get_ticker_details()
print(stock_data)

# Export the screener results to CSV file
stock_list.to_csv("sp500.csv")

# Create a SQLite database
# stock_list.to_sqlite('sp500.sqlite')
# stock_list.to_sqlite("sp500.sqlite")
4 changes: 2 additions & 2 deletions finviz/helper_functions/display_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ def create_table_string(table_list):

col_size = [max(map(len, col)) for col in zip(*table_list)]
format_str = " | ".join([f"{{:<{i}}}" for i in col_size])
table_list.insert(1, ['-' * i for i in col_size])
table_list.insert(1, ["-" * i for i in col_size])

table_string = ""
for item in table_list:
table_string += format_str.format(*item) + '\n'
table_string += format_str.format(*item) + "\n"

return table_string
59 changes: 33 additions & 26 deletions finviz/helper_functions/request_functions.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
import asyncio
import time
from typing import Callable, Dict, List

import aiohttp
import requests
import tenacity
import urllib3
from lxml import html
from requests import Response
from tqdm import tqdm
from user_agent import generate_user_agent

from finviz.config import connection_settings
from finviz.helper_functions.error_handling import (ConnectionTimeout,
TooManyRequests)
from finviz.helper_functions.error_handling import ConnectionTimeout

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


def http_request_get(url, session=None, payload=None, parse=True):
def http_request_get(
url, session=None, payload=None, parse=True, user_agent=generate_user_agent()
):
""" Sends a GET HTTP request to a website and returns its HTML content and full url address. """

if payload is None:
Expand All @@ -27,14 +30,14 @@ def http_request_get(url, session=None, payload=None, parse=True):
url,
params=payload,
verify=False,
headers={"User-Agent": generate_user_agent()},
headers={"User-Agent": user_agent},
)
else:
content = requests.get(
url,
params=payload,
verify=False,
headers={"User-Agent": generate_user_agent()},
headers={"User-Agent": user_agent},
)

content.raise_for_status() # Raise HTTPError for bad requests (4xx or 5xx)
Expand All @@ -46,28 +49,24 @@ def http_request_get(url, session=None, payload=None, parse=True):
raise ConnectionTimeout(url)


@tenacity.retry(wait=tenacity.wait_exponential())
def finviz_request(url: str, user_agent: str) -> Response:
response = requests.get(url, headers={"User-Agent": user_agent})
if response.text == "Too many requests.":
raise Exception("Too many requests.")
return response


def sequential_data_scrape(
scrape_func: Callable, urls: List[str], delay: float = 0.5, *args, **kwargs
scrape_func: Callable, urls: List[str], user_agent: str, *args, **kwargs
) -> List[Dict]:
data = []
delay_multiplier = 1.0

for url in urls:
for url in tqdm(urls):
try:
while True:
response = requests.get(
url, headers={"User-Agent": generate_user_agent()}
)
if response.text == "Too many requests.":
time.sleep(delay * delay_multiplier)
delay_multiplier *= 1.5
continue
else:
delay_multiplier = 1.0
break
response = finviz_request(url, user_agent)
kwargs["URL"] = url
data.append(scrape_func(response, *args, **kwargs))
time.sleep(delay)
except Exception as exc:
raise exc

Expand All @@ -81,24 +80,33 @@ def __init__(
self,
scrape_function: Callable,
urls: List[str],
user_agent: str,
*args,
css_select: bool = False
):
self.scrape_function = scrape_function
self.urls = urls
self.user_agent = user_agent
self.arguments = args
self.css_select = css_select
self.data = []

async def __http_request__async(
self, url: str, session: aiohttp.ClientSession, user_agent: str
self,
url: str,
session: aiohttp.ClientSession,
):
""" Sends asynchronous http request to URL address and scrapes the webpage. """

try:
async with session.get(url, headers={"User-Agent": user_agent}) as response:
async with session.get(
url, headers={"User-Agent": self.user_agent}
) as response:
page_html = await response.read()

if page_html.decode("utf-8") == "Too many requests.":
raise Exception("Too many requests.")

if self.css_select:
return self.scrape_function(
html.fromstring(page_html), *self.arguments
Expand All @@ -115,13 +123,12 @@ async def __async_scraper(self):
limit_per_host=connection_settings["CONCURRENT_CONNECTIONS"]
)
timeout = aiohttp.ClientTimeout(total=connection_settings["CONNECTION_TIMEOUT"])
user_agent = generate_user_agent()

async with aiohttp.ClientSession(
connector=conn, timeout=timeout, headers={"User-Agent": user_agent}
connector=conn, timeout=timeout, headers={"User-Agent": self.user_agent}
) as session:
for url in self.urls:
async_tasks.append(self.__http_request__async(url, session, user_agent))
async_tasks.append(self.__http_request__async(url, session))

self.data = await asyncio.gather(*async_tasks)

Expand Down
4 changes: 2 additions & 2 deletions finviz/helper_functions/scraper_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def get_table(page_html: requests.Response, headers, rows=None, **kwargs):
# Skip the first element ([1:]), since it's the headers (we already have it as a constant)
all_rows = [
column.xpath("td//text()")
for column in page_parsed.cssselect('tr[valign="top"]')[1: rows + 1]
for column in page_parsed.cssselect('tr[valign="top"]')[1 : rows + 1]
]

# If rows is different from -2, this function is called from Screener
Expand Down Expand Up @@ -80,7 +80,7 @@ def download_chart_image(page_content: requests.Response, **kwargs):


def get_analyst_price_targets_for_export(
ticker=None, page_content=None, last_ratings=5
ticker=None, page_content=None, last_ratings=5
):
analyst_price_targets = []

Expand Down
16 changes: 11 additions & 5 deletions finviz/main_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,15 @@ def get_all_news():
"""

page_parsed, _ = http_request_get(url=NEWS_URL, parse=True)
all_dates = [row.text_content() for row in page_parsed.cssselect('td[class="nn-date"]')]
all_headlines = [row.text_content() for row in page_parsed.cssselect('a[class="nn-tab-link"]')]
all_links = [row.get('href') for row in page_parsed.cssselect('a[class="nn-tab-link"]')]
all_dates = [
row.text_content() for row in page_parsed.cssselect('td[class="nn-date"]')
]
all_headlines = [
row.text_content() for row in page_parsed.cssselect('a[class="nn-tab-link"]')
]
all_links = [
row.get("href") for row in page_parsed.cssselect('a[class="nn-tab-link"]')
]

return list(zip(all_dates, all_headlines, all_links))

Expand Down Expand Up @@ -143,14 +149,14 @@ def get_analyst_price_targets(ticker, last_ratings=5):
for row in ratings_list:
if count == last_ratings:
break
# defalut values for len(row) == 4 , that is there is NO price information
# default values for len(row) == 4 , that is there is NO price information
price_from, price_to = 0, 0
if len(row) == 5:

strings = row[4].split("→")
# print(strings)
if len(strings) == 1:
# if only ONE price is avalable then it is 'price_to' value
# if only ONE price is available then it is 'price_to' value
price_to = strings[0].strip(" ").strip("$")
else:
# both '_from' & '_to' prices available
Expand Down
2 changes: 1 addition & 1 deletion finviz/portfolio.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def create_portfolio(self, name, file, drop_invalid_ticker=False):
ENGH:CA,1,,1,
(!) For transaction - 1 = BUY, 2 = SELL
(!) Note that if the price is ommited the function will take today's ticker price
(!) Note that if the price is omitted the function will take today's ticker price
"""

data = {
Expand Down
41 changes: 29 additions & 12 deletions finviz/screener.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
from urllib.parse import urlencode, urlparse

from bs4 import BeautifulSoup
from user_agent import generate_user_agent

import finviz.helper_functions.scraper_functions as scrape
from finviz.helper_functions.display_functions import create_table_string
from finviz.helper_functions.error_handling import InvalidTableType, NoResults
from finviz.helper_functions.request_functions import (http_request_get,
from finviz.helper_functions.request_functions import (Connector,
http_request_get,
sequential_data_scrape)
from finviz.helper_functions.save_data import export_to_csv, export_to_db

Expand Down Expand Up @@ -66,7 +68,8 @@ def __init__(
signal="",
table=None,
custom=None,
delay=0.5,
user_agent=generate_user_agent(),
request_method="sequential",
):
"""
Initializes all variables to its values
Expand Down Expand Up @@ -118,7 +121,8 @@ def __init__(
self._rows = rows
self._order = order
self._signal = signal
self._delay = delay
self._user_agent = user_agent
self._request_method = request_method

self.analysis = []
self.data = self.__search_screener()
Expand Down Expand Up @@ -362,7 +366,7 @@ def get_charts(self, period="d", size="l", chart_type="c", ta="1"):
f"https://finviz.com/chart.ashx?{encoded_payload}&t={row.get('Ticker')}"
for row in self.data
],
self._delay,
self._user_agent,
)

def get_ticker_details(self):
Expand All @@ -376,7 +380,7 @@ def get_ticker_details(self):
f"https://finviz.com/quote.ashx?&t={row.get('Ticker')}"
for row in self.data
],
self._delay,
self._user_agent,
)

for entry in ticker_data:
Expand Down Expand Up @@ -426,17 +430,30 @@ def __search_screener(self):
"s": self._signal,
"c": ",".join(self._custom),
},
user_agent=self._user_agent,
)

self._rows = self.__check_rows()
self.headers = self.__get_table_headers()
pages_data = sequential_data_scrape(
scrape.get_table,
scrape.get_page_urls(self._page_content, self._rows, self._url),
self._delay,
self.headers,
self._rows,
)

if self._request_method == "async":
async_connector = Connector(
scrape.get_table,
scrape.get_page_urls(self._page_content, self._rows, self._url),
self._user_agent,
self.headers,
self._rows,
css_select=True,
)
pages_data = async_connector.run_connector()
else:
pages_data = sequential_data_scrape(
scrape.get_table,
scrape.get_page_urls(self._page_content, self._rows, self._url),
self._user_agent,
self.headers,
self._rows,
)

data = []
for page in pages_data:
Expand Down
Loading

0 comments on commit 65b727b

Please sign in to comment.