Skip to content

Commit

Permalink
Refactor project structure
Browse files Browse the repository at this point in the history
  • Loading branch information
nirantak committed Oct 13, 2021
1 parent a25a8cc commit fa65055
Show file tree
Hide file tree
Showing 13 changed files with 85 additions and 55 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
.idea
.vscode
screenshots/*.png
cookies/*.pkl
*.pkl
out/**

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ playwright install

## Usage

See [scripts/README.md](scripts/) for usage instructions.
See [scrapers/README.md](scrapers/) for usage instructions.

Samples present in [demo/](demo/).

Expand Down
4 changes: 2 additions & 2 deletions demo/basic_playwright.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

DEBUG: bool = False
OPTS: dict[str, Any] = {
"ss_dir": "./screenshots",
"out_dir": "./out",
"headless": True,
"slow_mo": 0,
}
Expand All @@ -25,7 +25,7 @@
)
page = browser.new_page()
page.goto("http://whatsmyuseragent.org/")
page.screenshot(path=f"{OPTS['ss_dir']}/user_agent.png")
page.screenshot(path=f"{OPTS['out_dir']}/user_agent.png")
print(f"Title: \t\t{page.title()}")
print(f"User Agent: \t{page.inner_text('.user-agent').strip()}")
print(f"IP: \t\t{page.inner_text('.ip-address').split(':')[-1].strip()}")
Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions sample.env
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
DEBUG=true
IG_USERNAME="your_username_here"
IG_PASSWORD="your_password_here"
TV_TIME_USERID="your_userid_here"
Expand Down
6 changes: 3 additions & 3 deletions scripts/README.md → scrapers/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@

- [Scrapers](#scrapers)
- [Table of Contents](#table-of-contents)
- [Scripts](#scripts)
- [Usage](#usage)
- [TV Time](#tv-time)
- [Archived](#archived)

## Scripts
## Usage

### TV Time

Expand All @@ -21,7 +21,7 @@
- To get the list of all your TV Shows, run the script:

```bash
python scripts/tv_time.py [public|private]
python -m scrapers.tv_time [public|private]
```

- If no environment variables are set, the script will prompt for the same.
Expand Down
File renamed without changes.
10 changes: 5 additions & 5 deletions scripts/archive/README.md → scrapers/archive/README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Archived
# Archived Scrapers

## Table of Contents

- [Archived](#archived)
- [Archived Scrapers](#archived-scrapers)
- [Table of Contents](#table-of-contents)
- [Usage](#usage)
- [Ticket Price Scraper](#ticket-price-scraper)
Expand All @@ -18,7 +18,7 @@ Fill all variables in the top `# Config` section.
Run script

```bash
python scripts/archive/ticket_prices.py
python ticket_prices.py
```

### Instagram Followers Scraper
Expand All @@ -28,9 +28,9 @@ Rename file `sample.env` as `.env`, and fill all environment variables (username
Run script

```bash
python scripts/archive/instagram.py
python instagram.py
```

## Requirements

1. [ChromeDriver](https://sites.google.com/a/chromium.org/chromedriver/downloads)
1. [ChromeDriver](https://sites.google.com/a/chromium.org/chromedriver/downloads) in the same directory as this script.
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@

# Config
load_dotenv()
CHROMEDRIVER_PATH: str = "./drivers/chromedriver"
COOKIES_PATH: str = "./cookies"
CHROMEDRIVER_PATH: str = "./chromedriver"
COOKIES_PATH: str = "."
USERNAME: str = os.environ["IG_USERNAME"]
PASSWORD: str = os.environ["IG_PASSWORD"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@

# Config
URL: str = "full-url-here"
SCREENSHOT_PATH: str = "./screenshots"
CHROMEDRIVER_PATH: str = "./drivers/chromedriver"
SCREENSHOT_PATH: str = "."
CHROMEDRIVER_PATH: str = "./chromedriver"
locale.setlocale(locale.LC_ALL, "en_US.UTF8")


Expand Down
48 changes: 10 additions & 38 deletions scripts/tv_time.py → scrapers/tv_time.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,15 @@
#!/usr/bin/env python3
# Run: python -m scrapers.tv_time [public|private]

import os
import sys
from typing import Any

from dotenv import load_dotenv
from playwright.sync_api import Page, sync_playwright
from .utils import OPTS, Page, get_input, run_playwright

load_dotenv()
TV_TIME: str = "https://www.tvtime.com"
DEBUG: bool = True
OPTS: dict[str, Any] = {
"userid": os.environ.get("TV_TIME_USERID"),
"username": os.environ.get("TV_TIME_USERNAME"),
"password": os.environ.get("TV_TIME_PASSWORD"),
"ss_dir": "./screenshots",
"headless": True,
"slow_mo": 0,
}

if DEBUG:
# Set DEBUG = True to see the browser in action
os.environ["PWDEBUG"] = "console"
# os.environ["PWDEBUG"] = "1"
OPTS["headless"] = False
OPTS["slow_mo"] = 200


def get_input(fields: list[str]) -> list[str]:
res = []
for field in fields:
OPTS[field] = (OPTS[field] or input(f"Enter {field.upper()}: ")).strip()
res.append(OPTS[field])
return res
OPTS["userid"] = os.environ.get("TV_TIME_USERID")
OPTS["username"] = os.environ.get("TV_TIME_USERNAME")
OPTS["password"] = os.environ.get("TV_TIME_PASSWORD")


def login(page: Page) -> None:
Expand Down Expand Up @@ -81,25 +58,22 @@ def get_all_shows(page: Page) -> list[tuple[str, str]]:
def get_stats_screenshot(page: Page) -> None:
page.goto(f"/en/user/{OPTS['userid']}/profile")
page.click("text=Stats")
page.screenshot(path=f"{OPTS['ss_dir']}/tv_time_stats.png", full_page=True)
page.screenshot(path=f"{OPTS['out_dir']}/tv_time_stats.png", full_page=True)


if __name__ == "__main__":
"""
For running in a repl, do:
```python
from scripts.tv_time import *
from scrapers.tv_time import *
play = sync_playwright().start()
```
followed by any commands you want to run.
"""
mode = sys.argv[1] if len(sys.argv) > 1 else "private"

with sync_playwright() as play:
browser = play.chromium.launch(
headless=OPTS["headless"], slow_mo=OPTS["slow_mo"]
)
page = browser.new_page(base_url=TV_TIME)
with run_playwright("chromium", TV_TIME) as page:
page: Page
page.goto("/")
page.click(".optanon-alert-box-close")

Expand All @@ -116,7 +90,5 @@ def get_stats_screenshot(page: Page) -> None:
print("\nInvalid mode.\n")
sys.exit(1)

res = get_all_shows(page)
get_all_shows(page)
get_stats_screenshot(page)

browser.close()
57 changes: 57 additions & 0 deletions scrapers/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
from contextlib import contextmanager
from pathlib import Path
from typing import Any

from dotenv import load_dotenv
from playwright.sync_api import (
Browser,
BrowserType,
Page,
Playwright,
sync_playwright,
)

load_dotenv()

PROJECT_ROOT = Path(__file__).resolve().parents[1]
DEBUG: bool = os.environ.get("DEBUG", "false").lower() == "true"
OPTS: dict[str, Any] = {
"out_dir": PROJECT_ROOT / "out",
"headless": True,
"slow_mo": 0,
}

if DEBUG:
# Set DEBUG = True to see the browser in action
os.environ["PWDEBUG"] = "console"
# os.environ["PWDEBUG"] = "1"
OPTS["headless"] = False
OPTS["slow_mo"] = 200


def get_input(fields: list[str]) -> list[str]:
res = []
for field in fields:
OPTS[field] = (OPTS[field] or input(f"Enter {field.upper()}: ")).strip()
res.append(OPTS[field])
return res


@contextmanager
def run_playwright(browser_type: str, base_url: str | None = None) -> Page:
"""
browser: 'chromium', 'webkit' or 'firefox'
"""
play: Playwright = sync_playwright().start()
browser: BrowserType = getattr(play, browser_type)
window: Browser = browser.launch(
headless=OPTS["headless"], slow_mo=OPTS["slow_mo"]
)
page: Page = window.new_page(base_url=base_url)

try:
yield page
finally:
window.close()
play.stop()
Empty file removed screenshots/.gitkeep
Empty file.

0 comments on commit fa65055

Please sign in to comment.