forked from simonw/datasette.io
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_blog_content.py
57 lines (50 loc) · 1.5 KB
/
fetch_blog_content.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import click
import feedparser
import httpx
import sqlite_utils
import urllib
def all_entries(url):
while url:
response = httpx.get(url, timeout=30)
entries = feedparser.parse(response.text)["entries"]
yield from (
{
"id": entry["id"],
"title": entry["title"],
"published": entry["published"],
"url": entry["link"].split("#")[0],
"body": entry["summary"],
"tags": [t["term"] for t in entry["tags"]],
}
for entry in entries
)
next_url = response.links.get("next", {}).get("url")
if next_url:
url = urllib.parse.urljoin(url, next_url)
else:
break
@click.command()
@click.argument(
"db_filename",
type=click.Path(file_okay=True, dir_okay=False),
)
@click.argument("tags", type=str, nargs=-1)
@click.option(
"--refresh",
is_flag=True,
help="Pull everything rather than stopping at first already-seen item",
)
def cli(db_filename, tags, refresh):
db = sqlite_utils.Database(db_filename)
if refresh:
seen_ids = set()
db["entries"].drop()
else:
seen_ids = {row["id"] for row in db["entries"].rows}
for tag in tags:
for entry in all_entries("https://simonwillison.net/tags/{}.atom".format(tag)):
if entry["id"] in seen_ids:
break
db["entries"].insert(entry, pk="id", replace=True)
if __name__ == "__main__":
cli()