Skip to content

Commit

Permalink
dailylesson & 'all' args (jachinlin#51)
Browse files Browse the repository at this point in the history
* upate readme

* support 'all' argument for all clis

* add dailylesson cli

* update version to 1.0.0
  • Loading branch information
jachinlin authored Oct 12, 2019
1 parent 097893c commit f5a37f8
Show file tree
Hide file tree
Showing 11 changed files with 222 additions and 35 deletions.
11 changes: 5 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
# 把极客时间装进Kindle
<p align="center">
<img width="80%" src="https://raw.githubusercontent.com/jachinlin/jachinlin.github.io/master/img/gk-mp4.gif" alt="左耳听风">
</p>

# 把极客时间装进 Kindle

[![travis](https://travis-ci.org/jachinlin/geektime_dl.svg?branch=master)](https://travis-ci.org/jachinlin/geektime_dl)
[![codecov](https://codecov.io/gh/jachinlin/geektime_dl/branch/master/graph/badge.svg)](https://codecov.io/gh/jachinlin/geektime_dl)
[![Python versions](https://img.shields.io/pypi/pyversions/geektime-dl.svg)](https://pypi.org/project/geektime-dl/)
[![PyPI](https://img.shields.io/pypi/v/geektime-dl.svg)](https://pypi.org/project/geektime-dl/)
[![Actions Status](https://github.com/jachinlin/geektime_dl/workflows/Python%20package/badge.svg)](https://github.com/jachinlin/geektime_dl/actions)


<p align="center" style="margin:70px">
<img src="https://raw.githubusercontent.com/jachinlin/jachinlin.github.io/master/img/gk-mp4.gif" alt="左耳听风">
</p>

极客时间专栏文章的质量都是非常高的,比如耗子哥的《左耳听风》、朱赟的《朱赟的技术管理课》和王天一的《人工智能基础课》,都是我非常喜欢的专栏。这些专栏深入浅出,将知识和经验传授于读者,都是值得多次阅读的。

然而,每当空闲时间时,都需要掏出手机才能阅读专栏文章,这在某种情况下是很不便的,尤其坐地铁且没有网络时。作为一个kindle党,最好的解决方案就是kindle电子书。于是有了这个项目
Expand Down
11 changes: 9 additions & 2 deletions geektime_dl/cli/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import configparser
import argparse
import functools
from typing import List

from geektime_dl.utils.log import logger
from geektime_dl.data_client import get_data_client, DataClient
Expand Down Expand Up @@ -87,8 +88,14 @@ def get_data_client(cfg: dict) -> DataClient:
"Use '{} login --help' for help.\n".format(
sys.argv[0].split(os.path.sep)[-1]))

@staticmethod
def parse_course_ids(ids_str: str) -> list:
def get_all_course_ids(self, dc: DataClient, type_: str) -> List[int]:
raise NotImplementedError

def parse_course_ids(self, ids_str: str, dc: DataClient) -> List[int]:

if ids_str.startswith('all'):
return self.get_all_course_ids(dc, type_=ids_str)

def _int(num):
try:
return int(num)
Expand Down
113 changes: 102 additions & 11 deletions geektime_dl/cli/dailylesson.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,110 @@
# coding=utf8

from geektime_dl.cli import Command
import os
import sys

from termcolor import colored

from geektime_dl.data_client.gk_apis import GkApiError
from geektime_dl.utils.ebook import Render
from geektime_dl.utils.m3u8_downloader import Downloader
from geektime_dl.cli import Command, add_argument


class Daily(Command):
"""保存每日一课视频
geektime daily -v <video_id> [--url-only] [--hd-only] \
[--output-folder=<output_folder>]
"""保存每日一课视频"""""

def get_all_course_ids(self, dc, type_: str):
cid_list = []
data = dc.get_video_collection_list()
for c in data:
cid_list.append(int(c['collection_id']))

return cid_list

@add_argument("collection_ids", type=str,
help="specify the target video collection ids")
@add_argument("--url-only", dest="url_only", action='store_true',
default=False, help="download mp3/mp4 url only")
@add_argument("--hd-only", dest="hd_only", action='store_true',
default=False, help="download mp4 with high quality")
@add_argument("--workers", dest="workers", type=int, save=True,
help="specify the number of threads to download mp3/mp4")
def run(self, cfg: dict):

dc = self.get_data_client(cfg)
collection_ids = self.parse_course_ids(cfg['collection_ids'], dc)
output_folder = self._format_output_folder(cfg)

dl = Downloader(output_folder, workers=cfg['workers'])

for collection_id in collection_ids:
try:
course_data = dc.get_video_collection_intro(collection_id)
except GkApiError as e:
sys.stderr.write('{}\n\n'.format(e))
continue

out_dir = os.path.join(
output_folder,
Render.format_file_name(course_data['title']))
if not os.path.isdir(out_dir):
os.makedirs(out_dir)

# fetch raw data
print(colored('开始下载视频:{}-{}'.format(
collection_id, course_data['title']), 'green'))
pbar_desc = '数据爬取中:{}'.format(course_data['title'][:10])
data = dc.get_video_collection_content(
collection_id, pbar_desc=pbar_desc)

# save url
if cfg['url_only']:
self._parse_and_save_url(course_data, data, out_dir)
continue

# download mp4
for post in data:
fn = (Render.format_file_name(post['article_title']) +
('.hd' if cfg['hd_only'] else '.sd'))
if os.path.isfile(os.path.join(out_dir, fn) + '.ts'):
sys.stdout.write(fn + ' exists\n')
continue
url = self._parse_url(post, cfg['hd_only'])
if url:
dl.run(url, os.path.join(out_dir, fn))
dl.shutdown()

@staticmethod
def _format_output_folder(cfg):
output_folder = os.path.join(cfg['output_folder'], 'dailylesson')
output_folder = os.path.expanduser(output_folder)
if not os.path.isdir(output_folder):
os.makedirs(output_folder)
return output_folder

@staticmethod
def _parse_and_save_url(course_intro, course_data, out_dir):
title = Render.format_file_name(course_intro['title'])
fn = os.path.join(out_dir, '{}.mp4.txt'.format(title))
with open(fn, 'w') as f:
f.write('\n'.join(["{}:\n{}\n{}\n\n".format(
Render.format_file_name(post['article_title']),
(post.get('video_media_map') or {}).get('hd', {}).get('url'),
(post.get('video_media_map') or {}).get('sd', {}).get('url')
) for post in course_data]))

sys.stdout.write('视频链接下载完成:{}\n\n'.format(fn))

`[]`表示可选,`<>`表示相应变量值
@staticmethod
def _parse_url(post_content: dict, hd_only: bool):

--url-only: 只保存视频url
--hd-only:下载高清视频,默认下载标清视频
output_folder: 视频存放目录,默认当前目录
"""
def run(self, cfg: dict): # noqa: C901
if hd_only: # some post has sd mp4 only
url = ((post_content.get('video_media_map') or {}).get(
'hd', {}).get('url') or post_content['video_media'].get(
'sd', {}).get('url'))
else:
url = (post_content.get('video_media_map') or {}).get(
'sd', {}).get('url')

pass
return url
21 changes: 18 additions & 3 deletions geektime_dl/cli/ebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
import json
import datetime
from typing import List

from termcolor import colored
from kindle_maker import make_mobi
Expand All @@ -13,6 +14,7 @@
from geektime_dl.utils.ebook import Render
from geektime_dl.utils.mail import send_to_kindle
from geektime_dl.data_client.gk_apis import GkApiError
from geektime_dl.data_client import DataClient


class EBook(Command):
Expand Down Expand Up @@ -73,6 +75,20 @@ def _render_source_files(self, course_intro: dict, course_content: list,
continue
render.render_article_html(title, article['article_content'])

def get_all_course_ids(self, dc: DataClient, type_: str) -> List[int]:

cid_list = []
data = dc.get_course_list()
for c in data['1']['list'] + data['2']['list']:
if type_ == 'all':
cid_list.append(int(c['id']))
elif type_ == 'all-sub' and c['had_sub']:
cid_list.append(int(c['id']))
elif (type_ == 'all-done' and c['had_sub'] and
self.is_course_finished(c)):
cid_list.append(int(c['id']))
return cid_list

@add_argument("course_ids", type=str,
help="specify the target course ids")
@add_argument("--force", dest="force", action='store_true', default=False,
Expand All @@ -96,10 +112,9 @@ def _render_source_files(self, course_intro: dict, course_content: list,
help="specify the kindle receiver email")
def run(self, cfg: dict) -> None:

course_ids = self.parse_course_ids(cfg['course_ids'])
output_folder = self._format_output_folder(cfg)

dc = self.get_data_client(cfg)
course_ids = self.parse_course_ids(cfg['course_ids'], dc)
output_folder = self._format_output_folder(cfg)

for course_id in course_ids:
try:
Expand Down
20 changes: 18 additions & 2 deletions geektime_dl/cli/mp3.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
import sys
from typing import List

from termcolor import colored

Expand All @@ -14,6 +15,21 @@
class Mp3(Command):
"""保存专栏音频"""

def get_all_course_ids(self, dc, type_: str) -> List[int]:

cid_list = []
data = dc.get_course_list()
for c in data['1']['list']:
if type_ == 'all':
cid_list.append(int(c['id']))
elif type_ == 'all-sub' and c['had_sub']:
cid_list.append(int(c['id']))
elif (type_ == 'all-done' and c['had_sub'] and
self.is_course_finished(c)):
cid_list.append(int(c['id']))

return cid_list

@add_argument("course_ids", type=str,
help="specify the target course ids")
@add_argument("--url-only", dest="url_only", action='store_true',
Expand All @@ -22,10 +38,10 @@ class Mp3(Command):
help="specify the number of threads to download mp3/mp4")
def run(self, cfg: dict):

course_ids = self.parse_course_ids(cfg['course_ids'])
dc = self.get_data_client(cfg)
course_ids = self.parse_course_ids(cfg['course_ids'], dc)
output_folder = self._format_out_folder(cfg)

dc = self.get_data_client(cfg)
dl = Downloader(output_folder, workers=cfg['workers'])

for course_id in course_ids:
Expand Down
19 changes: 17 additions & 2 deletions geektime_dl/cli/mp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,21 @@
class Mp4(Command):
"""保存视频课程视频"""

def get_all_course_ids(self, dc, type_: str):

cid_list = []
data = dc.get_course_list()
for c in data['3']['list']:
if type_ == 'all':
cid_list.append(int(c['id']))
elif type_ == 'all-sub' and c['had_sub']:
cid_list.append(int(c['id']))
elif (type_ == 'all-done' and c['had_sub'] and
self.is_course_finished(c)):
cid_list.append(int(c['id']))

return cid_list

@add_argument("course_ids", type=str,
help="specify the target course ids")
@add_argument("--url-only", dest="url_only", action='store_true',
Expand All @@ -24,10 +39,10 @@ class Mp4(Command):
help="specify the number of threads to download mp3/mp4")
def run(self, cfg: dict):

course_ids = self.parse_course_ids(cfg['course_ids'])
dc = self.get_data_client(cfg)
course_ids = self.parse_course_ids(cfg['course_ids'], dc)
output_folder = self._format_output_folder(cfg)

dc = self.get_data_client(cfg)
dl = Downloader(output_folder, workers=cfg['workers'])

for course_id in course_ids:
Expand Down
50 changes: 47 additions & 3 deletions geektime_dl/data_client/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from tqdm import tqdm

from geektime_dl.data_client.gk_apis import GkApiClient
from geektime_dl.utils import Singleton, synchronized
from geektime_dl.utils import synchronized


def _local_storage(table: str):
Expand Down Expand Up @@ -40,13 +40,17 @@ def wrap(self: 'DataClient', *args, **kwargs):
return decorator


class DataClient(metaclass=Singleton):
class DataClient:

def __init__(self, gk: GkApiClient, db: TinyDB):
self._gk = gk
self.db = db
self._lock = threading.Lock() # tinydb 线程不安全

@property
def gk(self):
return self._gk

def get_course_list(self, **kwargs) -> dict:
"""
获取课程列表
Expand Down Expand Up @@ -103,6 +107,39 @@ def get_video_collection_list(self, **kwargs) -> list:
"""
return self._gk.get_video_collection_list()

@synchronized()
@_local_storage('video-collection')
def get_video_collection_intro(self, collection_id: int, **kwargs) -> dict:
"""
获取每日一课合辑简介
"""
data = self._gk.get_video_collection_intro(collection_id)
return data

@synchronized()
@_local_storage('daily')
def get_daily_content(self, video_id: int, **kwargs) -> dict:
"""
获取每日一课内容
"""
data = self._gk.get_post_content(video_id)
return data

def get_video_collection_content(self, collection_id: int, force: bool = False,
pbar=True, pbar_desc='') -> list:
"""
获取每日一课合辑ID 为 collection_id 的所有视频内容
"""
data = []
v_ids = self._gk.get_video_list_of(collection_id)
if pbar:
v_ids = tqdm(v_ids)
v_ids.set_description(pbar_desc)
for v_id in v_ids:
v = self.get_daily_content(v_id['article_id'], force=force)
data.append(v)
return data


class _JSONStorage(JSONStorage):
"""
Expand Down Expand Up @@ -139,7 +176,14 @@ def close(self):
pass


dc_global = None


def get_data_client(cfg: dict) -> DataClient:
global dc_global
if dc_global is not None:
return dc_global

gk = GkApiClient(
account=cfg['account'],
password=cfg['password'],
Expand All @@ -150,7 +194,7 @@ def get_data_client(cfg: dict) -> DataClient:
f = os.path.expanduser(
os.path.join(cfg['output_folder'], 'geektime-localstorage.json'))
db = TinyDB(f, storage=_JSONStorage)

dc = DataClient(gk, db)
dc_global = dc

return dc
2 changes: 1 addition & 1 deletion geektime_dl/data_client/gk_apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def get_video_collection_list(self) -> list:
"""每日一课合辑列表"""
# 没分析出接口
ids = list(range(3, 82)) + list(range(104, 141))
return [{'collection_id': id_ for id_ in ids}]
return [{'collection_id': id_} for id_ in ids]

@_retry
def get_video_list_of(self, collection_id: int) -> list:
Expand Down
Loading

0 comments on commit f5a37f8

Please sign in to comment.