Skip to content

Commit

Permalink
CHANGES: 01, 02
Browse files Browse the repository at this point in the history
  • Loading branch information
FLZ101 committed Sep 8, 2019
1 parent 4c38d34 commit 066fdc8
Show file tree
Hide file tree
Showing 6 changed files with 193 additions and 49 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
## 0.1.2

* (01) Removes `--email` and `--password`
* (02) Supports downloading "references" of a course

## 0.1.1

* Add `--cookies`
* (01) Adds `--cookies`

## 0.1.0
111 changes: 88 additions & 23 deletions dl_coursera/Crawler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pickle
import os
import logging
from http.cookiejar import MozillaCookieJar

from http.cookiejar import CookieJar, MozillaCookieJar

from .define import *

Expand All @@ -14,25 +15,24 @@


class Crawler:
def __init__(self, *, ts, sess, email, password, cookies):
def __init__(self, *, ts, sess, cookies):
self._ts = ts
self._loggedin = False

def attach(func):
setattr(self, func.__name__, func)
return func

@attach
def login():
if cookies:
_ = MozillaCookieJar()
_.load(cookies)
sess.cookies = _
if isinstance(cookies, CookieJar):
cj = cookies
else:
resp = sess.get(URL_ROOT)
resp = sess.post(URL_LOGIN(resp.cookies['CSRF3-Token']),
data={'email': email, 'password': password})
cj = MozillaCookieJar()
cj.load(cookies)

assert resp.status_code == 200
sess.cookies = cj
self._loggedin = True

@attach
@ts.register_task(
Expand Down Expand Up @@ -70,7 +70,7 @@ def crawl_spec(*, spec):
@attach
@ts.register_task(
priority=PRIO_COURSE, ttl=3,
format_kwargs=lambda _: format_dict({'source': _['course']['slug']})
format_kwargs=lambda _: format_dict({'cource': _['course']['slug']})
)
def crawl_course(*, course):
resp = sess.get(URL_COURSE_1(course['slug']))
Expand All @@ -82,6 +82,8 @@ def crawl_course(*, course):

assert course['slug'] == _['slug']

# ------

resp = sess.get(URL_COURSE_2(course['slug']))
d = resp.json()['linked']

Expand All @@ -94,6 +96,10 @@ def crawl_course(*, course):
id2item[_['id']] = CourseMaterialNotebook(id_=_['id'], name=_['name'], slug=_['slug'])
elif typeName == 'supplement':
id2item[_['id']] = CourseMaterialSupplement(id_=_['id'], name=_['name'], slug=_['slug'])
elif typeName in ['exam', 'quiz', 'phasedPeer']:
pass
else:
logging.warning('[crawl_course] unknown typeName=%s\n%s' % (typeName, _))

id2lesson = {}
for _ in d['onDemandCourseMaterialLessons.v1']:
Expand All @@ -117,6 +123,10 @@ def crawl_course(*, course):
if len(module['lessons']) > 0:
course['modules'].append(module)

# ------

crawl_course_references(course=course)

for module in course['modules']:
for lesson in module['lessons']:
for item in lesson['items']:
Expand All @@ -125,6 +135,54 @@ def crawl_course(*, course):
elif item['type'] == 'Supplement':
crawl_supplement(course=course, supplement=item)

def _crawl_course_ref(course, id_ref=None):
if not id_ref:
resp = sess.get(URL_COURSE_REFERENCES(course['id']))
else:
resp = sess.get(URL_COURSE_REFERENCE(course['id'], id_ref))

d = resp.json()

itemId2ref = {}
for _ in d['elements']:
ref = CourseReference(id_=_['shortId'], name=_['name'], slug=_['slug'])
course['references'].append(ref)

itemId = _['content']['org.coursera.ondemand.reference.AssetReferenceContent']['assetId']
itemId2ref[itemId] = ref

for _ in d['linked']['openCourseAssets.v1']:
typeName = _['typeName']
if typeName == 'cml':
cml = CML(_['definition']['value'])
assets, assetIDs, refids = cml.get_resources()
assets += crawl_assets(assetIDs)
html = cml.to_html(assets=assets)

itemId2ref[_['id']]['item'] = CourseMaterialSupplementItemCML(html=html, assets=assets)

for refid in refids:
crawl_course_reference(course=course, id_ref=refid)
else:
logging.warning("[_crawl_course_ref] unknown typeName=%s\n%s" % (typeName, _))

@ts.register_task(
priority=PRIO_COURSE_MATERIAL, ttl=3,
format_kwargs=lambda _: format_dict({'cource': _['course']['slug']})
)
def crawl_course_references(*, course):
_crawl_course_ref(course)

@ts.register_task(
priority=PRIO_COURSE_MATERIAL, ttl=3,
format_kwargs=lambda _: format_dict({'cource': _['course']['slug']})
)
def crawl_course_reference(*, course, id_ref):
for ref in course['references']:
if id_ref == ref['id']:
return
_crawl_course_ref(course, id_ref)

@ts.register_task(
priority=PRIO_COURSE_MATERIAL, ttl=3,
format_kwargs=lambda _: format_dict({'course': _['course']['slug'],
Expand All @@ -140,27 +198,28 @@ def crawl_lecture(*, course, lecture):
if url_subtitle is not None:
url_subtitle = URL_ROOT + url_subtitle

url_video = None
for reso in ['720p', '540p', '360p']:
url_video = _['sources']['byResolution'].get(reso)
if url_video is not None:
break

assert url_video is not None

_ = _['sources']['byResolution']
url_video = _[sorted(_.keys())[-1]] # choose the video with highest resolution
url_video = url_video['mp4VideoUrl']

lecture['videos'].append(Video(url_video=url_video, url_subtitle=url_subtitle))

# lecture assets
resp = sess.get(URL_LECTURE_2(course['id'], lecture['id']))
d = resp.json()

assets = []
assetIDs = []
for _ in d['linked']['openCourseAssets.v1']:
typeName = _['typeName']
if typeName == 'asset':
assetIDs.append(_['definition']['assetId'])
assets = crawl_assets(assetIDs)
elif typeName == 'url':
assets.append(Asset(id_=_['id'], url=_['definition']['url'], name=_['definition']['name']))
else:
logging.warning("[crawl_lecture] unknown typeName=%s\n%s" % (typeName, _))

assets += crawl_assets(assetIDs)
lecture['assets'] = assets

@ts.register_task(
Expand All @@ -176,12 +235,17 @@ def crawl_supplement(course, supplement):
typeName = _['typeName']
if typeName == 'cml':
cml = CML(_['definition']['value'])
assets, assetIDs = cml.get_assets()
assets, assetIDs, refids = cml.get_resources()
assets += crawl_assets(assetIDs)
html = cml.to_html(assets=assets)

supplement['items'].append(CourseMaterialSupplementItemCML(html=html, assets=assets))

for refid in refids:
crawl_course_reference(course=course, id_ref=refid)
else:
logging.warning("[crawl_supplement] unknown typeName=%s\n%s" % (typeName, _))

def crawl_assets(ids):
if len(ids) == 0:
return []
Expand All @@ -194,7 +258,6 @@ def crawl_assets(ids):
id_ = _['id']
url = _['url']['url']
name = _asset_name(_['name'], _['fileExtension'])

assets.append(Asset(id_=id_, url=url, name=name))

assert len(assets) == len(ids)
Expand All @@ -207,7 +270,9 @@ def _asset_name(name, fileExtension):
return name

def crawl(self, *, slug, isSpec):
self.login()
if not self._loggedin:
self.login()

if isSpec:
res = Spec(slug=slug)
self.crawl_spec(spec=res)
Expand Down
52 changes: 46 additions & 6 deletions dl_coursera/DLTaskGatherer.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import os
import zipfile
import io
import re
import logging

from .lib.ExploringTree import ExploringTree

from .markup import render_supplement

from .resource import load_resource
from .define import URL_ROOT
from .markup import CML


def _shorten_slug(x):
Expand All @@ -23,8 +26,7 @@ def __init__(self, *, soc, outdir): # "soc" means "sepc or course"
self._outdir = outdir

self._et = ExploringTree()
with self._et:
self._resource_node = self._et.jump('%s/resource' % self._soc['slug'])
self._resource_node = self._et.see('%s/resource' % self._soc['slug'])

self._dl_tasks = []
self._file_tasks = []
Expand Down Expand Up @@ -81,9 +83,40 @@ def _gather_course(self, course, i=None):
with self._et:
self._down(course['slug'], i)

self._gather_course_references(course)

for _i, module in enumerate(course['modules']):
self._gather_module(module, _i)

def _gather_course_references(self, course):
with self._et:
self._down('references')

refid2node = {}
for i, ref in enumerate(course['references']):
item = ref['item']
if item['type'] == 'CML':
refid2node[ref['id']] = self._et.see('%02d@%s.html' % (i + 1, ref['slug']))

def fn_a(a):
_refid = a.get('refid')
if not _refid:
return

_node = refid2node.get(_refid)
if not _node:
logging.warning('[fn_a] unknown _refid=%s\n%s' %
(_refid, {_1: _2.abspath() for _1, _2 in refid2node.items()}))
else:
a['href'] = self._et.relpathTo(_node)

self._fn_a = fn_a

for i, ref in enumerate(course['references']):
item = ref['item']
if item['type'] == 'CML':
self._gather_cml(item, i, ref)

def _gather_module(self, module, i):
_shorten_slug(module)

Expand Down Expand Up @@ -133,10 +166,17 @@ def _gather_supplement(self, supplement, i):
self._gather_cml(item, _i, supplement)

def _gather_cml(self, cml, i, supplement):
_data = render_supplement(content=cml['html'],
resource_path=self._resource_path(),
title='%s' % supplement['name']).encode('UTF-8')
self._add_file_task(_data, self._see('%[email protected]' % (i + 1)))
import bs4
html = cml['html']
html = bs4.BeautifulSoup(html, 'html.parser')
for a in html.find_all('a'):
self._fn_a(a)
html = str(html)

data = render_supplement(content=html,
resource_path=self._resource_path(),
title='%s' % supplement['name']).encode('UTF-8')
self._add_file_task(data, self._see('%02d@%s.html' % (i + 1, supplement['slug'])))

for asset in cml['assets']:
self._gather_asset(asset)
Expand Down
24 changes: 20 additions & 4 deletions dl_coursera/define.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,6 @@
URL_ROOT = 'https://www.coursera.org'


def URL_LOGIN(csrf3_token):
return URL_ROOT + '/api/login/v3Ssr?csrf3-token={}'.format(csrf3_token)


def URL_SPEC(slug):
return URL_ROOT + '/api/onDemandSpecializations.v1?q=slug&slug={}&fields=courseIds,interchangeableCourseIds,launchedAt,logo,memberships,metadata,partnerIds,premiumExperienceVariant,onDemandSpecializationMemberships.v1(suggestedSessionSchedule),onDemandSpecializationSuggestedSchedule.v1(suggestedSessions),partners.v1(homeLink,name),courses.v1(courseProgress,description,membershipIds,startDate,v2Details,vcMembershipIds),v2Details.v1(onDemandSessions,plannedLaunchDate),memberships.v1(grade,vcMembershipId),vcMemberships.v1(certificateCodeWithGrade)&includes=courseIds,memberships,partnerIds,onDemandSpecializationMemberships.v1(suggestedSessionSchedule),courses.v1(courseProgress,membershipIds,v2Details,vcMembershipIds),v2Details.v1(onDemandSessions)'.format(slug)

Expand All @@ -21,6 +17,14 @@ def URL_COURSE_2(slug):
return URL_ROOT + '/api/onDemandCourseMaterials.v2/?q=slug&slug={}&includes=modules%2Clessons%2CpassableItemGroups%2CpassableItemGroupChoices%2CpassableLessonElements%2Citems%2Ctracks%2CgradePolicy&fields=moduleIds%2ConDemandCourseMaterialModules.v1(name%2Cslug%2Cdescription%2CtimeCommitment%2ClessonIds%2Coptional%2ClearningObjectives)%2ConDemandCourseMaterialLessons.v1(name%2Cslug%2CtimeCommitment%2CelementIds%2Coptional%2CtrackId)%2ConDemandCourseMaterialPassableItemGroups.v1(requiredPassedCount%2CpassableItemGroupChoiceIds%2CtrackId)%2ConDemandCourseMaterialPassableItemGroupChoices.v1(name%2Cdescription%2CitemIds)%2ConDemandCourseMaterialPassableLessonElements.v1(gradingWeight%2CisRequiredForPassing)%2ConDemandCourseMaterialItems.v2(name%2Cslug%2CtimeCommitment%2CcontentSummary%2CisLocked%2ClockableByItem%2CitemLockedReasonCode%2CtrackId%2ClockedStatus%2CitemLockSummary)%2ConDemandCourseMaterialTracks.v1(passablesCount)&showLockedItems=true'.format(slug)


def URL_COURSE_REFERENCES(id_course):
return URL_ROOT + '/api/onDemandReferences.v1/?courseId={}&q=courseListed&fields=name%2CshortId%2Cslug%2Ccontent&includes=assets'.format(id_course)


def URL_COURSE_REFERENCE(id_course, id_ref):
return URL_ROOT + '/api/onDemandReferences.v1/?courseId={}&q=shortId&shortId={}&fields=name%2CshortId%2Cslug%2Ccontent&includes=assets'.format(id_course, id_ref)


def URL_LECTURE_1(id_course, id_lecture):
return URL_ROOT + '/api/onDemandLectureVideos.v1/{}~{}?includes=video&fields=onDemandVideos.v1(sources%2Csubtitles%2CsubtitlesVtt%2CsubtitlesTxt)'.format(id_course, id_lecture)

Expand Down Expand Up @@ -57,6 +61,18 @@ def __init__(self, *, id_=None, name=None, slug=None):
self['name'] = name
self['slug'] = slug
self['modules'] = []
self['references'] = []


class CourseReference(MyDict):
def __init__(self, *, id_=None, name=None, slug=None, item=None):
super().__init__()

self['type'] = 'CourseReference'
self['id'] = id_
self['name'] = name
self['slug'] = slug
self['item'] = item


class CourseMaterialModule(MyDict):
Expand Down
Loading

0 comments on commit 066fdc8

Please sign in to comment.