Skip to content

Commit

Permalink
[extractors] Use new framework for existing embeds (yt-dlp#4307)
Browse files Browse the repository at this point in the history
`Brightcove` is difficult to migrate because it's subclasses may depend
on the signature of the current functions. So it is left as-is for now

Note: Tests have not been migrated
  • Loading branch information
pukkandan committed Aug 1, 2022
1 parent 1e8fe57 commit bfd973e
Show file tree
Hide file tree
Showing 138 changed files with 500 additions and 1,910 deletions.
3 changes: 2 additions & 1 deletion yt_dlp/extractor/_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@
DWIE,
DWArticleIE,
)
from .eagleplatform import EaglePlatformIE
from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE
from .ebaumsworld import EbaumsWorldIE
from .echomsk import EchoMskIE
from .egghead import (
Expand Down Expand Up @@ -1555,6 +1555,7 @@
SharedIE,
VivoIE,
)
from .sharevideos import ShareVideosEmbedIE
from .shemaroome import ShemarooMeIE
from .showroomlive import ShowRoomLiveIE
from .simplecast import (
Expand Down
1 change: 1 addition & 0 deletions yt_dlp/extractor/adobetv.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ def _real_extract(self, url):
class AdobeTVVideoIE(AdobeTVBaseIE):
IE_NAME = 'adobetv:video'
_VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
_EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]']

_TEST = {
# From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners
Expand Down
15 changes: 2 additions & 13 deletions yt_dlp/extractor/ant1newsgr.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import re
import urllib.parse

from .common import InfoExtractor
Expand All @@ -7,7 +6,6 @@
ExtractorError,
determine_ext,
scale_thumbnails_to_max_format_width,
unescapeHTML,
)


Expand Down Expand Up @@ -91,7 +89,7 @@ def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle')
embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage))
embed_urls = list(Ant1NewsGrEmbedIE._extract_embed_urls(url, webpage))
if not embed_urls:
raise ExtractorError('no videos found for %s' % video_id, expected=True)
return self.playlist_from_matches(
Expand All @@ -104,6 +102,7 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
IE_DESC = 'ant1news.gr embedded videos'
_BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player'
_VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)'
_EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)']
_API_PATH = '/news/templates/data/jsonPlayer'

_TESTS = [{
Expand All @@ -117,16 +116,6 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
},
}]

@classmethod
def _extract_urls(cls, webpage):
_EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
_EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_EMBED_URL_RE})(?P=_q1)'
for mobj in re.finditer(_EMBED_RE, webpage):
url = unescapeHTML(mobj.group('url'))
if not cls.suitable(url):
continue
yield url

def _real_extract(self, url):
video_id = self._match_id(url)

Expand Down
30 changes: 8 additions & 22 deletions yt_dlp/extractor/anvato.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,30 +340,16 @@ def _get_anvato_videos(self, access_key, video_id):
'subtitles': subtitles,
}

@staticmethod
def _extract_urls(ie, webpage, video_id):
entries = []
for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
anvplayer_data = ie._parse_json(
mobj.group('anvp'), video_id, transform_source=unescapeHTML,
fatal=False)
if not anvplayer_data:
continue
video = anvplayer_data.get('video')
if not isinstance(video, compat_str) or not video.isdigit():
continue
access_key = anvplayer_data.get('accessKey')
if not access_key:
mcp = anvplayer_data.get('mcp')
if mcp:
access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
mcp.lower())
@classmethod
def _extract_from_webpage(cls, url, webpage):
for mobj in re.finditer(cls._ANVP_RE, webpage):
anvplayer_data = unescapeHTML(json.loads(mobj.group('anvp'))) or {}
video_id, access_key = anvplayer_data.get('video'), anvplayer_data.get('accessKey')
if not access_key:
access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower())
if not (video_id or '').isdigit() or not access_key:
continue
entries.append(ie.url_result(
'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
video_id=video))
return entries
yield cls.url_result(f'anvato:{access_key}:{video_id}', AnvatoIE, video_id)

def _extract_anvato_videos(self, webpage, video_id):
anvplayer_data = self._parse_json(
Expand Down
11 changes: 1 addition & 10 deletions yt_dlp/extractor/apa.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import re

from .common import InfoExtractor
from ..utils import (
determine_ext,
Expand All @@ -10,6 +8,7 @@

class APAIE(InfoExtractor):
_VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1']
_TESTS = [{
'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029',
'md5': '2b12292faeb0a7d930c778c7a5b4759b',
Expand All @@ -30,14 +29,6 @@ class APAIE(InfoExtractor):
'only_matching': True,
}]

@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1',
webpage)]

def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id, base_url = mobj.group('id', 'base_url')
Expand Down
1 change: 1 addition & 0 deletions yt_dlp/extractor/aparat.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

class AparatIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
_EMBED_REGEX = [r'<iframe .*?src="(?P<url>http://www\.aparat\.com/video/[^"]+)"']

_TESTS = [{
'url': 'http://www.aparat.com/v/wP8On',
Expand Down
4 changes: 2 additions & 2 deletions yt_dlp/extractor/arcpublishing.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ class ArcPublishingIE(InfoExtractor):
], 'video-api-cdn.%s.arcpublishing.com/api'),
]

@staticmethod
def _extract_urls(webpage):
@classmethod
def _extract_embed_urls(cls, url, webpage):
entries = []
# https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview
for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage):
Expand Down
13 changes: 2 additions & 11 deletions yt_dlp/extractor/arkena.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import re

from .common import InfoExtractor
from ..utils import (
ExtractorError,
Expand All @@ -19,6 +17,8 @@ class ArkenaIE(InfoExtractor):
play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+)
)
'''
# See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1']
_TESTS = [{
'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310',
'md5': '97f117754e5f3c020f5f26da4a44ebaf',
Expand Down Expand Up @@ -50,15 +50,6 @@ class ArkenaIE(InfoExtractor):
'only_matching': True,
}]

@staticmethod
def _extract_url(webpage):
# See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video
mobj = re.search(
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1',
webpage)
if mobj:
return mobj.group('url')

def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
Expand Down
7 changes: 1 addition & 6 deletions yt_dlp/extractor/arte.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ def _real_extract(self, url):

class ArteTVEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
_EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
_TESTS = [{
'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
'info_dict': {
Expand All @@ -219,12 +220,6 @@ class ArteTVEmbedIE(InfoExtractor):
'only_matching': True,
}]

@staticmethod
def _extract_urls(webpage):
return [url for _, url in re.findall(
r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
webpage)]

def _real_extract(self, url):
qs = parse_qs(url)
json_url = qs['json_url'][0]
Expand Down
1 change: 1 addition & 0 deletions yt_dlp/extractor/bandcamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

class BandcampIE(InfoExtractor):
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
_EMBED_REGEX = [r'<meta property="og:url"[^>]*?content="(?P<url>.*?bandcamp\.com.*?)"']
_TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439',
Expand Down
1 change: 1 addition & 0 deletions yt_dlp/extractor/bbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class BBCCoUkIE(InfoExtractor):
)
(?P<id>%s)(?!/(?:episodes|broadcasts|clips))
''' % _ID_REGEX
_EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']

_LOGIN_URL = 'https://account.bbc.com/signin'
_NETRC_MACHINE = 'bbc'
Expand Down
9 changes: 1 addition & 8 deletions yt_dlp/extractor/bitchute.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

class BitChuteIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
_EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
_TESTS = [{
'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
'md5': '7e427d7ed7af5a75b5855705ec750e2b',
Expand All @@ -33,14 +34,6 @@ class BitChuteIE(InfoExtractor):
'only_matching': True,
}]

@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>%s)' % BitChuteIE._VALID_URL,
webpage)]

def _real_extract(self, url):
video_id = self._match_id(url)

Expand Down
8 changes: 1 addition & 7 deletions yt_dlp/extractor/blogger.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import re

from ..utils import (
mimetype2ext,
parse_duration,
Expand All @@ -13,7 +11,7 @@
class BloggerIE(InfoExtractor):
IE_NAME = 'blogger.com'
_VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P<id>.+)'
_VALID_EMBED = r'''<iframe[^>]+src=["']((?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']'''
_EMBED_REGEX = [r'''<iframe[^>]+src=["'](?P<url>(?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']''']
_TESTS = [{
'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw',
'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac',
Expand All @@ -26,10 +24,6 @@ class BloggerIE(InfoExtractor):
}
}]

@staticmethod
def _extract_urls(webpage):
return re.findall(BloggerIE._VALID_EMBED, webpage)

def _real_extract(self, url):
token_id = self._match_id(url)
webpage = self._download_webpage(url, token_id)
Expand Down
2 changes: 1 addition & 1 deletion yt_dlp/extractor/buzzfeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def _real_extract(self, url):
continue
entries.append(self.url_result(video['url']))

facebook_urls = FacebookIE._extract_urls(webpage)
facebook_urls = FacebookIE._extract_embed_urls(url, webpage)
entries.extend([
self.url_result(facebook_url)
for facebook_url in facebook_urls])
Expand Down
7 changes: 1 addition & 6 deletions yt_dlp/extractor/channel9.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class Channel9IE(InfoExtractor):
IE_DESC = 'Channel 9'
IE_NAME = 'channel9'
_VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
_EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b']

_TESTS = [{
'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
Expand Down Expand Up @@ -78,12 +79,6 @@ class Channel9IE(InfoExtractor):

_RSS_URL = 'http://channel9.msdn.com/%s/RSS'

@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b',
webpage)

def _extract_list(self, video_id, rss_url=None):
if not rss_url:
rss_url = self._RSS_URL % video_id
Expand Down
2 changes: 2 additions & 0 deletions yt_dlp/extractor/cinchcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

class CinchcastIE(InfoExtractor):
_VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)'
_EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1']

_TESTS = [{
'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single',
'info_dict': {
Expand Down
10 changes: 1 addition & 9 deletions yt_dlp/extractor/cloudflarestream.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import base64
import re

from .common import InfoExtractor

Expand All @@ -16,6 +15,7 @@ class CloudflareStreamIE(InfoExtractor):
)
(?P<id>%s)
''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE)
_EMBED_REGEX = [fr'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1']
_TESTS = [{
'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717',
'info_dict': {
Expand All @@ -37,14 +37,6 @@ class CloudflareStreamIE(InfoExtractor):
'only_matching': True,
}]

@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//%s(?:%s).*?)\1' % (CloudflareStreamIE._EMBED_RE, CloudflareStreamIE._ID_RE),
webpage)]

def _real_extract(self, url):
video_id = self._match_id(url)
domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net'
Expand Down
5 changes: 5 additions & 0 deletions yt_dlp/extractor/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3882,6 +3882,11 @@ def _extract_embed_urls(cls, url, webpage):
class StopExtraction(Exception):
pass

@classmethod
def _extract_url(cls, webpage): # TODO: Remove
"""Only for compatibility with some older extractors"""
return next(iter(cls._extract_embed_urls(None, webpage) or []), None)


class SearchInfoExtractor(InfoExtractor):
"""
Expand Down
5 changes: 4 additions & 1 deletion yt_dlp/extractor/condenast.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,10 @@ class CondeNastIE(InfoExtractor):
)''' % '|'.join(_SITES.keys())
IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))

EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys())
_EMBED_REGEX = [r'''(?x)
<(?:iframe|script)[^>]+?src=(["\'])(?P<url>
(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?
)\1''' % '|'.join(_SITES.keys())]

_TESTS = [{
'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',
Expand Down
2 changes: 2 additions & 0 deletions yt_dlp/extractor/crooksandliars.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

class CrooksAndLiarsIE(InfoExtractor):
_VALID_URL = r'https?://embed\.crooksandliars\.com/(?:embed|v)/(?P<id>[A-Za-z0-9]+)'
_EMBED_REGEX = [r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1']

_TESTS = [{
'url': 'https://embed.crooksandliars.com/embed/8RUoRhRi',
'info_dict': {
Expand Down
2 changes: 1 addition & 1 deletion yt_dlp/extractor/cspan.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def add_referer(formats):
video_id = m.group('id')
video_type = 'program' if m.group('type') == 'prog' else 'clip'
else:
senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
senate_isvp_url = SenateISVPIE._extract_url(webpage)
if senate_isvp_url:
title = self._og_search_title(webpage)
surl = smuggle_url(senate_isvp_url, {'force_title': title})
Expand Down
Loading

0 comments on commit bfd973e

Please sign in to comment.