[extractors] Use new framework for existing embeds (yt-dlp#4307)

`Brightcove` is difficult to migrate because it's subclasses may depend on the signature of the current functions. So it is left as-is for now Note: Tests have not been migrated
mingming-ctr · Aug 1, 2022 · bfd973e · bfd973e
1 parent 1e8fe57
commit bfd973e
Show file tree

Hide file tree

Showing 138 changed files with 500 additions and 1,910 deletions.
diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py
@@ -446,7 +446,7 @@
     DWIE,
     DWArticleIE,
 )
-from .eagleplatform import EaglePlatformIE
+from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE
 from .ebaumsworld import EbaumsWorldIE
 from .echomsk import EchoMskIE
 from .egghead import (
@@ -1555,6 +1555,7 @@
     SharedIE,
     VivoIE,
 )
+from .sharevideos import ShareVideosEmbedIE
 from .shemaroome import ShemarooMeIE
 from .showroomlive import ShowRoomLiveIE
 from .simplecast import (

diff --git a/yt_dlp/extractor/adobetv.py b/yt_dlp/extractor/adobetv.py
@@ -232,6 +232,7 @@ def _real_extract(self, url):
 class AdobeTVVideoIE(AdobeTVBaseIE):
     IE_NAME = 'adobetv:video'
     _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
+    _EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]']
 
     _TEST = {
         # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners

diff --git a/yt_dlp/extractor/ant1newsgr.py b/yt_dlp/extractor/ant1newsgr.py
@@ -1,4 +1,3 @@
-import re
 import urllib.parse
 
 from .common import InfoExtractor
@@ -7,7 +6,6 @@
     ExtractorError,
     determine_ext,
     scale_thumbnails_to_max_format_width,
-    unescapeHTML,
 )
 
 
@@ -91,7 +89,7 @@ def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
         info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle')
-        embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage))
+        embed_urls = list(Ant1NewsGrEmbedIE._extract_embed_urls(url, webpage))
         if not embed_urls:
             raise ExtractorError('no videos found for %s' % video_id, expected=True)
         return self.playlist_from_matches(
@@ -104,6 +102,7 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
     IE_DESC = 'ant1news.gr embedded videos'
     _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player'
     _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)'
+    _EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)']
     _API_PATH = '/news/templates/data/jsonPlayer'
 
     _TESTS = [{
@@ -117,16 +116,6 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
         },
     }]
 
-    @classmethod
-    def _extract_urls(cls, webpage):
-        _EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
-        _EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_EMBED_URL_RE})(?P=_q1)'
-        for mobj in re.finditer(_EMBED_RE, webpage):
-            url = unescapeHTML(mobj.group('url'))
-            if not cls.suitable(url):
-                continue
-            yield url
-
     def _real_extract(self, url):
         video_id = self._match_id(url)
 

diff --git a/yt_dlp/extractor/anvato.py b/yt_dlp/extractor/anvato.py
@@ -340,30 +340,16 @@ def _get_anvato_videos(self, access_key, video_id):
             'subtitles': subtitles,
         }
 
-    @staticmethod
-    def _extract_urls(ie, webpage, video_id):
-        entries = []
-        for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
-            anvplayer_data = ie._parse_json(
-                mobj.group('anvp'), video_id, transform_source=unescapeHTML,
-                fatal=False)
-            if not anvplayer_data:
-                continue
-            video = anvplayer_data.get('video')
-            if not isinstance(video, compat_str) or not video.isdigit():
-                continue
-            access_key = anvplayer_data.get('accessKey')
-            if not access_key:
-                mcp = anvplayer_data.get('mcp')
-                if mcp:
-                    access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
-                        mcp.lower())
+    @classmethod
+    def _extract_from_webpage(cls, url, webpage):
+        for mobj in re.finditer(cls._ANVP_RE, webpage):
+            anvplayer_data = unescapeHTML(json.loads(mobj.group('anvp'))) or {}
+            video_id, access_key = anvplayer_data.get('video'), anvplayer_data.get('accessKey')
             if not access_key:
+                access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower())
+            if not (video_id or '').isdigit() or not access_key:
                 continue
-            entries.append(ie.url_result(
-                'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
-                video_id=video))
-        return entries
+            yield cls.url_result(f'anvato:{access_key}:{video_id}', AnvatoIE, video_id)
 
     def _extract_anvato_videos(self, webpage, video_id):
         anvplayer_data = self._parse_json(

diff --git a/yt_dlp/extractor/apa.py b/yt_dlp/extractor/apa.py
@@ -1,5 +1,3 @@
-import re
-
 from .common import InfoExtractor
 from ..utils import (
     determine_ext,
@@ -10,6 +8,7 @@
 
 class APAIE(InfoExtractor):
     _VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1']
     _TESTS = [{
         'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029',
         'md5': '2b12292faeb0a7d930c778c7a5b4759b',
@@ -30,14 +29,6 @@ class APAIE(InfoExtractor):
         'only_matching': True,
     }]
 
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url')
-            for mobj in re.finditer(
-                r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1',
-                webpage)]
-
     def _real_extract(self, url):
         mobj = self._match_valid_url(url)
         video_id, base_url = mobj.group('id', 'base_url')

diff --git a/yt_dlp/extractor/aparat.py b/yt_dlp/extractor/aparat.py
@@ -10,6 +10,7 @@
 
 class AparatIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
+    _EMBED_REGEX = [r'<iframe .*?src="(?P<url>http://www\.aparat\.com/video/[^"]+)"']
 
     _TESTS = [{
         'url': 'http://www.aparat.com/v/wP8On',

diff --git a/yt_dlp/extractor/arcpublishing.py b/yt_dlp/extractor/arcpublishing.py
@@ -70,8 +70,8 @@ class ArcPublishingIE(InfoExtractor):
         ], 'video-api-cdn.%s.arcpublishing.com/api'),
     ]
 
-    @staticmethod
-    def _extract_urls(webpage):
+    @classmethod
+    def _extract_embed_urls(cls, url, webpage):
         entries = []
         # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview
         for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage):

diff --git a/yt_dlp/extractor/arkena.py b/yt_dlp/extractor/arkena.py
@@ -1,5 +1,3 @@
-import re
-
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
@@ -19,6 +17,8 @@ class ArkenaIE(InfoExtractor):
                                 play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+)
                             )
                         '''
+    # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video
+    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1']
     _TESTS = [{
         'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310',
         'md5': '97f117754e5f3c020f5f26da4a44ebaf',
@@ -50,15 +50,6 @@ class ArkenaIE(InfoExtractor):
         'only_matching': True,
     }]
 
-    @staticmethod
-    def _extract_url(webpage):
-        # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video
-        mobj = re.search(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1',
-            webpage)
-        if mobj:
-            return mobj.group('url')
-
     def _real_extract(self, url):
         mobj = self._match_valid_url(url)
         video_id = mobj.group('id')

diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py
@@ -204,6 +204,7 @@ def _real_extract(self, url):
 
 class ArteTVEmbedIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
+    _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1']
     _TESTS = [{
         'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
         'info_dict': {
@@ -219,12 +220,6 @@ class ArteTVEmbedIE(InfoExtractor):
         'only_matching': True,
     }]
 
-    @staticmethod
-    def _extract_urls(webpage):
-        return [url for _, url in re.findall(
-            r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
-            webpage)]
-
     def _real_extract(self, url):
         qs = parse_qs(url)
         json_url = qs['json_url'][0]

diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py
@@ -22,6 +22,7 @@
 
 class BandcampIE(InfoExtractor):
     _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
+    _EMBED_REGEX = [r'<meta property="og:url"[^>]*?content="(?P<url>.*?bandcamp\.com.*?)"']
     _TESTS = [{
         'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
         'md5': 'c557841d5e50261777a6585648adf439',

diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py
@@ -46,6 +46,7 @@ class BBCCoUkIE(InfoExtractor):
                         )
                         (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
                     ''' % _ID_REGEX
+    _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
 
     _LOGIN_URL = 'https://account.bbc.com/signin'
     _NETRC_MACHINE = 'bbc'

diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py
@@ -13,6 +13,7 @@
 
 class BitChuteIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
+    _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
     _TESTS = [{
         'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
         'md5': '7e427d7ed7af5a75b5855705ec750e2b',
@@ -33,14 +34,6 @@ class BitChuteIE(InfoExtractor):
         'only_matching': True,
     }]
 
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url')
-            for mobj in re.finditer(
-                r'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>%s)' % BitChuteIE._VALID_URL,
-                webpage)]
-
     def _real_extract(self, url):
         video_id = self._match_id(url)
 

diff --git a/yt_dlp/extractor/blogger.py b/yt_dlp/extractor/blogger.py
@@ -1,5 +1,3 @@
-import re
-
 from ..utils import (
     mimetype2ext,
     parse_duration,
@@ -13,7 +11,7 @@
 class BloggerIE(InfoExtractor):
     IE_NAME = 'blogger.com'
     _VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P<id>.+)'
-    _VALID_EMBED = r'''<iframe[^>]+src=["']((?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']'''
+    _EMBED_REGEX = [r'''<iframe[^>]+src=["'](?P<url>(?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']''']
     _TESTS = [{
         'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw',
         'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac',
@@ -26,10 +24,6 @@ class BloggerIE(InfoExtractor):
         }
     }]
 
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(BloggerIE._VALID_EMBED, webpage)
-
     def _real_extract(self, url):
         token_id = self._match_id(url)
         webpage = self._download_webpage(url, token_id)

diff --git a/yt_dlp/extractor/buzzfeed.py b/yt_dlp/extractor/buzzfeed.py
@@ -81,7 +81,7 @@ def _real_extract(self, url):
                 continue
             entries.append(self.url_result(video['url']))
 
-        facebook_urls = FacebookIE._extract_urls(webpage)
+        facebook_urls = FacebookIE._extract_embed_urls(url, webpage)
         entries.extend([
             self.url_result(facebook_url)
             for facebook_url in facebook_urls])

diff --git a/yt_dlp/extractor/channel9.py b/yt_dlp/extractor/channel9.py
@@ -14,6 +14,7 @@ class Channel9IE(InfoExtractor):
     IE_DESC = 'Channel 9'
     IE_NAME = 'channel9'
     _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
+    _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b']
 
     _TESTS = [{
         'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
@@ -78,12 +79,6 @@ class Channel9IE(InfoExtractor):
 
     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
 
-    @staticmethod
-    def _extract_urls(webpage):
-        return re.findall(
-            r'<iframe[^>]+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b',
-            webpage)
-
     def _extract_list(self, video_id, rss_url=None):
         if not rss_url:
             rss_url = self._RSS_URL % video_id

diff --git a/yt_dlp/extractor/cinchcast.py b/yt_dlp/extractor/cinchcast.py
@@ -7,6 +7,8 @@
 
 class CinchcastIE(InfoExtractor):
     _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)'
+    _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1']
+
     _TESTS = [{
         'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single',
         'info_dict': {

diff --git a/yt_dlp/extractor/cloudflarestream.py b/yt_dlp/extractor/cloudflarestream.py
@@ -1,5 +1,4 @@
 import base64
-import re
 
 from .common import InfoExtractor
 
@@ -16,6 +15,7 @@ class CloudflareStreamIE(InfoExtractor):
                         )
                         (?P<id>%s)
                     ''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE)
+    _EMBED_REGEX = [fr'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1']
     _TESTS = [{
         'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717',
         'info_dict': {
@@ -37,14 +37,6 @@ class CloudflareStreamIE(InfoExtractor):
         'only_matching': True,
     }]
 
-    @staticmethod
-    def _extract_urls(webpage):
-        return [
-            mobj.group('url')
-            for mobj in re.finditer(
-                r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//%s(?:%s).*?)\1' % (CloudflareStreamIE._EMBED_RE, CloudflareStreamIE._ID_RE),
-                webpage)]
-
     def _real_extract(self, url):
         video_id = self._match_id(url)
         domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net'

diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
@@ -3882,6 +3882,11 @@ def _extract_embed_urls(cls, url, webpage):
     class StopExtraction(Exception):
         pass
 
+    @classmethod
+    def _extract_url(cls, webpage):  # TODO: Remove
+        """Only for compatibility with some older extractors"""
+        return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
+
 
 class SearchInfoExtractor(InfoExtractor):
     """

diff --git a/yt_dlp/extractor/condenast.py b/yt_dlp/extractor/condenast.py
@@ -58,7 +58,10 @@ class CondeNastIE(InfoExtractor):
         )''' % '|'.join(_SITES.keys())
     IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
 
-    EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys())
+    _EMBED_REGEX = [r'''(?x)
+        <(?:iframe|script)[^>]+?src=(["\'])(?P<url>
+            (?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?
+        )\1''' % '|'.join(_SITES.keys())]
 
     _TESTS = [{
         'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',

diff --git a/yt_dlp/extractor/crooksandliars.py b/yt_dlp/extractor/crooksandliars.py
@@ -7,6 +7,8 @@
 
 class CrooksAndLiarsIE(InfoExtractor):
     _VALID_URL = r'https?://embed\.crooksandliars\.com/(?:embed|v)/(?P<id>[A-Za-z0-9]+)'
+    _EMBED_REGEX = [r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1']
+
     _TESTS = [{
         'url': 'https://embed.crooksandliars.com/embed/8RUoRhRi',
         'info_dict': {

diff --git a/yt_dlp/extractor/cspan.py b/yt_dlp/extractor/cspan.py
@@ -163,7 +163,7 @@ def add_referer(formats):
                 video_id = m.group('id')
                 video_type = 'program' if m.group('type') == 'prog' else 'clip'
             else:
-                senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
+                senate_isvp_url = SenateISVPIE._extract_url(webpage)
                 if senate_isvp_url:
                     title = self._og_search_title(webpage)
                     surl = smuggle_url(senate_isvp_url, {'force_title': title})