[youtube] Generalize playlist entries extraction (Closes #6699, closes #6992)

9 years ago · 648e6a1ffe
1 changed files with 52 additions and 69 deletions
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -178,6 +178,52 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
            return
 class YoutubePlaylistBaseInfoExtractor(InfoExtractor):
    # Extract the video ids from the playlist pages
    def _entries(self, page, playlist_id):
        more_widget_html = content_html = page
        for page_num in itertools.count(1):
            for video_id, video_title in self.extract_videos_from_page(content_html):
                yield self.url_result(
                    video_id, 'Youtube', video_id=video_id,
                    video_title=video_title)
            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
            if not mobj:
                break
            more = self._download_json(
                'https://youtube.com/%s' % mobj.group('more'), playlist_id,
                'Downloading page #%s' % page_num,
                transform_source=uppercase_escape)
            content_html = more['content_html']
            if not content_html.strip():
                # Some webpages show a "Load more" button but they don't
                # have more videos
                break
            more_widget_html = more['load_more_widget_html']
    def extract_videos_from_page(self, page):
        ids_in_page = []
        titles_in_page = []
        for mobj in re.finditer(self._VIDEO_RE, page):
            # The link with index 0 is not the first video of the playlist (not sure if still actual)
            if 'index' in mobj.groupdict() and mobj.group('id') == '0':
                continue
            video_id = mobj.group('id')
            video_title = unescapeHTML(mobj.group('title'))
            if video_title:
                video_title = video_title.strip()
            try:
                idx = ids_in_page.index(video_id)
                if video_title and not titles_in_page[idx]:
                    titles_in_page[idx] = video_title
            except ValueError:
                ids_in_page.append(video_id)
                titles_in_page.append(video_title)
        return zip(ids_in_page, titles_in_page)
 class YoutubeIE(YoutubeBaseInfoExtractor):
    IE_DESC = 'YouTube.com'
    _VALID_URL = r"""(?x)^
@ -1419,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        }
 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
 class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor):
    IE_DESC = 'YouTube.com playlists'
    _VALID_URL = r"""(?x)(?:
                        (?:https?://)?
@ -1440,7 +1486,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                        ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
                     )"""
    _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
    IE_NAME = 'youtube:playlist'
    _TESTS = [{
        'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
@ -1557,37 +1603,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
            else:
                self.report_warning('Youtube gives an alert message: ' + match)
        # Extract the video ids from the playlist pages
        def _entries():
            more_widget_html = content_html = page
            for page_num in itertools.count(1):
                matches = re.finditer(self._VIDEO_RE, content_html)
                # We remove the duplicates and the link with index 0
                # (it's not the first video of the playlist)
                new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
                for vid_id in new_ids:
                    yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
                mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
                if not mobj:
                    break
                more = self._download_json(
                    'https://youtube.com/%s' % mobj.group('more'), playlist_id,
                    'Downloading page #%s' % page_num,
                    transform_source=uppercase_escape)
                content_html = more['content_html']
                if not content_html.strip():
                    # Some webpages show a "Load more" button but they don't
                    # have more videos
                    break
                more_widget_html = more['load_more_widget_html']
        playlist_title = self._html_search_regex(
            r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
            page, 'title')
        return self.playlist_result(_entries(), playlist_id, playlist_title)
        return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)
    def _real_extract(self, url):
        # Extract playlist id
@ -1613,10 +1633,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
        return self._extract_playlist(playlist_id)
 class YoutubeChannelIE(InfoExtractor):
 class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
    IE_DESC = 'YouTube.com channels'
    _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
    _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
    _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
    IE_NAME = 'youtube:channel'
    _TESTS = [{
        'note': 'paginated channel',
@ -1627,22 +1648,6 @@ class YoutubeChannelIE(InfoExtractor):
        }
    }]
    @staticmethod
    def extract_videos_from_page(page):
        ids_in_page = []
        titles_in_page = []
        for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
            video_id = mobj.group('id')
            video_title = unescapeHTML(mobj.group('title'))
            try:
                idx = ids_in_page.index(video_id)
                if video_title and not titles_in_page[idx]:
                    titles_in_page[idx] = video_title
            except ValueError:
                ids_in_page.append(video_id)
                titles_in_page.append(video_title)
        return zip(ids_in_page, titles_in_page)
    def _real_extract(self, url):
        channel_id = self._match_id(url)
@ -1685,29 +1690,7 @@ class YoutubeChannelIE(InfoExtractor):
                for video_id, video_title in self.extract_videos_from_page(channel_page)]
            return self.playlist_result(entries, channel_id)
        def _entries():
            more_widget_html = content_html = channel_page
            for pagenum in itertools.count(1):
                for video_id, video_title in self.extract_videos_from_page(content_html):
                    yield self.url_result(
                        video_id, 'Youtube', video_id=video_id,
                        video_title=video_title)
                mobj = re.search(
                    r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
                    more_widget_html)
                if not mobj:
                    break
                more = self._download_json(
                    'https://youtube.com/%s' % mobj.group('more'), channel_id,
                    'Downloading page #%s' % (pagenum + 1),
                    transform_source=uppercase_escape)
                content_html = more['content_html']
                more_widget_html = more['load_more_widget_html']
        return self.playlist_result(_entries(), channel_id)
        return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
 class YoutubeUserIE(YoutubeChannelIE):