|
@ -178,6 +178,52 @@ class YoutubeBaseInfoExtractor(InfoExtractor): |
|
|
return |
|
|
return |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class YoutubePlaylistBaseInfoExtractor(InfoExtractor): |
|
|
|
|
|
# Extract the video ids from the playlist pages |
|
|
|
|
|
def _entries(self, page, playlist_id): |
|
|
|
|
|
more_widget_html = content_html = page |
|
|
|
|
|
for page_num in itertools.count(1): |
|
|
|
|
|
for video_id, video_title in self.extract_videos_from_page(content_html): |
|
|
|
|
|
yield self.url_result( |
|
|
|
|
|
video_id, 'Youtube', video_id=video_id, |
|
|
|
|
|
video_title=video_title) |
|
|
|
|
|
|
|
|
|
|
|
mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) |
|
|
|
|
|
if not mobj: |
|
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
more = self._download_json( |
|
|
|
|
|
'https://youtube.com/%s' % mobj.group('more'), playlist_id, |
|
|
|
|
|
'Downloading page #%s' % page_num, |
|
|
|
|
|
transform_source=uppercase_escape) |
|
|
|
|
|
content_html = more['content_html'] |
|
|
|
|
|
if not content_html.strip(): |
|
|
|
|
|
# Some webpages show a "Load more" button but they don't |
|
|
|
|
|
# have more videos |
|
|
|
|
|
break |
|
|
|
|
|
more_widget_html = more['load_more_widget_html'] |
|
|
|
|
|
|
|
|
|
|
|
def extract_videos_from_page(self, page): |
|
|
|
|
|
ids_in_page = [] |
|
|
|
|
|
titles_in_page = [] |
|
|
|
|
|
for mobj in re.finditer(self._VIDEO_RE, page): |
|
|
|
|
|
# The link with index 0 is not the first video of the playlist (not sure if still actual) |
|
|
|
|
|
if 'index' in mobj.groupdict() and mobj.group('id') == '0': |
|
|
|
|
|
continue |
|
|
|
|
|
video_id = mobj.group('id') |
|
|
|
|
|
video_title = unescapeHTML(mobj.group('title')) |
|
|
|
|
|
if video_title: |
|
|
|
|
|
video_title = video_title.strip() |
|
|
|
|
|
try: |
|
|
|
|
|
idx = ids_in_page.index(video_id) |
|
|
|
|
|
if video_title and not titles_in_page[idx]: |
|
|
|
|
|
titles_in_page[idx] = video_title |
|
|
|
|
|
except ValueError: |
|
|
|
|
|
ids_in_page.append(video_id) |
|
|
|
|
|
titles_in_page.append(video_title) |
|
|
|
|
|
return zip(ids_in_page, titles_in_page) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class YoutubeIE(YoutubeBaseInfoExtractor): |
|
|
class YoutubeIE(YoutubeBaseInfoExtractor): |
|
|
IE_DESC = 'YouTube.com' |
|
|
IE_DESC = 'YouTube.com' |
|
|
_VALID_URL = r"""(?x)^ |
|
|
_VALID_URL = r"""(?x)^ |
|
@ -1419,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class YoutubePlaylistIE(YoutubeBaseInfoExtractor): |
|
|
|
|
|
|
|
|
class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor): |
|
|
IE_DESC = 'YouTube.com playlists' |
|
|
IE_DESC = 'YouTube.com playlists' |
|
|
_VALID_URL = r"""(?x)(?: |
|
|
_VALID_URL = r"""(?x)(?: |
|
|
(?:https?://)? |
|
|
(?:https?://)? |
|
@ -1440,7 +1486,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): |
|
|
((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) |
|
|
((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) |
|
|
)""" |
|
|
)""" |
|
|
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' |
|
|
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' |
|
|
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)' |
|
|
|
|
|
|
|
|
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?' |
|
|
IE_NAME = 'youtube:playlist' |
|
|
IE_NAME = 'youtube:playlist' |
|
|
_TESTS = [{ |
|
|
_TESTS = [{ |
|
|
'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', |
|
|
'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', |
|
@ -1557,37 +1603,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): |
|
|
else: |
|
|
else: |
|
|
self.report_warning('Youtube gives an alert message: ' + match) |
|
|
self.report_warning('Youtube gives an alert message: ' + match) |
|
|
|
|
|
|
|
|
# Extract the video ids from the playlist pages |
|
|
|
|
|
def _entries(): |
|
|
|
|
|
more_widget_html = content_html = page |
|
|
|
|
|
for page_num in itertools.count(1): |
|
|
|
|
|
matches = re.finditer(self._VIDEO_RE, content_html) |
|
|
|
|
|
# We remove the duplicates and the link with index 0 |
|
|
|
|
|
# (it's not the first video of the playlist) |
|
|
|
|
|
new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') |
|
|
|
|
|
for vid_id in new_ids: |
|
|
|
|
|
yield self.url_result(vid_id, 'Youtube', video_id=vid_id) |
|
|
|
|
|
|
|
|
|
|
|
mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) |
|
|
|
|
|
if not mobj: |
|
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
more = self._download_json( |
|
|
|
|
|
'https://youtube.com/%s' % mobj.group('more'), playlist_id, |
|
|
|
|
|
'Downloading page #%s' % page_num, |
|
|
|
|
|
transform_source=uppercase_escape) |
|
|
|
|
|
content_html = more['content_html'] |
|
|
|
|
|
if not content_html.strip(): |
|
|
|
|
|
# Some webpages show a "Load more" button but they don't |
|
|
|
|
|
# have more videos |
|
|
|
|
|
break |
|
|
|
|
|
more_widget_html = more['load_more_widget_html'] |
|
|
|
|
|
|
|
|
|
|
|
playlist_title = self._html_search_regex( |
|
|
playlist_title = self._html_search_regex( |
|
|
r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>', |
|
|
r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>', |
|
|
page, 'title') |
|
|
page, 'title') |
|
|
|
|
|
|
|
|
return self.playlist_result(_entries(), playlist_id, playlist_title) |
|
|
|
|
|
|
|
|
return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title) |
|
|
|
|
|
|
|
|
def _real_extract(self, url): |
|
|
def _real_extract(self, url): |
|
|
# Extract playlist id |
|
|
# Extract playlist id |
|
@ -1613,10 +1633,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): |
|
|
return self._extract_playlist(playlist_id) |
|
|
return self._extract_playlist(playlist_id) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class YoutubeChannelIE(InfoExtractor): |
|
|
|
|
|
|
|
|
class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): |
|
|
IE_DESC = 'YouTube.com channels' |
|
|
IE_DESC = 'YouTube.com channels' |
|
|
_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' |
|
|
_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' |
|
|
_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' |
|
|
_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' |
|
|
|
|
|
_VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' |
|
|
IE_NAME = 'youtube:channel' |
|
|
IE_NAME = 'youtube:channel' |
|
|
_TESTS = [{ |
|
|
_TESTS = [{ |
|
|
'note': 'paginated channel', |
|
|
'note': 'paginated channel', |
|
@ -1627,22 +1648,6 @@ class YoutubeChannelIE(InfoExtractor): |
|
|
} |
|
|
} |
|
|
}] |
|
|
}] |
|
|
|
|
|
|
|
|
@staticmethod |
|
|
|
|
|
def extract_videos_from_page(page): |
|
|
|
|
|
ids_in_page = [] |
|
|
|
|
|
titles_in_page = [] |
|
|
|
|
|
for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page): |
|
|
|
|
|
video_id = mobj.group('id') |
|
|
|
|
|
video_title = unescapeHTML(mobj.group('title')) |
|
|
|
|
|
try: |
|
|
|
|
|
idx = ids_in_page.index(video_id) |
|
|
|
|
|
if video_title and not titles_in_page[idx]: |
|
|
|
|
|
titles_in_page[idx] = video_title |
|
|
|
|
|
except ValueError: |
|
|
|
|
|
ids_in_page.append(video_id) |
|
|
|
|
|
titles_in_page.append(video_title) |
|
|
|
|
|
return zip(ids_in_page, titles_in_page) |
|
|
|
|
|
|
|
|
|
|
|
def _real_extract(self, url): |
|
|
def _real_extract(self, url): |
|
|
channel_id = self._match_id(url) |
|
|
channel_id = self._match_id(url) |
|
|
|
|
|
|
|
@ -1685,29 +1690,7 @@ class YoutubeChannelIE(InfoExtractor): |
|
|
for video_id, video_title in self.extract_videos_from_page(channel_page)] |
|
|
for video_id, video_title in self.extract_videos_from_page(channel_page)] |
|
|
return self.playlist_result(entries, channel_id) |
|
|
return self.playlist_result(entries, channel_id) |
|
|
|
|
|
|
|
|
def _entries(): |
|
|
|
|
|
more_widget_html = content_html = channel_page |
|
|
|
|
|
for pagenum in itertools.count(1): |
|
|
|
|
|
|
|
|
|
|
|
for video_id, video_title in self.extract_videos_from_page(content_html): |
|
|
|
|
|
yield self.url_result( |
|
|
|
|
|
video_id, 'Youtube', video_id=video_id, |
|
|
|
|
|
video_title=video_title) |
|
|
|
|
|
|
|
|
|
|
|
mobj = re.search( |
|
|
|
|
|
r'data-uix-load-more-href="/?(?P<more>[^"]+)"', |
|
|
|
|
|
more_widget_html) |
|
|
|
|
|
if not mobj: |
|
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
more = self._download_json( |
|
|
|
|
|
'https://youtube.com/%s' % mobj.group('more'), channel_id, |
|
|
|
|
|
'Downloading page #%s' % (pagenum + 1), |
|
|
|
|
|
transform_source=uppercase_escape) |
|
|
|
|
|
content_html = more['content_html'] |
|
|
|
|
|
more_widget_html = more['load_more_widget_html'] |
|
|
|
|
|
|
|
|
|
|
|
return self.playlist_result(_entries(), channel_id) |
|
|
|
|
|
|
|
|
return self.playlist_result(self._entries(channel_page, channel_id), channel_id) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class YoutubeUserIE(YoutubeChannelIE): |
|
|
class YoutubeUserIE(YoutubeChannelIE): |
|
|