Browse Source

[vk] improve extraction

- add support for Odnoklassniki embeds
- update tests
- extract more video from user lists(closes #4470)
- fix wall post audio extraction(closes #18332)
- improve error detection(closes #22568)
master
Remita Amine 5 years ago
parent
commit
3c989818e7
1 changed files with 173 additions and 156 deletions
  1. 329
      youtube_dl/extractor/vk.py

329
youtube_dl/extractor/vk.py

@ -12,7 +12,6 @@ from ..utils import (
get_element_by_class, get_element_by_class,
int_or_none, int_or_none,
orderedSet, orderedSet,
remove_start,
str_or_none, str_or_none,
str_to_int, str_to_int,
unescapeHTML, unescapeHTML,
@ -21,6 +20,7 @@ from ..utils import (
urlencode_postdata, urlencode_postdata,
) )
from .dailymotion import DailymotionIE from .dailymotion import DailymotionIE
from .odnoklassniki import OdnoklassnikiIE
from .pladform import PladformIE from .pladform import PladformIE
from .vimeo import VimeoIE from .vimeo import VimeoIE
from .youtube import YoutubeIE from .youtube import YoutubeIE
@ -60,6 +60,18 @@ class VKBaseIE(InfoExtractor):
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
def _download_payload(self, path, video_id, data, fatal=True):
data['al'] = 1
code, payload = self._download_json(
'https://vk.com/%s.php' % path, video_id,
data=urlencode_postdata(data), fatal=fatal,
headers={'X-Requested-With': 'XMLHttpRequest'})['payload']
if code == '3':
self.raise_login_required()
elif code == '8':
raise ExtractorError(clean_html(payload[0][1:-1]), expected=True)
return payload
class VKIE(VKBaseIE): class VKIE(VKBaseIE):
IE_NAME = 'vk' IE_NAME = 'vk'
@ -96,7 +108,6 @@ class VKIE(VKBaseIE):
}, },
{ {
'url': 'http://vk.com/video205387401_165548505', 'url': 'http://vk.com/video205387401_165548505',
'md5': '6c0aeb2e90396ba97035b9cbde548700',
'info_dict': { 'info_dict': {
'id': '205387401_165548505', 'id': '205387401_165548505',
'ext': 'mp4', 'ext': 'mp4',
@ -110,18 +121,18 @@ class VKIE(VKBaseIE):
}, },
{ {
'note': 'Embedded video', 'note': 'Embedded video',
'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1',
'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a',
'url': 'https://vk.com/video_ext.php?oid=-77521&id=162222515&hash=87b046504ccd8bfa',
'md5': '7babad3b85ea2e91948005b1b8b0cb84',
'info_dict': { 'info_dict': {
'id': '32194266_162925554',
'id': '-77521_162222515',
'ext': 'mp4', 'ext': 'mp4',
'uploader': 'Vladimir Gavrin',
'title': 'Lin Dan',
'duration': 101,
'upload_date': '20120730',
'view_count': int,
'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
'title': 'ProtivoGunz - Хуёвая песня',
'duration': 195,
'upload_date': '20120212',
'timestamp': 1329049880,
'uploader_id': '-77521',
}, },
'skip': 'This video has been removed from public access.',
}, },
{ {
# VIDEO NOW REMOVED # VIDEO NOW REMOVED
@ -138,18 +149,19 @@ class VKIE(VKBaseIE):
'upload_date': '20121218', 'upload_date': '20121218',
'view_count': int, 'view_count': int,
}, },
'skip': 'Requires vk account credentials',
'skip': 'Removed',
}, },
{ {
'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', 'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d',
'md5': '4d7a5ef8cf114dfa09577e57b2993202',
'info_dict': { 'info_dict': {
'id': '-43215063_168067957', 'id': '-43215063_168067957',
'ext': 'mp4', 'ext': 'mp4',
'uploader': 'Киномания - лучшее из мира кино',
'uploader': 'Bro Mazter',
'title': ' ', 'title': ' ',
'duration': 7291, 'duration': 7291,
'upload_date': '20140328', 'upload_date': '20140328',
'uploader_id': '223413403',
'timestamp': 1396018030,
}, },
'skip': 'Requires vk account credentials', 'skip': 'Requires vk account credentials',
}, },
@ -165,7 +177,7 @@ class VKIE(VKBaseIE):
'upload_date': '20140626', 'upload_date': '20140626',
'view_count': int, 'view_count': int,
}, },
'skip': 'Only works from Russia',
'skip': 'Removed',
}, },
{ {
# video (removed?) only available with list id # video (removed?) only available with list id
@ -247,6 +259,9 @@ class VKIE(VKBaseIE):
'uploader_id': '-387766', 'uploader_id': '-387766',
'timestamp': 1475137527, 'timestamp': 1475137527,
}, },
'params': {
'skip_download': True,
},
}, },
{ {
# live stream, hls and rtmp links, most likely already finished live # live stream, hls and rtmp links, most likely already finished live
@ -288,80 +303,94 @@ class VKIE(VKBaseIE):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid') video_id = mobj.group('videoid')
mv_data = {}
if video_id: if video_id:
info_url = 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id
data = {
'act': 'show_inline',
'video': video_id,
}
# Some videos (removed?) can only be downloaded with list id specified # Some videos (removed?) can only be downloaded with list id specified
list_id = mobj.group('list_id') list_id = mobj.group('list_id')
if list_id: if list_id:
info_url += '&list=%s' % list_id
data['list'] = list_id
payload = self._download_payload('al_video', video_id, data)
info_page = payload[1]
opts = payload[-1]
mv_data = opts.get('mvData') or {}
player = opts.get('player') or {}
else: else:
info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query')
video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
info_page = self._download_webpage(info_url, video_id)
info_page = self._download_webpage(
'http://vk.com/video_ext.php?' + mobj.group('embed_query'), video_id)
error_message = self._html_search_regex(
[r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'],
info_page, 'error message', default=None)
if error_message:
raise ExtractorError(error_message, expected=True)
error_message = self._html_search_regex(
[r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'],
info_page, 'error message', default=None)
if error_message:
raise ExtractorError(error_message, expected=True)
if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page):
raise ExtractorError(
'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.',
expected=True)
if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page):
raise ExtractorError(
'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.',
expected=True)
ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.'
ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.'
ERRORS = {
r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<':
ERROR_COPYRIGHT,
ERRORS = {
r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<':
ERROR_COPYRIGHT,
r'>The video .*? was removed from public access by request of the copyright holder.<':
ERROR_COPYRIGHT,
r'>The video .*? was removed from public access by request of the copyright holder.<':
ERROR_COPYRIGHT,
r'<!>Please log in or <':
'Video %s is only available for registered users, '
'use --username and --password options to provide account credentials.',
r'<!>Please log in or <':
'Video %s is only available for registered users, '
'use --username and --password options to provide account credentials.',
r'<!>Unknown error':
'Video %s does not exist.',
r'<!>Unknown error':
'Video %s does not exist.',
r'<!>Видео временно недоступно':
'Video %s is temporarily unavailable.',
r'<!>Видео временно недоступно':
'Video %s is temporarily unavailable.',
r'<!>Access denied':
'Access denied to video %s.',
r'<!>Access denied':
'Access denied to video %s.',
r'<!>Видеозапись недоступна, так как её автор был заблокирован.':
'Video %s is no longer available, because its author has been blocked.',
r'<!>Видеозапись недоступна, так как её автор был заблокирован.':
'Video %s is no longer available, because its author has been blocked.',
r'<!>This video is no longer available, because its author has been blocked.':
'Video %s is no longer available, because its author has been blocked.',
r'<!>This video is no longer available, because its author has been blocked.':
'Video %s is no longer available, because its author has been blocked.',
r'<!>This video is no longer available, because it has been deleted.':
'Video %s is no longer available, because it has been deleted.',
r'<!>This video is no longer available, because it has been deleted.':
'Video %s is no longer available, because it has been deleted.',
r'<!>The video .+? is not available in your region.':
'Video %s is not available in your region.',
}
r'<!>The video .+? is not available in your region.':
'Video %s is not available in your region.',
}
for error_re, error_msg in ERRORS.items():
if re.search(error_re, info_page):
raise ExtractorError(error_msg % video_id, expected=True)
for error_re, error_msg in ERRORS.items():
if re.search(error_re, info_page):
raise ExtractorError(error_msg % video_id, expected=True)
player = self._parse_json(self._search_regex(
r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n',
info_page, 'player params'), video_id)
youtube_url = YoutubeIE._extract_url(info_page) youtube_url = YoutubeIE._extract_url(info_page)
if youtube_url: if youtube_url:
return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
return self.url_result(youtube_url, YoutubeIE.ie_key())
vimeo_url = VimeoIE._extract_url(url, info_page) vimeo_url = VimeoIE._extract_url(url, info_page)
if vimeo_url is not None: if vimeo_url is not None:
return self.url_result(vimeo_url)
return self.url_result(vimeo_url, VimeoIE.ie_key())
pladform_url = PladformIE._extract_url(info_page) pladform_url = PladformIE._extract_url(info_page)
if pladform_url: if pladform_url:
return self.url_result(pladform_url)
return self.url_result(pladform_url, PladformIE.ie_key())
m_rutube = re.search( m_rutube = re.search(
r'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page) r'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page)
@ -374,6 +403,10 @@ class VKIE(VKBaseIE):
if dailymotion_urls: if dailymotion_urls:
return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key()) return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key())
odnoklassniki_url = OdnoklassnikiIE._extract_url(info_page)
if odnoklassniki_url:
return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page)
if m_opts: if m_opts:
m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1)) m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1))
@ -383,38 +416,7 @@ class VKIE(VKBaseIE):
opts_url = 'http:' + opts_url opts_url = 'http:' + opts_url
return self.url_result(opts_url) return self.url_result(opts_url)
# vars does not look to be served anymore since 24.10.2016
data = self._parse_json(
self._search_regex(
r'var\s+vars\s*=\s*({.+?});', info_page, 'vars', default='{}'),
video_id, fatal=False)
# <!json> is served instead
if not data:
data = self._parse_json(
self._search_regex(
[r'<!json>\s*({.+?})\s*<!>', r'<!json>\s*({.+})'],
info_page, 'json', default='{}'),
video_id)
if data:
data = data['player']['params'][0]
if not data:
data = self._parse_json(
self._search_regex(
r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', info_page,
'player params', default='{}'),
video_id)
if data:
data = data['params'][0]
# <!--{...}
if not data:
data = self._parse_json(
self._search_regex(
r'<!--\s*({.+})', info_page, 'payload'),
video_id)['payload'][-1][-1]['player']['params'][0]
data = player['params'][0]
title = unescapeHTML(data['md_title']) title = unescapeHTML(data['md_title'])
# 2 = live # 2 = live
@ -463,12 +465,12 @@ class VKIE(VKBaseIE):
'title': title, 'title': title,
'thumbnail': data.get('jpg'), 'thumbnail': data.get('jpg'),
'uploader': data.get('md_author'), 'uploader': data.get('md_author'),
'uploader_id': str_or_none(data.get('author_id')),
'duration': data.get('duration'),
'uploader_id': str_or_none(data.get('author_id') or mv_data.get('authorId')),
'duration': int_or_none(data.get('duration') or mv_data.get('duration')),
'timestamp': timestamp, 'timestamp': timestamp,
'view_count': view_count, 'view_count': view_count,
'like_count': int_or_none(data.get('liked')),
'dislike_count': int_or_none(data.get('nolikes')),
'like_count': int_or_none(mv_data.get('likes')),
'comment_count': int_or_none(mv_data.get('commcount')),
'is_live': is_live, 'is_live': is_live,
} }
@ -482,7 +484,6 @@ class VKUserVideosIE(VKBaseIE):
'url': 'http://vk.com/videos205387401', 'url': 'http://vk.com/videos205387401',
'info_dict': { 'info_dict': {
'id': '205387401', 'id': '205387401',
'title': "Tom Cruise's Videos",
}, },
'playlist_mincount': 4, 'playlist_mincount': 4,
}, { }, {
@ -498,22 +499,25 @@ class VKUserVideosIE(VKBaseIE):
'url': 'http://new.vk.com/videos205387401', 'url': 'http://new.vk.com/videos205387401',
'only_matching': True, 'only_matching': True,
}] }]
_VIDEO = collections.namedtuple(
'Video', ['owner_id', 'id', 'thumb', 'title', 'flags', 'duration', 'hash', 'moder_acts', 'owner', 'date', 'views', 'platform', 'blocked', 'music_video_meta'])
def _real_extract(self, url): def _real_extract(self, url):
page_id = self._match_id(url) page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
l = self._download_payload('al_video', page_id, {
'act': 'load_videos_silent',
'oid': page_id,
})[0]['']['list']
entries = [
self.url_result(
'http://vk.com/video' + video_id, 'VK', video_id=video_id)
for video_id in orderedSet(re.findall(r'href="/video(-?[0-9_]+)"', webpage))]
title = unescapeHTML(self._search_regex(
r'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos',
webpage, 'title', default=page_id))
entries = []
for video in l:
v = self._VIDEO._make(video)
video_id = '%d_%d' % (v.owner_id, v.id)
entries.append(self.url_result(
'http://vk.com/video' + video_id, 'VK', video_id=video_id))
return self.playlist_result(entries, page_id, title)
return self.playlist_result(entries, page_id)
class VKWallPostIE(VKBaseIE): class VKWallPostIE(VKBaseIE):
@ -523,15 +527,15 @@ class VKWallPostIE(VKBaseIE):
# public page URL, audio playlist # public page URL, audio playlist
'url': 'https://vk.com/bs.official?w=wall-23538238_35', 'url': 'https://vk.com/bs.official?w=wall-23538238_35',
'info_dict': { 'info_dict': {
'id': '23538238_35',
'title': 'Black Shadow - Wall post 23538238_35',
'id': '-23538238_35',
'title': 'Black Shadow - Wall post -23538238_35',
'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c', 'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c',
}, },
'playlist': [{ 'playlist': [{
'md5': '5ba93864ec5b85f7ce19a9af4af080f6', 'md5': '5ba93864ec5b85f7ce19a9af4af080f6',
'info_dict': { 'info_dict': {
'id': '135220665_111806521', 'id': '135220665_111806521',
'ext': 'mp3',
'ext': 'mp4',
'title': 'Black Shadow - Слепое Верование', 'title': 'Black Shadow - Слепое Верование',
'duration': 370, 'duration': 370,
'uploader': 'Black Shadow', 'uploader': 'Black Shadow',
@ -542,18 +546,16 @@ class VKWallPostIE(VKBaseIE):
'md5': '4cc7e804579122b17ea95af7834c9233', 'md5': '4cc7e804579122b17ea95af7834c9233',
'info_dict': { 'info_dict': {
'id': '135220665_111802303', 'id': '135220665_111802303',
'ext': 'mp3',
'ext': 'mp4',
'title': 'Black Shadow - Война - Негасимое Бездны Пламя!', 'title': 'Black Shadow - Война - Негасимое Бездны Пламя!',
'duration': 423, 'duration': 423,
'uploader': 'Black Shadow', 'uploader': 'Black Shadow',
'artist': 'Black Shadow', 'artist': 'Black Shadow',
'track': 'Война - Негасимое Бездны Пламя!', 'track': 'Война - Негасимое Бездны Пламя!',
}, },
'params': {
'skip_download': True,
},
}], }],
'params': { 'params': {
'skip_download': True,
'usenetrc': True, 'usenetrc': True,
}, },
'skip': 'Requires vk account credentials', 'skip': 'Requires vk account credentials',
@ -562,7 +564,7 @@ class VKWallPostIE(VKBaseIE):
'url': 'https://vk.com/wall85155021_6319', 'url': 'https://vk.com/wall85155021_6319',
'info_dict': { 'info_dict': {
'id': '85155021_6319', 'id': '85155021_6319',
'title': 'Sergey Gorbunov - Wall post 85155021_6319',
'title': 'Сергей Горбунов - Wall post 85155021_6319',
}, },
'playlist_count': 1, 'playlist_count': 1,
'params': { 'params': {
@ -578,58 +580,73 @@ class VKWallPostIE(VKBaseIE):
'url': 'https://m.vk.com/wall-23538238_35', 'url': 'https://m.vk.com/wall-23538238_35',
'only_matching': True, 'only_matching': True,
}] }]
_BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/='
_AUDIO = collections.namedtuple(
'Audio', ['id', 'owner_id', 'url', 'title', 'performer', 'duration', 'album_id', 'unk', 'author_link', 'lyrics', 'flags', 'context', 'extra', 'hashes', 'cover_url', 'ads', 'subtitle', 'main_artists', 'feat_artists', 'album', 'track_code', 'restriction', 'album_part', 'new_stats', 'access_key'])
def _decode(self, enc):
dec = ''
e = n = 0
for c in enc:
r = self._BASE64_CHARS.index(c)
cond = n % 4
e = 64 * e + r if cond else r
n += 1
if cond:
dec += chr(255 & e >> (-2 * n & 6))
return dec
def _unmask_url(self, mask_url, vk_id):
if 'audio_api_unavailable' in mask_url:
extra = mask_url.split('?extra=')[1].split('#')
func, base = self._decode(extra[1]).split(chr(11))
assert (func == 'i')
mask_url = list(self._decode(extra[0]))
url_len = len(mask_url)
indexes = [None] * url_len
index = int(base) ^ vk_id
for n in range(url_len - 1, -1, -1):
index = (url_len * (n + 1) ^ index + n) % url_len
indexes[n] = index
for n in range(1, url_len):
c = mask_url[n]
index = indexes[url_len - 1 - n]
mask_url[n] = mask_url[index]
mask_url[index] = c
mask_url = ''.join(mask_url)
return mask_url
def _real_extract(self, url): def _real_extract(self, url):
post_id = self._match_id(url) post_id = self._match_id(url)
wall_url = 'https://vk.com/wall%s' % post_id
post_id = remove_start(post_id, '-')
webpage = self._download_webpage(wall_url, post_id)
error = self._html_search_regex(
r'>Error</div>\s*<div[^>]+class=["\']body["\'][^>]*>([^<]+)',
webpage, 'error', default=None)
if error:
raise ExtractorError('VK said: %s' % error, expected=True)
webpage = self._download_payload('wkview', post_id, {
'act': 'show',
'w': 'wall' + post_id,
})[1]
description = clean_html(get_element_by_class('wall_post_text', webpage)) description = clean_html(get_element_by_class('wall_post_text', webpage))
uploader = clean_html(get_element_by_class('author', webpage)) uploader = clean_html(get_element_by_class('author', webpage))
thumbnail = self._og_search_thumbnail(webpage)
entries = [] entries = []
audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage)
if audio_ids:
al_audio = self._download_webpage(
'https://vk.com/al_audio.php', post_id,
note='Downloading audio info', fatal=False,
data=urlencode_postdata({
'act': 'reload_audio',
'al': '1',
'ids': ','.join(audio_ids)
}))
if al_audio:
Audio = collections.namedtuple(
'Audio', ['id', 'user_id', 'url', 'track', 'artist', 'duration'])
audios = self._parse_json(
self._search_regex(
r'<!json>(.+?)<!>', al_audio, 'audios', default='[]'),
post_id, fatal=False, transform_source=unescapeHTML)
if isinstance(audios, list):
for audio in audios:
a = Audio._make(audio[:6])
entries.append({
'id': '%s_%s' % (a.user_id, a.id),
'url': a.url,
'title': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id,
'thumbnail': thumbnail,
'duration': a.duration,
'uploader': uploader,
'artist': a.artist,
'track': a.track,
})
for audio in re.findall(r'data-audio="([^"]+)', webpage):
audio = self._parse_json(unescapeHTML(audio), post_id)
a = self._AUDIO._make(audio)
if not a.url:
continue
title = unescapeHTML(a.title)
entries.append({
'id': '%s_%s' % (a.owner_id, a.id),
'url': self._unmask_url(a.url, a.ads['vk_id']),
'title': '%s - %s' % (a.performer, title) if a.performer else title,
'thumbnail': a.cover_url.split(',') if a.cover_url else None,
'duration': a.duration,
'uploader': uploader,
'artist': a.performer,
'track': title,
'ext': 'mp4',
'protocol': 'm3u8',
})
for video in re.finditer( for video in re.finditer(
r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage): r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage):

Loading…
Cancel
Save