From a662489877aa8d88b898a4984c2e580b9edbe7de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 14 Nov 2015 05:09:50 +0600 Subject: [PATCH] [brightcove:embedinpage] Make more robust and extract rtmp streams --- youtube_dl/extractor/brightcove.py | 109 ++++++++++++++++++++--------- 1 file changed, 75 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 2c7d968a8..2ad35ec90 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -23,6 +23,7 @@ from ..utils import ( unescapeHTML, unsmuggle_url, js_to_json, + float_or_none, int_or_none, parse_iso8601, extract_attributes, @@ -353,7 +354,7 @@ class BrightcoveIE(InfoExtractor): class BrightcoveInPageEmbedIE(InfoExtractor): - _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P\d+)' + _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/(?P[\da-f-]+)_(?P[a-z]+)/index\.html\?.*videoId=(?P\d+)' _TEST = { 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', 'md5': 'c8100925723840d4b0d243f7025703be', @@ -385,59 +386,99 @@ class BrightcoveInPageEmbedIE(InfoExtractor): def _real_extract(self, url): account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) + webpage = self._download_webpage( + 'http://players.brightcove.net/%s/%s_%s/index.min.js' + % (account_id, player_id, embed), video_id) - catalog = self._parse_json( - js_to_json( - self._search_regex( - r'catalog\(({[^}]+})\);', - webpage, - 'catalog' - ) - ), - video_id - ) - policy_key = catalog['policyKey'] + policy_key = None + + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P.+?)\1', + webpage, 'policy key', group='pk') req = compat_urllib_request.Request( - 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id), + 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' + % (account_id, video_id), headers={'Accept': 'application/json;pk=%s' % policy_key}) json_data = self._download_json(req, video_id) title = json_data['name'] - description = json_data.get('description') - thumbnail = json_data.get('thumbnail') - timestamp = parse_iso8601(json_data.get('published_at')) - duration = int_or_none(json_data.get('duration')) formats = [] - for source in json_data.get('sources'): + for source in json_data.get('sources', []): source_type = source.get('type') + src = source.get('src') if source_type == 'application/x-mpegURL': - formats.extend(self._extract_m3u8_formats(source.get('src'), video_id)) + if not src: + continue + m3u8_formats = self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) else: - src = source.get('src') or source.get('streaming_src') - if src: - formats.append({ - 'url': src, - 'tbr': source.get('avg_bitrate'), - 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), - 'filesize': source.get('size'), - 'container': source.get('container'), - 'vcodec': source.get('codec'), - 'ext': source.get('container').lower(), - }) + streaming_src = source.get('streaming_src') + stream_name, app_name = source.get('stream_name'), source.get('app_name') + if not src and not streaming_src and (not stream_name or not app_name): + continue + tbr = float_or_none(source.get('avg_bitrate'), 1000) + height = int_or_none(source.get('height')) + f = { + 'tbr': tbr, + 'width': int_or_none(source.get('width')), + 'height': height, + 'filesize': int_or_none(source.get('size')), + 'container': source.get('container'), + 'vcodec': source.get('codec'), + 'ext': source.get('container').lower(), + } + def build_format_id(kind): + format_id = kind + if tbr: + format_id += '-%dk' % int(tbr) + if height: + format_id += '-%dp' % height + return format_id + + if src or streaming_src: + f.update({ + 'url': src or streaming_src, + 'format_id': build_format_id('http' if src else 'http-streaming'), + 'preference': 2 if src else 1, + }) + else: + f.update({ + 'url': app_name, + 'play_path': stream_name, + 'format_id': build_format_id('rtmp'), + }) + formats.append(f) self._sort_formats(formats) + description = json_data.get('description') + thumbnail = json_data.get('thumbnail') + timestamp = parse_iso8601(json_data.get('published_at')) + duration = float_or_none(json_data.get('duration'), 1000) + tags = json_data.get('tags', []) + return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, - 'timestamp': timestamp, 'duration': duration, - 'formats': formats, + 'timestamp': timestamp, 'uploader_id': account_id, + 'formats': formats, + 'tags': tags, }