555 lines
22 KiB

10 years ago
10 years ago
9 years ago
  1. from __future__ import unicode_literals
  2. import xml.etree.ElementTree
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. ExtractorError,
  6. parse_duration,
  7. int_or_none,
  8. )
  9. from ..compat import compat_HTTPError
  10. import re
  11. class BBCCoUkIE(InfoExtractor):
  12. IE_NAME = 'bbc.co.uk'
  13. IE_DESC = 'BBC iPlayer'
  14. _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
  15. mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s'
  16. _TESTS = [
  17. {
  18. 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
  19. 'info_dict': {
  20. 'id': 'b039d07m',
  21. 'ext': 'flv',
  22. 'title': 'Kaleidoscope, Leonard Cohen',
  23. 'description': 'The Canadian poet and songwriter reflects on his musical career.',
  24. 'duration': 1740,
  25. },
  26. 'params': {
  27. # rtmp download
  28. 'skip_download': True,
  29. }
  30. },
  31. {
  32. 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
  33. 'info_dict': {
  34. 'id': 'b00yng1d',
  35. 'ext': 'flv',
  36. 'title': 'The Man in Black: Series 3: The Printed Name',
  37. 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
  38. 'duration': 1800,
  39. },
  40. 'params': {
  41. # rtmp download
  42. 'skip_download': True,
  43. },
  44. 'skip': 'Episode is no longer available on BBC iPlayer Radio',
  45. },
  46. {
  47. 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
  48. 'info_dict': {
  49. 'id': 'b00yng1d',
  50. 'ext': 'flv',
  51. 'title': 'The Voice UK: Series 3: Blind Auditions 5',
  52. 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
  53. 'duration': 5100,
  54. },
  55. 'params': {
  56. # rtmp download
  57. 'skip_download': True,
  58. },
  59. 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
  60. },
  61. {
  62. 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
  63. 'info_dict': {
  64. 'id': 'b03k3pb7',
  65. 'ext': 'flv',
  66. 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
  67. 'description': '2. Invasion',
  68. 'duration': 3600,
  69. },
  70. 'params': {
  71. # rtmp download
  72. 'skip_download': True,
  73. },
  74. 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
  75. }, {
  76. 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
  77. 'info_dict': {
  78. 'id': 'b04v209v',
  79. 'ext': 'flv',
  80. 'title': 'Pete Tong, The Essential New Tune Special',
  81. 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
  82. 'duration': 10800,
  83. },
  84. 'params': {
  85. # rtmp download
  86. 'skip_download': True,
  87. }
  88. }, {
  89. 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3',
  90. 'note': 'Audio',
  91. 'info_dict': {
  92. 'id': 'p02frcch',
  93. 'ext': 'flv',
  94. 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix',
  95. 'description': 'French house superstar Madeon takes us out of the club and onto the after party.',
  96. 'duration': 3507,
  97. },
  98. 'params': {
  99. # rtmp download
  100. 'skip_download': True,
  101. }
  102. }, {
  103. 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
  104. 'note': 'Video',
  105. 'info_dict': {
  106. 'id': 'p025c103',
  107. 'ext': 'flv',
  108. 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
  109. 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
  110. 'duration': 226,
  111. },
  112. 'params': {
  113. # rtmp download
  114. 'skip_download': True,
  115. }
  116. }, {
  117. 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
  118. 'info_dict': {
  119. 'id': 'p02n76xf',
  120. 'ext': 'flv',
  121. 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
  122. 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
  123. 'duration': 3540,
  124. },
  125. 'params': {
  126. # rtmp download
  127. 'skip_download': True,
  128. },
  129. 'skip': 'geolocation',
  130. }, {
  131. 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
  132. 'info_dict': {
  133. 'id': 'b05zmgw1',
  134. 'ext': 'flv',
  135. 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
  136. 'title': 'Royal Academy Summer Exhibition',
  137. 'duration': 3540,
  138. },
  139. 'params': {
  140. # rtmp download
  141. 'skip_download': True,
  142. },
  143. 'skip': 'geolocation',
  144. }, {
  145. 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
  146. 'only_matching': True,
  147. }, {
  148. 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
  149. 'only_matching': True,
  150. }, {
  151. 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
  152. 'only_matching': True,
  153. }
  154. ]
  155. def _extract_asx_playlist(self, connection, programme_id):
  156. asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
  157. return [ref.get('href') for ref in asx.findall('./Entry/ref')]
  158. def _extract_connection(self, connection, programme_id):
  159. formats = []
  160. protocol = connection.get('protocol')
  161. supplier = connection.get('supplier')
  162. if protocol == 'http':
  163. href = connection.get('href')
  164. # ASX playlist
  165. if supplier == 'asx':
  166. for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
  167. formats.append({
  168. 'url': ref,
  169. 'format_id': 'ref%s_%s' % (i, supplier),
  170. })
  171. # Direct link
  172. else:
  173. formats.append({
  174. 'url': href,
  175. 'format_id': supplier,
  176. })
  177. elif protocol == 'rtmp':
  178. application = connection.get('application', 'ondemand')
  179. auth_string = connection.get('authString')
  180. identifier = connection.get('identifier')
  181. server = connection.get('server')
  182. formats.append({
  183. 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
  184. 'play_path': identifier,
  185. 'app': '%s?%s' % (application, auth_string),
  186. 'page_url': 'http://www.bbc.co.uk',
  187. 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
  188. 'rtmp_live': False,
  189. 'ext': 'flv',
  190. 'format_id': supplier,
  191. })
  192. return formats
  193. def _extract_items(self, playlist):
  194. return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item')
  195. def _extract_medias(self, media_selection):
  196. error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error')
  197. if error is not None:
  198. raise ExtractorError(
  199. '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True)
  200. return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
  201. def _extract_connections(self, media):
  202. return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection')
  203. def _extract_video(self, media, programme_id):
  204. formats = []
  205. vbr = int(media.get('bitrate'))
  206. vcodec = media.get('encoding')
  207. service = media.get('service')
  208. width = int(media.get('width'))
  209. height = int(media.get('height'))
  210. file_size = int(media.get('media_file_size'))
  211. for connection in self._extract_connections(media):
  212. conn_formats = self._extract_connection(connection, programme_id)
  213. for format in conn_formats:
  214. format.update({
  215. 'format_id': '%s_%s' % (service, format['format_id']),
  216. 'width': width,
  217. 'height': height,
  218. 'vbr': vbr,
  219. 'vcodec': vcodec,
  220. 'filesize': file_size,
  221. })
  222. formats.extend(conn_formats)
  223. return formats
  224. def _extract_audio(self, media, programme_id):
  225. formats = []
  226. abr = int(media.get('bitrate'))
  227. acodec = media.get('encoding')
  228. service = media.get('service')
  229. for connection in self._extract_connections(media):
  230. conn_formats = self._extract_connection(connection, programme_id)
  231. for format in conn_formats:
  232. format.update({
  233. 'format_id': '%s_%s' % (service, format['format_id']),
  234. 'abr': abr,
  235. 'acodec': acodec,
  236. })
  237. formats.extend(conn_formats)
  238. return formats
  239. def _get_subtitles(self, media, programme_id):
  240. subtitles = {}
  241. for connection in self._extract_connections(media):
  242. captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
  243. lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
  244. subtitles[lang] = [
  245. {
  246. 'url': connection.get('href'),
  247. 'ext': 'ttml',
  248. },
  249. ]
  250. return subtitles
  251. def _download_media_selector(self, programme_id):
  252. try:
  253. media_selection = self._download_xml(
  254. self.mediaselector_url % programme_id,
  255. programme_id, 'Downloading media selection XML')
  256. except ExtractorError as ee:
  257. if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
  258. media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8'))
  259. else:
  260. raise
  261. formats = []
  262. subtitles = None
  263. for media in self._extract_medias(media_selection):
  264. kind = media.get('kind')
  265. if kind == 'audio':
  266. formats.extend(self._extract_audio(media, programme_id))
  267. elif kind == 'video':
  268. formats.extend(self._extract_video(media, programme_id))
  269. elif kind == 'captions':
  270. subtitles = self.extract_subtitles(media, programme_id)
  271. return formats, subtitles
  272. def _download_playlist(self, playlist_id):
  273. try:
  274. playlist = self._download_json(
  275. 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
  276. playlist_id, 'Downloading playlist JSON')
  277. version = playlist.get('defaultAvailableVersion')
  278. if version:
  279. smp_config = version['smpConfig']
  280. title = smp_config['title']
  281. description = smp_config['summary']
  282. for item in smp_config['items']:
  283. kind = item['kind']
  284. if kind != 'programme' and kind != 'radioProgramme':
  285. continue
  286. programme_id = item.get('vpid')
  287. duration = int(item.get('duration'))
  288. formats, subtitles = self._download_media_selector(programme_id)
  289. return programme_id, title, description, duration, formats, subtitles
  290. except ExtractorError as ee:
  291. if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
  292. raise
  293. # fallback to legacy playlist
  294. playlist = self._download_xml(
  295. 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id,
  296. playlist_id, 'Downloading legacy playlist XML')
  297. no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
  298. if no_items is not None:
  299. reason = no_items.get('reason')
  300. if reason == 'preAvailability':
  301. msg = 'Episode %s is not yet available' % playlist_id
  302. elif reason == 'postAvailability':
  303. msg = 'Episode %s is no longer available' % playlist_id
  304. elif reason == 'noMedia':
  305. msg = 'Episode %s is not currently available' % playlist_id
  306. else:
  307. msg = 'Episode %s is not available: %s' % (playlist_id, reason)
  308. raise ExtractorError(msg, expected=True)
  309. for item in self._extract_items(playlist):
  310. kind = item.get('kind')
  311. if kind != 'programme' and kind != 'radioProgramme':
  312. continue
  313. title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
  314. description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
  315. programme_id = item.get('identifier')
  316. duration = int(item.get('duration'))
  317. formats, subtitles = self._download_media_selector(programme_id)
  318. return programme_id, title, description, duration, formats, subtitles
  319. def _real_extract(self, url):
  320. group_id = self._match_id(url)
  321. webpage = self._download_webpage(url, group_id, 'Downloading video page')
  322. programme_id = None
  323. tviplayer = self._search_regex(
  324. r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
  325. webpage, 'player', default=None)
  326. if tviplayer:
  327. player = self._parse_json(tviplayer, group_id).get('player', {})
  328. duration = int_or_none(player.get('duration'))
  329. programme_id = player.get('vpid')
  330. if not programme_id:
  331. programme_id = self._search_regex(
  332. r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None)
  333. if programme_id:
  334. formats, subtitles = self._download_media_selector(programme_id)
  335. title = self._og_search_title(webpage)
  336. description = self._search_regex(
  337. r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
  338. webpage, 'description', fatal=False)
  339. else:
  340. programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
  341. self._sort_formats(formats)
  342. return {
  343. 'id': programme_id,
  344. 'title': title,
  345. 'description': description,
  346. 'thumbnail': self._og_search_thumbnail(webpage, default=None),
  347. 'duration': duration,
  348. 'formats': formats,
  349. 'subtitles': subtitles,
  350. }
  351. class BBCNewsIE(BBCCoUkIE):
  352. IE_NAME = 'bbc.com'
  353. IE_DESC = 'BBC news'
  354. _VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P<id>[^/]+)$'
  355. mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s'
  356. _TESTS = [{
  357. 'url': 'http://www.bbc.com/news/world-europe-32668511',
  358. 'info_dict': {
  359. 'id': 'world-europe-32668511',
  360. 'title': 'Russia stages massive WW2 parade despite Western boycott',
  361. },
  362. 'playlist_count': 2,
  363. },{
  364. 'url': 'http://www.bbc.com/news/business-28299555',
  365. 'info_dict': {
  366. 'id': 'business-28299555',
  367. 'title': 'Farnborough Airshow: Video highlights',
  368. },
  369. 'playlist_count': 9,
  370. },{
  371. 'url': 'http://www.bbc.com/news/world-europe-32041533',
  372. 'note': 'Video',
  373. 'info_dict': {
  374. 'id': 'p02mprgb',
  375. 'ext': 'mp4',
  376. 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
  377. 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
  378. 'duration': 47,
  379. 'upload_date': '20150324',
  380. 'uploader': 'BBC News',
  381. },
  382. 'params': {
  383. 'skip_download': True,
  384. }
  385. },{
  386. 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
  387. 'note': 'Video',
  388. 'info_dict': {
  389. 'id': 'NA',
  390. 'ext': 'mp4',
  391. 'title': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde',
  392. 'description': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde',
  393. 'duration': 47,
  394. 'upload_date': '20150615',
  395. 'uploader': 'BBC News',
  396. },
  397. 'params': {
  398. 'skip_download': True,
  399. }
  400. },{
  401. 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
  402. 'note': 'Video',
  403. 'info_dict': {
  404. 'id': '39275083',
  405. 'ext': 'mp4',
  406. 'title': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n',
  407. 'description': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n',
  408. 'duration': 87,
  409. 'upload_date': '20150619',
  410. 'uploader': 'BBC News',
  411. },
  412. 'params': {
  413. 'skip_download': True,
  414. }
  415. }]
  416. def _real_extract(self, url):
  417. list_id = self._match_id(url)
  418. webpage = self._download_webpage(url, list_id)
  419. list_title = self._html_search_regex(r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'list title')
  420. pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None)
  421. if pubdate:
  422. pubdate = pubdate.replace('-','')
  423. ret = []
  424. jsent = []
  425. # works with bbc.com/news/something-something-123456 articles
  426. jsent = map(
  427. lambda m: self._parse_json(m,list_id),
  428. re.findall(r"data-media-meta='({[^']+})'", webpage)
  429. )
  430. if len(jsent) == 0:
  431. # http://www.bbc.com/news/video_and_audio/international
  432. # and single-video articles
  433. masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None)
  434. if masset:
  435. jmasset = self._parse_json(masset,list_id)
  436. for key, val in jmasset.get('videos',{}).items():
  437. for skey, sval in val.items():
  438. sval['id'] = skey
  439. jsent.append(sval)
  440. if len(jsent) == 0:
  441. # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc}
  442. # in http://www.bbc.com/news/video_and_audio/international
  443. # prone to breaking if entries have sourceFiles list
  444. jsent = map(
  445. lambda m: self._parse_json(m,list_id),
  446. re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage)
  447. )
  448. if len(jsent) == 0:
  449. raise ExtractorError('No video found', expected=True)
  450. for jent in jsent:
  451. programme_id = jent.get('externalId')
  452. xml_url = jent.get('href')
  453. title = jent.get('caption',list_title)
  454. duration = parse_duration(jent.get('duration'))
  455. description = list_title
  456. if jent.get('caption'):
  457. description += ' - ' + jent.get('caption')
  458. thumbnail = None
  459. if jent.has_key('image'):
  460. thumbnail=jent['image'].get('href')
  461. formats = []
  462. subtitles = []
  463. if programme_id:
  464. formats, subtitles = self._download_media_selector(programme_id)
  465. elif jent.has_key('sourceFiles'):
  466. # mediaselector not used at
  467. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu
  468. for key, val in jent['sourceFiles'].items():
  469. formats.append( {
  470. 'ext': val.get('encoding'),
  471. 'url': val.get('url'),
  472. 'filesize': int(val.get('filesize')),
  473. 'format_id': key
  474. } )
  475. elif xml_url:
  476. # Cheap fallback
  477. # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml
  478. xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)')
  479. programme_id = self._search_regex(r'<mediator [^>]*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)')
  480. formats, subtitles = self._download_media_selector(programme_id)
  481. if len(formats) == 0:
  482. raise ExtractorError('unsupported json media entry.\n '+str(jent)+'\n')
  483. self._sort_formats(formats)
  484. id = jent.get('id') if programme_id == None else programme_id
  485. if id == None:
  486. id = 'NA'
  487. ret.append( {
  488. 'id': id,
  489. 'uploader': 'BBC News',
  490. 'upload_date': pubdate,
  491. 'title': title,
  492. 'description': description,
  493. 'thumbnail': thumbnail,
  494. 'duration': duration,
  495. 'formats': formats,
  496. 'subtitles': subtitles,
  497. } )
  498. if len(ret) > 0:
  499. return self.playlist_result(ret, list_id, list_title)
  500. raise ExtractorError('No video found', expected=True)