You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

127 lines
4.8 KiB

11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
  1. import re
  2. import json
  3. import xml.etree.ElementTree
  4. import datetime
  5. from .common import InfoExtractor
  6. from ..utils import (
  7. compat_HTTPError,
  8. ExtractorError,
  9. )
  10. class VevoIE(InfoExtractor):
  11. """
  12. Accepts urls from vevo.com or in the format 'vevo:{id}'
  13. (currently used by MTVIE)
  14. """
  15. _VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)'
  16. _TESTS = [{
  17. u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
  18. u'file': u'GB1101300280.mp4',
  19. u"md5": u"06bea460acb744eab74a9d7dcb4bfd61",
  20. u'info_dict': {
  21. u"upload_date": u"20130624",
  22. u"uploader": u"Hurts",
  23. u"title": u"Somebody to Die For",
  24. u"duration": 230,
  25. u"width": 1920,
  26. u"height": 1080,
  27. }
  28. }]
  29. _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'
  30. def _formats_from_json(self, video_info):
  31. last_version = {'version': -1}
  32. for version in video_info['videoVersions']:
  33. # These are the HTTP downloads, other types are for different manifests
  34. if version['sourceType'] == 2:
  35. if version['version'] > last_version['version']:
  36. last_version = version
  37. if last_version['version'] == -1:
  38. raise ExtractorError(u'Unable to extract last version of the video')
  39. renditions = xml.etree.ElementTree.fromstring(last_version['data'])
  40. formats = []
  41. # Already sorted from worst to best quality
  42. for rend in renditions.findall('rendition'):
  43. attr = rend.attrib
  44. format_note = '%(videoCodec)s@%(videoBitrate)4sk, %(audioCodec)s@%(audioBitrate)3sk' % attr
  45. formats.append({
  46. 'url': attr['url'],
  47. 'format_id': attr['name'],
  48. 'format_note': format_note,
  49. 'height': int(attr['frameheight']),
  50. 'width': int(attr['frameWidth']),
  51. })
  52. return formats
  53. def _formats_from_smil(self, smil_xml):
  54. formats = []
  55. smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8'))
  56. els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')
  57. for el in els:
  58. src = el.attrib['src']
  59. m = re.match(r'''(?xi)
  60. (?P<ext>[a-z0-9]+):
  61. (?P<path>
  62. [/a-z0-9]+ # The directory and main part of the URL
  63. _(?P<cbr>[0-9]+)k
  64. _(?P<width>[0-9]+)x(?P<height>[0-9]+)
  65. _(?P<vcodec>[a-z0-9]+)
  66. _(?P<vbr>[0-9]+)
  67. _(?P<acodec>[a-z0-9]+)
  68. _(?P<abr>[0-9]+)
  69. \.[a-z0-9]+ # File extension
  70. )''', src)
  71. if not m:
  72. continue
  73. format_url = self._SMIL_BASE_URL + m.group('path')
  74. format_note = ('%(vcodec)s@%(vbr)4sk, %(acodec)s@%(abr)3sk' %
  75. m.groupdict())
  76. formats.append({
  77. 'url': format_url,
  78. 'format_id': u'SMIL_' + m.group('cbr'),
  79. 'format_note': format_note,
  80. 'ext': m.group('ext'),
  81. 'width': int(m.group('width')),
  82. 'height': int(m.group('height')),
  83. })
  84. return formats
  85. def _real_extract(self, url):
  86. mobj = re.match(self._VALID_URL, url)
  87. video_id = mobj.group('id')
  88. json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
  89. info_json = self._download_webpage(json_url, video_id, u'Downloading json info')
  90. video_info = json.loads(info_json)['video']
  91. formats = self._formats_from_json(video_info)
  92. try:
  93. smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
  94. self._SMIL_BASE_URL, video_id, video_id.lower())
  95. smil_xml = self._download_webpage(smil_url, video_id,
  96. u'Downloading SMIL info')
  97. formats.extend(self._formats_from_smil(smil_xml))
  98. except ExtractorError as ee:
  99. if not isinstance(ee.cause, compat_HTTPError):
  100. raise
  101. self._downloader.report_warning(
  102. u'Cannot download SMIL information, falling back to JSON ..')
  103. timestamp_ms = int(self._search_regex(
  104. r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date'))
  105. upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000)
  106. info = {
  107. 'id': video_id,
  108. 'title': video_info['title'],
  109. 'formats': formats,
  110. 'thumbnail': video_info['imageUrl'],
  111. 'upload_date': upload_date.strftime('%Y%m%d'),
  112. 'uploader': video_info['mainArtists'][0]['artistName'],
  113. 'duration': video_info['duration'],
  114. }
  115. return info