You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

184 lines
7.7 KiB

11 years ago
11 years ago
11 years ago
  1. import re
  2. import json
  3. import xml.etree.ElementTree
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. ExtractorError,
  7. find_xpath_attr,
  8. unified_strdate,
  9. )
  10. class ArteTvIE(InfoExtractor):
  11. """
  12. There are two sources of video in arte.tv: videos.arte.tv and
  13. www.arte.tv/guide, the extraction process is different for each one.
  14. The videos expire in 7 days, so we can't add tests.
  15. """
  16. _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
  17. _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html'
  18. _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
  19. _LIVE_URL = r'index-[0-9]+\.html$'
  20. IE_NAME = u'arte.tv'
  21. @classmethod
  22. def suitable(cls, url):
  23. return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL))
  24. # TODO implement Live Stream
  25. # from ..utils import compat_urllib_parse
  26. # def extractLiveStream(self, url):
  27. # video_lang = url.split('/')[-4]
  28. # info = self.grep_webpage(
  29. # url,
  30. # r'src="(.*?/videothek_js.*?\.js)',
  31. # 0,
  32. # [
  33. # (1, 'url', u'Invalid URL: %s' % url)
  34. # ]
  35. # )
  36. # http_host = url.split('/')[2]
  37. # next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
  38. # info = self.grep_webpage(
  39. # next_url,
  40. # r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
  41. # '(http://.*?\.swf).*?' +
  42. # '(rtmp://.*?)\'',
  43. # re.DOTALL,
  44. # [
  45. # (1, 'path', u'could not extract video path: %s' % url),
  46. # (2, 'player', u'could not extract video player: %s' % url),
  47. # (3, 'url', u'could not extract video url: %s' % url)
  48. # ]
  49. # )
  50. # video_url = u'%s/%s' % (info.get('url'), info.get('path'))
  51. def _real_extract(self, url):
  52. mobj = re.match(self._EMISSION_URL, url)
  53. if mobj is not None:
  54. lang = mobj.group('lang')
  55. # This is not a real id, it can be for example AJT for the news
  56. # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
  57. video_id = mobj.group('id')
  58. return self._extract_emission(url, video_id, lang)
  59. mobj = re.match(self._VIDEOS_URL, url)
  60. if mobj is not None:
  61. id = mobj.group('id')
  62. lang = mobj.group('lang')
  63. return self._extract_video(url, id, lang)
  64. mobj = re.match(self._LIVEWEB_URL, url)
  65. if mobj is not None:
  66. name = mobj.group('name')
  67. lang = mobj.group('lang')
  68. return self._extract_liveweb(url, name, lang)
  69. if re.search(self._LIVE_URL, video_id) is not None:
  70. raise ExtractorError(u'Arte live streams are not yet supported, sorry')
  71. # self.extractLiveStream(url)
  72. # return
  73. def _extract_emission(self, url, video_id, lang):
  74. """Extract from www.arte.tv/guide"""
  75. webpage = self._download_webpage(url, video_id)
  76. json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
  77. json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
  78. self.report_extraction(video_id)
  79. info = json.loads(json_info)
  80. player_info = info['videoJsonPlayer']
  81. info_dict = {'id': player_info['VID'],
  82. 'title': player_info['VTI'],
  83. 'description': player_info.get('VDE'),
  84. 'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),
  85. 'thumbnail': player_info['programImage'],
  86. 'ext': 'flv',
  87. }
  88. formats = player_info['VSR'].values()
  89. def _match_lang(f):
  90. # Return true if that format is in the language of the url
  91. if lang == 'fr':
  92. l = 'F'
  93. elif lang == 'de':
  94. l = 'A'
  95. regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
  96. return any(re.match(r, f['versionCode']) for r in regexes)
  97. # Some formats may not be in the same language as the url
  98. formats = filter(_match_lang, formats)
  99. # Some formats use the m3u8 protocol
  100. formats = filter(lambda f: f['videoFormat'] != 'M3U8', formats)
  101. # We order the formats by quality
  102. formats = sorted(formats, key=lambda f: int(f['height']))
  103. # Prefer videos without subtitles in the same language
  104. formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None)
  105. # Pick the best quality
  106. def _format(format_info):
  107. info = {'ext': 'flv',
  108. 'width': format_info.get('width'),
  109. 'height': format_info.get('height'),
  110. }
  111. if format_info['mediaType'] == u'rtmp':
  112. info['url'] = format_info['streamer']
  113. info['play_path'] = 'mp4:' + format_info['url']
  114. else:
  115. info_dict['url'] = format_info['url']
  116. return info
  117. info_dict['formats'] = [_format(f) for f in formats]
  118. # TODO: Remove when #980 has been merged
  119. info_dict.update(info_dict['formats'][-1])
  120. return info_dict
  121. def _extract_video(self, url, video_id, lang):
  122. """Extract from videos.arte.tv"""
  123. ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
  124. ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
  125. ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata')
  126. ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml)
  127. config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
  128. config_xml_url = config_node.attrib['ref']
  129. config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')
  130. video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml))
  131. def _key(m):
  132. quality = m.group('quality')
  133. if quality == 'hd':
  134. return 2
  135. else:
  136. return 1
  137. # We pick the best quality
  138. video_urls = sorted(video_urls, key=_key)
  139. video_url = list(video_urls)[-1].group('url')
  140. title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title')
  141. thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>',
  142. config_xml, 'thumbnail')
  143. return {'id': video_id,
  144. 'title': title,
  145. 'thumbnail': thumbnail,
  146. 'url': video_url,
  147. 'ext': 'flv',
  148. }
  149. def _extract_liveweb(self, url, name, lang):
  150. """Extract form http://liveweb.arte.tv/"""
  151. webpage = self._download_webpage(url, name)
  152. video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id')
  153. config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id,
  154. video_id, u'Downloading information')
  155. config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
  156. event_doc = config_doc.find('event')
  157. url_node = event_doc.find('video').find('urlHd')
  158. if url_node is None:
  159. url_node = video_doc.find('urlSd')
  160. return {'id': video_id,
  161. 'title': event_doc.find('name%s' % lang.capitalize()).text,
  162. 'url': url_node.text.replace('MP4', 'mp4'),
  163. 'ext': 'flv',
  164. 'thumbnail': self._og_search_thumbnail(webpage),
  165. }