You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

233 lines
9.0 KiB

  1. import re
  2. import json
  3. import itertools
  4. from .common import InfoExtractor
  5. from .subtitles import SubtitlesInfoExtractor
  6. from ..utils import (
  7. compat_urllib_request,
  8. compat_str,
  9. get_element_by_attribute,
  10. get_element_by_id,
  11. orderedSet,
  12. str_to_int,
  13. ExtractorError,
  14. )
  15. class DailymotionBaseInfoExtractor(InfoExtractor):
  16. @staticmethod
  17. def _build_request(url):
  18. """Build a request with the family filter disabled"""
  19. request = compat_urllib_request.Request(url)
  20. request.add_header('Cookie', 'family_filter=off')
  21. request.add_header('Cookie', 'ff=off')
  22. return request
  23. class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
  24. """Information Extractor for Dailymotion"""
  25. _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)'
  26. IE_NAME = u'dailymotion'
  27. _FORMATS = [
  28. (u'stream_h264_ld_url', u'ld'),
  29. (u'stream_h264_url', u'standard'),
  30. (u'stream_h264_hq_url', u'hq'),
  31. (u'stream_h264_hd_url', u'hd'),
  32. (u'stream_h264_hd1080_url', u'hd180'),
  33. ]
  34. _TESTS = [
  35. {
  36. u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech',
  37. u'file': u'x33vw9.mp4',
  38. u'md5': u'392c4b85a60a90dc4792da41ce3144eb',
  39. u'info_dict': {
  40. u"uploader": u"Amphora Alex and Van .",
  41. u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\""
  42. }
  43. },
  44. # Vevo video
  45. {
  46. u'url': u'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
  47. u'file': u'USUV71301934.mp4',
  48. u'info_dict': {
  49. u'title': u'Roar (Official)',
  50. u'uploader': u'Katy Perry',
  51. u'upload_date': u'20130905',
  52. },
  53. u'params': {
  54. u'skip_download': True,
  55. },
  56. u'skip': u'VEVO is only available in some countries',
  57. },
  58. # age-restricted video
  59. {
  60. u'url': u'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
  61. u'file': u'xyh2zz.mp4',
  62. u'md5': u'0d667a7b9cebecc3c89ee93099c4159d',
  63. u'info_dict': {
  64. u'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
  65. u'uploader': 'HotWaves1012',
  66. u'age_limit': 18,
  67. }
  68. }
  69. ]
  70. def _real_extract(self, url):
  71. # Extract id and simplified title from URL
  72. mobj = re.match(self._VALID_URL, url)
  73. video_id = mobj.group(1).split('_')[0].split('?')[0]
  74. url = 'http://www.dailymotion.com/video/%s' % video_id
  75. # Retrieve video webpage to extract further information
  76. request = self._build_request(url)
  77. webpage = self._download_webpage(request, video_id)
  78. # Extract URL, uploader and title from webpage
  79. self.report_extraction(video_id)
  80. # It may just embed a vevo video:
  81. m_vevo = re.search(
  82. r'<link rel="video_src" href="[^"]*?vevo.com[^"]*?videoId=(?P<id>[\w]*)',
  83. webpage)
  84. if m_vevo is not None:
  85. vevo_id = m_vevo.group('id')
  86. self.to_screen(u'Vevo video detected: %s' % vevo_id)
  87. return self.url_result(u'vevo:%s' % vevo_id, ie='Vevo')
  88. video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
  89. # Looking for official user
  90. r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
  91. webpage, 'video uploader', fatal=False)
  92. age_limit = self._rta_search(webpage)
  93. video_upload_date = None
  94. mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
  95. if mobj is not None:
  96. video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
  97. embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id
  98. embed_page = self._download_webpage(embed_url, video_id,
  99. u'Downloading embed page')
  100. info = self._search_regex(r'var info = ({.*?}),$', embed_page,
  101. 'video info', flags=re.MULTILINE)
  102. info = json.loads(info)
  103. if info.get('error') is not None:
  104. msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title']
  105. raise ExtractorError(msg, expected=True)
  106. formats = []
  107. for (key, format_id) in self._FORMATS:
  108. video_url = info.get(key)
  109. if video_url is not None:
  110. m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
  111. if m_size is not None:
  112. width, height = m_size.group(1), m_size.group(2)
  113. else:
  114. width, height = None, None
  115. formats.append({
  116. 'url': video_url,
  117. 'ext': 'mp4',
  118. 'format_id': format_id,
  119. 'width': width,
  120. 'height': height,
  121. })
  122. if not formats:
  123. raise ExtractorError(u'Unable to extract video URL')
  124. # subtitles
  125. video_subtitles = self.extract_subtitles(video_id, webpage)
  126. if self._downloader.params.get('listsubtitles', False):
  127. self._list_available_subtitles(video_id, webpage)
  128. return
  129. view_count = str_to_int(self._search_regex(
  130. r'video_views_value[^>]+>([\d\.,]+)<', webpage, u'view count'))
  131. return {
  132. 'id': video_id,
  133. 'formats': formats,
  134. 'uploader': video_uploader,
  135. 'upload_date': video_upload_date,
  136. 'title': self._og_search_title(webpage),
  137. 'subtitles': video_subtitles,
  138. 'thumbnail': info['thumbnail_url'],
  139. 'age_limit': age_limit,
  140. 'view_count': view_count,
  141. }
  142. def _get_available_subtitles(self, video_id, webpage):
  143. try:
  144. sub_list = self._download_webpage(
  145. 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id,
  146. video_id, note=False)
  147. except ExtractorError as err:
  148. self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
  149. return {}
  150. info = json.loads(sub_list)
  151. if (info['total'] > 0):
  152. sub_lang_list = dict((l['language'], l['url']) for l in info['list'])
  153. return sub_lang_list
  154. self._downloader.report_warning(u'video doesn\'t have subtitles')
  155. return {}
  156. class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
  157. IE_NAME = u'dailymotion:playlist'
  158. _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/'
  159. _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/playlist/.+?".*?>.*?</a>.*?</div>'
  160. _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s'
  161. def _extract_entries(self, id):
  162. video_ids = []
  163. for pagenum in itertools.count(1):
  164. request = self._build_request(self._PAGE_TEMPLATE % (id, pagenum))
  165. webpage = self._download_webpage(request,
  166. id, u'Downloading page %s' % pagenum)
  167. playlist_el = get_element_by_attribute(u'class', u'row video_list', webpage)
  168. video_ids.extend(re.findall(r'data-id="(.+?)"', playlist_el))
  169. if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
  170. break
  171. return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
  172. for video_id in orderedSet(video_ids)]
  173. def _real_extract(self, url):
  174. mobj = re.match(self._VALID_URL, url)
  175. playlist_id = mobj.group('id')
  176. webpage = self._download_webpage(url, playlist_id)
  177. return {'_type': 'playlist',
  178. 'id': playlist_id,
  179. 'title': get_element_by_id(u'playlist_name', webpage),
  180. 'entries': self._extract_entries(playlist_id),
  181. }
  182. class DailymotionUserIE(DailymotionPlaylistIE):
  183. IE_NAME = u'dailymotion:user'
  184. _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P<user>[^/]+)'
  185. _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/user/.+?".*?>.*?</a>.*?</div>'
  186. _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
  187. def _real_extract(self, url):
  188. mobj = re.match(self._VALID_URL, url)
  189. user = mobj.group('user')
  190. webpage = self._download_webpage(url, user)
  191. full_user = self._html_search_regex(
  192. r'<a class="label" href="/%s".*?>(.*?)</' % re.escape(user),
  193. webpage, u'user', flags=re.DOTALL)
  194. return {
  195. '_type': 'playlist',
  196. 'id': user,
  197. 'title': full_user,
  198. 'entries': self._extract_entries(user),
  199. }