You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

290 lines
12 KiB

  1. # encoding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. import json
  5. import base64
  6. import zlib
  7. import xml.etree.ElementTree
  8. from hashlib import sha1
  9. from math import pow, sqrt, floor
  10. from .subtitles import SubtitlesInfoExtractor
  11. from ..utils import (
  12. ExtractorError,
  13. compat_urllib_parse,
  14. compat_urllib_request,
  15. bytes_to_intlist,
  16. intlist_to_bytes,
  17. unified_strdate,
  18. clean_html,
  19. urlencode_postdata,
  20. )
  21. from ..aes import (
  22. aes_cbc_decrypt,
  23. inc,
  24. )
  25. class CrunchyrollIE(SubtitlesInfoExtractor):
  26. _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
  27. _TEST = {
  28. 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
  29. #'md5': 'b1639fd6ddfaa43788c85f6d1dddd412',
  30. 'info_dict': {
  31. 'id': '645513',
  32. 'ext': 'flv',
  33. 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
  34. 'description': 'md5:2d17137920c64f2f49981a7797d275ef',
  35. 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',
  36. 'uploader': 'Yomiuri Telecasting Corporation (YTV)',
  37. 'upload_date': '20131013',
  38. 'url': 're:(?!.*&amp)',
  39. },
  40. 'params': {
  41. # rtmp
  42. 'skip_download': True,
  43. },
  44. }
  45. _FORMAT_IDS = {
  46. '360': ('60', '106'),
  47. '480': ('61', '106'),
  48. '720': ('62', '106'),
  49. '1080': ('80', '108'),
  50. }
  51. def _login(self):
  52. (username, password) = self._get_login_info()
  53. if username is None:
  54. return
  55. self.report_login()
  56. login_url = 'https://www.crunchyroll.com/?a=formhandler'
  57. data = urlencode_postdata({
  58. 'formname': 'RpcApiUser_Login',
  59. 'name': username,
  60. 'password': password,
  61. })
  62. login_request = compat_urllib_request.Request(login_url, data)
  63. login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
  64. self._download_webpage(login_request, None, False, 'Wrong login info')
  65. def _real_initialize(self):
  66. self._login()
  67. def _decrypt_subtitles(self, data, iv, id):
  68. data = bytes_to_intlist(data)
  69. iv = bytes_to_intlist(iv)
  70. id = int(id)
  71. def obfuscate_key_aux(count, modulo, start):
  72. output = list(start)
  73. for _ in range(count):
  74. output.append(output[-1] + output[-2])
  75. # cut off start values
  76. output = output[2:]
  77. output = list(map(lambda x: x % modulo + 33, output))
  78. return output
  79. def obfuscate_key(key):
  80. num1 = int(floor(pow(2, 25) * sqrt(6.9)))
  81. num2 = (num1 ^ key) << 5
  82. num3 = key ^ num1
  83. num4 = num3 ^ (num3 >> 3) ^ num2
  84. prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2)))
  85. shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest())
  86. # Extend 160 Bit hash to 256 Bit
  87. return shaHash + [0] * 12
  88. key = obfuscate_key(id)
  89. class Counter:
  90. __value = iv
  91. def next_value(self):
  92. temp = self.__value
  93. self.__value = inc(self.__value)
  94. return temp
  95. decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv))
  96. return zlib.decompress(decrypted_data)
  97. def _convert_subtitles_to_srt(self, subtitles):
  98. output = ''
  99. for i, (start, end, text) in enumerate(re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles), 1):
  100. start = start.replace('.', ',')
  101. end = end.replace('.', ',')
  102. text = clean_html(text)
  103. text = text.replace('\\N', '\n')
  104. if not text:
  105. continue
  106. output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text)
  107. return output
  108. def _convert_subtitles_to_ass(self, subtitles):
  109. output = ''
  110. def ass_bool(strvalue):
  111. assvalue = '0'
  112. if strvalue == '1':
  113. assvalue = '-1'
  114. return assvalue
  115. sub_root = xml.etree.ElementTree.fromstring(subtitles)
  116. if not sub_root:
  117. return output
  118. output = '[Script Info]\n'
  119. output += 'Title: %s\n' % sub_root.attrib["title"]
  120. output += 'ScriptType: v4.00+\n'
  121. output += 'WrapStyle: %s\n' % sub_root.attrib["wrap_style"]
  122. output += 'PlayResX: %s\n' % sub_root.attrib["play_res_x"]
  123. output += 'PlayResY: %s\n' % sub_root.attrib["play_res_y"]
  124. output += """ScaledBorderAndShadow: yes
  125. [V4+ Styles]
  126. Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
  127. """
  128. for style in sub_root.findall('./styles/style'):
  129. output += 'Style: ' + style.attrib["name"]
  130. output += ',' + style.attrib["font_name"]
  131. output += ',' + style.attrib["font_size"]
  132. output += ',' + style.attrib["primary_colour"]
  133. output += ',' + style.attrib["secondary_colour"]
  134. output += ',' + style.attrib["outline_colour"]
  135. output += ',' + style.attrib["back_colour"]
  136. output += ',' + ass_bool(style.attrib["bold"])
  137. output += ',' + ass_bool(style.attrib["italic"])
  138. output += ',' + ass_bool(style.attrib["underline"])
  139. output += ',' + ass_bool(style.attrib["strikeout"])
  140. output += ',' + style.attrib["scale_x"]
  141. output += ',' + style.attrib["scale_y"]
  142. output += ',' + style.attrib["spacing"]
  143. output += ',' + style.attrib["angle"]
  144. output += ',' + style.attrib["border_style"]
  145. output += ',' + style.attrib["outline"]
  146. output += ',' + style.attrib["shadow"]
  147. output += ',' + style.attrib["alignment"]
  148. output += ',' + style.attrib["margin_l"]
  149. output += ',' + style.attrib["margin_r"]
  150. output += ',' + style.attrib["margin_v"]
  151. output += ',' + style.attrib["encoding"]
  152. output += '\n'
  153. output += """
  154. [Events]
  155. Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
  156. """
  157. for event in sub_root.findall('./events/event'):
  158. output += 'Dialogue: 0'
  159. output += ',' + event.attrib["start"]
  160. output += ',' + event.attrib["end"]
  161. output += ',' + event.attrib["style"]
  162. output += ',' + event.attrib["name"]
  163. output += ',' + event.attrib["margin_l"]
  164. output += ',' + event.attrib["margin_r"]
  165. output += ',' + event.attrib["margin_v"]
  166. output += ',' + event.attrib["effect"]
  167. output += ',' + event.attrib["text"]
  168. output += '\n'
  169. return output
  170. def _real_extract(self,url):
  171. mobj = re.match(self._VALID_URL, url)
  172. video_id = mobj.group('video_id')
  173. if mobj.group('prefix') == 'm':
  174. mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage')
  175. webpage_url = self._search_regex(r'<link rel="canonical" href="([^"]+)" />', mobile_webpage, 'webpage_url')
  176. else:
  177. webpage_url = 'http://www.' + mobj.group('url')
  178. webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage')
  179. note_m = self._html_search_regex(r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='')
  180. if note_m:
  181. raise ExtractorError(note_m)
  182. mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage)
  183. if mobj:
  184. msg = json.loads(mobj.group('msg'))
  185. if msg.get('type') == 'error':
  186. raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True)
  187. video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL)
  188. video_title = re.sub(r' {2,}', ' ', video_title)
  189. video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='')
  190. if not video_description:
  191. video_description = None
  192. video_upload_date = self._html_search_regex(r'<div>Availability for free users:(.+?)</div>', webpage, 'video_upload_date', fatal=False, flags=re.DOTALL)
  193. if video_upload_date:
  194. video_upload_date = unified_strdate(video_upload_date)
  195. video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, 'video_uploader', fatal=False, flags=re.DOTALL)
  196. playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))
  197. playerdata_req = compat_urllib_request.Request(playerdata_url)
  198. playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url})
  199. playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
  200. playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info')
  201. stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id')
  202. video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False)
  203. formats = []
  204. for fmt in re.findall(r'\?p([0-9]{3,4})=1', webpage):
  205. stream_quality, stream_format = self._FORMAT_IDS[fmt]
  206. video_format = fmt+'p'
  207. streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/')
  208. # urlencode doesn't work!
  209. streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+'&media%5Fid='+stream_id+'&video%5Fformat='+stream_format
  210. streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
  211. streamdata_req.add_header('Content-Length', str(len(streamdata_req.data)))
  212. streamdata = self._download_xml(
  213. streamdata_req, video_id,
  214. note='Downloading media info for %s' % video_format)
  215. video_url = streamdata.find('.//host').text
  216. video_play_path = streamdata.find('.//file').text
  217. formats.append({
  218. 'url': video_url,
  219. 'play_path': video_play_path,
  220. 'ext': 'flv',
  221. 'format': video_format,
  222. 'format_id': video_format,
  223. })
  224. subtitles = {}
  225. sub_format = self._downloader.params.get('subtitlesformat', 'srt')
  226. for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
  227. sub_page = self._download_webpage('http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id='+sub_id,\
  228. video_id, note='Downloading subtitles for '+sub_name)
  229. id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False)
  230. iv = self._search_regex(r'<iv>([^<]+)', sub_page, 'subtitle_iv', fatal=False)
  231. data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
  232. if not id or not iv or not data:
  233. continue
  234. id = int(id)
  235. iv = base64.b64decode(iv)
  236. data = base64.b64decode(data)
  237. subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
  238. lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
  239. if not lang_code:
  240. continue
  241. if sub_format == 'ass':
  242. subtitles[lang_code] = self._convert_subtitles_to_ass(subtitle)
  243. else:
  244. subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle)
  245. if self._downloader.params.get('listsubtitles', False):
  246. self._list_available_subtitles(video_id, subtitles)
  247. return
  248. return {
  249. 'id': video_id,
  250. 'title': video_title,
  251. 'description': video_description,
  252. 'thumbnail': video_thumbnail,
  253. 'uploader': video_uploader,
  254. 'upload_date': video_upload_date,
  255. 'subtitles': subtitles,
  256. 'formats': formats,
  257. }