You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

224 lines
8.2 KiB

10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
  1. from __future__ import unicode_literals
  2. import re
  3. import json
  4. from .subtitles import SubtitlesInfoExtractor
  5. from .common import InfoExtractor
  6. from ..utils import (
  7. compat_urllib_parse,
  8. compat_urllib_request,
  9. ExtractorError,
  10. int_or_none,
  11. compat_str,
  12. )
  13. class LyndaIE(SubtitlesInfoExtractor):
  14. IE_NAME = 'lynda'
  15. IE_DESC = 'lynda.com videos'
  16. _VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html'
  17. _LOGIN_URL = 'https://www.lynda.com/login/login.aspx'
  18. _NETRC_MACHINE = 'lynda'
  19. _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true'
  20. _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
  21. ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
  22. _TEST = {
  23. 'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
  24. 'md5': 'ecfc6862da89489161fb9cd5f5a6fac1',
  25. 'info_dict': {
  26. 'id': '114408',
  27. 'ext': 'mp4',
  28. 'title': 'Using the exercise files',
  29. 'duration': 68
  30. }
  31. }
  32. def _real_initialize(self):
  33. self._login()
  34. def _real_extract(self, url):
  35. mobj = re.match(self._VALID_URL, url)
  36. video_id = mobj.group(1)
  37. page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, video_id,
  38. 'Downloading video JSON')
  39. video_json = json.loads(page)
  40. if 'Status' in video_json:
  41. raise ExtractorError('lynda returned error: %s' % video_json['Message'], expected=True)
  42. if video_json['HasAccess'] is False:
  43. raise ExtractorError(
  44. 'Video %s is only available for members. ' % video_id + self.ACCOUNT_CREDENTIALS_HINT, expected=True)
  45. video_id = compat_str(video_json['ID'])
  46. duration = video_json['DurationInSeconds']
  47. title = video_json['Title']
  48. formats = []
  49. fmts = video_json.get('Formats')
  50. if fmts:
  51. formats.extend([
  52. {
  53. 'url': fmt['Url'],
  54. 'ext': fmt['Extension'],
  55. 'width': fmt['Width'],
  56. 'height': fmt['Height'],
  57. 'filesize': fmt['FileSize'],
  58. 'format_id': str(fmt['Resolution'])
  59. } for fmt in fmts])
  60. prioritized_streams = video_json.get('PrioritizedStreams')
  61. if prioritized_streams:
  62. formats.extend([
  63. {
  64. 'url': video_url,
  65. 'width': int_or_none(format_id),
  66. 'format_id': format_id,
  67. } for format_id, video_url in prioritized_streams['0'].items()
  68. ])
  69. self._sort_formats(formats)
  70. if self._downloader.params.get('listsubtitles', False):
  71. self._list_available_subtitles(video_id, page)
  72. return
  73. subtitles = self._fix_subtitles(self.extract_subtitles(video_id, page))
  74. return {
  75. 'id': video_id,
  76. 'title': title,
  77. 'duration': duration,
  78. 'subtitles': subtitles,
  79. 'formats': formats
  80. }
  81. def _login(self):
  82. (username, password) = self._get_login_info()
  83. if username is None:
  84. return
  85. login_form = {
  86. 'username': username,
  87. 'password': password,
  88. 'remember': 'false',
  89. 'stayPut': 'false'
  90. }
  91. request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
  92. login_page = self._download_webpage(request, None, 'Logging in as %s' % username)
  93. # Not (yet) logged in
  94. m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page)
  95. if m is not None:
  96. response = m.group('json')
  97. response_json = json.loads(response)
  98. state = response_json['state']
  99. if state == 'notlogged':
  100. raise ExtractorError('Unable to login, incorrect username and/or password', expected=True)
  101. # This is when we get popup:
  102. # > You're already logged in to lynda.com on two devices.
  103. # > If you log in here, we'll log you out of another device.
  104. # So, we need to confirm this.
  105. if state == 'conflicted':
  106. confirm_form = {
  107. 'username': '',
  108. 'password': '',
  109. 'resolve': 'true',
  110. 'remember': 'false',
  111. 'stayPut': 'false',
  112. }
  113. request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form))
  114. login_page = self._download_webpage(request, None, 'Confirming log in and log out from another device')
  115. if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None:
  116. raise ExtractorError('Unable to log in')
  117. def _fix_subtitles(self, subtitles):
  118. if subtitles is None:
  119. return subtitles # subtitles not requested
  120. fixed_subtitles = {}
  121. for k, v in subtitles.items():
  122. subs = json.loads(v)
  123. if len(subs) == 0:
  124. continue
  125. srt = ''
  126. for pos in range(0, len(subs) - 1):
  127. seq_current = subs[pos]
  128. m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode'])
  129. if m_current is None:
  130. continue
  131. seq_next = subs[pos + 1]
  132. m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode'])
  133. if m_next is None:
  134. continue
  135. appear_time = m_current.group('timecode')
  136. disappear_time = m_next.group('timecode')
  137. text = seq_current['Caption']
  138. srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text)
  139. if srt:
  140. fixed_subtitles[k] = srt
  141. return fixed_subtitles
  142. def _get_available_subtitles(self, video_id, webpage):
  143. url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id
  144. sub = self._download_webpage(url, None, False)
  145. sub_json = json.loads(sub)
  146. return {'en': url} if len(sub_json) > 0 else {}
  147. class LyndaCourseIE(InfoExtractor):
  148. IE_NAME = 'lynda:course'
  149. IE_DESC = 'lynda.com online courses'
  150. # Course link equals to welcome/introduction video link of same course
  151. # We will recognize it as course link
  152. _VALID_URL = r'https?://(?:www|m)\.lynda\.com/(?P<coursepath>[^/]+/[^/]+/(?P<courseid>\d+))-\d\.html'
  153. def _real_extract(self, url):
  154. mobj = re.match(self._VALID_URL, url)
  155. course_path = mobj.group('coursepath')
  156. course_id = mobj.group('courseid')
  157. page = self._download_webpage('http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id,
  158. course_id, 'Downloading course JSON')
  159. course_json = json.loads(page)
  160. if 'Status' in course_json and course_json['Status'] == 'NotFound':
  161. raise ExtractorError('Course %s does not exist' % course_id, expected=True)
  162. unaccessible_videos = 0
  163. videos = []
  164. (username, _) = self._get_login_info()
  165. # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided
  166. # by single video API anymore
  167. for chapter in course_json['Chapters']:
  168. for video in chapter['Videos']:
  169. if username is None and video['HasAccess'] is False:
  170. unaccessible_videos += 1
  171. continue
  172. videos.append(video['ID'])
  173. if unaccessible_videos > 0:
  174. self._downloader.report_warning('%s videos are only available for members and will not be downloaded. '
  175. % unaccessible_videos + LyndaIE.ACCOUNT_CREDENTIALS_HINT)
  176. entries = [
  177. self.url_result('http://www.lynda.com/%s/%s-4.html' %
  178. (course_path, video_id),
  179. 'Lynda')
  180. for video_id in videos]
  181. course_title = course_json['Title']
  182. return self.playlist_result(entries, course_id, course_title)