You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

227 lines
8.2 KiB

10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
  1. from __future__ import unicode_literals
  2. import re
  3. import json
  4. from .subtitles import SubtitlesInfoExtractor
  5. from .common import InfoExtractor
  6. from ..compat import (
  7. compat_str,
  8. compat_urllib_parse,
  9. compat_urllib_request,
  10. )
  11. from ..utils import (
  12. ExtractorError,
  13. int_or_none,
  14. )
  15. class LyndaIE(SubtitlesInfoExtractor):
  16. IE_NAME = 'lynda'
  17. IE_DESC = 'lynda.com videos'
  18. _VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html'
  19. _LOGIN_URL = 'https://www.lynda.com/login/login.aspx'
  20. _NETRC_MACHINE = 'lynda'
  21. _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true'
  22. _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
  23. ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
  24. _TEST = {
  25. 'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
  26. 'md5': 'ecfc6862da89489161fb9cd5f5a6fac1',
  27. 'info_dict': {
  28. 'id': '114408',
  29. 'ext': 'mp4',
  30. 'title': 'Using the exercise files',
  31. 'duration': 68
  32. }
  33. }
  34. def _real_initialize(self):
  35. self._login()
  36. def _real_extract(self, url):
  37. mobj = re.match(self._VALID_URL, url)
  38. video_id = mobj.group(1)
  39. page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, video_id,
  40. 'Downloading video JSON')
  41. video_json = json.loads(page)
  42. if 'Status' in video_json:
  43. raise ExtractorError('lynda returned error: %s' % video_json['Message'], expected=True)
  44. if video_json['HasAccess'] is False:
  45. raise ExtractorError(
  46. 'Video %s is only available for members. ' % video_id + self.ACCOUNT_CREDENTIALS_HINT, expected=True)
  47. video_id = compat_str(video_json['ID'])
  48. duration = video_json['DurationInSeconds']
  49. title = video_json['Title']
  50. formats = []
  51. fmts = video_json.get('Formats')
  52. if fmts:
  53. formats.extend([
  54. {
  55. 'url': fmt['Url'],
  56. 'ext': fmt['Extension'],
  57. 'width': fmt['Width'],
  58. 'height': fmt['Height'],
  59. 'filesize': fmt['FileSize'],
  60. 'format_id': str(fmt['Resolution'])
  61. } for fmt in fmts])
  62. prioritized_streams = video_json.get('PrioritizedStreams')
  63. if prioritized_streams:
  64. formats.extend([
  65. {
  66. 'url': video_url,
  67. 'width': int_or_none(format_id),
  68. 'format_id': format_id,
  69. } for format_id, video_url in prioritized_streams['0'].items()
  70. ])
  71. self._check_formats(formats, video_id)
  72. self._sort_formats(formats)
  73. if self._downloader.params.get('listsubtitles', False):
  74. self._list_available_subtitles(video_id, page)
  75. return
  76. subtitles = self._fix_subtitles(self.extract_subtitles(video_id, page))
  77. return {
  78. 'id': video_id,
  79. 'title': title,
  80. 'duration': duration,
  81. 'subtitles': subtitles,
  82. 'formats': formats
  83. }
  84. def _login(self):
  85. (username, password) = self._get_login_info()
  86. if username is None:
  87. return
  88. login_form = {
  89. 'username': username,
  90. 'password': password,
  91. 'remember': 'false',
  92. 'stayPut': 'false'
  93. }
  94. request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
  95. login_page = self._download_webpage(request, None, 'Logging in as %s' % username)
  96. # Not (yet) logged in
  97. m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page)
  98. if m is not None:
  99. response = m.group('json')
  100. response_json = json.loads(response)
  101. state = response_json['state']
  102. if state == 'notlogged':
  103. raise ExtractorError('Unable to login, incorrect username and/or password', expected=True)
  104. # This is when we get popup:
  105. # > You're already logged in to lynda.com on two devices.
  106. # > If you log in here, we'll log you out of another device.
  107. # So, we need to confirm this.
  108. if state == 'conflicted':
  109. confirm_form = {
  110. 'username': '',
  111. 'password': '',
  112. 'resolve': 'true',
  113. 'remember': 'false',
  114. 'stayPut': 'false',
  115. }
  116. request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form))
  117. login_page = self._download_webpage(request, None, 'Confirming log in and log out from another device')
  118. if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None:
  119. raise ExtractorError('Unable to log in')
  120. def _fix_subtitles(self, subtitles):
  121. if subtitles is None:
  122. return subtitles # subtitles not requested
  123. fixed_subtitles = {}
  124. for k, v in subtitles.items():
  125. subs = json.loads(v)
  126. if len(subs) == 0:
  127. continue
  128. srt = ''
  129. for pos in range(0, len(subs) - 1):
  130. seq_current = subs[pos]
  131. m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode'])
  132. if m_current is None:
  133. continue
  134. seq_next = subs[pos + 1]
  135. m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode'])
  136. if m_next is None:
  137. continue
  138. appear_time = m_current.group('timecode')
  139. disappear_time = m_next.group('timecode')
  140. text = seq_current['Caption']
  141. srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text)
  142. if srt:
  143. fixed_subtitles[k] = srt
  144. return fixed_subtitles
  145. def _get_available_subtitles(self, video_id, webpage):
  146. url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id
  147. sub = self._download_webpage(url, None, False)
  148. sub_json = json.loads(sub)
  149. return {'en': url} if len(sub_json) > 0 else {}
  150. class LyndaCourseIE(InfoExtractor):
  151. IE_NAME = 'lynda:course'
  152. IE_DESC = 'lynda.com online courses'
  153. # Course link equals to welcome/introduction video link of same course
  154. # We will recognize it as course link
  155. _VALID_URL = r'https?://(?:www|m)\.lynda\.com/(?P<coursepath>[^/]+/[^/]+/(?P<courseid>\d+))-\d\.html'
  156. def _real_extract(self, url):
  157. mobj = re.match(self._VALID_URL, url)
  158. course_path = mobj.group('coursepath')
  159. course_id = mobj.group('courseid')
  160. page = self._download_webpage('http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id,
  161. course_id, 'Downloading course JSON')
  162. course_json = json.loads(page)
  163. if 'Status' in course_json and course_json['Status'] == 'NotFound':
  164. raise ExtractorError('Course %s does not exist' % course_id, expected=True)
  165. unaccessible_videos = 0
  166. videos = []
  167. (username, _) = self._get_login_info()
  168. # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided
  169. # by single video API anymore
  170. for chapter in course_json['Chapters']:
  171. for video in chapter['Videos']:
  172. if username is None and video['HasAccess'] is False:
  173. unaccessible_videos += 1
  174. continue
  175. videos.append(video['ID'])
  176. if unaccessible_videos > 0:
  177. self._downloader.report_warning('%s videos are only available for members and will not be downloaded. '
  178. % unaccessible_videos + LyndaIE.ACCOUNT_CREDENTIALS_HINT)
  179. entries = [
  180. self.url_result('http://www.lynda.com/%s/%s-4.html' %
  181. (course_path, video_id),
  182. 'Lynda')
  183. for video_id in videos]
  184. course_title = course_json['Title']
  185. return self.playlist_result(entries, course_id, course_title)