You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

226 lines
8.2 KiB

10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
  1. from __future__ import unicode_literals
  2. import re
  3. import json
  4. from .subtitles import SubtitlesInfoExtractor
  5. from .common import InfoExtractor
  6. from ..compat import (
  7. compat_str,
  8. compat_urllib_parse,
  9. compat_urllib_request,
  10. )
  11. from ..utils import (
  12. ExtractorError,
  13. int_or_none,
  14. )
  15. class LyndaIE(SubtitlesInfoExtractor):
  16. IE_NAME = 'lynda'
  17. IE_DESC = 'lynda.com videos'
  18. _VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html'
  19. _LOGIN_URL = 'https://www.lynda.com/login/login.aspx'
  20. _NETRC_MACHINE = 'lynda'
  21. _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true'
  22. _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
  23. ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
  24. _TEST = {
  25. 'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
  26. 'md5': 'ecfc6862da89489161fb9cd5f5a6fac1',
  27. 'info_dict': {
  28. 'id': '114408',
  29. 'ext': 'mp4',
  30. 'title': 'Using the exercise files',
  31. 'duration': 68
  32. }
  33. }
  34. def _real_initialize(self):
  35. self._login()
  36. def _real_extract(self, url):
  37. mobj = re.match(self._VALID_URL, url)
  38. video_id = mobj.group(1)
  39. page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, video_id,
  40. 'Downloading video JSON')
  41. video_json = json.loads(page)
  42. if 'Status' in video_json:
  43. raise ExtractorError('lynda returned error: %s' % video_json['Message'], expected=True)
  44. if video_json['HasAccess'] is False:
  45. raise ExtractorError(
  46. 'Video %s is only available for members. ' % video_id + self.ACCOUNT_CREDENTIALS_HINT, expected=True)
  47. video_id = compat_str(video_json['ID'])
  48. duration = video_json['DurationInSeconds']
  49. title = video_json['Title']
  50. formats = []
  51. fmts = video_json.get('Formats')
  52. if fmts:
  53. formats.extend([
  54. {
  55. 'url': fmt['Url'],
  56. 'ext': fmt['Extension'],
  57. 'width': fmt['Width'],
  58. 'height': fmt['Height'],
  59. 'filesize': fmt['FileSize'],
  60. 'format_id': str(fmt['Resolution'])
  61. } for fmt in fmts])
  62. prioritized_streams = video_json.get('PrioritizedStreams')
  63. if prioritized_streams:
  64. formats.extend([
  65. {
  66. 'url': video_url,
  67. 'width': int_or_none(format_id),
  68. 'format_id': format_id,
  69. } for format_id, video_url in prioritized_streams['0'].items()
  70. ])
  71. self._sort_formats(formats)
  72. if self._downloader.params.get('listsubtitles', False):
  73. self._list_available_subtitles(video_id, page)
  74. return
  75. subtitles = self._fix_subtitles(self.extract_subtitles(video_id, page))
  76. return {
  77. 'id': video_id,
  78. 'title': title,
  79. 'duration': duration,
  80. 'subtitles': subtitles,
  81. 'formats': formats
  82. }
  83. def _login(self):
  84. (username, password) = self._get_login_info()
  85. if username is None:
  86. return
  87. login_form = {
  88. 'username': username,
  89. 'password': password,
  90. 'remember': 'false',
  91. 'stayPut': 'false'
  92. }
  93. request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
  94. login_page = self._download_webpage(request, None, 'Logging in as %s' % username)
  95. # Not (yet) logged in
  96. m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page)
  97. if m is not None:
  98. response = m.group('json')
  99. response_json = json.loads(response)
  100. state = response_json['state']
  101. if state == 'notlogged':
  102. raise ExtractorError('Unable to login, incorrect username and/or password', expected=True)
  103. # This is when we get popup:
  104. # > You're already logged in to lynda.com on two devices.
  105. # > If you log in here, we'll log you out of another device.
  106. # So, we need to confirm this.
  107. if state == 'conflicted':
  108. confirm_form = {
  109. 'username': '',
  110. 'password': '',
  111. 'resolve': 'true',
  112. 'remember': 'false',
  113. 'stayPut': 'false',
  114. }
  115. request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form))
  116. login_page = self._download_webpage(request, None, 'Confirming log in and log out from another device')
  117. if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None:
  118. raise ExtractorError('Unable to log in')
  119. def _fix_subtitles(self, subtitles):
  120. if subtitles is None:
  121. return subtitles # subtitles not requested
  122. fixed_subtitles = {}
  123. for k, v in subtitles.items():
  124. subs = json.loads(v)
  125. if len(subs) == 0:
  126. continue
  127. srt = ''
  128. for pos in range(0, len(subs) - 1):
  129. seq_current = subs[pos]
  130. m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode'])
  131. if m_current is None:
  132. continue
  133. seq_next = subs[pos + 1]
  134. m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode'])
  135. if m_next is None:
  136. continue
  137. appear_time = m_current.group('timecode')
  138. disappear_time = m_next.group('timecode')
  139. text = seq_current['Caption']
  140. srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text)
  141. if srt:
  142. fixed_subtitles[k] = srt
  143. return fixed_subtitles
  144. def _get_available_subtitles(self, video_id, webpage):
  145. url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id
  146. sub = self._download_webpage(url, None, False)
  147. sub_json = json.loads(sub)
  148. return {'en': url} if len(sub_json) > 0 else {}
  149. class LyndaCourseIE(InfoExtractor):
  150. IE_NAME = 'lynda:course'
  151. IE_DESC = 'lynda.com online courses'
  152. # Course link equals to welcome/introduction video link of same course
  153. # We will recognize it as course link
  154. _VALID_URL = r'https?://(?:www|m)\.lynda\.com/(?P<coursepath>[^/]+/[^/]+/(?P<courseid>\d+))-\d\.html'
  155. def _real_extract(self, url):
  156. mobj = re.match(self._VALID_URL, url)
  157. course_path = mobj.group('coursepath')
  158. course_id = mobj.group('courseid')
  159. page = self._download_webpage('http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id,
  160. course_id, 'Downloading course JSON')
  161. course_json = json.loads(page)
  162. if 'Status' in course_json and course_json['Status'] == 'NotFound':
  163. raise ExtractorError('Course %s does not exist' % course_id, expected=True)
  164. unaccessible_videos = 0
  165. videos = []
  166. (username, _) = self._get_login_info()
  167. # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided
  168. # by single video API anymore
  169. for chapter in course_json['Chapters']:
  170. for video in chapter['Videos']:
  171. if username is None and video['HasAccess'] is False:
  172. unaccessible_videos += 1
  173. continue
  174. videos.append(video['ID'])
  175. if unaccessible_videos > 0:
  176. self._downloader.report_warning('%s videos are only available for members and will not be downloaded. '
  177. % unaccessible_videos + LyndaIE.ACCOUNT_CREDENTIALS_HINT)
  178. entries = [
  179. self.url_result('http://www.lynda.com/%s/%s-4.html' %
  180. (course_path, video_id),
  181. 'Lynda')
  182. for video_id in videos]
  183. course_title = course_json['Title']
  184. return self.playlist_result(entries, course_id, course_title)