You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

156 lines
5.9 KiB

10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
  1. from __future__ import unicode_literals
  2. import re
  3. import json
  4. from .common import InfoExtractor
  5. from .youtube import YoutubeIE
  6. from ..utils import (
  7. clean_html,
  8. ExtractorError,
  9. get_element_by_id,
  10. )
  11. class TechTVMITIE(InfoExtractor):
  12. IE_NAME = 'techtv.mit.edu'
  13. _VALID_URL = r'https?://techtv\.mit\.edu/(?:videos|embeds)/(?P<id>\d+)'
  14. _TEST = {
  15. 'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
  16. 'md5': '00a3a27ee20d44bcaa0933ccec4a2cf7',
  17. 'info_dict': {
  18. 'id': '25418',
  19. 'ext': 'mp4',
  20. 'title': 'MIT DNA and Protein Sets',
  21. 'description': 'md5:46f5c69ce434f0a97e7c628cc142802d',
  22. },
  23. }
  24. def _real_extract(self, url):
  25. video_id = self._match_id(url)
  26. raw_page = self._download_webpage(
  27. 'http://techtv.mit.edu/videos/%s' % video_id, video_id)
  28. clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page)
  29. base_url = self._proto_relative_url(self._search_regex(
  30. r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url'), 'http:')
  31. formats_json = self._search_regex(
  32. r'bitrates: (\[.+?\])', raw_page, 'video formats')
  33. formats_mit = json.loads(formats_json)
  34. formats = [
  35. {
  36. 'format_id': f['label'],
  37. 'url': base_url + f['url'].partition(':')[2],
  38. 'ext': f['url'].partition(':')[0],
  39. 'format': f['label'],
  40. 'width': f['width'],
  41. 'vbr': f['bitrate'],
  42. }
  43. for f in formats_mit
  44. ]
  45. title = get_element_by_id('edit-title', clean_page)
  46. description = clean_html(get_element_by_id('edit-description', clean_page))
  47. thumbnail = self._search_regex(
  48. r'playlist:.*?url: \'(.+?)\'',
  49. raw_page, 'thumbnail', flags=re.DOTALL)
  50. return {
  51. 'id': video_id,
  52. 'title': title,
  53. 'formats': formats,
  54. 'description': description,
  55. 'thumbnail': thumbnail,
  56. }
  57. class MITIE(TechTVMITIE):
  58. IE_NAME = 'video.mit.edu'
  59. _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)'
  60. _TEST = {
  61. 'url': 'http://video.mit.edu/watch/the-government-is-profiling-you-13222/',
  62. 'md5': '7db01d5ccc1895fc5010e9c9e13648da',
  63. 'info_dict': {
  64. 'id': '21783',
  65. 'ext': 'mp4',
  66. 'title': 'The Government is Profiling You',
  67. 'description': 'md5:ad5795fe1e1623b73620dbfd47df9afd',
  68. },
  69. }
  70. def _real_extract(self, url):
  71. mobj = re.match(self._VALID_URL, url)
  72. page_title = mobj.group('title')
  73. webpage = self._download_webpage(url, page_title)
  74. embed_url = self._search_regex(
  75. r'<iframe .*?src="(.+?)"', webpage, 'embed url')
  76. return self.url_result(embed_url, ie='TechTVMIT')
  77. class OCWMITIE(InfoExtractor):
  78. IE_NAME = 'ocw.mit.edu'
  79. _VALID_URL = r'^http://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)'
  80. _BASE_URL = 'http://ocw.mit.edu/'
  81. _TESTS = [
  82. {
  83. 'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',
  84. 'info_dict': {
  85. 'id': 'EObHWIEKGjA',
  86. 'ext': 'mp4',
  87. 'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
  88. 'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
  89. 'upload_date': '20121109',
  90. 'uploader_id': 'MIT',
  91. 'uploader': 'MIT OpenCourseWare',
  92. }
  93. },
  94. {
  95. 'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/',
  96. 'info_dict': {
  97. 'id': '7K1sB05pE0A',
  98. 'ext': 'mp4',
  99. 'title': 'Session 1: Introduction to Derivatives',
  100. 'upload_date': '20090818',
  101. 'uploader_id': 'MIT',
  102. 'uploader': 'MIT OpenCourseWare',
  103. 'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',
  104. }
  105. }
  106. ]
  107. def _real_extract(self, url):
  108. mobj = re.match(self._VALID_URL, url)
  109. topic = mobj.group('topic')
  110. webpage = self._download_webpage(url, topic)
  111. title = self._html_search_meta('WT.cg_s', webpage)
  112. description = self._html_search_meta('Description', webpage)
  113. # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file)
  114. embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage)
  115. if embed_chapter_media:
  116. metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1))
  117. metadata = re.split(r', ?', metadata)
  118. yt = metadata[1]
  119. else:
  120. # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)
  121. embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage)
  122. if embed_media:
  123. metadata = re.sub(r'[\'"]', '', embed_media.group(1))
  124. metadata = re.split(r', ?', metadata)
  125. yt = metadata[1]
  126. else:
  127. raise ExtractorError('Unable to find embedded YouTube video.')
  128. video_id = YoutubeIE.extract_id(yt)
  129. return {
  130. '_type': 'url_transparent',
  131. 'id': video_id,
  132. 'title': title,
  133. 'description': description,
  134. 'url': yt,
  135. 'ie_key': 'Youtube',
  136. }