You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123 lines
4.2 KiB

  1. import itertools
  2. import json
  3. import re
  4. from .common import InfoExtractor, SearchInfoExtractor
  5. from ..utils import (
  6. compat_urllib_parse,
  7. compat_urlparse,
  8. determine_ext,
  9. clean_html,
  10. )
  11. class YahooIE(InfoExtractor):
  12. IE_DESC = u'Yahoo screen'
  13. _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
  14. _TESTS = [
  15. {
  16. u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
  17. u'file': u'214727115.flv',
  18. u'info_dict': {
  19. u'title': u'Julian Smith & Travis Legg Watch Julian Smith',
  20. u'description': u'Julian and Travis watch Julian Smith',
  21. },
  22. u'params': {
  23. # Requires rtmpdump
  24. u'skip_download': True,
  25. },
  26. },
  27. {
  28. u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
  29. u'file': u'103000935.flv',
  30. u'info_dict': {
  31. u'title': u'Codefellas - The Cougar Lies with Spanish Moss',
  32. u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
  33. },
  34. u'params': {
  35. # Requires rtmpdump
  36. u'skip_download': True,
  37. },
  38. },
  39. ]
  40. def _real_extract(self, url):
  41. mobj = re.match(self._VALID_URL, url)
  42. video_id = mobj.group('id')
  43. webpage = self._download_webpage(url, video_id)
  44. items_json = self._search_regex(r'YVIDEO_INIT_ITEMS = ({.*?});$',
  45. webpage, u'items', flags=re.MULTILINE)
  46. items = json.loads(items_json)
  47. info = items['mediaItems']['query']['results']['mediaObj'][0]
  48. meta = info['meta']
  49. formats = []
  50. for s in info['streams']:
  51. format_info = {
  52. 'width': s.get('width'),
  53. 'height': s.get('height'),
  54. 'bitrate': s.get('bitrate'),
  55. }
  56. host = s['host']
  57. path = s['path']
  58. if host.startswith('rtmp'):
  59. format_info.update({
  60. 'url': host,
  61. 'play_path': path,
  62. 'ext': 'flv',
  63. })
  64. else:
  65. format_url = compat_urlparse.urljoin(host, path)
  66. format_info['url'] = format_url
  67. format_info['ext'] = determine_ext(format_url)
  68. formats.append(format_info)
  69. formats = sorted(formats, key=lambda f:(f['height'], f['width']))
  70. info = {
  71. 'id': video_id,
  72. 'title': meta['title'],
  73. 'formats': formats,
  74. 'description': clean_html(meta['description']),
  75. 'thumbnail': meta['thumbnail'],
  76. }
  77. # TODO: Remove when #980 has been merged
  78. info.update(formats[-1])
  79. return info
  80. class YahooSearchIE(SearchInfoExtractor):
  81. IE_DESC = u'Yahoo screen search'
  82. _MAX_RESULTS = 1000
  83. IE_NAME = u'screen.yahoo:search'
  84. _SEARCH_KEY = 'yvsearch'
  85. def _get_n_results(self, query, n):
  86. """Get a specified number of results for a query"""
  87. res = {
  88. '_type': 'playlist',
  89. 'id': query,
  90. 'entries': []
  91. }
  92. for pagenum in itertools.count(0):
  93. result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
  94. webpage = self._download_webpage(result_url, query,
  95. note='Downloading results page '+str(pagenum+1))
  96. info = json.loads(webpage)
  97. m = info[u'm']
  98. results = info[u'results']
  99. for (i, r) in enumerate(results):
  100. if (pagenum * 30) +i >= n:
  101. break
  102. mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
  103. e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
  104. res['entries'].append(e)
  105. if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
  106. break
  107. return res