You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

85 lines
3.5 KiB

11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
  1. # coding: utf-8
  2. import json
  3. import re
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. compat_urllib_parse,
  7. unified_strdate,
  8. )
  9. class WatIE(InfoExtractor):
  10. _WORKING = False
  11. _VALID_URL=r'http://www.wat.tv/.*-(?P<shortID>.*?)_.*?.html'
  12. IE_NAME = 'wat.tv'
  13. _TEST = {
  14. u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
  15. u'file': u'10631273.mp4',
  16. u'md5': u'0a4fe7870f31eaeabb5e25fd8da8414a',
  17. u'info_dict': {
  18. u'title': u'World War Z - Philadelphia VOST',
  19. u'description': u'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr',
  20. }
  21. }
  22. def download_video_info(self, real_id):
  23. # 'contentv4' is used in the website, but it also returns the related
  24. # videos, we don't need them
  25. info = self._download_webpage('http://www.wat.tv/interface/contentv3/' + real_id, real_id, 'Downloading video info')
  26. info = json.loads(info)
  27. return info['media']
  28. def _real_extract(self, url):
  29. def real_id_for_chapter(chapter):
  30. return chapter['tc_start'].split('-')[0]
  31. mobj = re.match(self._VALID_URL, url)
  32. short_id = mobj.group('shortID')
  33. webpage = self._download_webpage(url, short_id)
  34. real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
  35. video_info = self.download_video_info(real_id)
  36. chapters = video_info['chapters']
  37. first_chapter = chapters[0]
  38. if real_id_for_chapter(first_chapter) != real_id:
  39. self.to_screen('Multipart video detected')
  40. chapter_urls = []
  41. for chapter in chapters:
  42. chapter_id = real_id_for_chapter(chapter)
  43. # Yes, when we this chapter is processed by WatIE,
  44. # it will download the info again
  45. chapter_info = self.download_video_info(chapter_id)
  46. chapter_urls.append(chapter_info['url'])
  47. entries = [self.url_result(chapter_url) for chapter_url in chapter_urls]
  48. return self.playlist_result(entries, real_id, video_info['title'])
  49. # Otherwise we can continue and extract just one part, we have to use
  50. # the short id for getting the video url
  51. player_data = compat_urllib_parse.urlencode({'shortVideoId': short_id,
  52. 'html5': '1'})
  53. player_info = self._download_webpage('http://www.wat.tv/player?' + player_data,
  54. real_id, u'Downloading player info')
  55. player = json.loads(player_info)['player']
  56. html5_player = self._html_search_regex(r'iframe src="(.*?)"', player,
  57. 'html5 player')
  58. player_webpage = self._download_webpage(html5_player, real_id,
  59. u'Downloading player webpage')
  60. video_url = self._search_regex(r'urlhtml5 : "(.*?)"', player_webpage,
  61. 'video url')
  62. info = {'id': real_id,
  63. 'url': video_url,
  64. 'ext': 'mp4',
  65. 'title': first_chapter['title'],
  66. 'thumbnail': first_chapter['preview'],
  67. 'description': first_chapter['description'],
  68. 'view_count': video_info['views'],
  69. }
  70. if 'date_diffusion' in first_chapter:
  71. info['upload_date'] = unified_strdate(first_chapter['date_diffusion'])
  72. return info