Merge remote-tracking branch 'upstream/master'

11 years ago · f4371f4784
11 changed files with 219 additions and 64 deletions
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@ -37,6 +37,8 @@ class TestAllURLsMatching(unittest.TestCase):
        assertPlaylist(u'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
        assertPlaylist(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') #668
        self.assertFalse('youtube:playlist' in self.matching_ies(u'PLtS2H6bU1M'))
        # Top tracks
        assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101')
    def test_youtube_matching(self):
        self.assertTrue(YoutubeIE.suitable(u'PLtS2H6bU1M'))
--- a/test/test_youtube_lists.py
+++ b/test/test_youtube_lists.py
@ -117,6 +117,13 @@ class TestYoutubeLists(unittest.TestCase):
        original_video = entries[0]
        self.assertEqual(original_video['id'], 'rjFaenf1T-Y')
    def test_youtube_toptracks(self):
        dl = FakeYDL()
        ie = YoutubePlaylistIE(dl)
        result = ie.extract('https://www.youtube.com/playlist?list=MCUS')
        entries = result['entries']
        self.assertEqual(len(entries), 100)
    def test_youtube_toplist(self):
        dl = FakeYDL()
        ie = YoutubeTopListIE(dl)
--- a/youtube_dl/init.py
+++ b/youtube_dl/init.py
@ -41,6 +41,7 @@ __authors__  = (
    'Chris Gahan',
    'Saimadhav Heblikar',
    'Mike Col',
    'Andreas Schmitz',
 )
 __license__ = 'Public Domain'
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -143,8 +143,10 @@ from .myvideo import MyVideoIE
 from .naver import NaverIE
 from .nba import NBAIE
 from .nbc import NBCNewsIE
 from .ndr import NDRIE
 from .ndtv import NDTVIE
 from .newgrounds import NewgroundsIE
 from .nfb import NFBIE
 from .nhl import NHLIE, NHLVideocenterIE
 from .niconico import NiconicoIE
 from .ninegag import NineGagIE
--- a/youtube_dl/extractor/chilloutzone.py
+++ b/youtube_dl/extractor/chilloutzone.py
@ -1,14 +1,18 @@
 from __future__ import unicode_literals
 import re
 import base64
 import urllib
 import json
 from .common import InfoExtractor
 from ..utils import (
    clean_html,
    ExtractorError
 )
 video_container = ('.mp4', '.mkv', '.flv')
 class ChilloutzoneIE(InfoExtractor):
    _VALID_URL = r'(?:https?://)?(?:www\.)?chilloutzone\.net/video/(?P<id>[\w|-]+).html'
    _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P<id>[\w|-]+)\.html'
    _TEST = {
        'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html',
        'md5': 'a76f3457e813ea0037e5244f509e66d1',
@ -16,6 +20,7 @@ class ChilloutzoneIE(InfoExtractor):
            'id': 'enemene-meck-alle-katzen-weg',
            'ext': 'mp4',
            'title': 'Enemene Meck - Alle Katzen weg',
            'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?',
        },
    }
@ -23,71 +28,45 @@ class ChilloutzoneIE(InfoExtractor):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        webpage_url = 'http://www.chilloutzone.net/video/' + video_id + '.html'
        webpage = self._download_webpage(url, video_id)
        # Log that we are starting to download the page
        self.report_download_webpage(webpage_url)
        webpage = self._download_webpage(webpage_url, video_id)
        # Log that we are starting to parse the page
        self.report_extraction(video_id)        
        # Find base64 decoded file info
        base64_video_info = self._html_search_regex(r'var cozVidData = "(.+?)";', webpage, u'video Data')
        # decode string and find video file
        base64_video_info = self._html_search_regex(
            r'var cozVidData = "(.+?)";', webpage, 'video data')
        decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8")
        video_info_dict = json.loads(decoded_video_info)
        # get video information from dict
        media_url = video_info_dict['mediaUrl']
        description = video_info_dict['description']
        video_url = video_info_dict['mediaUrl']
        description = clean_html(video_info_dict.get('description'))
        title = video_info_dict['title']
        native_platform = video_info_dict['nativePlatform']
        native_video_id = video_info_dict['nativeVideoId']
        source_priority = video_info_dict['sourcePriority']
        # Start video extraction
        video_url = ''
        # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed)
        if native_platform == None:
            # Look for other video urls
            video_url = self._html_search_regex(r'<iframe.* src="(.+?)".*', webpage, u'fallback Video URL')
            if 'youtube' in video_url:
                self.to_screen(u'Youtube video detected:')
                return self.url_result(video_url, ie='Youtube')
        # For debugging purposes
        #print video_info_dict
        #print native_platform
        #print native_video_id
        #print source_priority
        #print media_url
        if native_platform is None:
            youtube_url = self._html_search_regex(
                r'<iframe.* src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
                webpage, 'fallback video URL', default=None)
            if youtube_url is not None:
                return self.url_result(youtube_url, ie='Youtube')
        # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or
        # the own CDN
        if source_priority == 'native':
            if native_platform == 'youtube':
                self.to_screen(u'Youtube video detected:')
                video_url = 'https://www.youtube.com/watch?v=' + native_video_id
                return self.url_result(video_url, ie='Youtube') 
                return self.url_result(video_id, ie='Youtube')
            if native_platform == 'vimeo':
                self.to_screen(u'Vimeo video detected:')
                video_url = 'http://vimeo.com/' + native_video_id
                return self.url_result(video_url, ie='Vimeo')
        # No redirect, use coz media url
        video_url = media_url
        if video_url.endswith('.mp4') == False:
            self.report_warning(u'Url does not contain a video container')
            return []
                return self.url_result(
                    'http://vimeo.com/' + native_video_id, ie='Vimeo')
        if not video_url:
            raise ExtractorError('No video found')
        return [{
            'id':        video_id,
            'url':       video_url,
            'ext':       'mp4',
            'title':     title,
        return {
            'id': video_id,
            'url': video_url,
            'ext': 'mp4',
            'title': title,
            'description': description,
        }]
        }
--- a/youtube_dl/extractor/elpais.py
+++ b/youtube_dl/extractor/elpais.py
@ -9,7 +9,7 @@ from ..utils import unified_strdate
 class ElPaisIE(InfoExtractor):
    _VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])'
    IE_DESCR = 'El País'
    IE_DESC = 'El País'
    _TEST = {
        'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html',
--- a/youtube_dl/extractor/mooshare.py
+++ b/youtube_dl/extractor/mooshare.py
@ -61,7 +61,7 @@ class MooshareIE(InfoExtractor):
        }
        request = compat_urllib_request.Request(
            'http://mooshare.biz/8dqtk4bjbp8g', compat_urllib_parse.urlencode(download_form))
            'http://mooshare.biz/%s' % video_id, compat_urllib_parse.urlencode(download_form))
        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
        self.to_screen('%s: Waiting for timeout' % video_id)
@ -111,4 +111,4 @@ class MooshareIE(InfoExtractor):
            'thumbnail': thumbnail,
            'duration': duration,
            'formats': formats,
        }
        }
--- a/youtube_dl/extractor/ndr.py
+++ b/youtube_dl/extractor/ndr.py
@ -0,0 +1,89 @@
 # encoding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..utils import ExtractorError
 class NDRIE(InfoExtractor):
    IE_NAME = 'ndr'
    IE_DESC = 'NDR.de - Mediathek'
    _VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html'
    _TESTS = [
        # video
        {
            'url': 'http://www.ndr.de/fernsehen/sendungen/hallo_niedersachsen/media/hallonds19925.html',
            'md5': '20eba151ff165f386643dad9c1da08f7',
            'info_dict': {
                'id': '19925',
                'ext': 'mp4',
                'title': 'Hallo Niedersachsen  ',
                'description': 'Bei Hallo Niedersachsen um 19:30 Uhr erfahren Sie alles, was am Tag in Niedersachsen los war.',
                'duration': 1722,
            },
        },
        # audio
        {
            'url': 'http://www.ndr.de/903/audio191719.html',
            'md5': '41ed601768534dd18a9ae34d84798129',
            'info_dict': {
                'id': '191719',
                'ext': 'mp3',
                'title': '"Es war schockierend"',
                'description': 'md5:ed7ff8364793545021a6355b97e95f10',
                'duration': 112,
            }
        }
    ]
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        page = self._download_webpage(url, video_id, 'Downloading page')
        title = self._og_search_title(page)
        description = self._og_search_description(page)
        mobj = re.search(
            r'<div class="duration"><span class="min">(?P<minutes>\d+)</span>:<span class="sec">(?P<seconds>\d+)</span></div>',
            page)
        duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
        formats = []
        mp3_url = re.search(r'''{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page)
        if mp3_url:
            formats.append({
                'url': mp3_url.group('audio'),
                'format_id': 'mp3',
            })
        thumbnail = None
        video_url = re.search(r'''3: {src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)
        if video_url:
            thumbnail = self._html_search_regex(r'(?m)title: "NDR PLAYER",\s*poster: "([^"]+)",',
                page, 'thumbnail', fatal=False)
            if thumbnail:
                thumbnail = 'http://www.ndr.de' + thumbnail
            for format_id in ['lo', 'hi', 'hq']:
                formats.append({
                    'url': '%s.%s.mp4' % (video_url.group('video'), format_id),
                    'format_id': format_id,
                })
        if not formats:
            raise ExtractorError('No media links available for %s' % video_id)
        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'duration': duration,
            'formats': formats,
        }
--- a/youtube_dl/extractor/nfb.py
+++ b/youtube_dl/extractor/nfb.py
@ -0,0 +1,76 @@
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..utils import (
    compat_urllib_request,
    compat_urllib_parse,
 )
 class NFBIE(InfoExtractor):
    IE_NAME = 'nfb'
    IE_DESC = 'National Film Board of Canada'
    _VALID_URL = r'https?://(?:www\.)?(nfb|onf)\.ca/film/(?P<id>[\da-z_-]+)'
    _TEST = {
        'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
        'info_dict': {
            'id': 'qallunaat_why_white_people_are_funny',
            'ext': 'mp4',
            'title': 'Qallunaat! Why White People Are Funny ',
            'description': 'md5:836d8aff55e087d04d9f6df554d4e038',
            'duration': 3128,
            'uploader': 'Mark Sandiford',
            'uploader_id': 'mark-sandiford',
        },
        'params': {
            # rtmp download
            'skip_download': True,
        }
    }
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        page = self._download_webpage('https://www.nfb.ca/film/%s' % video_id, video_id, 'Downloading film page')
        uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"',
            page, 'director id', fatal=False)
        uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>',
            page, 'director name', fatal=False)
        request = compat_urllib_request.Request('https://www.nfb.ca/film/%s/player_config' % video_id,
            compat_urllib_parse.urlencode({'getConfig': 'true'}).encode('ascii'))
        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
        request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf')
        config = self._download_xml(request, video_id, 'Downloading player config XML')
        thumbnail = config.find("./player/stream/media[@type='posterImage']/assets/asset[@quality='high']/default/url").text
        video = config.find("./player/stream/media[@type='video']")
        duration = int(video.get('duration'))
        title = video.find('title').text
        description = video.find('description').text
        # It seems assets always go from lower to better quality, so no need to sort
        formats = [{
            'url': x.find('default/streamerURI').text + '/',
            'play_path': x.find('default/url').text,
            'rtmp_live': False,
            'ext': 'mp4',
            'format_id': x.get('quality'),
        } for x in video.findall('assets/asset')]
        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'duration': duration,
            'uploader': uploader,
            'uploader_id': uploader_id,
            'formats': formats,
        }
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -1422,7 +1422,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
    IE_DESC = u'YouTube.com playlists'
    _VALID_URL = r"""(?:
    _VALID_URL = r"""(?x)(?:
                        (?:https?://)?
                        (?:\w+\.)?
                        youtube\.com/
@ -1431,7 +1431,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                           \? (?:.*?&)*? (?:p|a|list)=
                        |  p/
                        )
                        ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
                        (
                            (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
                            # Top tracks, they can also include dots 
                            |(?:MC)[\w\.]*
                        )
                        .*
                     |
                        ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
@ -1441,11 +1445,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
    _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
    IE_NAME = u'youtube:playlist'
    @classmethod
    def suitable(cls, url):
        """Receives a URL and returns True if suitable for this IE."""
        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
    def _real_initialize(self):
        self._login()
@ -1469,7 +1468,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
    def _real_extract(self, url):
        # Extract playlist id
        mobj = re.match(self._VALID_URL, url, re.VERBOSE)
        mobj = re.match(self._VALID_URL, url)
        if mobj is None:
            raise ExtractorError(u'Invalid URL: %s' % url)
        playlist_id = mobj.group(1) or mobj.group(2)
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@ -1,2 +1,2 @@
 __version__ = '2014.02.06.1'
 __version__ = '2014.02.06.3'