Merge remote-tracking branch 'upstream/master'

11 years ago · 99859d436c
10 changed files with 127 additions and 18 deletions
--- a/devscripts/youtube_genalgo.py
+++ b/devscripts/youtube_genalgo.py
@ -26,9 +26,9 @@ tests = [
    # 85
    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<",
     ".>/?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWQ0q876543r1mnbvcx9asdfghjklpoiuyt2"),
-    # 84
+    # 84 - vflh9ybst 2013/08/23 (sporadic)
    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<",
-     "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWQ09876543q1mnbvcxzasdfghjklpoiuew2"),
+     "yuioplkjhgfdsazxcvbnm1234567890QWERrYUIOPLKqHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<"),
    # 83
    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<",
     ".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"),
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -29,6 +29,7 @@ from .gametrailers import GametrailersIE
 from .generic import GenericIE
 from .googleplus import GooglePlusIE
 from .googlesearch import GoogleSearchIE
+from .hark import HarkIE
 from .hotnewhiphop import HotNewHipHopIE
 from .howcast import HowcastIE
 from .hypem import HypemIE
@ -57,6 +58,7 @@ from .pornotube import PornotubeIE
 from .rbmaradio import RBMARadioIE
 from .redtube import RedTubeIE
 from .ringtv import RingTVIE
+from .ro220 import Ro220IE
 from .roxwel import RoxwelIE
 from .rtlnow import RTLnowIE
 from .sina import SinaIE
@ -116,12 +118,14 @@ _ALL_CLASSES = [
 ]
 _ALL_CLASSES.append(GenericIE)

+
 def gen_extractors():
    """ Return a list of an instance of every supported extractor.
    The order does matter; the first extractor matched is the one handling the URL.
    """
    return [klass() for klass in _ALL_CLASSES]

+
 def get_info_extractor(ie_name):
    """Returns the info extractor class with the given ie_name"""
    return globals()[ie_name+'IE']
--- a/youtube_dl/extractor/c56.py
+++ b/youtube_dl/extractor/c56.py
@ -12,8 +12,8 @@ class C56IE(InfoExtractor):

    _TEST ={
        u'url': u'http://www.56.com/u39/v_OTM0NDA3MTY.html',
-        u'file': u'93440716.mp4',
-        u'md5': u'9dc07b5c8e978112a6441f9e75d2b59e',
+        u'file': u'93440716.flv',
+        u'md5': u'e59995ac63d0457783ea05f93f12a866',
        u'info_dict': {
            u'title': u'网事知多少 第32期：车怒',
        },
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@ -21,7 +21,7 @@ class DailymotionIE(InfoExtractor):
        u'file': u'x33vw9.mp4',
        u'md5': u'392c4b85a60a90dc4792da41ce3144eb',
        u'info_dict': {
-            u"uploader": u"Alex and Van .", 
+            u"uploader": u"Amphora Alex and Van .", 
            u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\""
        }
    }
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -7,12 +7,14 @@ from .common import InfoExtractor
 from ..utils import (
    compat_urllib_error,
    compat_urllib_parse,
+    compat_urllib_parse_urlparse,
    compat_urllib_request,

    ExtractorError,
 )
 from .brightcove import BrightcoveIE

+
 class GenericIE(InfoExtractor):
    IE_DESC = u'Generic downloader that works on some sites'
    _VALID_URL = r'.*'
@ -23,7 +25,7 @@ class GenericIE(InfoExtractor):
            u'file': u'13601338388002.mp4',
            u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
            u'info_dict': {
-                u"uploader": u"www.hodiho.fr", 
+                u"uploader": u"www.hodiho.fr",
                u"title": u"R\u00e9gis plante sa Jeep"
            }
        },
@ -124,7 +126,7 @@ class GenericIE(InfoExtractor):
            raise ExtractorError(u'Invalid URL: %s' % url)

        self.report_extraction(video_id)
-        # Look for BrigthCove:
+        # Look for BrightCove:
        m_brightcove = re.search(r'<object.+?class=([\'"]).*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL)
        if m_brightcove is not None:
            self.to_screen(u'Brightcove video detected.')
@ -161,6 +163,10 @@ class GenericIE(InfoExtractor):
            raise ExtractorError(u'Invalid URL: %s' % url)

        video_url = compat_urllib_parse.unquote(mobj.group(1))
+        if video_url.startswith('//'):
+            video_url = compat_urllib_parse_urlparse(url).scheme + ':' + video_url
+        if '://' not in video_url:
+            video_url = url + ('' if url.endswith('/') else '/') + video_url
        video_id = os.path.basename(video_url)

        # here's a fun little line of code for you:
--- a/youtube_dl/extractor/hark.py
+++ b/youtube_dl/extractor/hark.py
@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+
+import re
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+class HarkIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.hark\.com/clips/(.+?)-.+'
+    _TEST = {
+        u'url': u'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013',
+        u'file': u'mmbzyhkgny.mp3',
+        u'md5': u'6783a58491b47b92c7c1af5a77d4cbee',
+        u'info_dict': {
+            u"title": u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' On May 23, 2013 ",
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group(1)
+        embed_url = "http://www.hark.com/clips/%s/homepage_embed" %(video_id)
+        webpage = self._download_webpage(embed_url, video_id)
+
+        final_url = self._search_regex(r'src="(.+?).mp3"',
+                                webpage, 'video url')+'.mp3'
+        title = self._html_search_regex(r'<title>(.+?)</title>',
+                                webpage, 'video title').replace(' Sound Clip and Quote - Hark','').replace(
+                                'Sound Clip , Quote, MP3, and Ringtone - Hark','')
+
+        return {'id': video_id,
+                'url' : final_url,
+                'title': title,
+                'ext': determine_ext(final_url),
+                }
--- a/youtube_dl/extractor/ro220.py
+++ b/youtube_dl/extractor/ro220.py
@ -0,0 +1,42 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    compat_parse_qs,
+)
+
+
+class Ro220IE(InfoExtractor):
+    IE_NAME = '220.ro'
+    _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<video_id>[^/]+)'
+    _TEST = {
+        u"url": u"http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/",
+        u'file': u'LYV6doKo7f.mp4',
+        u'md5': u'03af18b73a07b4088753930db7a34add',
+        u'info_dict': {
+            u"title": u"Luati-le Banii sez 4 ep 1",
+            u"description": u"Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.",
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('video_id')
+
+        webpage = self._download_webpage(url, video_id)
+        flashVars_str = self._search_regex(
+            r'<param name="flashVars" value="([^"]+)"',
+            webpage, u'flashVars')
+        flashVars = compat_parse_qs(flashVars_str)
+
+        info = {
+            '_type': 'video',
+            'id': video_id,
+            'ext': 'mp4',
+            'url': flashVars['videoURL'][0],
+            'title': flashVars['title'][0],
+            'description': clean_html(flashVars['desc'][0]),
+            'thumbnail': flashVars['preview'][0],
+        }
+        return info
--- a/youtube_dl/extractor/rtlnow.py
+++ b/youtube_dl/extractor/rtlnow.py
@ -8,8 +8,8 @@ from ..utils import (
 )

 class RTLnowIE(InfoExtractor):
-    """Information Extractor for RTLnow, RTL2now and VOXnow"""
-    _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl(?:(?P<is_rtl2>2)|-)now\.rtl(?(is_rtl2)2|)\.de/|(?:www\.)?voxnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'
+    """Information Extractor for RTL NOW, RTL2 NOW, SUPER RTL NOW and VOX NOW"""
+    _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?superrtlnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'
    _TESTS = [{
        u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
        u'file': u'90419.flv',
@ -48,6 +48,19 @@ class RTLnowIE(InfoExtractor):
        u'params': {
            u'skip_download': True,
        },
+    },
+    {
+        u'url': u'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
+        u'file': u'99205.flv',
+        u'info_dict': {
+            u'upload_date': u'20080928', 
+            u'title': u'Medicopter 117 - Angst!',
+            u'description': u'Angst!',
+            u'thumbnail': u'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg'
+        },
+        u'params': {
+            u'skip_download': True,
+        },
    }]

    def _real_extract(self,url):
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -427,7 +427,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        elif len(s) == 85:
            return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27]
        elif len(s) == 84:
-            return s[83:27:-1] + s[0] + s[26:5:-1] + s[2:0:-1] + s[27]
+            return s[5:40] + s[3] + s[41:48] + s[0] + s[49:84]
        elif len(s) == 83:
            return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
        elif len(s) == 82:
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -476,7 +476,7 @@ def formatSeconds(secs):
 def make_HTTPS_handler(opts):
    if sys.version_info < (3,2):
        # Python's 2.x handler is very simplistic
-        return compat_urllib_request.HTTPSHandler()
+        return YoutubeDLHandlerHTTPS()
    else:
        import ssl
        context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
@ -485,7 +485,7 @@ def make_HTTPS_handler(opts):
        context.verify_mode = (ssl.CERT_NONE
                               if opts.no_check_certificate
                               else ssl.CERT_REQUIRED)
-        return compat_urllib_request.HTTPSHandler(context=context)
+        return YoutubeDLHandlerHTTPS(context=context)

 class ExtractorError(Exception):
    """Error during info extraction."""
@ -569,7 +569,8 @@ class ContentTooShortError(Exception):
        self.downloaded = downloaded
        self.expected = expected

-class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
+
+class YoutubeDLHandler_Template:  # Old-style class, like HTTPHandler
    """Handler for HTTP requests and responses.

    This class, when installed with an OpenerDirector, automatically adds
@ -602,8 +603,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
        ret.code = code
        return ret

-    def http_request(self, req):
-        for h,v in std_headers.items():
+    def _http_request(self, req):
+        for h, v in std_headers.items():
            if h in req.headers:
                del req.headers[h]
            req.add_header(h, v)
@ -618,7 +619,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
            del req.headers['Youtubedl-user-agent']
        return req

-    def http_response(self, req, resp):
+    def _http_response(self, req, resp):
        old_resp = resp
        # gzip
        if resp.headers.get('Content-encoding', '') == 'gzip':
@ -632,8 +633,16 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
            resp.msg = old_resp.msg
        return resp

-    https_request = http_request
-    https_response = http_response
+
+class YoutubeDLHandler(YoutubeDLHandler_Template, compat_urllib_request.HTTPHandler):
+    http_request = YoutubeDLHandler_Template._http_request
+    http_response = YoutubeDLHandler_Template._http_response
+
+
+class YoutubeDLHandlerHTTPS(YoutubeDLHandler_Template, compat_urllib_request.HTTPSHandler):
+    https_request = YoutubeDLHandler_Template._http_request
+    https_response = YoutubeDLHandler_Template._http_response
+

 def unified_strdate(date_str):
    """Return a string with the date in the format YYYYMMDD"""