add an extractor for tv.sohu.com

11 years ago · 6624a2b07d
2 changed files with 98 additions and 0 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -55,6 +55,7 @@ from .redtube import RedTubeIE
 from .ringtv import RingTVIE
 from .roxwel import RoxwelIE
 from .sina import SinaIE
 from .sohu import SohuIE
 from .soundcloud import SoundcloudIE, SoundcloudSetIE
 from .spiegel import SpiegelIE
 from .stanfordoc import StanfordOpenClassroomIE
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@ -0,0 +1,97 @@
 # encoding: utf-8
 import re
 import json
 import time
 import logging
 import urllib2
 from .common import InfoExtractor
 from ..utils import compat_urllib_request
 class SohuIE(InfoExtractor):
    _VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P<id>\d+)\.shtml.*?'
    _TEST = {
        u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super',
        u'file': u'382479172.flv',
        u'md5': u'cc84eed6b6fbf0f2f9a8d3cb9da1939b',
        u'info_dict': {
            u'title': u'The Illest - Far East Movement Riff Raff',
        },
    }
    def _clearn_html(self, string):
        tags = re.findall(r'<.+?>', string)
        for t in tags:
            string = string.replace(t, ' ')
        for i in range(2):
            spaces = re.findall(r'\s+', string)
            for s in spaces:
                string = string.replace(s, ' ')
        string = string.strip()
        return string
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        webpage = self._download_webpage(url, video_id)
        pattern = r'<h1 id="video-title">\n*?(.+?)\n*?</h1>'
        compiled = re.compile(pattern, re.DOTALL)
        title = self._search_regex(compiled, webpage, u'video title').strip('\t\n')
        title = self._clearn_html(title)
        pattern = re.compile(r'var vid="(\d+)"')
        result = re.search(pattern, webpage)
        if not result:
            logging.info('[Sohu] could not get vid')
            return None
        vid = result.group(1)
        logging.info('vid: %s' % vid)
        base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
        url_1 = base_url_1 + vid
        logging.info('json url: %s' % url_1)
        json_1 = json.loads(urllib2.urlopen(url_1).read())
        # get the highest definition video vid and json infomation.
        vids = []
        qualities = ('oriVid', 'superVid', 'highVid', 'norVid')
        for vid_name in qualities:
            vids.append(json_1['data'][vid_name])
        clearest_vid = 0
        for i, v in enumerate(vids):
            if v != 0:
                clearest_vid = v
                logging.info('quality definition: %s' % qualities[i][:-3])
                break
        if not clearest_vid:
            logging.warning('could not find valid clearest_vid')
            return None
        if vid != clearest_vid:
            url_1 = '%s%d' % (base_url_1, clearest_vid)
            logging.info('highest definition json url: %s' % url_1)
            json_1 = json.loads(urllib2.urlopen(url_1).read())
        allot = json_1['allot']
        prot = json_1['prot']
        clipsURL = json_1['data']['clipsURL']
        su = json_1['data']['su']
        num_of_parts = json_1['data']['totalBlocks']
        logging.info('Total parts: %d' % num_of_parts)
        base_url_3 = 'http://allot/?prot=prot&file=clipsURL[i]&new=su[i]'
        files_info = []
        for i in range(num_of_parts):
            middle_url = 'http://%s/?prot=%s&file=%s&new=%s' % (allot, prot, clipsURL[i], su[i])
            logging.info('middle url part %d: %s' % (i, middle_url))
            middle_info = urllib2.urlopen(middle_url).read().split('|')
            middle_part_1 = middle_info[0]
            download_url = '%s%s?key=%s' % (middle_info[0], su[i], middle_info[3])
            info = {
                'id': '%s_part%02d' % (video_id, i + 1),
                'title': title,
                'url': download_url,
                'ext': 'mp4',
            }
            files_info.append(info)
            time.sleep(1)
        return files_info