Browse Source

[theintercept] Add new extractor

master
j 9 years ago
parent
commit
2be689b7e2
2 changed files with 69 additions and 0 deletions
  1. 1
      youtube_dl/extractor/__init__.py
  2. 68
      youtube_dl/extractor/theintercept.py

1
youtube_dl/extractor/__init__.py

@ -657,6 +657,7 @@ from .tenplay import TenPlayIE
from .testurl import TestURLIE from .testurl import TestURLIE
from .testtube import TestTubeIE from .testtube import TestTubeIE
from .tf1 import TF1IE from .tf1 import TF1IE
from .theintercept import TheInterceptIE
from .theonion import TheOnionIE from .theonion import TheOnionIE
from .theplatform import ( from .theplatform import (
ThePlatformIE, ThePlatformIE,

68
youtube_dl/extractor/theintercept.py

@ -0,0 +1,68 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class TheInterceptIE(InfoExtractor):
_VALID_URL = r'https://theintercept.com/fieldofvision/(?P<id>.+?)/'
_TESTS = [{
'url': 'https://theintercept.com/fieldofvision/thisisacoup-episode-four-surrender-or-die/',
'info_dict': {
'id': 'thisisacoup-episode-four-surrender-or-die',
'ext': 'mp4',
'title': '#ThisIsACoup – Episode Four: Surrender or Die',
'upload_date': '20151218',
'description': 'md5:74dd27f0e2fbd50817829f97eaa33140',
}
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
mobj = re.search(r'initialStoreTree =(?P<json_data>.+})', webpage)
if mobj is None:
raise ExtractorError('Unable to extract initialStoreTree')
json_data = self._parse_json(mobj.group('json_data'), display_id)
info = None
for post in json_data['resources']['posts'].values():
if post['slug'] == display_id:
info = post
break
if info is None:
raise ExtractorError('Unable to find info for %s'%display_id)
title = info['title']
description = info['excerpt']
upload_date = info['date'][:10].replace('-', '')
video_id = info['fov_videoid']
creator = ','.join([a['display_name'] for a in info['authors']])
thumbnail = self._og_search_property('image', webpage)
content_id = thumbnail.split('/')[-1].split('.')[0]
content_url = 'https://content.jwplatform.com/jw6/{content_id}.xml'.format(content_id=content_id)
content = self._download_xml(content_url, video_id)
formats = []
for source in content.findall('.//{http://rss.jwpcdn.com/}source'):
if source.attrib['file'].endswith('.m3u8'):
formats.extend(self._extract_m3u8_formats(
source.attrib['file'], video_id, 'mp4', preference=1, m3u8_id='hls'))
return {
'creator': creator,
'description': description,
'display_id': display_id,
'formats': formats,
'id': video_id,
'id': video_id,
'thumbnail': thumbnail,
'title': title,
'upload_date': upload_date,
}
Loading…
Cancel
Save