Browse Source

[dw] add support for article pages(closes #8790)

master
remitamine 8 years ago
parent
commit
36bb63e084
2 changed files with 35 additions and 6 deletions
  1. 5
      youtube_dl/extractor/__init__.py
  2. 36
      youtube_dl/extractor/dw.py

5
youtube_dl/extractor/__init__.py

@ -189,7 +189,10 @@ from .dumpert import DumpertIE
from .defense import DefenseGouvFrIE from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE from .discovery import DiscoveryIE
from .dropbox import DropboxIE from .dropbox import DropboxIE
from .dw import DWIE
from .dw import (
DWIE,
DWArticleIE,
)
from .eagleplatform import EaglePlatformIE from .eagleplatform import EaglePlatformIE
from .ebaumsworld import EbaumsWorldIE from .ebaumsworld import EbaumsWorldIE
from .echomsk import EchoMskIE from .echomsk import EchoMskIE

36
youtube_dl/extractor/dw.py

@ -3,9 +3,11 @@ from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import int_or_none from ..utils import int_or_none
from ..compat import compat_urlparse
class DWIE(InfoExtractor): class DWIE(InfoExtractor):
IE_NAME = 'dw'
_VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
# video # video
@ -32,16 +34,15 @@ class DWIE(InfoExtractor):
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
media_id = self._match_id(url)
webpage = self._download_webpage(url, media_id)
hidden_inputs = self._hidden_inputs(webpage) hidden_inputs = self._hidden_inputs(webpage)
title = hidden_inputs['media_title'] title = hidden_inputs['media_title']
formats = [] formats = []
if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1': if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1':
formats = self._extract_smil_formats( formats = self._extract_smil_formats(
'http://www.dw.com/smil/v-%s' % video_id, video_id,
'http://www.dw.com/smil/v-%s' % media_id, media_id,
transform_source=lambda s: s.replace( transform_source=lambda s: s.replace(
'rtmp://tv-od.dw.de/flash/', 'rtmp://tv-od.dw.de/flash/',
'http://tv-download.dw.de/dwtv_video/flv/')) 'http://tv-download.dw.de/dwtv_video/flv/'))
@ -49,7 +50,7 @@ class DWIE(InfoExtractor):
formats = [{'url': hidden_inputs['file_name']}] formats = [{'url': hidden_inputs['file_name']}]
return { return {
'id': video_id,
'id': media_id,
'title': title, 'title': title,
'description': self._og_search_description(webpage), 'description': self._og_search_description(webpage),
'thumbnail': hidden_inputs.get('preview_image'), 'thumbnail': hidden_inputs.get('preview_image'),
@ -57,3 +58,28 @@ class DWIE(InfoExtractor):
'upload_date': hidden_inputs.get('display_date'), 'upload_date': hidden_inputs.get('display_date'),
'formats': formats, 'formats': formats,
} }
class DWArticleIE(InfoExtractor):
IE_NAME = 'dw:article'
_VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P<id>\d+)'
_TEST = {
'url': 'http://www.dw.com/en/no-hope-limited-options-for-refugees-in-idomeni/a-19111009',
'md5': '8ca657f9d068bbef74d6fc38b97fc869',
'info_dict': {
'id': '19105868',
'ext': 'mp4',
'title': 'The harsh life of refugees in Idomeni',
'description': 'md5:196015cc7e48ebf474db9399420043c7',
'upload_date': '20160310',
}
}
def _real_extract(self, url):
article_id = self._match_id(url)
webpage = self._download_webpage(url, article_id)
hidden_inputs = self._hidden_inputs(webpage)
media_id = hidden_inputs['media_id']
media_path = self._search_regex(r'href="([^"]+av-%s)"\s+class="overlayLink"' % media_id, webpage, 'media url')
media_url = compat_urlparse.urljoin(url, media_path)
return self.url_result(media_url, 'DW', media_id)
Loading…
Cancel
Save