|
|
@ -61,10 +61,7 @@ class ArteTvIE(InfoExtractor): |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
class ArteTVPlus7IE(InfoExtractor): |
|
|
|
IE_NAME = 'arte.tv:+7' |
|
|
|
_VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' |
|
|
|
|
|
|
|
class ArteTVBaseIE(InfoExtractor): |
|
|
|
@classmethod |
|
|
|
def _extract_url_info(cls, url): |
|
|
|
mobj = re.match(cls._VALID_URL, url) |
|
|
@ -78,60 +75,6 @@ class ArteTVPlus7IE(InfoExtractor): |
|
|
|
video_id = mobj.group('id') |
|
|
|
return video_id, lang |
|
|
|
|
|
|
|
def _real_extract(self, url): |
|
|
|
video_id, lang = self._extract_url_info(url) |
|
|
|
webpage = self._download_webpage(url, video_id) |
|
|
|
return self._extract_from_webpage(webpage, video_id, lang) |
|
|
|
|
|
|
|
def _extract_from_webpage(self, webpage, video_id, lang): |
|
|
|
patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') |
|
|
|
ids = (video_id, '') |
|
|
|
# some pages contain multiple videos (like |
|
|
|
# http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), |
|
|
|
# so we first try to look for json URLs that contain the video id from |
|
|
|
# the 'vid' parameter. |
|
|
|
patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] |
|
|
|
json_url = self._html_search_regex( |
|
|
|
patterns, webpage, 'json vp url', default=None) |
|
|
|
if not json_url: |
|
|
|
def find_iframe_url(webpage, default=NO_DEFAULT): |
|
|
|
return self._html_search_regex( |
|
|
|
r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', |
|
|
|
webpage, 'iframe url', group='url', default=default) |
|
|
|
|
|
|
|
iframe_url = find_iframe_url(webpage, None) |
|
|
|
if not iframe_url: |
|
|
|
embed_url = self._html_search_regex( |
|
|
|
r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) |
|
|
|
if embed_url: |
|
|
|
player = self._download_json( |
|
|
|
embed_url, video_id, 'Downloading player page') |
|
|
|
iframe_url = find_iframe_url(player['html']) |
|
|
|
# en and es URLs produce react-based pages with different layout (e.g. |
|
|
|
# http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) |
|
|
|
if not iframe_url: |
|
|
|
program = self._search_regex( |
|
|
|
r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', |
|
|
|
webpage, 'program', default=None) |
|
|
|
if program: |
|
|
|
embed_html = self._parse_json(program, video_id) |
|
|
|
if embed_html: |
|
|
|
iframe_url = find_iframe_url(embed_html['embed_html']) |
|
|
|
if iframe_url: |
|
|
|
json_url = compat_parse_qs( |
|
|
|
compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] |
|
|
|
if json_url: |
|
|
|
title = self._search_regex( |
|
|
|
r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', |
|
|
|
webpage, 'title', default=None, group='title') |
|
|
|
return self._extract_from_json_url(json_url, video_id, lang, title=title) |
|
|
|
# Different kind of embed URL (e.g. |
|
|
|
# http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) |
|
|
|
embed_url = self._search_regex( |
|
|
|
r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', |
|
|
|
webpage, 'embed url', group='url') |
|
|
|
return self.url_result(embed_url) |
|
|
|
|
|
|
|
def _extract_from_json_url(self, json_url, video_id, lang, title=None): |
|
|
|
info = self._download_json(json_url, video_id) |
|
|
|
player_info = info['videoJsonPlayer'] |
|
|
@ -235,6 +178,74 @@ class ArteTVPlus7IE(InfoExtractor): |
|
|
|
return info_dict |
|
|
|
|
|
|
|
|
|
|
|
class ArteTVPlus7IE(ArteTVBaseIE): |
|
|
|
IE_NAME = 'arte.tv:+7' |
|
|
|
_VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' |
|
|
|
|
|
|
|
_TESTS = [{ |
|
|
|
'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', |
|
|
|
'only_matching': True, |
|
|
|
}] |
|
|
|
|
|
|
|
@classmethod |
|
|
|
def suitable(cls, url): |
|
|
|
return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url) |
|
|
|
|
|
|
|
def _real_extract(self, url): |
|
|
|
video_id, lang = self._extract_url_info(url) |
|
|
|
webpage = self._download_webpage(url, video_id) |
|
|
|
return self._extract_from_webpage(webpage, video_id, lang) |
|
|
|
|
|
|
|
def _extract_from_webpage(self, webpage, video_id, lang): |
|
|
|
patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') |
|
|
|
ids = (video_id, '') |
|
|
|
# some pages contain multiple videos (like |
|
|
|
# http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), |
|
|
|
# so we first try to look for json URLs that contain the video id from |
|
|
|
# the 'vid' parameter. |
|
|
|
patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] |
|
|
|
json_url = self._html_search_regex( |
|
|
|
patterns, webpage, 'json vp url', default=None) |
|
|
|
if not json_url: |
|
|
|
def find_iframe_url(webpage, default=NO_DEFAULT): |
|
|
|
return self._html_search_regex( |
|
|
|
r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', |
|
|
|
webpage, 'iframe url', group='url', default=default) |
|
|
|
|
|
|
|
iframe_url = find_iframe_url(webpage, None) |
|
|
|
if not iframe_url: |
|
|
|
embed_url = self._html_search_regex( |
|
|
|
r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) |
|
|
|
if embed_url: |
|
|
|
player = self._download_json( |
|
|
|
embed_url, video_id, 'Downloading player page') |
|
|
|
iframe_url = find_iframe_url(player['html']) |
|
|
|
# en and es URLs produce react-based pages with different layout (e.g. |
|
|
|
# http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) |
|
|
|
if not iframe_url: |
|
|
|
program = self._search_regex( |
|
|
|
r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', |
|
|
|
webpage, 'program', default=None) |
|
|
|
if program: |
|
|
|
embed_html = self._parse_json(program, video_id) |
|
|
|
if embed_html: |
|
|
|
iframe_url = find_iframe_url(embed_html['embed_html']) |
|
|
|
if iframe_url: |
|
|
|
json_url = compat_parse_qs( |
|
|
|
compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] |
|
|
|
if json_url: |
|
|
|
title = self._search_regex( |
|
|
|
r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', |
|
|
|
webpage, 'title', default=None, group='title') |
|
|
|
return self._extract_from_json_url(json_url, video_id, lang, title=title) |
|
|
|
# Different kind of embed URL (e.g. |
|
|
|
# http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) |
|
|
|
embed_url = self._search_regex( |
|
|
|
r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', |
|
|
|
webpage, 'embed url', group='url') |
|
|
|
return self.url_result(embed_url) |
|
|
|
|
|
|
|
|
|
|
|
# It also uses the arte_vp_url url from the webpage to extract the information |
|
|
|
class ArteTVCreativeIE(ArteTVPlus7IE): |
|
|
|
IE_NAME = 'arte.tv:creative' |
|
|
@ -267,7 +278,7 @@ class ArteTVInfoIE(ArteTVPlus7IE): |
|
|
|
IE_NAME = 'arte.tv:info' |
|
|
|
_VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' |
|
|
|
|
|
|
|
_TEST = { |
|
|
|
_TESTS = [{ |
|
|
|
'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere', |
|
|
|
'info_dict': { |
|
|
|
'id': '067528-000-A', |
|
|
@ -275,7 +286,7 @@ class ArteTVInfoIE(ArteTVPlus7IE): |
|
|
|
'title': 'Service civique, un cache misère ?', |
|
|
|
'upload_date': '20160403', |
|
|
|
}, |
|
|
|
} |
|
|
|
}] |
|
|
|
|
|
|
|
|
|
|
|
class ArteTVFutureIE(ArteTVPlus7IE): |
|
|
@ -300,6 +311,8 @@ class ArteTVDDCIE(ArteTVPlus7IE): |
|
|
|
IE_NAME = 'arte.tv:ddc' |
|
|
|
_VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)' |
|
|
|
|
|
|
|
_TESTS = [] |
|
|
|
|
|
|
|
def _real_extract(self, url): |
|
|
|
video_id, lang = self._extract_url_info(url) |
|
|
|
if lang == 'folge': |
|
|
@ -318,7 +331,7 @@ class ArteTVConcertIE(ArteTVPlus7IE): |
|
|
|
IE_NAME = 'arte.tv:concert' |
|
|
|
_VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' |
|
|
|
|
|
|
|
_TEST = { |
|
|
|
_TESTS = [{ |
|
|
|
'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde', |
|
|
|
'md5': '9ea035b7bd69696b67aa2ccaaa218161', |
|
|
|
'info_dict': { |
|
|
@ -328,14 +341,14 @@ class ArteTVConcertIE(ArteTVPlus7IE): |
|
|
|
'upload_date': '20140128', |
|
|
|
'description': 'md5:486eb08f991552ade77439fe6d82c305', |
|
|
|
}, |
|
|
|
} |
|
|
|
}] |
|
|
|
|
|
|
|
|
|
|
|
class ArteTVCinemaIE(ArteTVPlus7IE): |
|
|
|
IE_NAME = 'arte.tv:cinema' |
|
|
|
_VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)' |
|
|
|
|
|
|
|
_TEST = { |
|
|
|
_TESTS = [{ |
|
|
|
'url': 'http://cinema.arte.tv/de/node/38291', |
|
|
|
'md5': '6b275511a5107c60bacbeeda368c3aa1', |
|
|
|
'info_dict': { |
|
|
@ -345,7 +358,7 @@ class ArteTVCinemaIE(ArteTVPlus7IE): |
|
|
|
'upload_date': '20160122', |
|
|
|
'description': 'md5:7f749bbb77d800ef2be11d54529b96bc', |
|
|
|
}, |
|
|
|
} |
|
|
|
}] |
|
|
|
|
|
|
|
|
|
|
|
class ArteTVMagazineIE(ArteTVPlus7IE): |
|
|
@ -390,9 +403,41 @@ class ArteTVEmbedIE(ArteTVPlus7IE): |
|
|
|
) |
|
|
|
''' |
|
|
|
|
|
|
|
_TESTS = [] |
|
|
|
|
|
|
|
def _real_extract(self, url): |
|
|
|
mobj = re.match(self._VALID_URL, url) |
|
|
|
video_id = mobj.group('id') |
|
|
|
lang = mobj.group('lang') |
|
|
|
json_url = mobj.group('json_url') |
|
|
|
return self._extract_from_json_url(json_url, video_id, lang) |
|
|
|
|
|
|
|
|
|
|
|
class ArteTVPlaylistIE(ArteTVBaseIE): |
|
|
|
IE_NAME = 'arte.tv:playlist' |
|
|
|
_VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/[^#]*#collection/(?P<id>PL-\d+)' |
|
|
|
|
|
|
|
_TESTS = [{ |
|
|
|
'url': 'http://www.arte.tv/guide/de/plus7/?country=DE#collection/PL-013263/ARTETV', |
|
|
|
'info_dict': { |
|
|
|
'id': 'PL-013263', |
|
|
|
'title': 'Areva & Uramin', |
|
|
|
}, |
|
|
|
'playlist_mincount': 6, |
|
|
|
}, { |
|
|
|
'url': 'http://www.arte.tv/guide/de/playlists?country=DE#collection/PL-013190/ARTETV', |
|
|
|
'only_matching': True, |
|
|
|
}] |
|
|
|
|
|
|
|
def _real_extract(self, url): |
|
|
|
playlist_id, lang = self._extract_url_info(url) |
|
|
|
collection = self._download_json( |
|
|
|
'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' |
|
|
|
% (lang, playlist_id), playlist_id) |
|
|
|
title = collection.get('title') |
|
|
|
description = collection.get('shortDescription') or collection.get('teaserText') |
|
|
|
entries = [ |
|
|
|
self._extract_from_json_url( |
|
|
|
video['jsonUrl'], video.get('programId') or playlist_id, lang) |
|
|
|
for video in collection['videos'] if video.get('jsonUrl')] |
|
|
|
return self.playlist_result(entries, playlist_id, title, description) |