Browse Source

[CSpan] Add detection for Senate ISVP. Closes #5302

master
Yen Chi Hsuan 9 years ago
parent
commit
2fe1b5bd2a
3 changed files with 41 additions and 3 deletions
  1. 18
      youtube_dl/extractor/cspan.py
  2. 6
      youtube_dl/extractor/generic.py
  3. 20
      youtube_dl/extractor/senateisvp.py

18
youtube_dl/extractor/cspan.py

@ -7,7 +7,9 @@ from ..utils import (
int_or_none, int_or_none,
unescapeHTML, unescapeHTML,
find_xpath_attr, find_xpath_attr,
smuggle_url,
) )
from .senateisvp import SenateISVPIE
class CSpanIE(InfoExtractor): class CSpanIE(InfoExtractor):
@ -40,6 +42,15 @@ class CSpanIE(InfoExtractor):
'title': 'General Motors Ignition Switch Recall', 'title': 'General Motors Ignition Switch Recall',
}, },
'playlist_duration_sum': 14855, 'playlist_duration_sum': 14855,
}, {
# Video from senate.gov
'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers',
'md5': '7314c4b96dad66dd8e63dc3518ceaa6f',
'info_dict': {
'id': 'judiciary031715',
'ext': 'flv',
'title': 'Immigration Reforms Needed to Protect Skilled American Workers',
}
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -56,7 +67,7 @@ class CSpanIE(InfoExtractor):
# present, otherwise this is a stripped version # present, otherwise this is a stripped version
r'<p class=\'initial\'>(.*?)</p>' r'<p class=\'initial\'>(.*?)</p>'
], ],
webpage, 'description', flags=re.DOTALL)
webpage, 'description', flags=re.DOTALL, default=None)
info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id
data = self._download_json(info_url, video_id) data = self._download_json(info_url, video_id)
@ -68,6 +79,11 @@ class CSpanIE(InfoExtractor):
title = find_xpath_attr(doc, './/string', 'name', 'title').text title = find_xpath_attr(doc, './/string', 'name', 'title').text
thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text
senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
if senate_isvp_url:
surl = smuggle_url(senate_isvp_url, {'force_title': title})
return self.url_result(surl, 'SenateISVP', video_id, title)
files = data['video']['files'] files = data['video']['files']
entries = [{ entries = [{

6
youtube_dl/extractor/generic.py

@ -35,6 +35,7 @@ from .rutv import RUTVIE
from .smotri import SmotriIE from .smotri import SmotriIE
from .condenast import CondeNastIE from .condenast import CondeNastIE
from .udn import UDNEmbedIE from .udn import UDNEmbedIE
from .senateisvp import SenateISVPIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -1365,6 +1366,11 @@ class GenericIE(InfoExtractor):
return self.url_result( return self.url_result(
compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
# Look for Senate ISVP iframe
senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
if senate_isvp_url:
return self.url_result(surl, 'SenateISVP')
def check_video(vurl): def check_video(vurl):
if YoutubeIE.suitable(vurl): if YoutubeIE.suitable(vurl):
return True return True

20
youtube_dl/extractor/senateisvp.py

@ -3,7 +3,10 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ExtractorError
from ..utils import (
ExtractorError,
unsmuggle_url,
)
from ..compat import ( from ..compat import (
compat_parse_qs, compat_parse_qs,
compat_urlparse, compat_urlparse,
@ -73,12 +76,22 @@ class SenateISVPIE(InfoExtractor):
} }
}] }]
@staticmethod
def _search_iframe_url(webpage):
mobj = re.search(
r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/\?[^'\"]+)['\"]",
webpage)
if mobj:
return mobj.group('url')
def _get_info_for_comm(self, committee): def _get_info_for_comm(self, committee):
for entry in self._COMM_MAP: for entry in self._COMM_MAP:
if entry[0] == committee: if entry[0] == committee:
return entry[1:] return entry[1:]
def _real_extract(self, url): def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs')) qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs'))
if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): if not qs.get('filename') or not qs.get('type') or not qs.get('comm'):
raise ExtractorError('Invalid URL', expected=True) raise ExtractorError('Invalid URL', expected=True)
@ -87,7 +100,10 @@ class SenateISVPIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id)
if smuggled_data.get('force_title'):
title = smuggled_data['force_title']
else:
title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id)
poster = qs.get('poster') poster = qs.get('poster')
if poster: if poster:
thumbnail = poster[0] thumbnail = poster[0]

Loading…
Cancel
Save