13 changed files with 381 additions and 36 deletions
Unified View
Diff Options
-
4devscripts/youtube_genalgo.py
-
28youtube_dl/FileDownloader.py
-
4youtube_dl/__init__.py
-
4youtube_dl/extractor/__init__.py
-
76youtube_dl/extractor/addanime.py
-
167youtube_dl/extractor/appletrailers.py
-
58youtube_dl/extractor/cnn.py
-
2youtube_dl/extractor/common.py
-
4youtube_dl/extractor/googleplus.py
-
33youtube_dl/extractor/nbc.py
-
2youtube_dl/extractor/youtube.py
-
33youtube_dl/utils.py
-
2youtube_dl/version.py
@ -0,0 +1,76 @@ |
|||||
|
import ast |
||||
|
import re |
||||
|
|
||||
|
from .common import InfoExtractor |
||||
|
from ..utils import ( |
||||
|
compat_HTTPError, |
||||
|
compat_str, |
||||
|
compat_urllib_parse, |
||||
|
compat_urllib_parse_urlparse, |
||||
|
|
||||
|
ExtractorError, |
||||
|
) |
||||
|
|
||||
|
|
||||
|
class AddAnimeIE(InfoExtractor): |
||||
|
|
||||
|
_VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)' |
||||
|
IE_NAME = u'AddAnime' |
||||
|
_TEST = { |
||||
|
u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', |
||||
|
u'file': u'24MR3YO5SAS9.flv', |
||||
|
u'md5': u'1036a0e0cd307b95bd8a8c3a5c8cfaf1', |
||||
|
u'info_dict': { |
||||
|
u"description": u"One Piece 606", |
||||
|
u"title": u"One Piece 606" |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
def _real_extract(self, url): |
||||
|
try: |
||||
|
mobj = re.match(self._VALID_URL, url) |
||||
|
video_id = mobj.group('video_id') |
||||
|
webpage = self._download_webpage(url, video_id) |
||||
|
except ExtractorError as ee: |
||||
|
if not isinstance(ee.cause, compat_HTTPError): |
||||
|
raise |
||||
|
|
||||
|
redir_webpage = ee.cause.read().decode('utf-8') |
||||
|
action = self._search_regex( |
||||
|
r'<form id="challenge-form" action="([^"]+)"', |
||||
|
redir_webpage, u'Redirect form') |
||||
|
vc = self._search_regex( |
||||
|
r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>', |
||||
|
redir_webpage, u'redirect vc value') |
||||
|
av = re.search( |
||||
|
r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);', |
||||
|
redir_webpage) |
||||
|
if av is None: |
||||
|
raise ExtractorError(u'Cannot find redirect math task') |
||||
|
av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3)) |
||||
|
|
||||
|
parsed_url = compat_urllib_parse_urlparse(url) |
||||
|
av_val = av_res + len(parsed_url.netloc) |
||||
|
confirm_url = ( |
||||
|
parsed_url.scheme + u'://' + parsed_url.netloc + |
||||
|
action + '?' + |
||||
|
compat_urllib_parse.urlencode({ |
||||
|
'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) |
||||
|
self._download_webpage( |
||||
|
confirm_url, video_id, |
||||
|
note=u'Confirming after redirect') |
||||
|
webpage = self._download_webpage(url, video_id) |
||||
|
|
||||
|
video_url = self._search_regex(r"var normal_video_file = '(.*?)';", |
||||
|
webpage, u'video file URL') |
||||
|
video_title = self._og_search_title(webpage) |
||||
|
video_description = self._og_search_description(webpage) |
||||
|
|
||||
|
return { |
||||
|
'_type': 'video', |
||||
|
'id': video_id, |
||||
|
'url': video_url, |
||||
|
'ext': 'flv', |
||||
|
'title': video_title, |
||||
|
'description': video_description |
||||
|
} |
@ -0,0 +1,167 @@ |
|||||
|
import re |
||||
|
import xml.etree.ElementTree |
||||
|
|
||||
|
from .common import InfoExtractor |
||||
|
from ..utils import ( |
||||
|
determine_ext, |
||||
|
ExtractorError, |
||||
|
) |
||||
|
|
||||
|
|
||||
|
class AppleTrailersIE(InfoExtractor): |
||||
|
_VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)' |
||||
|
_TEST = { |
||||
|
u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/", |
||||
|
u"playlist": [ |
||||
|
{ |
||||
|
u"file": u"manofsteel-trailer4.mov", |
||||
|
u"md5": u"11874af099d480cc09e103b189805d5f", |
||||
|
u"info_dict": { |
||||
|
u"duration": 111, |
||||
|
u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg", |
||||
|
u"title": u"Trailer 4", |
||||
|
u"upload_date": u"20130523", |
||||
|
u"uploader_id": u"wb", |
||||
|
}, |
||||
|
}, |
||||
|
{ |
||||
|
u"file": u"manofsteel-trailer3.mov", |
||||
|
u"md5": u"07a0a262aae5afe68120eed61137ab34", |
||||
|
u"info_dict": { |
||||
|
u"duration": 182, |
||||
|
u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg", |
||||
|
u"title": u"Trailer 3", |
||||
|
u"upload_date": u"20130417", |
||||
|
u"uploader_id": u"wb", |
||||
|
}, |
||||
|
}, |
||||
|
{ |
||||
|
u"file": u"manofsteel-trailer.mov", |
||||
|
u"md5": u"e401fde0813008e3307e54b6f384cff1", |
||||
|
u"info_dict": { |
||||
|
u"duration": 148, |
||||
|
u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg", |
||||
|
u"title": u"Trailer", |
||||
|
u"upload_date": u"20121212", |
||||
|
u"uploader_id": u"wb", |
||||
|
}, |
||||
|
}, |
||||
|
{ |
||||
|
u"file": u"manofsteel-teaser.mov", |
||||
|
u"md5": u"76b392f2ae9e7c98b22913c10a639c97", |
||||
|
u"info_dict": { |
||||
|
u"duration": 93, |
||||
|
u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg", |
||||
|
u"title": u"Teaser", |
||||
|
u"upload_date": u"20120721", |
||||
|
u"uploader_id": u"wb", |
||||
|
}, |
||||
|
} |
||||
|
] |
||||
|
} |
||||
|
|
||||
|
def _real_extract(self, url): |
||||
|
mobj = re.match(self._VALID_URL, url) |
||||
|
movie = mobj.group('movie') |
||||
|
uploader_id = mobj.group('company') |
||||
|
|
||||
|
playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' |
||||
|
playlist_snippet = self._download_webpage(playlist_url, movie) |
||||
|
playlist_cleaned = re.sub(r'(?s)<script>.*?</script>', u'', playlist_snippet) |
||||
|
playlist_html = u'<html>' + playlist_cleaned + u'</html>' |
||||
|
|
||||
|
size_cache = {} |
||||
|
|
||||
|
doc = xml.etree.ElementTree.fromstring(playlist_html) |
||||
|
playlist = [] |
||||
|
for li in doc.findall('./div/ul/li'): |
||||
|
title = li.find('.//h3').text |
||||
|
video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() |
||||
|
thumbnail = li.find('.//img').attrib['src'] |
||||
|
|
||||
|
date_el = li.find('.//p') |
||||
|
upload_date = None |
||||
|
m = re.search(r':\s?(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<year>[0-9]{2})', date_el.text) |
||||
|
if m: |
||||
|
upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') |
||||
|
runtime_el = date_el.find('./br') |
||||
|
m = re.search(r':\s?(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime_el.tail) |
||||
|
duration = None |
||||
|
if m: |
||||
|
duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) |
||||
|
|
||||
|
formats = [] |
||||
|
for formats_el in li.findall('.//a'): |
||||
|
if formats_el.attrib['class'] != 'OverlayPanel': |
||||
|
continue |
||||
|
target = formats_el.attrib['target'] |
||||
|
|
||||
|
format_code = formats_el.text |
||||
|
if 'Automatic' in format_code: |
||||
|
continue |
||||
|
|
||||
|
size_q = formats_el.attrib['href'] |
||||
|
size_id = size_q.rpartition('#videos-')[2] |
||||
|
if size_id not in size_cache: |
||||
|
size_url = url + size_q |
||||
|
sizepage_html = self._download_webpage( |
||||
|
size_url, movie, |
||||
|
note=u'Downloading size info %s' % size_id, |
||||
|
errnote=u'Error while downloading size info %s' % size_id, |
||||
|
) |
||||
|
_doc = xml.etree.ElementTree.fromstring(sizepage_html) |
||||
|
size_cache[size_id] = _doc |
||||
|
|
||||
|
sizepage_doc = size_cache[size_id] |
||||
|
links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') |
||||
|
for vid_a in links: |
||||
|
href = vid_a.get('href') |
||||
|
if not href.endswith(target): |
||||
|
continue |
||||
|
detail_q = href.partition('#')[0] |
||||
|
detail_url = url + '/' + detail_q |
||||
|
|
||||
|
m = re.match(r'includes/(?P<detail_id>[^/]+)/', detail_q) |
||||
|
detail_id = m.group('detail_id') |
||||
|
|
||||
|
detail_html = self._download_webpage( |
||||
|
detail_url, movie, |
||||
|
note=u'Downloading detail %s %s' % (detail_id, size_id), |
||||
|
errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) |
||||
|
) |
||||
|
detail_doc = xml.etree.ElementTree.fromstring(detail_html) |
||||
|
movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') |
||||
|
assert movie_link_el.get('class') == 'movieLink' |
||||
|
movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') |
||||
|
ext = determine_ext(movie_link) |
||||
|
assert ext == 'mov' |
||||
|
|
||||
|
formats.append({ |
||||
|
'format': format_code, |
||||
|
'ext': ext, |
||||
|
'url': movie_link, |
||||
|
}) |
||||
|
|
||||
|
info = { |
||||
|
'_type': 'video', |
||||
|
'id': video_id, |
||||
|
'title': title, |
||||
|
'formats': formats, |
||||
|
'title': title, |
||||
|
'duration': duration, |
||||
|
'thumbnail': thumbnail, |
||||
|
'upload_date': upload_date, |
||||
|
'uploader_id': uploader_id, |
||||
|
'user_agent': 'QuickTime compatible (youtube-dl)', |
||||
|
} |
||||
|
# TODO: Remove when #980 has been merged |
||||
|
info['url'] = formats[-1]['url'] |
||||
|
info['ext'] = formats[-1]['ext'] |
||||
|
|
||||
|
playlist.append(info) |
||||
|
|
||||
|
return { |
||||
|
'_type': 'playlist', |
||||
|
'id': movie, |
||||
|
'entries': playlist, |
||||
|
} |
@ -0,0 +1,58 @@ |
|||||
|
import re |
||||
|
import xml.etree.ElementTree |
||||
|
|
||||
|
from .common import InfoExtractor |
||||
|
from ..utils import determine_ext |
||||
|
|
||||
|
|
||||
|
class CNNIE(InfoExtractor): |
||||
|
_VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/ |
||||
|
(?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))''' |
||||
|
|
||||
|
_TESTS = [{ |
||||
|
u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', |
||||
|
u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4', |
||||
|
u'md5': u'3e6121ea48df7e2259fe73a0628605c4', |
||||
|
u'info_dict': { |
||||
|
u'title': u'Nadal wins 8th French Open title', |
||||
|
u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', |
||||
|
}, |
||||
|
}, |
||||
|
{ |
||||
|
u"url": u"http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29", |
||||
|
u"file": u"us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4", |
||||
|
u"md5": u"b5cc60c60a3477d185af8f19a2a26f4e", |
||||
|
u"info_dict": { |
||||
|
u"title": "Student's epic speech stuns new freshmen", |
||||
|
u"description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"" |
||||
|
} |
||||
|
}] |
||||
|
|
||||
|
def _real_extract(self, url): |
||||
|
mobj = re.match(self._VALID_URL, url) |
||||
|
path = mobj.group('path') |
||||
|
page_title = mobj.group('title') |
||||
|
info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path |
||||
|
info_xml = self._download_webpage(info_url, page_title) |
||||
|
info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) |
||||
|
|
||||
|
formats = [] |
||||
|
for f in info.findall('files/file'): |
||||
|
mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate']) |
||||
|
if mf is not None: |
||||
|
formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text)) |
||||
|
formats = sorted(formats) |
||||
|
(_,_,_, video_path) = formats[-1] |
||||
|
video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path |
||||
|
|
||||
|
thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')]) |
||||
|
thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails] |
||||
|
|
||||
|
return {'id': info.attrib['id'], |
||||
|
'title': info.find('headline').text, |
||||
|
'url': video_url, |
||||
|
'ext': determine_ext(video_url), |
||||
|
'thumbnail': thumbnails[-1][1], |
||||
|
'thumbnails': thumbs_dict, |
||||
|
'description': info.find('description').text, |
||||
|
} |
@ -0,0 +1,33 @@ |
|||||
|
import re |
||||
|
import xml.etree.ElementTree |
||||
|
|
||||
|
from .common import InfoExtractor |
||||
|
from ..utils import find_xpath_attr, compat_str |
||||
|
|
||||
|
|
||||
|
class NBCNewsIE(InfoExtractor): |
||||
|
_VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)' |
||||
|
|
||||
|
_TEST = { |
||||
|
u'url': u'http://www.nbcnews.com/video/nbc-news/52753292', |
||||
|
u'file': u'52753292.flv', |
||||
|
u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179', |
||||
|
u'info_dict': { |
||||
|
u'title': u'Crew emerges after four-month Mars food study', |
||||
|
u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1', |
||||
|
}, |
||||
|
} |
||||
|
|
||||
|
def _real_extract(self, url): |
||||
|
mobj = re.match(self._VALID_URL, url) |
||||
|
video_id = mobj.group('id') |
||||
|
info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) |
||||
|
info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video') |
||||
|
|
||||
|
return {'id': video_id, |
||||
|
'title': info.find('headline').text, |
||||
|
'ext': 'flv', |
||||
|
'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, |
||||
|
'description': compat_str(info.find('caption').text), |
||||
|
'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, |
||||
|
} |
@ -1,2 +1,2 @@ |
|||||
|
|
||||
__version__ = '2013.08.27' |
|
||||
|
__version__ = '2013.08.28' |
Write
Preview
Loading…
Cancel
Save