Browse Source

[googledrive] Add new extractor

master
remitamine 9 years ago
parent
commit
984e4d4875
2 changed files with 107 additions and 0 deletions
  1. 1
      youtube_dl/extractor/__init__.py
  2. 106
      youtube_dl/extractor/googledrive.py

1
youtube_dl/extractor/__init__.py

@ -209,6 +209,7 @@ from .globo import GloboIE
from .godtube import GodTubeIE
from .goldenmoustache import GoldenMoustacheIE
from .golem import GolemIE
from .googledrive import GoogleDriveIE
from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
from .gorillavid import GorillaVidIE

106
youtube_dl/extractor/googledrive.py

@ -0,0 +1,106 @@
from .common import InfoExtractor
from ..utils import RegexNotFoundError
class GoogleDriveIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P<id>.+?)(?:&|/|$)'
_TEST = {
'url': 'https://drive.google.com/file/d/0BzpExh0WzJF0NlR5WUlxdEVsY0U/edit?pli=1',
'info_dict': {
'id': '0BzpExh0WzJF0NlR5WUlxdEVsY0U',
'ext': 'mp4',
'title': '[AHSH] Fairy Tail S2 - 01 [720p].mp4',
}
}
_formats = {
'5': {'ext': 'flv'},
'6': {'ext': 'flv'},
'13': {'ext': '3gp'},
'17': {'ext': '3gp'},
'18': {'ext': 'mp4'},
'22': {'ext': 'mp4'},
'34': {'ext': 'flv'},
'35': {'ext': 'flv'},
'36': {'ext': '3gp'},
'37': {'ext': 'mp4'},
'38': {'ext': 'mp4'},
'43': {'ext': 'webm'},
'44': {'ext': 'webm'},
'45': {'ext': 'webm'},
'46': {'ext': 'webm'},
'59': {'ext': 'mp4'}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'http://docs.google.com/file/d/'+video_id, video_id, encoding='unicode_escape'
)
try:
title = self._html_search_regex(
r'"title","(?P<title>.*?)"',
webpage,
'title',
group='title'
)
fmt_stream_map = self._html_search_regex(
r'"fmt_stream_map","(?P<fmt_stream_map>.*?)"',
webpage,
'fmt_stream_map',
group='fmt_stream_map'
)
fmt_list = self._html_search_regex(
r'"fmt_list","(?P<fmt_list>.*?)"',
webpage,
'fmt_list',
group='fmt_list'
)
# timestamp = self._html_search_regex(
# r'"timestamp","(?P<timestamp>.*?)"',
# webpage,
# 'timestamp',
# group='timestamp'
# )
length_seconds = self._html_search_regex(
r'"length_seconds","(?P<length_seconds>.*?)"',
webpage,
'length_seconds',
group='length_seconds'
)
except RegexNotFoundError:
try:
reason = self._html_search_regex(
r'"reason","(?P<reason>.*?)"',
webpage,
'reason',
group='reason'
)
self.report_warning(reason)
return
except RegexNotFoundError:
self.report_warning('not a video')
return
fmt_stream_map = fmt_stream_map.split(',')
fmt_list = fmt_list.split(',')
formats = []
for i in range(len(fmt_stream_map)):
fmt_id, fmt_url = fmt_stream_map[i].split('|')
resolution = fmt_list[i].split('/')[1]
width, height = resolution.split('x')
formats.append({
'url': fmt_url,
'format_id': fmt_id,
'resolution': resolution,
'width': int(width),
'height': int(height),
'ext': self._formats[fmt_id]['ext']
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
# 'timestamp': int(timestamp),
'duration': int(length_seconds),
'formats': formats
}
Loading…
Cancel
Save