From ecb3bfe543ada5e4433522efc05c49a71da8b03f Mon Sep 17 00:00:00 2001 From: Kevin Ngo Date: Mon, 7 Nov 2011 18:02:10 -0800 Subject: [PATCH 1/8] going home and need to upload what little i did --- youtube-dl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube-dl b/youtube-dl index 30a02e5cd..cb31d13c9 100755 --- a/youtube-dl +++ b/youtube-dl @@ -3481,6 +3481,13 @@ class XVideosIE(InfoExtractor): self._downloader.trouble(u'\nERROR: unable to download ' + video_id) +class SoundcloudIE(InformationExtractor): + """Information extractor for soundcloud.com""" + + _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/(\w\d-]+)' + IE_NAME = u'soundcloud' + + class PostProcessor(object): """Post Processor class. From 40306424b1fa34284060740e0d8dd0c2859054b2 Mon Sep 17 00:00:00 2001 From: Kevin Ngo Date: Tue, 8 Nov 2011 00:03:35 -0800 Subject: [PATCH 2/8] work on soundcloud information extractor...need to talk to youtube-dl guys --- youtube-dl | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/youtube-dl b/youtube-dl index cb31d13c9..263ae6540 100755 --- a/youtube-dl +++ b/youtube-dl @@ -3484,9 +3484,63 @@ class XVideosIE(InfoExtractor): class SoundcloudIE(InformationExtractor): """Information extractor for soundcloud.com""" - _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/(\w\d-]+)' + _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)' IE_NAME = u'soundcloud' + def __init__(self, downloader=None): + InfoExtractor.__init__(self, downloader) + + def report_webpage(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) + + def _real_initialize(self): + return + + def _real_extract(self, url): + htmlParser = HTMLParser.HTMLParser() + + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + + # extract uploader + uploader = mobj.group(3).decode('utf-8') + # extract simple title (uploader + slug of song title) + slug_title = mobj.group(4).decode('utf-8') + simple_title = uploader + '-' + slug_title + + self.report_webpage('%s/%s' % (uploader, slug_title)) + + request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title)) + try: + webpage = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) + return + + self.report_extraction('%s/%s' % (uploader, slug_title)) + + # extract video_id (soundcloud uid of song) + mobj = re.search + + try: + self._download.process_info({ + 'id': video_id, + 'url': video_url, + 'uploader': uploader, + 'upload_date': u'NA', + 'title': video_title, + 'stitle': simple_title, + 'ext': u'mp3', + 'format': u'NA', + 'player_url': None, + }) class PostProcessor(object): """Post Processor class. From 073d7a5985ad6ffef98db72217ba3c1f6d0ecd5c Mon Sep 17 00:00:00 2001 From: Kevin Ngo Date: Wed, 9 Nov 2011 01:52:36 -0800 Subject: [PATCH 3/8] extracted all of the soundcloud information including description (not tested), need to hook into filedownloader --- youtube-dl | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/youtube-dl b/youtube-dl index 263ae6540..48616015d 100755 --- a/youtube-dl +++ b/youtube-dl @@ -3482,7 +3482,13 @@ class XVideosIE(InfoExtractor): class SoundcloudIE(InformationExtractor): - """Information extractor for soundcloud.com""" + """Information extractor for soundcloud.com + To access the media, the uid of the song and a stream token + must be extracted from the page source and the script must make + a request to media.soundcloud.com/crossdomain.xml. Then + the media can be grabbed by requesting from an url composed + of the stream token and uid + """ _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)' IE_NAME = u'soundcloud' @@ -3509,7 +3515,7 @@ class SoundcloudIE(InformationExtractor): self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return - # extract uploader + # extract uploader (which is in the url) uploader = mobj.group(3).decode('utf-8') # extract simple title (uploader + slug of song title) slug_title = mobj.group(4).decode('utf-8') @@ -3526,20 +3532,42 @@ class SoundcloudIE(InformationExtractor): self.report_extraction('%s/%s' % (uploader, slug_title)) - # extract video_id (soundcloud uid of song) - mobj = re.search + # extract uid and access token + mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', page) + if mobj: + video_id = match.group(1) + stream_token = match.group(2) + + # construct media url (with uid/token) to request song + mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s" + mediaURL = mediaURL % (video_id, stream_token) + + # description + description = u'No description available' + mobj = re.search('track-description-value">

(.*?)

', page) + if mobj: + description = mobj.group(1) + + # upload date + mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)", page) + if mobj: + try: + upload_date = datetime.datetime.strptime(match.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d') + except: + pass try: self._download.process_info({ 'id': video_id, 'url': video_url, 'uploader': uploader, - 'upload_date': u'NA', + 'upload_date': upload_date, 'title': video_title, 'stitle': simple_title, 'ext': u'mp3', 'format': u'NA', 'player_url': None, + 'description': description }) class PostProcessor(object): From b20d4f8626783ae61f5865a4d9aa3f460053c9a4 Mon Sep 17 00:00:00 2001 From: Kevin Ngo Date: Thu, 10 Nov 2011 01:04:33 -0800 Subject: [PATCH 4/8] changed spaces to tabs (by yt-dl standards), fixed bugs, but still won't download. need to figure out how the whole process works to integrate correctly --- youtube-dl | 93 +++++++++++++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 43 deletions(-) diff --git a/youtube-dl b/youtube-dl index 48616015d..6eafc30b1 100755 --- a/youtube-dl +++ b/youtube-dl @@ -3481,20 +3481,20 @@ class XVideosIE(InfoExtractor): self._downloader.trouble(u'\nERROR: unable to download ' + video_id) -class SoundcloudIE(InformationExtractor): +class SoundcloudIE(InfoExtractor): """Information extractor for soundcloud.com - To access the media, the uid of the song and a stream token - must be extracted from the page source and the script must make - a request to media.soundcloud.com/crossdomain.xml. Then - the media can be grabbed by requesting from an url composed - of the stream token and uid - """ + To access the media, the uid of the song and a stream token + must be extracted from the page source and the script must make + a request to media.soundcloud.com/crossdomain.xml. Then + the media can be grabbed by requesting from an url composed + of the stream token and uid + """ _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)' IE_NAME = u'soundcloud' - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) + def __init__(self, downloader=None): + InfoExtractor.__init__(self, downloader) def report_webpage(self, video_id): """Report information extraction.""" @@ -3504,8 +3504,8 @@ class SoundcloudIE(InformationExtractor): """Report information extraction.""" self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) - def _real_initialize(self): - return + def _real_initialize(self): + return def _real_extract(self, url): htmlParser = HTMLParser.HTMLParser() @@ -3515,10 +3515,10 @@ class SoundcloudIE(InformationExtractor): self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return - # extract uploader (which is in the url) - uploader = mobj.group(3).decode('utf-8') - # extract simple title (uploader + slug of song title) - slug_title = mobj.group(4).decode('utf-8') + # extract uploader (which is in the url) + uploader = mobj.group(1).decode('utf-8') + # extract simple title (uploader + slug of song title) + slug_title = mobj.group(2).decode('utf-8') simple_title = uploader + '-' + slug_title self.report_webpage('%s/%s' % (uploader, slug_title)) @@ -3532,32 +3532,36 @@ class SoundcloudIE(InformationExtractor): self.report_extraction('%s/%s' % (uploader, slug_title)) - # extract uid and access token - mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', page) - if mobj: - video_id = match.group(1) - stream_token = match.group(2) - - # construct media url (with uid/token) to request song - mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s" - mediaURL = mediaURL % (video_id, stream_token) - - # description - description = u'No description available' - mobj = re.search('track-description-value">

(.*?)

', page) - if mobj: - description = mobj.group(1) - - # upload date - mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)", page) - if mobj: - try: - upload_date = datetime.datetime.strptime(match.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d') - except: - pass - - try: - self._download.process_info({ + # extract uid and access token + mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', page) + if mobj: + video_id = match.group(1) + stream_token = match.group(2) + + # construct media url (with uid/token) to request song + mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s" + mediaURL = mediaURL % (video_id, stream_token) + + # description + description = u'No description available' + mobj = re.search('track-description-value">

(.*?)

', page) + if mobj: + description = mobj.group(1) + + # upload date + mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)", page) + if mobj: + try: + upload_date = datetime.datetime.strptime(match.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d') + except: + pass + + # for soundcloud, a request must be made to a cross domain to establish + # needed cookies + request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers) + + try: + self._downloader.process_info({ 'id': video_id, 'url': video_url, 'uploader': uploader, @@ -3567,8 +3571,10 @@ class SoundcloudIE(InformationExtractor): 'ext': u'mp3', 'format': u'NA', 'player_url': None, - 'description': description - }) + 'description': description + }) + except UnavailableVideoError: + self._downloader.trouble(u'\nERROR: unable to download video') class PostProcessor(object): """Post Processor class. @@ -3966,6 +3972,7 @@ def gen_extractors(): EscapistIE(), CollegeHumorIE(), XVideosIE(), + SoundcloudIE(), GenericIE() ] From 871be928a89be62f89ec21a1d9cb700dbddcaeca Mon Sep 17 00:00:00 2001 From: Kevin Ngo Date: Sat, 12 Nov 2011 16:48:43 -0800 Subject: [PATCH 5/8] now downloads soundcloud songs, need to polish title grabbing and file naming --- youtube-dl | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/youtube-dl b/youtube-dl index 6eafc30b1..949d2e532 100755 --- a/youtube-dl +++ b/youtube-dl @@ -3533,10 +3533,10 @@ class SoundcloudIE(InfoExtractor): self.report_extraction('%s/%s' % (uploader, slug_title)) # extract uid and access token - mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', page) + mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage) if mobj: - video_id = match.group(1) - stream_token = match.group(2) + video_id = mobj.group(1) + stream_token = mobj.group(2) # construct media url (with uid/token) to request song mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s" @@ -3544,17 +3544,18 @@ class SoundcloudIE(InfoExtractor): # description description = u'No description available' - mobj = re.search('track-description-value">

(.*?)

', page) + mobj = re.search('track-description-value">

(.*?)

', webpage) if mobj: description = mobj.group(1) # upload date - mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)", page) + upload_date = None + mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)", webpage) if mobj: try: - upload_date = datetime.datetime.strptime(match.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d') - except: - pass + upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d') + except Exception as e: + print str(e) # for soundcloud, a request must be made to a cross domain to establish # needed cookies @@ -3563,10 +3564,10 @@ class SoundcloudIE(InfoExtractor): try: self._downloader.process_info({ 'id': video_id, - 'url': video_url, + 'url': mediaURL, 'uploader': uploader, 'upload_date': upload_date, - 'title': video_title, + 'title': simple_title, 'stitle': simple_title, 'ext': u'mp3', 'format': u'NA', From ec574c2c416eb2b2238841d3f12c66603af29b2a Mon Sep 17 00:00:00 2001 From: Kevin Ngo Date: Sat, 12 Nov 2011 17:08:40 -0800 Subject: [PATCH 6/8] extracts full title from source --- youtube-dl | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/youtube-dl b/youtube-dl index 949d2e532..f717f35ad 100755 --- a/youtube-dl +++ b/youtube-dl @@ -3532,13 +3532,18 @@ class SoundcloudIE(InfoExtractor): self.report_extraction('%s/%s' % (uploader, slug_title)) - # extract uid and access token + # extract uid and stream token that soundcloud hands out for access mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage) if mobj: video_id = mobj.group(1) stream_token = mobj.group(2) - # construct media url (with uid/token) to request song + # extract unsimplified title + mobj = re.search('"title":"(.*?)",', webpage) + if mobj: + title = mobj.group(1) + + # construct media url (with uid/token) mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s" mediaURL = mediaURL % (video_id, stream_token) @@ -3557,22 +3562,21 @@ class SoundcloudIE(InfoExtractor): except Exception as e: print str(e) - # for soundcloud, a request must be made to a cross domain to establish - # needed cookies + # for soundcloud, a request to a cross domain is required for cookies request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers) try: self._downloader.process_info({ - 'id': video_id, + 'id': video_id.decode('utf-8'), 'url': mediaURL, - 'uploader': uploader, + 'uploader': uploader.decode('utf-8'), 'upload_date': upload_date, - 'title': simple_title, - 'stitle': simple_title, + 'title': simple_title.decode('utf-8'), + 'stitle': simple_title.decode('utf-8'), 'ext': u'mp3', 'format': u'NA', 'player_url': None, - 'description': description + 'description': description.decode('utf-8') }) except UnavailableVideoError: self._downloader.trouble(u'\nERROR: unable to download video') From 208c4b9128d1476a259cb416f4662d2574974233 Mon Sep 17 00:00:00 2001 From: Kevin Ngo Date: Sat, 12 Nov 2011 17:10:21 -0800 Subject: [PATCH 7/8] added whitespace below soundcloudIE class --- youtube-dl | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube-dl b/youtube-dl index f717f35ad..2e29bad75 100755 --- a/youtube-dl +++ b/youtube-dl @@ -3581,6 +3581,7 @@ class SoundcloudIE(InfoExtractor): except UnavailableVideoError: self._downloader.trouble(u'\nERROR: unable to download video') + class PostProcessor(object): """Post Processor class. From 38348005b3f4d30ed8e4ca4784decce4f7631f42 Mon Sep 17 00:00:00 2001 From: Kevin Ngo Date: Sat, 12 Nov 2011 17:28:26 -0800 Subject: [PATCH 8/8] removed weird indent --- youtube-dl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube-dl b/youtube-dl index 2e29bad75..f85aa8e42 100755 --- a/youtube-dl +++ b/youtube-dl @@ -3978,7 +3978,7 @@ def gen_extractors(): EscapistIE(), CollegeHumorIE(), XVideosIE(), - SoundcloudIE(), + SoundcloudIE(), GenericIE() ]