Browse Source

Add an extractor for orf.at (closes #1346)

Make find_xpath_attr also accept numbers in the value
master
Jaime Marquínez Ferrándiz 11 years ago
parent
commit
545434670b
3 changed files with 67 additions and 1 deletions
  1. 1
      youtube_dl/extractor/__init__.py
  2. 65
      youtube_dl/extractor/orf.py
  3. 2
      youtube_dl/utils.py

1
youtube_dl/extractor/__init__.py

@ -59,6 +59,7 @@ from .myvideo import MyVideoIE
from .nba import NBAIE
from .nbc import NBCNewsIE
from .ooyala import OoyalaIE
from .orf import ORFIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
from .pornotube import PornotubeIE

65
youtube_dl/extractor/orf.py

@ -0,0 +1,65 @@
import re
import xml.etree.ElementTree
import json
from .common import InfoExtractor
from ..utils import (
compat_urlparse,
ExtractorError,
find_xpath_attr,
)
class ORFIE(InfoExtractor):
_VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)'
_TEST = {
u'url': u'http://tvthek.orf.at/programs/1171769-Wetter-ZIB/episodes/6557323-Wetter',
u'file': u'6566957.flv',
u'info_dict': {
u'title': u'Wetter',
u'description': u'Christa Kummer, Marcus Wadsak und Kollegen präsentieren abwechselnd ihre täglichen Wetterprognosen für Österreich.\r \r Mehr Wetter unter wetter.ORF.at',
},
u'params': {
# It uses rtmp
u'skip_download': True,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
playlist_id = mobj.group('id')
webpage = self._download_webpage(url, playlist_id)
flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml')
flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0]
flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8'))
playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"')
playlist = json.loads(playlist_json)
videos = []
ns = '{http://tempuri.org/XMLSchema.xsd}'
xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns}
webpage_description = self._og_search_description(webpage)
for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1):
# Get best quality url
rtmp_url = None
for q in ['Q6A', 'Q4A', 'Q1A']:
video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q)
if video_url is not None:
rtmp_url = video_url.text
break
if rtmp_url is None:
raise ExtractorError(u'Couldn\'t get video url: %s' % info['id'])
description = self._html_search_regex(
r'id="playlist_entry_%s".*?<p>(.*?)</p>' % i, webpage,
u'description', default=webpage_description, flags=re.DOTALL)
videos.append({
'_type': 'video',
'id': info['id'],
'title': info['title'],
'url': rtmp_url,
'ext': 'flv',
'description': description,
})
return videos

2
youtube_dl/utils.py

@ -213,7 +213,7 @@ if sys.version_info >= (2,7):
def find_xpath_attr(node, xpath, key, val):
""" Find the xpath xpath[@key=val] """
assert re.match(r'^[a-zA-Z]+$', key)
assert re.match(r'^[a-zA-Z@\s]*$', val)
assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
expr = xpath + u"[@%s='%s']" % (key, val)
return node.find(expr)
else:

Loading…
Cancel
Save