[yahoo] Fix video extraction (fixes #1521)

There's no need to use two different methods.
Now we can also download videos over http if possible.
Also run the test for rtmp videos, but skip the download.
master
Jaime Marquínez Ferrándiz 11 years ago
parent 123c10608d
commit 9c15e9de84
  1. 138
      youtube_dl/extractor/yahoo.py

@ -1,4 +1,3 @@
import datetime
import itertools import itertools
import json import json
import re import re
@ -6,86 +5,85 @@ import re
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
from ..utils import ( from ..utils import (
compat_urllib_parse, compat_urllib_parse,
compat_urlparse,
ExtractorError, determine_ext,
clean_html,
) )
class YahooIE(InfoExtractor): class YahooIE(InfoExtractor):
IE_DESC = u'Yahoo screen' IE_DESC = u'Yahoo screen'
_VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html' _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
_TEST = { _TESTS = [
u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', {
u'file': u'214727115.flv', u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
u'md5': u'2e717f169c1be93d84d3794a00d4a325', u'file': u'214727115.mp4',
u'info_dict': { u'info_dict': {
u"title": u"Julian Smith & Travis Legg Watch Julian Smith" u'title': u'Julian Smith & Travis Legg Watch Julian Smith',
u'description': u'Julian and Travis watch Julian Smith',
},
},
{
u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
u'file': u'103000935.flv',
u'info_dict': {
u'title': u'The Cougar Lies with Spanish Moss',
u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
},
u'params': {
# Requires rtmpdump
u'skip_download': True,
},
}, },
u'skip': u'Requires rtmpdump' ]
}
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id') video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
items_json = self._search_regex(r'YVIDEO_INIT_ITEMS = ({.*?});$',
if m_id is None: webpage, u'items', flags=re.MULTILINE)
# TODO: Check which url parameters are required items = json.loads(items_json)
info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id info = items['mediaItems']['query']['results']['mediaObj'][0]
webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage') meta = info['meta']
info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
<description><!\[CDATA\[(?P<description>.*?)\]\]></description>.* formats = []
<media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.* for s in info['streams']:
<media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB" format_info = {
''' 'width': s.get('width'),
self.report_extraction(video_id) 'height': s.get('height'),
m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL) 'bitrate': s.get('bitrate'),
if m_info is None: }
raise ExtractorError(u'Unable to extract video info')
video_title = m_info.group('title') host = s['host']
video_description = m_info.group('description') path = s['path']
video_thumb = m_info.group('thumb') if host.startswith('rtmp'):
video_date = m_info.group('date') format_info.update({
video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d') 'url': host,
'play_path': path,
# TODO: Find a way to get mp4 videos 'ext': 'flv',
rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id })
webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage') else:
m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage) format_url = compat_urlparse.urljoin(host, path)
video_url = m_rest.group('url') format_info['url'] = format_url
video_path = m_rest.group('path') format_info['ext'] = determine_ext(format_url)
if m_rest is None:
raise ExtractorError(u'Unable to extract video url') formats.append(format_info)
formats = sorted(formats, key=lambda f:(f['height'], f['width']))
else: # We have to use a different method if another id is defined
long_id = m_id.group('new_id') info = {
info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335' 'id': video_id,
webpage = self._download_webpage(info_url, video_id, u'Downloading info json') 'title': meta['title'],
json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1) 'formats': formats,
info = json.loads(json_str) 'description': clean_html(meta['description']),
res = info[u'query'][u'results'][u'mediaObj'][0] 'thumbnail': meta['thumbnail'],
stream = res[u'streams'][0] }
video_path = stream[u'path'] # TODO: Remove when #980 has been merged
video_url = stream[u'host'] info.update(formats[-1])
meta = res[u'meta']
video_title = meta[u'title'] return info
video_description = meta[u'description']
video_thumb = meta[u'thumbnail']
video_date = None # I can't find it
info_dict = {
'id': video_id,
'url': video_url,
'play_path': video_path,
'title':video_title,
'description': video_description,
'thumbnail': video_thumb,
'upload_date': video_date,
'ext': 'flv',
}
return info_dict
class YahooSearchIE(SearchInfoExtractor): class YahooSearchIE(SearchInfoExtractor):
IE_DESC = u'Yahoo screen search' IE_DESC = u'Yahoo screen search'

Loading…
Cancel
Save