From c938c35f957ea069eed824131ca908608853abe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 May 2015 07:18:22 +0600 Subject: [PATCH] [iconosquare] Fix extraction --- youtube_dl/extractor/iconosquare.py | 57 ++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/iconosquare.py b/youtube_dl/extractor/iconosquare.py index 370e86e5a..70e4c0d41 100644 --- a/youtube_dl/extractor/iconosquare.py +++ b/youtube_dl/extractor/iconosquare.py @@ -1,36 +1,75 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import int_or_none class IconosquareIE(InfoExtractor): - _VALID_URL = r'https?://(www\.)?(?:iconosquare\.com|statigr\.am)/p/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?(?:iconosquare\.com|statigr\.am)/p/(?P[^/]+)' _TEST = { 'url': 'http://statigr.am/p/522207370455279102_24101272', 'md5': '6eb93b882a3ded7c378ee1d6884b1814', 'info_dict': { 'id': '522207370455279102_24101272', 'ext': 'mp4', - 'uploader_id': 'aguynamedpatrick', - 'title': 'Instagram photo by @aguynamedpatrick (Patrick Janelle)', + 'title': 'Instagram media by @aguynamedpatrick (Patrick Janelle)', 'description': 'md5:644406a9ec27457ed7aa7a9ebcd4ce3d', + 'timestamp': 1376471991, + 'upload_date': '20130814', + 'uploader': 'aguynamedpatrick', + 'uploader_id': '24101272', + 'comment_count': int, + 'like_count': int, }, } def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + media = self._parse_json( + self._search_regex( + r'window\.media\s*=\s*({.+?});\n', webpage, 'media'), + video_id) + + formats = [{ + 'url': f['url'], + 'format_id': format_id, + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')) + } for format_id, f in media['videos'].items()] + self._sort_formats(formats) + title = self._html_search_regex( r'(.+?)(?: *\(Videos?\))? \| (?:Iconosquare|Statigram)', webpage, 'title') - uploader_id = self._html_search_regex( - r'@([^ ]+)', title, 'uploader name', fatal=False) + + timestamp = int_or_none(media.get('created_time') or media.get('caption', {}).get('created_time')) + description = media.get('caption', {}).get('text') + + uploader = media.get('user', {}).get('username') + uploader_id = media.get('user', {}).get('id') + + comment_count = int_or_none(media.get('comments', {}).get('count')) + like_count = int_or_none(media.get('likes', {}).get('count')) + + thumbnails = [{ + 'url': t['url'], + 'id': thumbnail_id, + 'width': int_or_none(t.get('width')), + 'height': int_or_none(t.get('height')) + } for thumbnail_id, t in media.get('images', {}).items()] return { 'id': video_id, - 'url': self._og_search_video_url(webpage), 'title': title, - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader_id': uploader_id + 'description': description, + 'thumbnails': thumbnails, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'comment_count': comment_count, + 'like_count': like_count, + 'formats': formats, }