[platzi] Add extractor (closes #20562)
parent
19591facea
commit
c701472fc9
2 changed files with 221 additions and 0 deletions
@ -0,0 +1,217 @@ |
||||
# coding: utf-8 |
||||
from __future__ import unicode_literals |
||||
|
||||
from .common import InfoExtractor |
||||
from ..compat import ( |
||||
compat_b64decode, |
||||
compat_str, |
||||
) |
||||
from ..utils import ( |
||||
clean_html, |
||||
ExtractorError, |
||||
int_or_none, |
||||
str_or_none, |
||||
try_get, |
||||
url_or_none, |
||||
urlencode_postdata, |
||||
urljoin, |
||||
) |
||||
|
||||
|
||||
class PlatziIE(InfoExtractor): |
||||
_VALID_URL = r'''(?x) |
||||
https?:// |
||||
(?: |
||||
platzi\.com/clases| # es version |
||||
courses\.platzi\.com/classes # en version |
||||
)/[^/]+/(?P<id>\d+)-[^/?\#&]+ |
||||
''' |
||||
_LOGIN_URL = 'https://platzi.com/login/' |
||||
_NETRC_MACHINE = 'platzi' |
||||
|
||||
_TESTS = [{ |
||||
'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/', |
||||
'md5': '8f56448241005b561c10f11a595b37e3', |
||||
'info_dict': { |
||||
'id': '12074', |
||||
'ext': 'mp4', |
||||
'title': 'Creando nuestra primera página', |
||||
'description': 'md5:4c866e45034fc76412fbf6e60ae008bc', |
||||
'duration': 420, |
||||
}, |
||||
'skip': 'Requires platzi account credentials', |
||||
}, { |
||||
'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/', |
||||
'info_dict': { |
||||
'id': '13430', |
||||
'ext': 'mp4', |
||||
'title': 'Background', |
||||
'description': 'md5:49c83c09404b15e6e71defaf87f6b305', |
||||
'duration': 360, |
||||
}, |
||||
'skip': 'Requires platzi account credentials', |
||||
'params': { |
||||
'skip_download': True, |
||||
}, |
||||
}] |
||||
|
||||
def _real_initialize(self): |
||||
self._login() |
||||
|
||||
def _login(self): |
||||
username, password = self._get_login_info() |
||||
if username is None: |
||||
return |
||||
|
||||
login_page = self._download_webpage( |
||||
self._LOGIN_URL, None, 'Downloading login page') |
||||
|
||||
login_form = self._hidden_inputs(login_page) |
||||
|
||||
login_form.update({ |
||||
'email': username, |
||||
'password': password, |
||||
}) |
||||
|
||||
urlh = self._request_webpage( |
||||
self._LOGIN_URL, None, 'Logging in', |
||||
data=urlencode_postdata(login_form), |
||||
headers={'Referer': self._LOGIN_URL}) |
||||
|
||||
# login succeeded |
||||
if 'platzi.com/login' not in compat_str(urlh.geturl()): |
||||
return |
||||
|
||||
login_error = self._webpage_read_content( |
||||
urlh, self._LOGIN_URL, None, 'Downloading login error page') |
||||
|
||||
login = self._parse_json( |
||||
self._search_regex( |
||||
r'login\s*=\s*({.+?})(?:\s*;|\s*</script)', login_error, 'login'), |
||||
None) |
||||
|
||||
for kind in ('error', 'password', 'nonFields'): |
||||
error = str_or_none(login.get('%sError' % kind)) |
||||
if error: |
||||
raise ExtractorError( |
||||
'Unable to login: %s' % error, expected=True) |
||||
raise ExtractorError('Unable to log in') |
||||
|
||||
def _real_extract(self, url): |
||||
lecture_id = self._match_id(url) |
||||
|
||||
webpage = self._download_webpage(url, lecture_id) |
||||
|
||||
data = self._parse_json( |
||||
self._search_regex( |
||||
r'client_data\s*=\s*({.+?})\s*;', webpage, 'client data'), |
||||
lecture_id) |
||||
|
||||
material = data['initialState']['material'] |
||||
desc = material['description'] |
||||
title = desc['title'] |
||||
|
||||
formats = [] |
||||
for server_id, server in material['videos'].items(): |
||||
if not isinstance(server, dict): |
||||
continue |
||||
for format_id in ('hls', 'dash'): |
||||
format_url = url_or_none(server.get(format_id)) |
||||
if not format_url: |
||||
continue |
||||
if format_id == 'hls': |
||||
formats.extend(self._extract_m3u8_formats( |
||||
format_url, lecture_id, 'mp4', |
||||
entry_protocol='m3u8_native', m3u8_id=format_id, |
||||
note='Downloading %s m3u8 information' % server_id, |
||||
fatal=False)) |
||||
elif format_id == 'dash': |
||||
formats.extend(self._extract_mpd_formats( |
||||
format_url, lecture_id, mpd_id=format_id, |
||||
note='Downloading %s MPD manifest' % server_id, |
||||
fatal=False)) |
||||
self._sort_formats(formats) |
||||
|
||||
content = str_or_none(desc.get('content')) |
||||
description = (clean_html(compat_b64decode(content).decode('utf-8')) |
||||
if content else None) |
||||
duration = int_or_none(material.get('duration'), invscale=60) |
||||
|
||||
return { |
||||
'id': lecture_id, |
||||
'title': title, |
||||
'description': description, |
||||
'duration': duration, |
||||
'formats': formats, |
||||
} |
||||
|
||||
|
||||
class PlatziCourseIE(InfoExtractor): |
||||
_VALID_URL = r'''(?x) |
||||
https?:// |
||||
(?: |
||||
platzi\.com/clases| # es version |
||||
courses\.platzi\.com/classes # en version |
||||
)/(?P<id>[^/?\#&]+) |
||||
''' |
||||
_TESTS = [{ |
||||
'url': 'https://platzi.com/clases/next-js/', |
||||
'info_dict': { |
||||
'id': '1311', |
||||
'title': 'Curso de Next.js', |
||||
}, |
||||
'playlist_count': 22, |
||||
}, { |
||||
'url': 'https://courses.platzi.com/classes/communication-codestream/', |
||||
'info_dict': { |
||||
'id': '1367', |
||||
'title': 'Codestream Course', |
||||
}, |
||||
'playlist_count': 14, |
||||
}] |
||||
|
||||
@classmethod |
||||
def suitable(cls, url): |
||||
return False if PlatziIE.suitable(url) else super(PlatziCourseIE, cls).suitable(url) |
||||
|
||||
def _real_extract(self, url): |
||||
course_name = self._match_id(url) |
||||
|
||||
webpage = self._download_webpage(url, course_name) |
||||
|
||||
props = self._parse_json( |
||||
self._search_regex(r'data\s*=\s*({.+?})\s*;', webpage, 'data'), |
||||
course_name)['initialProps'] |
||||
|
||||
entries = [] |
||||
for chapter_num, chapter in enumerate(props['concepts'], 1): |
||||
if not isinstance(chapter, dict): |
||||
continue |
||||
materials = chapter.get('materials') |
||||
if not materials or not isinstance(materials, list): |
||||
continue |
||||
chapter_title = chapter.get('title') |
||||
chapter_id = str_or_none(chapter.get('id')) |
||||
for material in materials: |
||||
if not isinstance(material, dict): |
||||
continue |
||||
if material.get('material_type') != 'video': |
||||
continue |
||||
video_url = urljoin(url, material.get('url')) |
||||
if not video_url: |
||||
continue |
||||
entries.append({ |
||||
'_type': 'url_transparent', |
||||
'url': video_url, |
||||
'title': str_or_none(material.get('name')), |
||||
'id': str_or_none(material.get('id')), |
||||
'ie_key': PlatziIE.ie_key(), |
||||
'chapter': chapter_title, |
||||
'chapter_number': chapter_num, |
||||
'chapter_id': chapter_id, |
||||
}) |
||||
|
||||
course_id = compat_str(try_get(props, lambda x: x['course']['id'])) |
||||
course_title = try_get(props, lambda x: x['course']['name'], compat_str) |
||||
|
||||
return self.playlist_result(entries, course_id, course_title) |
Loading…
Reference in new issue