1
0
mirror of https://gitlab.com/ytdl-org/youtube-dl.git synced 2026-01-25 00:00:04 -05:00

Compare commits

...

52 Commits

Author SHA1 Message Date
Sergey M․
6d2679ee26 release 2016.08.22 2016-08-22 04:17:34 +07:00
Sergey M․
afbab5688e [ChangeLog] Actualize 2016-08-22 04:15:46 +07:00
Sergey M․
3d897cc791 [ivi] Fix episode number extraction 2016-08-22 03:34:27 +07:00
Sergey M․
cf143c4d97 [ivi] Add support for 720p and 1080p 2016-08-22 03:31:33 +07:00
Yen Chi Hsuan
ad120ae1c5 [extractor/common] Change the default m3u8 protocol in HTML5
Helper functions should have consistent default values
2016-08-22 02:26:07 +08:00
Remita Amine
d0fa172e5f [firsttv] keep a test videos with multiple formats 2016-08-21 19:13:43 +01:00
Yen Chi Hsuan
f97f9f71e5 Merge branch 'TRox1972-charlierose' 2016-08-22 02:11:43 +08:00
Yen Chi Hsuan
526656726b [charlierose] Simplify and improve 2016-08-22 02:06:47 +08:00
Remita Amine
9b8c554ea7 [firsttv] fix extraction(closes #9249) 2016-08-21 17:56:25 +01:00
Yen Chi Hsuan
d13bfc07b7 Merge branch 'charlierose' of https://github.com/TRox1972/youtube-dl into TRox1972-charlierose 2016-08-22 00:48:35 +08:00
Sergey M․
efe470e261 [twitch] Renew authentication 2016-08-21 22:45:50 +07:00
Sergey M․
e3f6b56909 [twitch] Refactor API calls 2016-08-21 22:09:29 +07:00
Sergey M․
b1e676fde8 [twitch] Modernize 2016-08-21 21:28:02 +07:00
Sergey M․
92d4cfa358 [kaltura] Fallback ext calculation on caption's format 2016-08-21 21:01:01 +07:00
Remita Amine
3d47ee0a9e [zingmp3] fix extraction and add support for video clips(closes #10041) 2016-08-21 14:09:48 +01:00
Yen Chi Hsuan
d164a0d41b [README.md] Add a format selection example using comma
Ref: #10399
2016-08-21 20:00:48 +08:00
Déstin Reed
db29af6d36 [charlierose] Add new extractor 2016-08-21 11:29:48 +02:00
Sergey M․
2c6acdfd2d [kaltura] Add test for #10279 2016-08-21 08:37:01 +07:00
Sergey M․
fddaa76a59 [kaltura] Assume ttml to be default subtitles' extension 2016-08-21 08:28:36 +07:00
Sergey M․
a809446750 [kaltura] Add subtitles support when entry_id is unknown beforehand (Closes #10279) 2016-08-21 08:28:36 +07:00
Sergey M․
d8f30a7e66 [kaltura] Remove unused code 2016-08-21 08:28:36 +07:00
Sergey M․
5b1d85754e [YoutubeDL] Autocalculate ext when ext is None 2016-08-21 08:28:36 +07:00
Remita Amine
e25586e471 [cultureunplugged] fix extraction(closes #10330) 2016-08-20 20:02:49 +01:00
Remita Amine
292a2301bf [cnn] add support for money.cnn.com videos(closes #2797) 2016-08-20 19:00:25 +01:00
Remita Amine
dabe15701b [cbs, cbsnews] fix extraction(fixes #10393) 2016-08-20 13:25:32 +01:00
Sergey M․
4245f55880 [dotsub] Replace test (Closes #10386) 2016-08-20 06:18:20 +07:00
Déstin Reed
5b9d187cc6 [imdb] Improve title extraction and make thumbnail non-fatal 2016-08-20 04:50:39 +07:00
Yen Chi Hsuan
39e1c4f08c [litv] Support 'promo' URLs (closes #10385) 2016-08-20 00:52:37 +08:00
Yen Chi Hsuan
19f35402c5 [snotr] Fix extraction (closes #10338) 2016-08-20 00:18:22 +08:00
Yen Chi Hsuan
70852b47ca [utils] Recognize units with full names in parse_filename
Reference: https://en.wikipedia.org/wiki/Template:Quantities_of_bytes
2016-08-20 00:17:26 +08:00
Yen Chi Hsuan
a9a3b4a081 [miomio] Adapt to the new API and update _TESTS
The test case is from #9680
2016-08-20 00:08:23 +08:00
Yen Chi Hsuan
ecc90093f9 [vuclip] Adapt to the new API and update _TEST 2016-08-19 23:56:09 +08:00
Yen Chi Hsuan
520251c093 [extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags 2016-08-19 23:53:47 +08:00
Yen Chi Hsuan
55af45fcab [radiobremen] Update _TEST (closes #10337) 2016-08-19 23:12:30 +08:00
Yen Chi Hsuan
b82232036a [n-tv.de] Fix extraction (closes #10331) 2016-08-19 20:39:28 +08:00
Yen Chi Hsuan
e4659b4547 [utils] Correct octal/hexadecimal number detection in js_to_json 2016-08-19 20:37:17 +08:00
Sergey M․
9e5751b9fe [globo:article] Relax _VALID_URL and video id regex (Closes #10379) 2016-08-19 01:13:45 +07:00
Sergey M․
bd1bcd3ea0 release 2016.08.19 2016-08-19 00:15:12 +07:00
Sergey M․
93a63b36f1 [ChangeLog] Actualize 2016-08-19 00:13:24 +07:00
Sergey M․
8b2dc4c328 [options] Remove output template description from --help
Same reasons as for --format
2016-08-18 23:59:13 +07:00
Sergey M․
850837b67a [porncom] Add extractor (Closes #2251, closes #10251) 2016-08-18 23:52:41 +07:00
Sergey M․
13585d7682 [utils] Recognize lowercase units in parse_filesize 2016-08-18 23:32:00 +07:00
Sergey M․
fd3ec986a4 [generic] Fix dbtv test (Closes #10364) 2016-08-18 21:35:41 +07:00
Sergey M․
b0d578ff7b [dbtv] Relax embed regex 2016-08-18 21:30:55 +07:00
Déstin Reed
b0c8f2e9c8 [DBTV:generic] Add support for embeds 2016-08-18 21:29:27 +07:00
Sergey M․
51815886a9 [vk:wallpost] Fix audio extraction 2016-08-18 06:14:05 +07:00
Sergey M․
08a42f9c74 [vk] Fix authentication on python3 2016-08-18 05:22:23 +07:00
Sergey M․
e15ad9ef09 [keezmovies] PEP 8 2016-08-18 04:39:31 +07:00
Sergey M․
4e9fee1015 [hgtvcom:show] Add extractor (Closes #10365) 2016-08-18 04:37:14 +07:00
Remita Amine
7273e5849b [discoverygo] extend _VALID_URL to support other networks 2016-08-17 11:03:09 +01:00
Sergey M․
b505e98784 [extremetube] Revert display_id 2016-08-17 07:02:13 +07:00
Sergey M․
92cd9fd565 [keezmovies] Make display_id optional 2016-08-17 07:01:32 +07:00
39 changed files with 768 additions and 386 deletions

View File

@@ -6,8 +6,8 @@
---
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.17*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.17**
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.22*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.22**
### Before submitting an *issue* make sure you have:
- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@@ -35,7 +35,7 @@ $ youtube-dl -v <your command line>
[debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
[debug] youtube-dl version 2016.08.17
[debug] youtube-dl version 2016.08.22
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {}

View File

@@ -1,3 +1,45 @@
version 2016.08.22
Core
* Improve formats and subtitles extension auto calculation
+ Recognize full unit names in parse_filesize
+ Add support for m3u8 manifests in HTML5 multimedia tags
* Fix octal/hexadecimal number detection in js_to_json
Extractors
+ [ivi] Add support for 720p and 1080p
+ [charlierose] Add new extractor (#10382)
* [1tv] Fix extraction (#9249)
* [twitch] Renew authentication
* [kaltura] Improve subtitles extension calculation
+ [zingmp3] Add support for video clips
* [zingmp3] Fix extraction (#10041)
* [kaltura] Improve subtitles extraction (#10279)
* [cultureunplugged] Fix extraction (#10330)
+ [cnn] Add support for money.cnn.com (#2797)
* [cbsnews] Fix extraction (#10362)
* [cbs] Fix extraction (#10393)
+ [litv] Support 'promo' URLs (#10385)
* [snotr] Fix extraction (#10338)
* [n-tv.de] Fix extraction (#10331)
* [globo:article] Relax URL and video id regular expressions (#10379)
version 2016.08.19
Core
- Remove output template description from --help
* Recognize lowercase units in parse_filesize
Extractors
+ [porncom] Add extractor for porn.com (#2251, #10251)
+ [generic] Add support for DBTV embeds
* [vk:wallpost] Fix audio extraction for new site layout
* [vk] Fix authentication
+ [hgtvcom:show] Add extractor for hgtv.com shows (#10365)
+ [discoverygo] Add support for another GO network sites
version 2016.08.17
Core

View File

@@ -201,32 +201,8 @@ which means you can modify it, redistribute it or use it however you like.
-a, --batch-file FILE File containing URLs to download ('-' for
stdin)
--id Use only video ID in file name
-o, --output TEMPLATE Output filename template. Use %(title)s to
get the title, %(uploader)s for the
uploader name, %(uploader_id)s for the
uploader nickname if different,
%(autonumber)s to get an automatically
incremented number, %(ext)s for the
filename extension, %(format)s for the
format description (like "22 - 1280x720" or
"HD"), %(format_id)s for the unique id of
the format (like YouTube's itags: "137"),
%(upload_date)s for the upload date
(YYYYMMDD), %(extractor)s for the provider
(youtube, metacafe, etc), %(id)s for the
video id, %(playlist_title)s,
%(playlist_id)s, or %(playlist)s (=title if
present, ID otherwise) for the playlist the
video is in, %(playlist_index)s for the
position in the playlist. %(height)s and
%(width)s for the width and height of the
video format. %(resolution)s for a textual
description of the resolution of the video
format. %% for a literal percent. Use - to
output to stdout. Can also be used to
download to a different directory, for
example with -o '/my/downloads/%(uploader)s
/%(title)s-%(id)s.%(ext)s' .
-o, --output TEMPLATE Output filename template, see the "OUTPUT
TEMPLATE" for all the info
--autonumber-size NUMBER Specify the number of digits in
%(autonumber)s when it is present in output
filename template or --auto-number option
@@ -669,7 +645,11 @@ $ youtube-dl -f 'best[filesize<50M]'
# Download best format available via direct link over HTTP/HTTPS protocol
$ youtube-dl -f '(bestvideo+bestaudio/best)[protocol^=http]'
# Download the best video format and the best audio format without merging them
$ youtube-dl -f 'bestvideo,bestaudio' -o '%(title)s.f%(format_id)s.%(ext)s'
```
Note that in the last example, an output template is recommended as bestvideo and bestaudio may have the same file name.
# VIDEO SELECTION

View File

@@ -121,6 +121,7 @@
- **CDA**
- **CeskaTelevize**
- **channel9**: Channel 9
- **CharlieRose**
- **Chaturbate**
- **Chilloutzone**
- **chirbit**
@@ -279,6 +280,7 @@
- **Helsinki**: helsinki.fi
- **HentaiStigma**
- **HGTV**
- **hgtv.com:show**
- **HistoricFilms**
- **history:topic**: History.com Topic
- **hitbox**
@@ -523,6 +525,7 @@
- **podomatic**
- **Pokemon**
- **PolskieRadio**
- **PornCom**
- **PornHd**
- **PornHub**: PornHub and Thumbzilla
- **PornHubPlaylist**
@@ -891,5 +894,4 @@
- **Zapiks**
- **ZDF**
- **ZDFChannel**
- **zingmp3:album**: mp3.zing.vn albums
- **zingmp3:song**: mp3.zing.vn songs
- **zingmp3**: mp3.zing.vn

View File

@@ -712,6 +712,9 @@ class TestUtil(unittest.TestCase):
inp = '''{"foo":101}'''
self.assertEqual(js_to_json(inp), '''{"foo":101}''')
inp = '''{"duration": "00:01:07"}'''
self.assertEqual(js_to_json(inp), '''{"duration": "00:01:07"}''')
def test_js_to_json_edgecases(self):
on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}")
self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"})
@@ -817,7 +820,10 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_filesize('2 MiB'), 2097152)
self.assertEqual(parse_filesize('5 GB'), 5000000000)
self.assertEqual(parse_filesize('1.2Tb'), 1200000000000)
self.assertEqual(parse_filesize('1.2tb'), 1200000000000)
self.assertEqual(parse_filesize('1,24 KB'), 1240)
self.assertEqual(parse_filesize('1,24 kb'), 1240)
self.assertEqual(parse_filesize('8.5 megabytes'), 8500000)
def test_parse_count(self):
self.assertEqual(parse_count(None), None)

View File

@@ -1299,7 +1299,7 @@ class YoutubeDL(object):
for subtitle_format in subtitle:
if subtitle_format.get('url'):
subtitle_format['url'] = sanitize_url(subtitle_format['url'])
if 'ext' not in subtitle_format:
if subtitle_format.get('ext') is None:
subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
if self.params.get('listsubtitles', False):
@@ -1354,7 +1354,7 @@ class YoutubeDL(object):
note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
)
# Automatically determine file extension if missing
if 'ext' not in format:
if format.get('ext') is None:
format['ext'] = determine_ext(format['url']).lower()
# Automatically determine protocol if missing (useful for format
# selection purposes)

View File

@@ -4,6 +4,7 @@ from .theplatform import ThePlatformFeedIE
from ..utils import (
int_or_none,
find_xpath_attr,
ExtractorError,
)
@@ -17,19 +18,6 @@ class CBSBaseIE(ThePlatformFeedIE):
}]
} if closed_caption_e is not None and closed_caption_e.attrib.get('value') else []
def _extract_video_info(self, filter_query, video_id):
return self._extract_feed_info(
'dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id, lambda entry: {
'series': entry.get('cbs$SeriesTitle'),
'season_number': int_or_none(entry.get('cbs$SeasonNumber')),
'episode': entry.get('cbs$EpisodeTitle'),
'episode_number': int_or_none(entry.get('cbs$EpisodeNumber')),
}, {
'StreamPack': {
'manifest': 'm3u',
}
})
class CBSIE(CBSBaseIE):
_VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)'
@@ -38,7 +26,6 @@ class CBSIE(CBSBaseIE):
'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
'info_dict': {
'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_',
'display_id': 'connect-chat-feat-garth-brooks',
'ext': 'mp4',
'title': 'Connect Chat feat. Garth Brooks',
'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
@@ -47,7 +34,10 @@ class CBSIE(CBSBaseIE):
'upload_date': '20131127',
'uploader': 'CBSI-NEW',
},
'expected_warnings': ['Failed to download m3u8 information'],
'params': {
# m3u8 download
'skip_download': True,
},
'_skip': 'Blocked outside the US',
}, {
'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
@@ -56,8 +46,31 @@ class CBSIE(CBSBaseIE):
'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
'only_matching': True,
}]
TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true'
def _extract_video_info(self, guid):
path = 'dJ5BDC/media/guid/2198311517/' + guid
smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path
formats, subtitles = self._extract_theplatform_smil(smil_url + '&manifest=m3u', guid)
for r in ('HLS&formats=M3U', 'RTMP', 'WIFI', '3G'):
try:
tp_formats, _ = self._extract_theplatform_smil(smil_url + '&assetTypes=' + r, guid, 'Downloading %s SMIL data' % r.split('&')[0])
formats.extend(tp_formats)
except ExtractorError:
continue
self._sort_formats(formats)
metadata = self._download_theplatform_metadata(path, guid)
info = self._parse_theplatform_metadata(metadata)
info.update({
'id': guid,
'formats': formats,
'subtitles': subtitles,
'series': metadata.get('cbs$SeriesTitle'),
'season_number': int_or_none(metadata.get('cbs$SeasonNumber')),
'episode': metadata.get('cbs$EpisodeTitle'),
'episode_number': int_or_none(metadata.get('cbs$EpisodeNumber')),
})
return info
def _real_extract(self, url):
content_id = self._match_id(url)
return self._extract_video_info('byGuid=%s' % content_id, content_id)
return self._extract_video_info(content_id)

View File

@@ -2,13 +2,13 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from .cbs import CBSBaseIE
from .cbs import CBSIE
from ..utils import (
parse_duration,
)
class CBSNewsIE(CBSBaseIE):
class CBSNewsIE(CBSIE):
IE_DESC = 'CBS News'
_VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)'
@@ -35,7 +35,8 @@ class CBSNewsIE(CBSBaseIE):
'ext': 'mp4',
'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
'description': 'md5:4a6983e480542d8b333a947bfc64ddc7',
'upload_date': '19700101',
'upload_date': '20140404',
'timestamp': 1396650660,
'uploader': 'CBSI-NEW',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 205,
@@ -63,7 +64,7 @@ class CBSNewsIE(CBSBaseIE):
item = video_info['item'] if 'item' in video_info else video_info
guid = item['mpxRefId']
return self._extract_video_info('byGuid=%s' % guid, guid)
return self._extract_video_info(guid)
class CBSNewsLiveVideoIE(InfoExtractor):

View File

@@ -23,6 +23,9 @@ class CBSSportsIE(CBSBaseIE):
}
}]
def _extract_video_info(self, filter_query, video_id):
return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id)
def _real_extract(self, url):
video_id = self._match_id(url)
return self._extract_video_info('byId=%s' % video_id, video_id)

View File

@@ -0,0 +1,51 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import remove_end
class CharlieRoseIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?charlierose\.com/video(?:s|/player)/(?P<id>\d+)'
_TESTS = [{
'url': 'https://charlierose.com/videos/27996',
'md5': 'fda41d49e67d4ce7c2411fd2c4702e09',
'info_dict': {
'id': '27996',
'ext': 'mp4',
'title': 'Remembering Zaha Hadid',
'thumbnail': 're:^https?://.*\.jpg\?\d+',
'description': 'We revisit past conversations with Zaha Hadid, in memory of the world renowned Iraqi architect.',
'subtitles': {
'en': [{
'ext': 'vtt',
}],
},
},
}, {
'url': 'https://charlierose.com/videos/27996',
'only_matching': True,
}]
_PLAYER_BASE = 'https://charlierose.com/video/player/%s'
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(self._PLAYER_BASE % video_id, video_id)
title = remove_end(self._og_search_title(webpage), ' - Charlie Rose')
info_dict = self._parse_html5_media_entries(
self._PLAYER_BASE % video_id, webpage, video_id,
m3u8_entry_protocol='m3u8_native')[0]
self._sort_formats(info_dict['formats'])
self._remove_duplicate_formats(info_dict['formats'])
info_dict.update({
'id': video_id,
'title': title,
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
})
return info_dict

View File

@@ -11,7 +11,7 @@ from ..utils import (
class CNNIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/
_VALID_URL = r'''(?x)https?://(?:(?P<sub_domain>edition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/
(?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''
_TESTS = [{
@@ -45,19 +45,46 @@ class CNNIE(InfoExtractor):
'description': 'md5:e7223a503315c9f150acac52e76de086',
'upload_date': '20141222',
}
}, {
'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html',
'md5': '52a515dc1b0f001cd82e4ceda32be9d1',
'info_dict': {
'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney',
'ext': 'mp4',
'title': '5 stunning stats about Netflix',
'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.',
'upload_date': '20160819',
}
}, {
'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk',
'only_matching': True,
}, {
'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg',
'only_matching': True,
}, {
'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn',
'only_matching': True,
}]
_CONFIG = {
# http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml
'edition': {
'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml',
'media_src': 'http://pmd.cdn.turner.com/cnn/big',
},
# http://money.cnn.com/.element/apps/cvp2/cfg/config.xml
'money': {
'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml',
'media_src': 'http://ht3.cdn.turner.com/money/big',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
path = mobj.group('path')
page_title = mobj.group('title')
info_url = 'http://edition.cnn.com/video/data/3.0/%s/index.xml' % path
sub_domain, path, page_title = re.match(self._VALID_URL, url).groups()
if sub_domain not in ('money', 'edition'):
sub_domain = 'edition'
config = self._CONFIG[sub_domain]
info_url = config['data_src'] % path
info = self._download_xml(info_url, page_title)
formats = []
@@ -66,7 +93,7 @@ class CNNIE(InfoExtractor):
(?:_(?P<bitrate>[0-9]+)k)?
''')
for f in info.findall('files/file'):
video_url = 'http://ht.cdn.turner.com/cnn/big%s' % (f.text.strip())
video_url = config['media_src'] + f.text.strip()
fdct = {
'format_id': f.attrib['bitrate'],
'url': video_url,
@@ -146,7 +173,7 @@ class CNNBlogsIE(InfoExtractor):
class CNNArticleIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!video/)'
_VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)'
_TEST = {
'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/',
'md5': '689034c2a3d9c6dc4aa72d65a81efd01',

View File

@@ -1695,7 +1695,7 @@ class InfoExtractor(object):
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats
def _parse_html5_media_entries(self, base_url, webpage):
def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'):
def absolute_url(video_url):
return compat_urlparse.urljoin(base_url, video_url)
@@ -1710,6 +1710,21 @@ class InfoExtractor(object):
return f
return {}
def _media_formats(src, cur_media_type):
full_url = absolute_url(src)
if determine_ext(full_url) == 'm3u8':
is_plain_url = False
formats = self._extract_m3u8_formats(
full_url, video_id, ext='mp4',
entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
else:
is_plain_url = True
formats = [{
'url': full_url,
'vcodec': 'none' if cur_media_type == 'audio' else None,
}]
return is_plain_url, formats
entries = []
for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
media_info = {
@@ -1719,10 +1734,8 @@ class InfoExtractor(object):
media_attributes = extract_attributes(media_tag)
src = media_attributes.get('src')
if src:
media_info['formats'].append({
'url': absolute_url(src),
'vcodec': 'none' if media_type == 'audio' else None,
})
_, formats = _media_formats(src)
media_info['formats'].extend(formats)
media_info['thumbnail'] = media_attributes.get('poster')
if media_content:
for source_tag in re.findall(r'<source[^>]+>', media_content):
@@ -1730,12 +1743,13 @@ class InfoExtractor(object):
src = source_attributes.get('src')
if not src:
continue
f = parse_content_type(source_attributes.get('type'))
f.update({
'url': absolute_url(src),
'vcodec': 'none' if media_type == 'audio' else None,
})
media_info['formats'].append(f)
is_plain_url, formats = _media_formats(src, media_type)
if is_plain_url:
f = parse_content_type(source_attributes.get('type'))
f.update(formats[0])
media_info['formats'].append(f)
else:
media_info['formats'].extend(formats)
for track_tag in re.findall(r'<track[^>]+>', media_content):
track_attributes = extract_attributes(track_tag)
kind = track_attributes.get('kind')

View File

@@ -1,9 +1,13 @@
from __future__ import unicode_literals
import re
import time
from .common import InfoExtractor
from ..utils import int_or_none
from ..utils import (
int_or_none,
HEADRequest,
)
class CultureUnpluggedIE(InfoExtractor):
@@ -32,6 +36,9 @@ class CultureUnpluggedIE(InfoExtractor):
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
# request setClientTimezone.php to get PHPSESSID cookie which is need to get valid json data in the next request
self._request_webpage(HEADRequest(
'http://www.cultureunplugged.com/setClientTimezone.php?timeOffset=%d' % -(time.timezone / 3600)), display_id)
movie_data = self._download_json(
'http://www.cultureunplugged.com/movie-data/cu-%s.json' % video_id, display_id)

View File

@@ -38,6 +38,12 @@ class DBTVIE(InfoExtractor):
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return [url for _, url in re.findall(
r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?dbtv\.no/(?:lazy)?player/\d+.*?)\1',
webpage)]
def _real_extract(self, url):
video_id, display_id = re.match(self._VALID_URL, url).groups()

View File

@@ -11,7 +11,17 @@ from ..utils import (
class DiscoveryGoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?discoverygo\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_VALID_URL = r'''(?x)https?://(?:www\.)?(?:
discovery|
investigationdiscovery|
discoverylife|
animalplanet|
ahctv|
destinationamerica|
sciencechannel|
tlc|
velocitychannel
)go\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'''
_TEST = {
'url': 'https://www.discoverygo.com/love-at-first-kiss/kiss-first-ask-questions-later/',
'info_dict': {

View File

@@ -10,18 +10,18 @@ from ..utils import (
class DotsubIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dotsub\.com/view/(?P<id>[^/]+)'
_TEST = {
'url': 'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27',
'md5': '0914d4d69605090f623b7ac329fea66e',
'url': 'https://dotsub.com/view/9c63db2a-fa95-4838-8e6e-13deafe47f09',
'md5': '21c7ff600f545358134fea762a6d42b6',
'info_dict': {
'id': 'aed3b8b2-1889-4df5-ae63-ad85f5572f27',
'id': '9c63db2a-fa95-4838-8e6e-13deafe47f09',
'ext': 'flv',
'title': 'Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary',
'description': 'md5:699a0f7f50aeec6042cb3b1db2d0d074',
'thumbnail': 're:^https?://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p',
'duration': 3169,
'uploader': '4v4l0n42',
'timestamp': 1292248482.625,
'upload_date': '20101213',
'title': 'MOTIVATION - "It\'s Possible" Best Inspirational Video Ever',
'description': 'md5:41af1e273edbbdfe4e216a78b9d34ac6',
'thumbnail': 're:^https?://dotsub.com/media/9c63db2a-fa95-4838-8e6e-13deafe47f09/p',
'duration': 198,
'uploader': 'liuxt',
'timestamp': 1385778501.104,
'upload_date': '20131130',
'view_count': int,
}
}

View File

@@ -134,6 +134,7 @@ from .ccc import CCCIE
from .cda import CDAIE
from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE
from .charlierose import CharlieRoseIE
from .chaturbate import ChaturbateIE
from .chilloutzone import ChilloutzoneIE
from .chirbit import (
@@ -324,7 +325,10 @@ from .heise import HeiseIE
from .hellporno import HellPornoIE
from .helsinki import HelsinkiIE
from .hentaistigma import HentaiStigmaIE
from .hgtv import HGTVIE
from .hgtv import (
HGTVIE,
HGTVComShowIE,
)
from .historicfilms import HistoricFilmsIE
from .hitbox import HitboxIE, HitboxLiveIE
from .hornbunny import HornBunnyIE
@@ -639,6 +643,7 @@ from .podomatic import PodomaticIE
from .pokemon import PokemonIE
from .polskieradio import PolskieRadioIE
from .porn91 import Porn91IE
from .porncom import PornComIE
from .pornhd import PornHdIE
from .pornhub import (
PornHubIE,
@@ -1110,7 +1115,4 @@ from .youtube import (
)
from .zapiks import ZapiksIE
from .zdf import ZDFIE, ZDFChannelIE
from .zingmp3 import (
ZingMp3SongIE,
ZingMp3AlbumIE,
)
from .zingmp3 import ZingMp3IE

View File

@@ -5,13 +5,12 @@ from .keezmovies import KeezMoviesIE
class ExtremeTubeIE(KeezMoviesIE):
_VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?:(?P<display_id>[^/]+)-)(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)'
_TESTS = [{
'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
'md5': '1fb9228f5e3332ec8c057d6ac36f33e0',
'info_dict': {
'id': '652431',
'display_id': 'music-video-14-british-euro-brit-european-cumshots-swallow',
'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431',
'ext': 'mp4',
'title': 'Music Video 14 british euro brit european cumshots swallow',
'uploader': 'unknown',

View File

@@ -2,44 +2,40 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_xpath
from ..compat import compat_urlparse
from ..utils import (
int_or_none,
qualities,
unified_strdate,
xpath_attr,
xpath_element,
xpath_text,
xpath_with_ns,
)
class FirstTVIE(InfoExtractor):
IE_NAME = '1tv'
IE_DESC = 'Первый канал'
_VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+p?(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P<id>[^/?#]+)'
_TESTS = [{
# single format via video_materials.json API
'url': 'http://www.1tv.ru/prj/inprivate/vypusk/35930',
'md5': '82a2777648acae812d58b3f5bd42882b',
# single format
'url': 'http://www.1tv.ru/shows/naedine-so-vsemi/vypuski/gost-lyudmila-senchina-naedine-so-vsemi-vypusk-ot-12-02-2015',
'md5': 'a1b6b60d530ebcf8daacf4565762bbaf',
'info_dict': {
'id': '35930',
'id': '40049',
'ext': 'mp4',
'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015',
'description': 'md5:357933adeede13b202c7c21f91b871b2',
'description': 'md5:36a39c1d19618fec57d12efe212a8370',
'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$',
'upload_date': '20150212',
'duration': 2694,
},
}, {
# multiple formats via video_materials.json API
'url': 'http://www.1tv.ru/video_archive/projects/dobroeutro/p113641',
# multiple formats
'url': 'http://www.1tv.ru/shows/dobroe-utro/pro-zdorove/vesennyaya-allergiya-dobroe-utro-fragment-vypuska-ot-07042016',
'info_dict': {
'id': '113641',
'id': '364746',
'ext': 'mp4',
'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016',
'description': 'md5:8dcebb3dded0ff20fade39087fd1fee2',
'description': 'md5:a242eea0031fd180a4497d52640a9572',
'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$',
'upload_date': '20160407',
'duration': 179,
@@ -48,84 +44,47 @@ class FirstTVIE(InfoExtractor):
'params': {
'skip_download': True,
},
}, {
# single format only available via ONE_ONLINE_VIDEOS.archive_single_xml API
'url': 'http://www.1tv.ru/video_archive/series/f7552/p47038',
'md5': '519d306c5b5669761fd8906c39dbee23',
'info_dict': {
'id': '47038',
'ext': 'mp4',
'title': '"Побег". Второй сезон. 3 серия',
'description': 'md5:3abf8f6b9bce88201c33e9a3d794a00b',
'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$',
'upload_date': '20120516',
'duration': 3080,
},
}, {
'url': 'http://www.1tv.ru/videoarchive/9967',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
display_id = self._match_id(url)
# Videos with multiple formats only available via this API
video = self._download_json(
'http://www.1tv.ru/video_materials.json?legacy_id=%s' % video_id,
video_id, fatal=False)
description, thumbnail, upload_date, duration = [None] * 4
if video:
item = video[0]
title = item['title']
quality = qualities(('ld', 'sd', 'hd', ))
formats = [{
'url': f['src'],
'format_id': f.get('name'),
'quality': quality(f.get('name')),
} for f in item['mbr'] if f.get('src')]
thumbnail = item.get('poster')
else:
# Some videos are not available via video_materials.json
video = self._download_xml(
'http://www.1tv.ru/owa/win/ONE_ONLINE_VIDEOS.archive_single_xml?pid=%s' % video_id,
video_id)
NS_MAP = {
'media': 'http://search.yahoo.com/mrss/',
}
item = xpath_element(video, './channel/item', fatal=True)
title = xpath_text(item, './title', fatal=True)
formats = [{
'url': content.attrib['url'],
} for content in item.findall(
compat_xpath(xpath_with_ns('./media:content', NS_MAP))) if content.attrib.get('url')]
thumbnail = xpath_attr(
item, xpath_with_ns('./media:thumbnail', NS_MAP), 'url')
webpage = self._download_webpage(url, display_id)
playlist_url = compat_urlparse.urljoin(url, self._search_regex(
r'data-playlist-url="([^"]+)', webpage, 'playlist url'))
item = self._download_json(playlist_url, display_id)[0]
video_id = item['id']
quality = qualities(('ld', 'sd', 'hd', ))
formats = []
for f in item.get('mbr', []):
src = f.get('src')
if not src:
continue
fname = f.get('name')
formats.append({
'url': src,
'format_id': fname,
'quality': quality(fname),
})
self._sort_formats(formats)
webpage = self._download_webpage(url, video_id, 'Downloading page', fatal=False)
if webpage:
title = self._html_search_regex(
(r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>',
r"'title'\s*:\s*'([^']+)'"),
webpage, 'title', default=None) or title
description = self._html_search_regex(
r'<div class="descr">\s*<div>&nbsp;</div>\s*<p>([^<]*)</p></div>',
webpage, 'description', default=None) or self._html_search_meta(
'description', webpage, 'description')
thumbnail = thumbnail or self._og_search_thumbnail(webpage)
duration = int_or_none(self._html_search_meta(
'video:duration', webpage, 'video duration', fatal=False))
upload_date = unified_strdate(self._html_search_meta(
'ya:ovs:upload_date', webpage, 'upload date', fatal=False))
title = self._html_search_regex(
(r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>',
r"'title'\s*:\s*'([^']+)'"),
webpage, 'title', default=None) or item['title']
description = self._html_search_regex(
r'<div class="descr">\s*<div>&nbsp;</div>\s*<p>([^<]*)</p></div>',
webpage, 'description', default=None) or self._html_search_meta(
'description', webpage, 'description')
duration = int_or_none(self._html_search_meta(
'video:duration', webpage, 'video duration', fatal=False))
upload_date = unified_strdate(self._html_search_meta(
'ya:ovs:upload_date', webpage, 'upload date', fatal=False))
return {
'id': video_id,
'thumbnail': thumbnail,
'thumbnail': item.get('poster') or self._og_search_thumbnail(webpage),
'title': title,
'description': description,
'upload_date': upload_date,

View File

@@ -73,6 +73,7 @@ from .eagleplatform import EaglePlatformIE
from .facebook import FacebookIE
from .soundcloud import SoundcloudIE
from .vbox7 import Vbox7IE
from .dbtv import DBTVIE
class GenericIE(InfoExtractor):
@@ -1386,6 +1387,15 @@ class GenericIE(InfoExtractor):
},
'add_ie': [Vbox7IE.ie_key()],
},
{
# DBTV embeds
'url': 'http://www.dagbladet.no/2016/02/23/nyheter/nordlys/ski/troms/ver/43254897/',
'info_dict': {
'id': '43254897',
'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans',
},
'playlist_mincount': 3,
},
# {
# # TODO: find another test
# # http://schema.org/VideoObject
@@ -2257,6 +2267,11 @@ class GenericIE(InfoExtractor):
if vbox7_url:
return self.url_result(vbox7_url, Vbox7IE.ie_key())
# Look for DBTV embeds
dbtv_urls = DBTVIE._extract_urls(webpage)
if dbtv_urls:
return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key())
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(
webpage, video_id, default={}, expected_type='VideoObject')

View File

@@ -396,12 +396,12 @@ class GloboIE(InfoExtractor):
class GloboArticleIE(InfoExtractor):
_VALID_URL = 'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/]+)\.html'
_VALID_URL = 'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/]+)(?:\.html)?'
_VIDEOID_REGEXES = [
r'\bdata-video-id=["\'](\d{7,})',
r'\bdata-player-videosids=["\'](\d{7,})',
r'\bvideosIDs\s*:\s*["\'](\d{7,})',
r'\bvideosIDs\s*:\s*["\']?(\d{7,})',
r'\bdata-id=["\'](\d{7,})',
r'<div[^>]+\bid=["\'](\d{7,})',
]
@@ -423,6 +423,9 @@ class GloboArticleIE(InfoExtractor):
}, {
'url': 'http://gshow.globo.com/programas/tv-xuxa/O-Programa/noticia/2014/01/xuxa-e-junno-namoram-muuuito-em-luau-de-zeze-di-camargo-e-luciano.html',
'only_matching': True,
}, {
'url': 'http://oglobo.globo.com/rio/a-amizade-entre-um-entregador-de-farmacia-um-piano-19946271',
'only_matching': True,
}]
@classmethod

View File

@@ -46,3 +46,34 @@ class HGTVIE(InfoExtractor):
'episode_number': int_or_none(embed_vars.get('episode')),
'ie_key': 'ThePlatform',
}
class HGTVComShowIE(InfoExtractor):
IE_NAME = 'hgtv.com:show'
_VALID_URL = r'https?://(?:www\.)?hgtv\.com/shows/[^/]+/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'http://www.hgtv.com/shows/flip-or-flop/flip-or-flop-full-episodes-videos',
'info_dict': {
'id': 'flip-or-flop-full-episodes-videos',
'title': 'Flip or Flop Full Episodes',
},
'playlist_mincount': 15,
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
config = self._parse_json(
self._search_regex(
r'(?s)data-module=["\']video["\'][^>]*>.*?<script[^>]+type=["\']text/x-config["\'][^>]*>(.+?)</script',
webpage, 'video config'),
display_id)['channels'][0]
entries = [
self.url_result(video['releaseUrl'])
for video in config['videos'] if video.get('releaseUrl')]
return self.playlist_result(
entries, display_id, config.get('title'), config.get('description'))

View File

@@ -6,6 +6,7 @@ from .common import InfoExtractor
from ..utils import (
mimetype2ext,
qualities,
remove_end,
)
@@ -19,7 +20,7 @@ class ImdbIE(InfoExtractor):
'info_dict': {
'id': '2524815897',
'ext': 'mp4',
'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb',
'title': 'Ice Age: Continental Drift Trailer (No. 2)',
'description': 'md5:9061c2219254e5d14e03c25c98e96a81',
}
}, {
@@ -83,10 +84,10 @@ class ImdbIE(InfoExtractor):
return {
'id': video_id,
'title': self._og_search_title(webpage),
'title': remove_end(self._og_search_title(webpage), ' - IMDb'),
'formats': formats,
'description': descr,
'thumbnail': format_info['slate'],
'thumbnail': format_info.get('slate'),
}

View File

@@ -1,4 +1,4 @@
# encoding: utf-8
# coding: utf-8
from __future__ import unicode_literals
import re
@@ -8,7 +8,7 @@ from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
sanitized_Request,
qualities,
)
@@ -49,11 +49,27 @@ class IviIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.jpg$',
},
'skip': 'Only works from Russia',
},
{
# with MP4-HD720 format
'url': 'http://www.ivi.ru/watch/146500',
'md5': 'd63d35cdbfa1ea61a5eafec7cc523e1e',
'info_dict': {
'id': '146500',
'ext': 'mp4',
'title': 'Кукла',
'description': 'md5:ffca9372399976a2d260a407cc74cce6',
'duration': 5599,
'thumbnail': 're:^https?://.*\.jpg$',
},
'skip': 'Only works from Russia',
}
]
# Sorted by quality
_KNOWN_FORMATS = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ']
_KNOWN_FORMATS = (
'MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi',
'MP4-SHQ', 'MP4-HD720', 'MP4-HD1080')
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -69,10 +85,9 @@ class IviIE(InfoExtractor):
]
}
request = sanitized_Request(
'http://api.digitalaccess.ru/api/json/', json.dumps(data))
video_json = self._download_json(
request, video_id, 'Downloading video JSON')
'http://api.digitalaccess.ru/api/json/', video_id,
'Downloading video JSON', data=json.dumps(data))
if 'error' in video_json:
error = video_json['error']
@@ -84,11 +99,13 @@ class IviIE(InfoExtractor):
result = video_json['result']
quality = qualities(self._KNOWN_FORMATS)
formats = [{
'url': x['url'],
'format_id': x['content_format'],
'preference': self._KNOWN_FORMATS.index(x['content_format']),
} for x in result['files'] if x['content_format'] in self._KNOWN_FORMATS]
'format_id': x.get('content_format'),
'quality': quality(x.get('content_format')),
} for x in result['files'] if x.get('url')]
self._sort_formats(formats)
@@ -115,7 +132,7 @@ class IviIE(InfoExtractor):
webpage, 'season number', default=None))
episode_number = int_or_none(self._search_regex(
r'<meta[^>]+itemprop="episode"[^>]*>\s*<meta[^>]+itemprop="episodeNumber"[^>]+content="(\d+)',
r'[^>]+itemprop="episode"[^>]*>\s*<meta[^>]+itemprop="episodeNumber"[^>]+content="(\d+)',
webpage, 'episode number', default=None))
description = self._og_search_description(webpage, default=None) or self._html_search_meta(

View File

@@ -36,6 +36,12 @@ class KalturaIE(InfoExtractor):
'''
_SERVICE_URL = 'http://cdnapi.kaltura.com'
_SERVICE_BASE = '/api_v3/index.php'
# See https://github.com/kaltura/server/blob/master/plugins/content/caption/base/lib/model/enums/CaptionType.php
_CAPTION_TYPES = {
1: 'srt',
2: 'ttml',
3: 'vtt',
}
_TESTS = [
{
'url': 'kaltura:269692:1_1jc2y3e4',
@@ -67,6 +73,27 @@ class KalturaIE(InfoExtractor):
# video with subtitles
'url': 'kaltura:111032:1_cw786r8q',
'only_matching': True,
},
{
# video with ttml subtitles (no fileExt)
'url': 'kaltura:1926081:0_l5ye1133',
'info_dict': {
'id': '0_l5ye1133',
'ext': 'mp4',
'title': 'What Can You Do With Python?',
'upload_date': '20160221',
'uploader_id': 'stork',
'thumbnail': 're:^https?://.*/thumbnail/.*',
'timestamp': int,
'subtitles': {
'en': [{
'ext': 'ttml',
}],
},
},
'params': {
'skip_download': True,
},
}
]
@@ -122,18 +149,6 @@ class KalturaIE(InfoExtractor):
return data
def _get_kaltura_signature(self, video_id, partner_id, service_url=None):
actions = [{
'apiVersion': '3.1',
'expiry': 86400,
'format': 1,
'service': 'session',
'action': 'startWidgetSession',
'widgetId': '_%s' % partner_id,
}]
return self._kaltura_api_call(
video_id, actions, service_url, note='Downloading Kaltura signature')['ks']
def _get_video_info(self, video_id, partner_id, service_url=None):
actions = [
{
@@ -208,6 +223,17 @@ class KalturaIE(InfoExtractor):
reference_id)['entryResult']
info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets']
entry_id = info['id']
# Unfortunately, data returned in kalturaIframePackageData lacks
# captions so we will try requesting the complete data using
# regular approach since we now know the entry_id
try:
_, info, flavor_assets, captions = self._get_video_info(
entry_id, partner_id)
except ExtractorError:
# Regular scenario failed but we already have everything
# extracted apart from captions and can process at least
# with this
pass
else:
raise ExtractorError('Invalid URL', expected=True)
ks = params.get('flashvars[ks]', [None])[0]
@@ -265,9 +291,12 @@ class KalturaIE(InfoExtractor):
# Continue if caption is not ready
if f.get('status') != 2:
continue
if not caption.get('id'):
continue
caption_format = int_or_none(caption.get('format'))
subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({
'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']),
'ext': caption.get('fileExt'),
'ext': caption.get('fileExt') or self._CAPTION_TYPES.get(caption_format) or 'ttml',
})
return {

View File

@@ -39,7 +39,9 @@ class KeezMoviesIE(InfoExtractor):
def _extract_info(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
display_id = (mobj.group('display_id')
if 'display_id' in mobj.groupdict()
else None) or mobj.group('id')
webpage = self._download_webpage(
url, display_id, headers={'Cookie': 'age_verified=1'})

View File

@@ -14,7 +14,7 @@ from ..utils import (
class LiTVIE(InfoExtractor):
_VALID_URL = r'https?://www\.litv\.tv/vod/[^/]+/content\.do\?.*?\bid=(?P<id>[^&]+)'
_VALID_URL = r'https?://www\.litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P<id>[^&]+)'
_URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s'
@@ -27,6 +27,7 @@ class LiTVIE(InfoExtractor):
'playlist_count': 50,
}, {
'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
'md5': '969e343d9244778cb29acec608e53640',
'info_dict': {
'id': 'VOD00041610',
'ext': 'mp4',
@@ -37,7 +38,16 @@ class LiTVIE(InfoExtractor):
},
'params': {
'noplaylist': True,
'skip_download': True, # m3u8 download
},
'skip': 'Georestricted to Taiwan',
}, {
'url': 'https://www.litv.tv/promo/miyuezhuan/?content_id=VOD00044841&',
'md5': '88322ea132f848d6e3e18b32a832b918',
'info_dict': {
'id': 'VOD00044841',
'ext': 'mp4',
'title': '芈月傳第1集 霸星芈月降世楚國',
'description': '楚威王二年,太史令唐昧夜觀星象,發現霸星即將現世。王后得知霸星的預言後,想盡辦法不讓孩子順利出生,幸得莒姬相護化解危機。沒想到眾人期待下出生的霸星卻是位公主,楚威王對此失望至極。楚王后命人將女嬰丟棄河中,居然奇蹟似的被少司命像攔下,楚威王認為此女非同凡響,為她取名芈月。',
},
'skip': 'Georestricted to Taiwan',
}]
@@ -92,13 +102,18 @@ class LiTVIE(InfoExtractor):
# endpoint gives the same result as the data embedded in the webpage.
# If georestricted, there are no embedded data, so an extra request is
# necessary to get the error code
if 'assetId' not in view_data:
view_data = self._download_json(
'https://www.litv.tv/vod/ajax/getProgramInfo', video_id,
query={'contentId': video_id},
headers={'Accept': 'application/json'})
video_data = self._parse_json(self._search_regex(
r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);',
webpage, 'video data', default='{}'), video_id)
if not video_data:
payload = {
'assetId': view_data['assetId'],
'watchDevices': vod_data['watchDevices'],
'watchDevices': view_data['watchDevices'],
'contentType': view_data['contentType'],
}
video_data = self._download_json(
@@ -115,7 +130,8 @@ class LiTVIE(InfoExtractor):
raise ExtractorError('Unexpected result from %s' % self.IE_NAME)
formats = self._extract_m3u8_formats(
video_data['fullpath'], video_id, ext='mp4', m3u8_id='hls')
video_data['fullpath'], video_id, ext='mp4',
entry_protocol='m3u8_native', m3u8_id='hls')
for a_format in formats:
# LiTV HLS segments doesn't like compressions
a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True

View File

@@ -25,10 +25,7 @@ class MioMioIE(InfoExtractor):
'title': '【SKY】字幕 铠武昭和VS平成 假面骑士大战FEAT战队 魔星字幕组 字幕',
'duration': 5923,
},
'params': {
# The server provides broken file
'skip_download': True,
}
'skip': 'Unable to load videos',
}, {
'url': 'http://www.miomio.tv/watch/cc184024/',
'info_dict': {
@@ -47,16 +44,12 @@ class MioMioIE(InfoExtractor):
'skip': 'Unable to load videos',
}, {
# new 'h5' player
'url': 'http://www.miomio.tv/watch/cc273295/',
'md5': '',
'url': 'http://www.miomio.tv/watch/cc273997/',
'md5': '0b27a4b4495055d826813f8c3a6b2070',
'info_dict': {
'id': '273295',
'id': '273997',
'ext': 'mp4',
'title': 'アウト×デラックス 20160526',
},
'params': {
# intermittent HTTP 500
'skip_download': True,
'title': 'マツコの知らない世界【劇的進化SPビニール傘冷凍食品2016】 1_2 - 16 05 31',
},
}]
@@ -116,7 +109,7 @@ class MioMioIE(InfoExtractor):
player_webpage = self._download_webpage(
player_url, video_id,
note='Downloading player webpage', headers={'Referer': url})
entries = self._parse_html5_media_entries(player_url, player_webpage)
entries = self._parse_html5_media_entries(player_url, player_webpage, video_id)
http_headers = {'Referer': player_url}
else:
http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path}

View File

@@ -1,6 +1,8 @@
# encoding: utf-8
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
@@ -40,8 +42,8 @@ class NTVDeIE(InfoExtractor):
timestamp = int_or_none(info.get('publishedDateAsUnixTimeStamp'))
vdata = self._parse_json(self._search_regex(
r'(?s)\$\(\s*"\#player"\s*\)\s*\.data\(\s*"player",\s*(\{.*?\})\);',
webpage, 'player data'),
video_id, transform_source=js_to_json)
webpage, 'player data'), video_id,
transform_source=lambda s: js_to_json(re.sub(r'advertising:\s*{[^}]+},', '', s)))
duration = parse_duration(vdata.get('duration'))
formats = []

View File

@@ -0,0 +1,89 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
int_or_none,
js_to_json,
parse_filesize,
str_to_int,
)
class PornComIE(InfoExtractor):
_VALID_URL = r'https?://(?:[a-zA-Z]+\.)?porn\.com/videos/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.porn.com/videos/teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec-2603339',
'md5': '3f30ce76267533cd12ba999263156de7',
'info_dict': {
'id': '2603339',
'display_id': 'teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec',
'ext': 'mp4',
'title': 'Teen grabs a dildo and fucks her pussy live on 1hottie, I rec',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 551,
'view_count': int,
'age_limit': 18,
},
}, {
'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
webpage = self._download_webpage(url, display_id)
config = self._parse_json(
self._search_regex(
r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*=',
webpage, 'config', default='{}'),
display_id, transform_source=js_to_json, fatal=False)
if config:
title = config['title']
formats = [{
'url': stream['url'],
'format_id': stream.get('id'),
'height': int_or_none(self._search_regex(
r'^(\d+)[pP]', stream.get('id') or '', 'height', default=None))
} for stream in config['streams'] if stream.get('url')]
thumbnail = (compat_urlparse.urljoin(
config['thumbCDN'], config['poster'])
if config.get('thumbCDN') and config.get('poster') else None)
duration = int_or_none(config.get('length'))
else:
title = self._search_regex(
(r'<title>([^<]+)</title>', r'<h1[^>]*>([^<]+)</h1>'),
webpage, 'title')
formats = [{
'url': compat_urlparse.urljoin(url, format_url),
'format_id': '%sp' % height,
'height': int(height),
'filesize_approx': parse_filesize(filesize),
} for format_url, height, filesize in re.findall(
r'<a[^>]+href="(/download/[^"]+)">MPEG4 (\d+)p<span[^>]*>(\d+\s+[a-zA-Z]+)<',
webpage)]
thumbnail = None
duration = None
self._sort_formats(formats)
view_count = str_to_int(self._search_regex(
r'class=["\']views["\'][^>]*><p>([\d,.]+)', webpage, 'view count'))
return {
'id': video_id,
'display_id': display_id,
'title': title,
'thumbnail': thumbnail,
'duration': duration,
'view_count': view_count,
'formats': formats,
'age_limit': 18,
}

View File

@@ -13,15 +13,15 @@ class RadioBremenIE(InfoExtractor):
IE_NAME = 'radiobremen'
_TEST = {
'url': 'http://www.radiobremen.de/mediathek/index.html?id=114720',
'url': 'http://www.radiobremen.de/mediathek/?id=141876',
'info_dict': {
'id': '114720',
'id': '141876',
'ext': 'mp4',
'duration': 1685,
'duration': 178,
'width': 512,
'title': 'buten un binnen vom 22. Dezember',
'title': 'Druck auf Patrick Öztürk',
'thumbnail': 're:https?://.*\.jpg$',
'description': 'Unter anderem mit diesen Themen: 45 Flüchtlinge sind in Worpswede angekommen +++ Freies Internet für alle: Bremer arbeiten an einem flächendeckenden W-Lan-Netzwerk +++ Aktivisten kämpfen für das Unibad +++ So war das Wetter 2014 +++',
'description': 'Gegen den SPD-Bürgerschaftsabgeordneten Patrick Öztürk wird wegen Beihilfe zum gewerbsmäßigen Betrug ermittelt. Am Donnerstagabend sollte er dem Vorstand des SPD-Unterbezirks Bremerhaven dazu Rede und Antwort stehen.',
},
}

View File

@@ -5,9 +5,9 @@ import re
from .common import InfoExtractor
from ..utils import (
float_or_none,
str_to_int,
parse_duration,
parse_filesize,
str_to_int,
)
@@ -17,21 +17,24 @@ class SnotrIE(InfoExtractor):
'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks',
'info_dict': {
'id': '13708',
'ext': 'flv',
'ext': 'mp4',
'title': 'Drone flying through fireworks!',
'duration': 247,
'filesize_approx': 98566144,
'duration': 248,
'filesize_approx': 40700000,
'description': 'A drone flying through Fourth of July Fireworks',
}
'thumbnail': 're:^https?://.*\.jpg$',
},
'expected_warnings': ['description'],
}, {
'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10',
'info_dict': {
'id': '530',
'ext': 'flv',
'ext': 'mp4',
'title': 'David Letteman - George W. Bush Top 10',
'duration': 126,
'filesize_approx': 8912896,
'filesize_approx': 8500000,
'description': 'The top 10 George W. Bush moments, brought to you by David Letterman!',
'thumbnail': 're:^https?://.*\.jpg$',
}
}]
@@ -43,26 +46,28 @@ class SnotrIE(InfoExtractor):
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
video_url = 'http://cdn.videos.snotr.com/%s.flv' % video_id
info_dict = self._parse_html5_media_entries(
url, webpage, video_id, m3u8_entry_protocol='m3u8_native')[0]
view_count = str_to_int(self._html_search_regex(
r'<p>\n<strong>Views:</strong>\n([\d,\.]+)</p>',
r'<p[^>]*>\s*<strong[^>]*>Views:</strong>\s*<span[^>]*>([\d,\.]+)',
webpage, 'view count', fatal=False))
duration = parse_duration(self._html_search_regex(
r'<p>\n<strong>Length:</strong>\n\s*([0-9:]+).*?</p>',
r'<p[^>]*>\s*<strong[^>]*>Length:</strong>\s*<span[^>]*>([\d:]+)',
webpage, 'duration', fatal=False))
filesize_approx = float_or_none(self._html_search_regex(
r'<p>\n<strong>Filesize:</strong>\n\s*([0-9.]+)\s*megabyte</p>',
webpage, 'filesize', fatal=False), invscale=1024 * 1024)
filesize_approx = parse_filesize(self._html_search_regex(
r'<p[^>]*>\s*<strong[^>]*>Filesize:</strong>\s*<span[^>]*>([^<]+)',
webpage, 'filesize', fatal=False))
return {
info_dict.update({
'id': video_id,
'description': description,
'title': title,
'url': video_url,
'view_count': view_count,
'duration': duration,
'filesize_approx': filesize_approx,
}
})
return info_dict

View File

@@ -7,6 +7,7 @@ import random
from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
compat_parse_qs,
compat_str,
compat_urllib_parse_urlencode,
@@ -14,13 +15,13 @@ from ..compat import (
compat_urlparse,
)
from ..utils import (
clean_html,
ExtractorError,
int_or_none,
js_to_json,
orderedSet,
parse_duration,
parse_iso8601,
sanitized_Request,
urlencode_postdata,
)
@@ -42,7 +43,7 @@ class TwitchBaseIE(InfoExtractor):
'%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')),
expected=True)
def _download_json(self, url, video_id, note='Downloading JSON metadata'):
def _call_api(self, path, item_id, note):
headers = {
'Referer': 'http://api.twitch.tv/crossdomain/receiver.html?v=2',
'X-Requested-With': 'XMLHttpRequest',
@@ -50,8 +51,8 @@ class TwitchBaseIE(InfoExtractor):
for cookie in self._downloader.cookiejar:
if cookie.name == 'api_token':
headers['Twitch-Api-Token'] = cookie.value
request = sanitized_Request(url, headers=headers)
response = super(TwitchBaseIE, self)._download_json(request, video_id, note)
response = self._download_json(
'%s/%s' % (self._API_BASE, path), item_id, note)
self._handle_error(response)
return response
@@ -63,9 +64,17 @@ class TwitchBaseIE(InfoExtractor):
if username is None:
return
def fail(message):
raise ExtractorError(
'Unable to login. Twitch said: %s' % message, expected=True)
login_page, handle = self._download_webpage_handle(
self._LOGIN_URL, None, 'Downloading login page')
# Some TOR nodes and public proxies are blocked completely
if 'blacklist_message' in login_page:
fail(clean_html(login_page))
login_form = self._hidden_inputs(login_page)
login_form.update({
@@ -82,21 +91,24 @@ class TwitchBaseIE(InfoExtractor):
if not post_url.startswith('http'):
post_url = compat_urlparse.urljoin(redirect_url, post_url)
request = sanitized_Request(
post_url, urlencode_postdata(login_form))
request.add_header('Referer', redirect_url)
response = self._download_webpage(
request, None, 'Logging in as %s' % username)
headers = {'Referer': redirect_url}
error_message = self._search_regex(
r'<div[^>]+class="subwindow_notice"[^>]*>([^<]+)</div>',
response, 'error message', default=None)
if error_message:
raise ExtractorError(
'Unable to login. Twitch said: %s' % error_message, expected=True)
try:
response = self._download_json(
post_url, None, 'Logging in as %s' % username,
data=urlencode_postdata(login_form),
headers=headers)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
response = self._parse_json(
e.cause.read().decode('utf-8'), None)
fail(response['message'])
raise
if '>Reset your password<' in response:
self.report_warning('Twitch asks you to reset your password, go to https://secure.twitch.tv/reset/submit')
if response.get('redirect'):
self._download_webpage(
response['redirect'], None, 'Downloading login redirect page',
headers=headers)
def _prefer_source(self, formats):
try:
@@ -109,14 +121,14 @@ class TwitchBaseIE(InfoExtractor):
class TwitchItemBaseIE(TwitchBaseIE):
def _download_info(self, item, item_id):
return self._extract_info(self._download_json(
'%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
return self._extract_info(self._call_api(
'kraken/videos/%s%s' % (item, item_id), item_id,
'Downloading %s info JSON' % self._ITEM_TYPE))
def _extract_media(self, item_id):
info = self._download_info(self._ITEM_SHORTCUT, item_id)
response = self._download_json(
'%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id,
response = self._call_api(
'api/videos/%s%s' % (self._ITEM_SHORTCUT, item_id), item_id,
'Downloading %s playlist JSON' % self._ITEM_TYPE)
entries = []
chunks = response['chunks']
@@ -246,8 +258,8 @@ class TwitchVodIE(TwitchItemBaseIE):
item_id = self._match_id(url)
info = self._download_info(self._ITEM_SHORTCUT, item_id)
access_token = self._download_json(
'%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
access_token = self._call_api(
'api/vods/%s/access_token' % item_id, item_id,
'Downloading %s access token' % self._ITEM_TYPE)
formats = self._extract_m3u8_formats(
@@ -275,12 +287,12 @@ class TwitchVodIE(TwitchItemBaseIE):
class TwitchPlaylistBaseIE(TwitchBaseIE):
_PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE
_PLAYLIST_PATH = 'kraken/channels/%s/videos/?offset=%d&limit=%d'
_PAGE_LIMIT = 100
def _extract_playlist(self, channel_id):
info = self._download_json(
'%s/kraken/channels/%s' % (self._API_BASE, channel_id),
info = self._call_api(
'kraken/channels/%s' % channel_id,
channel_id, 'Downloading channel info JSON')
channel_name = info.get('display_name') or info.get('name')
entries = []
@@ -289,8 +301,8 @@ class TwitchPlaylistBaseIE(TwitchBaseIE):
broken_paging_detected = False
counter_override = None
for counter in itertools.count(1):
response = self._download_json(
self._PLAYLIST_URL % (channel_id, offset, limit),
response = self._call_api(
self._PLAYLIST_PATH % (channel_id, offset, limit),
channel_id,
'Downloading %s videos JSON page %s'
% (self._PLAYLIST_TYPE, counter_override or counter))
@@ -345,7 +357,7 @@ class TwitchProfileIE(TwitchPlaylistBaseIE):
class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):
IE_NAME = 'twitch:past_broadcasts'
_VALID_URL = r'%s/(?P<id>[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
_PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true'
_PLAYLIST_PATH = TwitchPlaylistBaseIE._PLAYLIST_PATH + '&broadcasts=true'
_PLAYLIST_TYPE = 'past broadcasts'
_TEST = {
@@ -389,8 +401,8 @@ class TwitchStreamIE(TwitchBaseIE):
def _real_extract(self, url):
channel_id = self._match_id(url)
stream = self._download_json(
'%s/kraken/streams/%s' % (self._API_BASE, channel_id), channel_id,
stream = self._call_api(
'kraken/streams/%s' % channel_id, channel_id,
'Downloading stream JSON').get('stream')
# Fallback on profile extraction if stream is offline
@@ -405,8 +417,8 @@ class TwitchStreamIE(TwitchBaseIE):
# JSON and fallback to lowercase if it's not available.
channel_id = stream.get('channel', {}).get('name') or channel_id.lower()
access_token = self._download_json(
'%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id,
access_token = self._call_api(
'api/channels/%s/access_token' % channel_id, channel_id,
'Downloading channel access token')
query = {

View File

@@ -1,6 +1,7 @@
# encoding: utf-8
from __future__ import unicode_literals
import collections
import re
import json
import sys
@@ -16,7 +17,6 @@ from ..utils import (
get_element_by_class,
int_or_none,
orderedSet,
parse_duration,
remove_start,
str_to_int,
unescapeHTML,
@@ -52,8 +52,9 @@ class VKBaseIE(InfoExtractor):
# what actually happens.
# We will workaround this VK issue by resetting the remixlhk cookie to
# the first one manually.
cookies = url_handle.headers.get('Set-Cookie')
if cookies:
for header, cookies in url_handle.headers.items():
if header.lower() != 'set-cookie':
continue
if sys.version_info[0] >= 3:
cookies = cookies.encode('iso-8859-1')
cookies = cookies.decode('utf-8')
@@ -61,6 +62,7 @@ class VKBaseIE(InfoExtractor):
if remixlhk:
value, domain = remixlhk.groups()
self._set_cookie(domain, 'remixlhk', value)
break
login_page = self._download_webpage(
'https://login.vk.com/?act=login', None,
@@ -445,6 +447,9 @@ class VKWallPostIE(VKBaseIE):
'skip_download': True,
},
}],
'params': {
'usenetrc': True,
},
'skip': 'Requires vk account credentials',
}, {
# single YouTube embed, no leading -
@@ -454,6 +459,9 @@ class VKWallPostIE(VKBaseIE):
'title': 'Sergey Gorbunov - Wall post 85155021_6319',
},
'playlist_count': 1,
'params': {
'usenetrc': True,
},
'skip': 'Requires vk account credentials',
}, {
# wall page URL
@@ -481,37 +489,41 @@ class VKWallPostIE(VKBaseIE):
raise ExtractorError('VK said: %s' % error, expected=True)
description = clean_html(get_element_by_class('wall_post_text', webpage))
uploader = clean_html(get_element_by_class(
'fw_post_author', webpage)) or self._og_search_description(webpage)
uploader = clean_html(get_element_by_class('author', webpage))
thumbnail = self._og_search_thumbnail(webpage)
entries = []
for audio in re.finditer(r'''(?sx)
<input[^>]+
id=(?P<q1>["\'])audio_info(?P<id>\d+_\d+).*?(?P=q1)[^>]+
value=(?P<q2>["\'])(?P<url>http.+?)(?P=q2)
.+?
</table>''', webpage):
audio_html = audio.group(0)
audio_id = audio.group('id')
duration = parse_duration(get_element_by_class('duration', audio_html))
track = self._html_search_regex(
r'<span[^>]+id=["\']title%s[^>]*>([^<]+)' % audio_id,
audio_html, 'title', default=None)
artist = self._html_search_regex(
r'>([^<]+)</a></b>\s*&ndash', audio_html,
'artist', default=None)
entries.append({
'id': audio_id,
'url': audio.group('url'),
'title': '%s - %s' % (artist, track) if artist and track else audio_id,
'thumbnail': thumbnail,
'duration': duration,
'uploader': uploader,
'artist': artist,
'track': track,
})
audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage)
if audio_ids:
al_audio = self._download_webpage(
'https://vk.com/al_audio.php', post_id,
note='Downloading audio info', fatal=False,
data=urlencode_postdata({
'act': 'reload_audio',
'al': '1',
'ids': ','.join(audio_ids)
}))
if al_audio:
Audio = collections.namedtuple(
'Audio', ['id', 'user_id', 'url', 'track', 'artist', 'duration'])
audios = self._parse_json(
self._search_regex(
r'<!json>(.+?)<!>', al_audio, 'audios', default='[]'),
post_id, fatal=False, transform_source=unescapeHTML)
if isinstance(audios, list):
for audio in audios:
a = Audio._make(audio[:6])
entries.append({
'id': '%s_%s' % (a.user_id, a.id),
'url': a.url,
'title': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id,
'thumbnail': thumbnail,
'duration': a.duration,
'uploader': uploader,
'artist': a.artist,
'track': a.track,
})
for video in re.finditer(
r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage):

View File

@@ -17,12 +17,12 @@ class VuClipIE(InfoExtractor):
_VALID_URL = r'https?://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)'
_TEST = {
'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html',
'url': 'http://m.vuclip.com/w?cid=1129900602&bu=8589892792&frm=w&z=34801&op=0&oc=843169247&section=recommend',
'info_dict': {
'id': '922692425',
'id': '1129900602',
'ext': '3gp',
'title': 'The Toy Soldiers - Hollywood Movie Trailer',
'duration': 177,
'title': 'Top 10 TV Convicts',
'duration': 733,
}
}
@@ -54,7 +54,7 @@ class VuClipIE(InfoExtractor):
'url': video_url,
}]
else:
formats = self._parse_html5_media_entries(url, webpage)[0]['formats']
formats = self._parse_html5_media_entries(url, webpage, video_id)[0]['formats']
title = remove_end(self._html_search_regex(
r'<title>(.*?)-\s*Vuclip</title>', webpage, 'title').strip(), ' - Video')

View File

@@ -4,13 +4,17 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import ExtractorError
from ..utils import (
ExtractorError,
int_or_none,
update_url_query,
)
class ZingMp3BaseInfoExtractor(InfoExtractor):
def _extract_item(self, item, fatal=True):
error_message = item.find('./errormessage').text
def _extract_item(self, item, page_type, fatal=True):
error_message = item.get('msg')
if error_message:
if not fatal:
return
@@ -18,25 +22,48 @@ class ZingMp3BaseInfoExtractor(InfoExtractor):
'%s returned error: %s' % (self.IE_NAME, error_message),
expected=True)
title = item.find('./title').text.strip()
source = item.find('./source').text
extension = item.attrib['type']
thumbnail = item.find('./backimage').text
formats = []
for quality, source_url in zip(item.get('qualities') or item.get('quality', []), item.get('source_list') or item.get('source', [])):
if not source_url or source_url == 'require vip':
continue
if not re.match(r'https?://', source_url):
source_url = '//' + source_url
source_url = self._proto_relative_url(source_url, 'http:')
quality_num = int_or_none(quality)
f = {
'format_id': quality,
'url': source_url,
}
if page_type == 'video':
f.update({
'height': quality_num,
'ext': 'mp4',
})
else:
f.update({
'abr': quality_num,
'ext': 'mp3',
})
formats.append(f)
cover = item.get('cover')
return {
'title': title,
'url': source,
'ext': extension,
'thumbnail': thumbnail,
'title': (item.get('name') or item.get('title')).strip(),
'formats': formats,
'thumbnail': 'http:/' + cover if cover else None,
'artist': item.get('artist'),
}
def _extract_player_xml(self, player_xml_url, id, playlist_title=None):
player_xml = self._download_xml(player_xml_url, id, 'Downloading Player XML')
items = player_xml.findall('./item')
def _extract_player_json(self, player_json_url, id, page_type, playlist_title=None):
player_json = self._download_json(player_json_url, id, 'Downloading Player JSON')
items = player_json['data']
if 'item' in items:
items = items['item']
if len(items) == 1:
# one single song
data = self._extract_item(items[0])
data = self._extract_item(items[0], page_type)
data['id'] = id
return data
@@ -45,7 +72,7 @@ class ZingMp3BaseInfoExtractor(InfoExtractor):
entries = []
for i, item in enumerate(items, 1):
entry = self._extract_item(item, fatal=False)
entry = self._extract_item(item, page_type, fatal=False)
if not entry:
continue
entry['id'] = '%s-%d' % (id, i)
@@ -59,8 +86,8 @@ class ZingMp3BaseInfoExtractor(InfoExtractor):
}
class ZingMp3SongIE(ZingMp3BaseInfoExtractor):
_VALID_URL = r'https?://mp3\.zing\.vn/bai-hat/(?P<slug>[^/]+)/(?P<song_id>\w+)\.html'
class ZingMp3IE(ZingMp3BaseInfoExtractor):
_VALID_URL = r'https?://mp3\.zing\.vn/(?:bai-hat|album|playlist|video-clip)/[^/]+/(?P<id>\w+)\.html'
_TESTS = [{
'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html',
'md5': 'ead7ae13693b3205cbc89536a077daed',
@@ -70,51 +97,47 @@ class ZingMp3SongIE(ZingMp3BaseInfoExtractor):
'ext': 'mp3',
'thumbnail': 're:^https?://.*\.jpg$',
},
}]
IE_NAME = 'zingmp3:song'
IE_DESC = 'mp3.zing.vn songs'
def _real_extract(self, url):
matched = re.match(self._VALID_URL, url)
slug = matched.group('slug')
song_id = matched.group('song_id')
webpage = self._download_webpage(
'http://mp3.zing.vn/bai-hat/%s/%s.html' % (slug, song_id), song_id)
player_xml_url = self._search_regex(
r'&amp;xmlURL=(?P<xml_url>[^&]+)&', webpage, 'player xml url')
return self._extract_player_xml(player_xml_url, song_id)
class ZingMp3AlbumIE(ZingMp3BaseInfoExtractor):
_VALID_URL = r'https?://mp3\.zing\.vn/(?:album|playlist)/(?P<slug>[^/]+)/(?P<album_id>\w+)\.html'
_TESTS = [{
}, {
'url': 'http://mp3.zing.vn/video-clip/Let-It-Go-Frozen-OST-Sungha-Jung/ZW6BAEA0.html',
'md5': '870295a9cd8045c0e15663565902618d',
'info_dict': {
'id': 'ZW6BAEA0',
'title': 'Let It Go (Frozen OST)',
'ext': 'mp4',
},
}, {
'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html',
'info_dict': {
'_type': 'playlist',
'id': 'ZWZBWDAF',
'title': 'Lâu Đài Tình Ái - Bằng Kiều ft. Minh Tuyết | Album 320 lossless',
'title': 'Lâu Đài Tình Ái - Bằng Kiều,Minh Tuyết | Album 320 lossless',
},
'playlist_count': 10,
'skip': 'removed at the request of the owner',
}, {
'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html',
'only_matching': True,
}]
IE_NAME = 'zingmp3:album'
IE_DESC = 'mp3.zing.vn albums'
IE_NAME = 'zingmp3'
IE_DESC = 'mp3.zing.vn'
def _real_extract(self, url):
matched = re.match(self._VALID_URL, url)
slug = matched.group('slug')
album_id = matched.group('album_id')
page_id = self._match_id(url)
webpage = self._download_webpage(
'http://mp3.zing.vn/album/%s/%s.html' % (slug, album_id), album_id)
player_xml_url = self._search_regex(
r'&amp;xmlURL=(?P<xml_url>[^&]+)&', webpage, 'player xml url')
webpage = self._download_webpage(url, page_id)
return self._extract_player_xml(
player_xml_url, album_id,
playlist_title=self._og_search_title(webpage))
player_json_url = self._search_regex([
r'data-xml="([^"]+)',
r'&amp;xmlURL=([^&]+)&'
], webpage, 'player xml url')
playlist_title = None
page_type = self._search_regex(r'/(?:html5)?xml/([^/-]+)', player_json_url, 'page type')
if page_type == 'video':
player_json_url = update_url_query(player_json_url, {'format': 'json'})
else:
player_json_url = player_json_url.replace('/xml/', '/html5xml/')
if page_type == 'album':
playlist_title = self._og_search_title(webpage)
return self._extract_player_json(player_json_url, page_id, page_type, playlist_title)

View File

@@ -628,22 +628,7 @@ def parseOpts(overrideArguments=None):
filesystem.add_option(
'-o', '--output',
dest='outtmpl', metavar='TEMPLATE',
help=('Output filename template. Use %(title)s to get the title, '
'%(uploader)s for the uploader name, %(uploader_id)s for the uploader nickname if different, '
'%(autonumber)s to get an automatically incremented number, '
'%(ext)s for the filename extension, '
'%(format)s for the format description (like "22 - 1280x720" or "HD"), '
'%(format_id)s for the unique id of the format (like YouTube\'s itags: "137"), '
'%(upload_date)s for the upload date (YYYYMMDD), '
'%(extractor)s for the provider (youtube, metacafe, etc), '
'%(id)s for the video id, '
'%(playlist_title)s, %(playlist_id)s, or %(playlist)s (=title if present, ID otherwise) for the playlist the video is in, '
'%(playlist_index)s for the position in the playlist. '
'%(height)s and %(width)s for the width and height of the video format. '
'%(resolution)s for a textual description of the resolution of the video format. '
'%% for a literal percent. '
'Use - to output to stdout. Can also be used to download to a different directory, '
'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))
help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info'))
filesystem.add_option(
'--autonumber-size',
dest='autonumber_size', metavar='NUMBER',

View File

@@ -1504,38 +1504,63 @@ def parse_filesize(s):
_UNIT_TABLE = {
'B': 1,
'b': 1,
'bytes': 1,
'KiB': 1024,
'KB': 1000,
'kB': 1024,
'Kb': 1000,
'kb': 1000,
'kilobytes': 1000,
'kibibytes': 1024,
'MiB': 1024 ** 2,
'MB': 1000 ** 2,
'mB': 1024 ** 2,
'Mb': 1000 ** 2,
'mb': 1000 ** 2,
'megabytes': 1000 ** 2,
'mebibytes': 1024 ** 2,
'GiB': 1024 ** 3,
'GB': 1000 ** 3,
'gB': 1024 ** 3,
'Gb': 1000 ** 3,
'gb': 1000 ** 3,
'gigabytes': 1000 ** 3,
'gibibytes': 1024 ** 3,
'TiB': 1024 ** 4,
'TB': 1000 ** 4,
'tB': 1024 ** 4,
'Tb': 1000 ** 4,
'tb': 1000 ** 4,
'terabytes': 1000 ** 4,
'tebibytes': 1024 ** 4,
'PiB': 1024 ** 5,
'PB': 1000 ** 5,
'pB': 1024 ** 5,
'Pb': 1000 ** 5,
'pb': 1000 ** 5,
'petabytes': 1000 ** 5,
'pebibytes': 1024 ** 5,
'EiB': 1024 ** 6,
'EB': 1000 ** 6,
'eB': 1024 ** 6,
'Eb': 1000 ** 6,
'eb': 1000 ** 6,
'exabytes': 1000 ** 6,
'exbibytes': 1024 ** 6,
'ZiB': 1024 ** 7,
'ZB': 1000 ** 7,
'zB': 1024 ** 7,
'Zb': 1000 ** 7,
'zb': 1000 ** 7,
'zettabytes': 1000 ** 7,
'zebibytes': 1024 ** 7,
'YiB': 1024 ** 8,
'YB': 1000 ** 8,
'yB': 1024 ** 8,
'Yb': 1000 ** 8,
'yb': 1000 ** 8,
'yottabytes': 1000 ** 8,
'yobibytes': 1024 ** 8,
}
return lookup_unit_table(_UNIT_TABLE, s)
@@ -2030,14 +2055,14 @@ def js_to_json(code):
}.get(m.group(0), m.group(0)), v[1:-1])
INTEGER_TABLE = (
(r'^0[xX][0-9a-fA-F]+', 16),
(r'^0+[0-7]+', 8),
(r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
(r'^(0+[0-7]+)\s*:?$', 8),
)
for regex, base in INTEGER_TABLE:
im = re.match(regex, v)
if im:
i = int(im.group(0), base)
i = int(im.group(1), base)
return '"%d":' % i if v.endswith(':') else '%d' % i
return '"%s"' % v

View File

@@ -1,3 +1,3 @@
from __future__ import unicode_literals
__version__ = '2016.08.17'
__version__ = '2016.08.22'