common.py 180 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import base64
  4. import collections
  5. import hashlib
  6. import itertools
  7. import json
  8. import netrc
  9. import os
  10. import random
  11. import re
  12. import sys
  13. import time
  14. import math
  15. from ..compat import (
  16. compat_cookiejar_Cookie,
  17. compat_cookies_SimpleCookie,
  18. compat_etree_Element,
  19. compat_etree_fromstring,
  20. compat_expanduser,
  21. compat_getpass,
  22. compat_http_client,
  23. compat_os_name,
  24. compat_str,
  25. compat_urllib_error,
  26. compat_urllib_parse_unquote,
  27. compat_urllib_parse_urlencode,
  28. compat_urllib_request,
  29. compat_urlparse,
  30. compat_xml_parse_error,
  31. )
  32. from ..downloader import FileDownloader
  33. from ..downloader.f4m import (
  34. get_base_url,
  35. remove_encrypted_media,
  36. )
  37. from ..utils import (
  38. age_restricted,
  39. base_url,
  40. bug_reports_message,
  41. clean_html,
  42. compiled_regex_type,
  43. determine_ext,
  44. determine_protocol,
  45. dict_get,
  46. encode_data_uri,
  47. error_to_compat_str,
  48. extract_attributes,
  49. ExtractorError,
  50. fix_xml_ampersands,
  51. float_or_none,
  52. format_field,
  53. GeoRestrictedError,
  54. GeoUtils,
  55. int_or_none,
  56. join_nonempty,
  57. js_to_json,
  58. JSON_LD_RE,
  59. mimetype2ext,
  60. network_exceptions,
  61. NO_DEFAULT,
  62. orderedSet,
  63. parse_bitrate,
  64. parse_codecs,
  65. parse_duration,
  66. parse_iso8601,
  67. parse_m3u8_attributes,
  68. parse_resolution,
  69. RegexNotFoundError,
  70. sanitize_filename,
  71. sanitized_Request,
  72. str_or_none,
  73. str_to_int,
  74. strip_or_none,
  75. traverse_obj,
  76. try_get,
  77. unescapeHTML,
  78. UnsupportedError,
  79. unified_strdate,
  80. unified_timestamp,
  81. update_Request,
  82. update_url_query,
  83. url_basename,
  84. url_or_none,
  85. urljoin,
  86. variadic,
  87. xpath_element,
  88. xpath_text,
  89. xpath_with_ns,
  90. )
  91. class InfoExtractor(object):
  92. """Information Extractor class.
  93. Information extractors are the classes that, given a URL, extract
  94. information about the video (or videos) the URL refers to. This
  95. information includes the real video URL, the video title, author and
  96. others. The information is stored in a dictionary which is then
  97. passed to the YoutubeDL. The YoutubeDL processes this
  98. information possibly downloading the video to the file system, among
  99. other possible outcomes.
  100. The type field determines the type of the result.
  101. By far the most common value (and the default if _type is missing) is
  102. "video", which indicates a single video.
  103. For a video, the dictionaries must include the following fields:
  104. id: Video identifier.
  105. title: Video title, unescaped.
  106. Additionally, it must contain either a formats entry or a url one:
  107. formats: A list of dictionaries for each format available, ordered
  108. from worst to best quality.
  109. Potential fields:
  110. * url The mandatory URL representing the media:
  111. for plain file media - HTTP URL of this file,
  112. for RTMP - RTMP URL,
  113. for HLS - URL of the M3U8 media playlist,
  114. for HDS - URL of the F4M manifest,
  115. for DASH
  116. - HTTP URL to plain file media (in case of
  117. unfragmented media)
  118. - URL of the MPD manifest or base URL
  119. representing the media if MPD manifest
  120. is parsed from a string (in case of
  121. fragmented media)
  122. for MSS - URL of the ISM manifest.
  123. * manifest_url
  124. The URL of the manifest file in case of
  125. fragmented media:
  126. for HLS - URL of the M3U8 master playlist,
  127. for HDS - URL of the F4M manifest,
  128. for DASH - URL of the MPD manifest,
  129. for MSS - URL of the ISM manifest.
  130. * ext Will be calculated from URL if missing
  131. * format A human-readable description of the format
  132. ("mp4 container with h264/opus").
  133. Calculated from the format_id, width, height.
  134. and format_note fields if missing.
  135. * format_id A short description of the format
  136. ("mp4_h264_opus" or "19").
  137. Technically optional, but strongly recommended.
  138. * format_note Additional info about the format
  139. ("3D" or "DASH video")
  140. * width Width of the video, if known
  141. * height Height of the video, if known
  142. * resolution Textual description of width and height
  143. * dynamic_range The dynamic range of the video. One of:
  144. "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
  145. * tbr Average bitrate of audio and video in KBit/s
  146. * abr Average audio bitrate in KBit/s
  147. * acodec Name of the audio codec in use
  148. * asr Audio sampling rate in Hertz
  149. * vbr Average video bitrate in KBit/s
  150. * fps Frame rate
  151. * vcodec Name of the video codec in use
  152. * container Name of the container format
  153. * filesize The number of bytes, if known in advance
  154. * filesize_approx An estimate for the number of bytes
  155. * player_url SWF Player URL (used for rtmpdump).
  156. * protocol The protocol that will be used for the actual
  157. download, lower-case. One of "http", "https" or
  158. one of the protocols defined in downloader.PROTOCOL_MAP
  159. * fragment_base_url
  160. Base URL for fragments. Each fragment's path
  161. value (if present) will be relative to
  162. this URL.
  163. * fragments A list of fragments of a fragmented media.
  164. Each fragment entry must contain either an url
  165. or a path. If an url is present it should be
  166. considered by a client. Otherwise both path and
  167. fragment_base_url must be present. Here is
  168. the list of all potential fields:
  169. * "url" - fragment's URL
  170. * "path" - fragment's path relative to
  171. fragment_base_url
  172. * "duration" (optional, int or float)
  173. * "filesize" (optional, int)
  174. * is_from_start Is a live format that can be downloaded
  175. from the start. Boolean
  176. * preference Order number of this format. If this field is
  177. present and not None, the formats get sorted
  178. by this field, regardless of all other values.
  179. -1 for default (order by other properties),
  180. -2 or smaller for less than default.
  181. < -1000 to hide the format (if there is
  182. another one which is strictly better)
  183. * language Language code, e.g. "de" or "en-US".
  184. * language_preference Is this in the language mentioned in
  185. the URL?
  186. 10 if it's what the URL is about,
  187. -1 for default (don't know),
  188. -10 otherwise, other values reserved for now.
  189. * quality Order number of the video quality of this
  190. format, irrespective of the file format.
  191. -1 for default (order by other properties),
  192. -2 or smaller for less than default.
  193. * source_preference Order number for this video source
  194. (quality takes higher priority)
  195. -1 for default (order by other properties),
  196. -2 or smaller for less than default.
  197. * http_headers A dictionary of additional HTTP headers
  198. to add to the request.
  199. * stretched_ratio If given and not 1, indicates that the
  200. video's pixels are not square.
  201. width : height ratio as float.
  202. * no_resume The server does not support resuming the
  203. (HTTP or RTMP) download. Boolean.
  204. * has_drm The format has DRM and cannot be downloaded. Boolean
  205. * downloader_options A dictionary of downloader options as
  206. described in FileDownloader
  207. RTMP formats can also have the additional fields: page_url,
  208. app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
  209. rtmp_protocol, rtmp_real_time
  210. url: Final video URL.
  211. ext: Video filename extension.
  212. format: The video format, defaults to ext (used for --get-format)
  213. player_url: SWF Player URL (used for rtmpdump).
  214. The following fields are optional:
  215. direct: True if a direct video file was given (must only be set by GenericIE)
  216. alt_title: A secondary title of the video.
  217. display_id An alternative identifier for the video, not necessarily
  218. unique, but available before title. Typically, id is
  219. something like "4234987", title "Dancing naked mole rats",
  220. and display_id "dancing-naked-mole-rats"
  221. thumbnails: A list of dictionaries, with the following entries:
  222. * "id" (optional, string) - Thumbnail format ID
  223. * "url"
  224. * "preference" (optional, int) - quality of the image
  225. * "width" (optional, int)
  226. * "height" (optional, int)
  227. * "resolution" (optional, string "{width}x{height}",
  228. deprecated)
  229. * "filesize" (optional, int)
  230. * "http_headers" (dict) - HTTP headers for the request
  231. thumbnail: Full URL to a video thumbnail image.
  232. description: Full video description.
  233. uploader: Full name of the video uploader.
  234. license: License name the video is licensed under.
  235. creator: The creator of the video.
  236. timestamp: UNIX timestamp of the moment the video was uploaded
  237. upload_date: Video upload date (YYYYMMDD).
  238. If not explicitly set, calculated from timestamp
  239. release_timestamp: UNIX timestamp of the moment the video was released.
  240. If it is not clear whether to use timestamp or this, use the former
  241. release_date: The date (YYYYMMDD) when the video was released.
  242. If not explicitly set, calculated from release_timestamp
  243. modified_timestamp: UNIX timestamp of the moment the video was last modified.
  244. modified_date: The date (YYYYMMDD) when the video was last modified.
  245. If not explicitly set, calculated from modified_timestamp
  246. uploader_id: Nickname or id of the video uploader.
  247. uploader_url: Full URL to a personal webpage of the video uploader.
  248. channel: Full name of the channel the video is uploaded on.
  249. Note that channel fields may or may not repeat uploader
  250. fields. This depends on a particular extractor.
  251. channel_id: Id of the channel.
  252. channel_url: Full URL to a channel webpage.
  253. channel_follower_count: Number of followers of the channel.
  254. location: Physical location where the video was filmed.
  255. subtitles: The available subtitles as a dictionary in the format
  256. {tag: subformats}. "tag" is usually a language code, and
  257. "subformats" is a list sorted from lower to higher
  258. preference, each element is a dictionary with the "ext"
  259. entry and one of:
  260. * "data": The subtitles file contents
  261. * "url": A URL pointing to the subtitles file
  262. It can optionally also have:
  263. * "name": Name or description of the subtitles
  264. * "http_headers": A dictionary of additional HTTP headers
  265. to add to the request.
  266. "ext" will be calculated from URL if missing
  267. automatic_captions: Like 'subtitles'; contains automatically generated
  268. captions instead of normal subtitles
  269. duration: Length of the video in seconds, as an integer or float.
  270. view_count: How many users have watched the video on the platform.
  271. like_count: Number of positive ratings of the video
  272. dislike_count: Number of negative ratings of the video
  273. repost_count: Number of reposts of the video
  274. average_rating: Average rating give by users, the scale used depends on the webpage
  275. comment_count: Number of comments on the video
  276. comments: A list of comments, each with one or more of the following
  277. properties (all but one of text or html optional):
  278. * "author" - human-readable name of the comment author
  279. * "author_id" - user ID of the comment author
  280. * "author_thumbnail" - The thumbnail of the comment author
  281. * "id" - Comment ID
  282. * "html" - Comment as HTML
  283. * "text" - Plain text of the comment
  284. * "timestamp" - UNIX timestamp of comment
  285. * "parent" - ID of the comment this one is replying to.
  286. Set to "root" to indicate that this is a
  287. comment to the original video.
  288. * "like_count" - Number of positive ratings of the comment
  289. * "dislike_count" - Number of negative ratings of the comment
  290. * "is_favorited" - Whether the comment is marked as
  291. favorite by the video uploader
  292. * "author_is_uploader" - Whether the comment is made by
  293. the video uploader
  294. age_limit: Age restriction for the video, as an integer (years)
  295. webpage_url: The URL to the video webpage, if given to yt-dlp it
  296. should allow to get the same result again. (It will be set
  297. by YoutubeDL if it's missing)
  298. categories: A list of categories that the video falls in, for example
  299. ["Sports", "Berlin"]
  300. tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
  301. cast: A list of the video cast
  302. is_live: True, False, or None (=unknown). Whether this video is a
  303. live stream that goes on instead of a fixed-length video.
  304. was_live: True, False, or None (=unknown). Whether this video was
  305. originally a live stream.
  306. live_status: 'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
  307. If absent, automatically set from is_live, was_live
  308. start_time: Time in seconds where the reproduction should start, as
  309. specified in the URL.
  310. end_time: Time in seconds where the reproduction should end, as
  311. specified in the URL.
  312. chapters: A list of dictionaries, with the following entries:
  313. * "start_time" - The start time of the chapter in seconds
  314. * "end_time" - The end time of the chapter in seconds
  315. * "title" (optional, string)
  316. playable_in_embed: Whether this video is allowed to play in embedded
  317. players on other sites. Can be True (=always allowed),
  318. False (=never allowed), None (=unknown), or a string
  319. specifying the criteria for embedability (Eg: 'whitelist')
  320. availability: Under what condition the video is available. One of
  321. 'private', 'premium_only', 'subscriber_only', 'needs_auth',
  322. 'unlisted' or 'public'. Use 'InfoExtractor._availability'
  323. to set it
  324. __post_extractor: A function to be called just before the metadata is
  325. written to either disk, logger or console. The function
  326. must return a dict which will be added to the info_dict.
  327. This is usefull for additional information that is
  328. time-consuming to extract. Note that the fields thus
  329. extracted will not be available to output template and
  330. match_filter. So, only "comments" and "comment_count" are
  331. currently allowed to be extracted via this method.
  332. The following fields should only be used when the video belongs to some logical
  333. chapter or section:
  334. chapter: Name or title of the chapter the video belongs to.
  335. chapter_number: Number of the chapter the video belongs to, as an integer.
  336. chapter_id: Id of the chapter the video belongs to, as a unicode string.
  337. The following fields should only be used when the video is an episode of some
  338. series, programme or podcast:
  339. series: Title of the series or programme the video episode belongs to.
  340. series_id: Id of the series or programme the video episode belongs to, as a unicode string.
  341. season: Title of the season the video episode belongs to.
  342. season_number: Number of the season the video episode belongs to, as an integer.
  343. season_id: Id of the season the video episode belongs to, as a unicode string.
  344. episode: Title of the video episode. Unlike mandatory video title field,
  345. this field should denote the exact title of the video episode
  346. without any kind of decoration.
  347. episode_number: Number of the video episode within a season, as an integer.
  348. episode_id: Id of the video episode, as a unicode string.
  349. The following fields should only be used when the media is a track or a part of
  350. a music album:
  351. track: Title of the track.
  352. track_number: Number of the track within an album or a disc, as an integer.
  353. track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
  354. as a unicode string.
  355. artist: Artist(s) of the track.
  356. genre: Genre(s) of the track.
  357. album: Title of the album the track belongs to.
  358. album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
  359. album_artist: List of all artists appeared on the album (e.g.
  360. "Ash Borer / Fell Voices" or "Various Artists", useful for splits
  361. and compilations).
  362. disc_number: Number of the disc or other physical medium the track belongs to,
  363. as an integer.
  364. release_year: Year (YYYY) when the album was released.
  365. composer: Composer of the piece
  366. Unless mentioned otherwise, the fields should be Unicode strings.
  367. Unless mentioned otherwise, None is equivalent to absence of information.
  368. _type "playlist" indicates multiple videos.
  369. There must be a key "entries", which is a list, an iterable, or a PagedList
  370. object, each element of which is a valid dictionary by this specification.
  371. Additionally, playlists can have "id", "title", and any other relevent
  372. attributes with the same semantics as videos (see above).
  373. It can also have the following optional fields:
  374. playlist_count: The total number of videos in a playlist. If not given,
  375. YoutubeDL tries to calculate it from "entries"
  376. _type "multi_video" indicates that there are multiple videos that
  377. form a single show, for examples multiple acts of an opera or TV episode.
  378. It must have an entries key like a playlist and contain all the keys
  379. required for a video at the same time.
  380. _type "url" indicates that the video must be extracted from another
  381. location, possibly by a different extractor. Its only required key is:
  382. "url" - the next URL to extract.
  383. The key "ie_key" can be set to the class name (minus the trailing "IE",
  384. e.g. "Youtube") if the extractor class is known in advance.
  385. Additionally, the dictionary may have any properties of the resolved entity
  386. known in advance, for example "title" if the title of the referred video is
  387. known ahead of time.
  388. _type "url_transparent" entities have the same specification as "url", but
  389. indicate that the given additional information is more precise than the one
  390. associated with the resolved URL.
  391. This is useful when a site employs a video service that hosts the video and
  392. its technical metadata, but that video service does not embed a useful
  393. title, description etc.
  394. Subclasses of this should define a _VALID_URL regexp and, re-define the
  395. _real_extract() and (optionally) _real_initialize() methods.
  396. Probably, they should also be added to the list of extractors.
  397. Subclasses may also override suitable() if necessary, but ensure the function
  398. signature is preserved and that this function imports everything it needs
  399. (except other extractors), so that lazy_extractors works correctly.
  400. To support username + password (or netrc) login, the extractor must define a
  401. _NETRC_MACHINE and re-define _perform_login(username, password) and
  402. (optionally) _initialize_pre_login() methods. The _perform_login method will
  403. be called between _initialize_pre_login and _real_initialize if credentials
  404. are passed by the user. In cases where it is necessary to have the login
  405. process as part of the extraction rather than initialization, _perform_login
  406. can be left undefined.
  407. _GEO_BYPASS attribute may be set to False in order to disable
  408. geo restriction bypass mechanisms for a particular extractor.
  409. Though it won't disable explicit geo restriction bypass based on
  410. country code provided with geo_bypass_country.
  411. _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
  412. countries for this extractor. One of these countries will be used by
  413. geo restriction bypass mechanism right away in order to bypass
  414. geo restriction, of course, if the mechanism is not disabled.
  415. _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
  416. IP blocks in CIDR notation for this extractor. One of these IP blocks
  417. will be used by geo restriction bypass mechanism similarly
  418. to _GEO_COUNTRIES.
  419. The _WORKING attribute should be set to False for broken IEs
  420. in order to warn the users and skip the tests.
  421. """
  422. _ready = False
  423. _downloader = None
  424. _x_forwarded_for_ip = None
  425. _GEO_BYPASS = True
  426. _GEO_COUNTRIES = None
  427. _GEO_IP_BLOCKS = None
  428. _WORKING = True
  429. _NETRC_MACHINE = None
  430. IE_DESC = None
  431. _LOGIN_HINTS = {
  432. 'any': 'Use --cookies, --cookies-from-browser, --username and --password, or --netrc to provide account credentials',
  433. 'cookies': (
  434. 'Use --cookies-from-browser or --cookies for the authentication. '
  435. 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'),
  436. 'password': 'Use --username and --password, or --netrc to provide account credentials',
  437. }
  438. def __init__(self, downloader=None):
  439. """Constructor. Receives an optional downloader (a YoutubeDL instance).
  440. If a downloader is not passed during initialization,
  441. it must be set using "set_downloader()" before "extract()" is called"""
  442. self._ready = False
  443. self._x_forwarded_for_ip = None
  444. self._printed_messages = set()
  445. self.set_downloader(downloader)
  446. @classmethod
  447. def _match_valid_url(cls, url):
  448. # This does not use has/getattr intentionally - we want to know whether
  449. # we have cached the regexp for *this* class, whereas getattr would also
  450. # match the superclass
  451. if '_VALID_URL_RE' not in cls.__dict__:
  452. if '_VALID_URL' not in cls.__dict__:
  453. cls._VALID_URL = cls._make_valid_url()
  454. cls._VALID_URL_RE = re.compile(cls._VALID_URL)
  455. return cls._VALID_URL_RE.match(url)
  456. @classmethod
  457. def suitable(cls, url):
  458. """Receives a URL and returns True if suitable for this IE."""
  459. # This function must import everything it needs (except other extractors),
  460. # so that lazy_extractors works correctly
  461. return cls._match_valid_url(url) is not None
  462. @classmethod
  463. def _match_id(cls, url):
  464. return cls._match_valid_url(url).group('id')
  465. @classmethod
  466. def get_temp_id(cls, url):
  467. try:
  468. return cls._match_id(url)
  469. except (IndexError, AttributeError):
  470. return None
  471. @classmethod
  472. def working(cls):
  473. """Getter method for _WORKING."""
  474. return cls._WORKING
  475. @classmethod
  476. def supports_login(cls):
  477. return bool(cls._NETRC_MACHINE)
  478. def initialize(self):
  479. """Initializes an instance (authentication, etc)."""
  480. self._printed_messages = set()
  481. self._initialize_geo_bypass({
  482. 'countries': self._GEO_COUNTRIES,
  483. 'ip_blocks': self._GEO_IP_BLOCKS,
  484. })
  485. if not self._ready:
  486. self._initialize_pre_login()
  487. if self.supports_login():
  488. username, password = self._get_login_info()
  489. if username:
  490. self._perform_login(username, password)
  491. elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
  492. self.report_warning(f'Login with password is not supported for this website. {self._LOGIN_HINTS["cookies"]}')
  493. self._real_initialize()
  494. self._ready = True
  495. def _initialize_geo_bypass(self, geo_bypass_context):
  496. """
  497. Initialize geo restriction bypass mechanism.
  498. This method is used to initialize geo bypass mechanism based on faking
  499. X-Forwarded-For HTTP header. A random country from provided country list
  500. is selected and a random IP belonging to this country is generated. This
  501. IP will be passed as X-Forwarded-For HTTP header in all subsequent
  502. HTTP requests.
  503. This method will be used for initial geo bypass mechanism initialization
  504. during the instance initialization with _GEO_COUNTRIES and
  505. _GEO_IP_BLOCKS.
  506. You may also manually call it from extractor's code if geo bypass
  507. information is not available beforehand (e.g. obtained during
  508. extraction) or due to some other reason. In this case you should pass
  509. this information in geo bypass context passed as first argument. It may
  510. contain following fields:
  511. countries: List of geo unrestricted countries (similar
  512. to _GEO_COUNTRIES)
  513. ip_blocks: List of geo unrestricted IP blocks in CIDR notation
  514. (similar to _GEO_IP_BLOCKS)
  515. """
  516. if not self._x_forwarded_for_ip:
  517. # Geo bypass mechanism is explicitly disabled by user
  518. if not self.get_param('geo_bypass', True):
  519. return
  520. if not geo_bypass_context:
  521. geo_bypass_context = {}
  522. # Backward compatibility: previously _initialize_geo_bypass
  523. # expected a list of countries, some 3rd party code may still use
  524. # it this way
  525. if isinstance(geo_bypass_context, (list, tuple)):
  526. geo_bypass_context = {
  527. 'countries': geo_bypass_context,
  528. }
  529. # The whole point of geo bypass mechanism is to fake IP
  530. # as X-Forwarded-For HTTP header based on some IP block or
  531. # country code.
  532. # Path 1: bypassing based on IP block in CIDR notation
  533. # Explicit IP block specified by user, use it right away
  534. # regardless of whether extractor is geo bypassable or not
  535. ip_block = self.get_param('geo_bypass_ip_block', None)
  536. # Otherwise use random IP block from geo bypass context but only
  537. # if extractor is known as geo bypassable
  538. if not ip_block:
  539. ip_blocks = geo_bypass_context.get('ip_blocks')
  540. if self._GEO_BYPASS and ip_blocks:
  541. ip_block = random.choice(ip_blocks)
  542. if ip_block:
  543. self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
  544. self._downloader.write_debug(
  545. '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
  546. return
  547. # Path 2: bypassing based on country code
  548. # Explicit country code specified by user, use it right away
  549. # regardless of whether extractor is geo bypassable or not
  550. country = self.get_param('geo_bypass_country', None)
  551. # Otherwise use random country code from geo bypass context but
  552. # only if extractor is known as geo bypassable
  553. if not country:
  554. countries = geo_bypass_context.get('countries')
  555. if self._GEO_BYPASS and countries:
  556. country = random.choice(countries)
  557. if country:
  558. self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
  559. self._downloader.write_debug(
  560. 'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
  561. def extract(self, url):
  562. """Extracts URL information and returns it in list of dicts."""
  563. try:
  564. for _ in range(2):
  565. try:
  566. self.initialize()
  567. self.write_debug('Extracting URL: %s' % url)
  568. ie_result = self._real_extract(url)
  569. if ie_result is None:
  570. return None
  571. if self._x_forwarded_for_ip:
  572. ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
  573. subtitles = ie_result.get('subtitles')
  574. if (subtitles and 'live_chat' in subtitles
  575. and 'no-live-chat' in self.get_param('compat_opts', [])):
  576. del subtitles['live_chat']
  577. return ie_result
  578. except GeoRestrictedError as e:
  579. if self.__maybe_fake_ip_and_retry(e.countries):
  580. continue
  581. raise
  582. except UnsupportedError:
  583. raise
  584. except ExtractorError as e:
  585. kwargs = {
  586. 'video_id': e.video_id or self.get_temp_id(url),
  587. 'ie': self.IE_NAME,
  588. 'tb': e.traceback or sys.exc_info()[2],
  589. 'expected': e.expected,
  590. 'cause': e.cause
  591. }
  592. if hasattr(e, 'countries'):
  593. kwargs['countries'] = e.countries
  594. raise type(e)(e.orig_msg, **kwargs)
  595. except compat_http_client.IncompleteRead as e:
  596. raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
  597. except (KeyError, StopIteration) as e:
  598. raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
  599. def __maybe_fake_ip_and_retry(self, countries):
  600. if (not self.get_param('geo_bypass_country', None)
  601. and self._GEO_BYPASS
  602. and self.get_param('geo_bypass', True)
  603. and not self._x_forwarded_for_ip
  604. and countries):
  605. country_code = random.choice(countries)
  606. self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
  607. if self._x_forwarded_for_ip:
  608. self.report_warning(
  609. 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
  610. % (self._x_forwarded_for_ip, country_code.upper()))
  611. return True
  612. return False
  613. def set_downloader(self, downloader):
  614. """Sets a YoutubeDL instance as the downloader for this IE."""
  615. self._downloader = downloader
  616. def _initialize_pre_login(self):
  617. """ Intialization before login. Redefine in subclasses."""
  618. pass
  619. def _perform_login(self, username, password):
  620. """ Login with username and password. Redefine in subclasses."""
  621. pass
  622. def _real_initialize(self):
  623. """Real initialization process. Redefine in subclasses."""
  624. pass
  625. def _real_extract(self, url):
  626. """Real extraction process. Redefine in subclasses."""
  627. raise NotImplementedError('This method must be implemented by subclasses')
  628. @classmethod
  629. def ie_key(cls):
  630. """A string for getting the InfoExtractor with get_info_extractor"""
  631. return cls.__name__[:-2]
  632. @property
  633. def IE_NAME(self):
  634. return compat_str(type(self).__name__[:-2])
  635. @staticmethod
  636. def __can_accept_status_code(err, expected_status):
  637. assert isinstance(err, compat_urllib_error.HTTPError)
  638. if expected_status is None:
  639. return False
  640. elif callable(expected_status):
  641. return expected_status(err.code) is True
  642. else:
  643. return err.code in variadic(expected_status)
  644. def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
  645. """
  646. Return the response handle.
  647. See _download_webpage docstring for arguments specification.
  648. """
  649. if not self._downloader._first_webpage_request:
  650. sleep_interval = self.get_param('sleep_interval_requests') or 0
  651. if sleep_interval > 0:
  652. self.to_screen('Sleeping %s seconds ...' % sleep_interval)
  653. time.sleep(sleep_interval)
  654. else:
  655. self._downloader._first_webpage_request = False
  656. if note is None:
  657. self.report_download_webpage(video_id)
  658. elif note is not False:
  659. if video_id is None:
  660. self.to_screen('%s' % (note,))
  661. else:
  662. self.to_screen('%s: %s' % (video_id, note))
  663. # Some sites check X-Forwarded-For HTTP header in order to figure out
  664. # the origin of the client behind proxy. This allows bypassing geo
  665. # restriction by faking this header's value to IP that belongs to some
  666. # geo unrestricted country. We will do so once we encounter any
  667. # geo restriction error.
  668. if self._x_forwarded_for_ip:
  669. if 'X-Forwarded-For' not in headers:
  670. headers['X-Forwarded-For'] = self._x_forwarded_for_ip
  671. if isinstance(url_or_request, compat_urllib_request.Request):
  672. url_or_request = update_Request(
  673. url_or_request, data=data, headers=headers, query=query)
  674. else:
  675. if query:
  676. url_or_request = update_url_query(url_or_request, query)
  677. if data is not None or headers:
  678. url_or_request = sanitized_Request(url_or_request, data, headers)
  679. try:
  680. return self._downloader.urlopen(url_or_request)
  681. except network_exceptions as err:
  682. if isinstance(err, compat_urllib_error.HTTPError):
  683. if self.__can_accept_status_code(err, expected_status):
  684. # Retain reference to error to prevent file object from
  685. # being closed before it can be read. Works around the
  686. # effects of <https://bugs.python.org/issue15002>
  687. # introduced in Python 3.4.1.
  688. err.fp._error = err
  689. return err.fp
  690. if errnote is False:
  691. return False
  692. if errnote is None:
  693. errnote = 'Unable to download webpage'
  694. errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
  695. if fatal:
  696. raise ExtractorError(errmsg, cause=err)
  697. else:
  698. self.report_warning(errmsg)
  699. return False
  700. def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
  701. """
  702. Return a tuple (page content as string, URL handle).
  703. See _download_webpage docstring for arguments specification.
  704. """
  705. # Strip hashes from the URL (#1038)
  706. if isinstance(url_or_request, (compat_str, str)):
  707. url_or_request = url_or_request.partition('#')[0]
  708. urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
  709. if urlh is False:
  710. assert not fatal
  711. return False
  712. content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
  713. return (content, urlh)
  714. @staticmethod
  715. def _guess_encoding_from_content(content_type, webpage_bytes):
  716. m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
  717. if m:
  718. encoding = m.group(1)
  719. else:
  720. m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
  721. webpage_bytes[:1024])
  722. if m:
  723. encoding = m.group(1).decode('ascii')
  724. elif webpage_bytes.startswith(b'\xff\xfe'):
  725. encoding = 'utf-16'
  726. else:
  727. encoding = 'utf-8'
  728. return encoding
  729. def __check_blocked(self, content):
  730. first_block = content[:512]
  731. if ('<title>Access to this site is blocked</title>' in content
  732. and 'Websense' in first_block):
  733. msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
  734. blocked_iframe = self._html_search_regex(
  735. r'<iframe src="([^"]+)"', content,
  736. 'Websense information URL', default=None)
  737. if blocked_iframe:
  738. msg += ' Visit %s for more details' % blocked_iframe
  739. raise ExtractorError(msg, expected=True)
  740. if '<title>The URL you requested has been blocked</title>' in first_block:
  741. msg = (
  742. 'Access to this webpage has been blocked by Indian censorship. '
  743. 'Use a VPN or proxy server (with --proxy) to route around it.')
  744. block_msg = self._html_search_regex(
  745. r'</h1><p>(.*?)</p>',
  746. content, 'block message', default=None)
  747. if block_msg:
  748. msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
  749. raise ExtractorError(msg, expected=True)
  750. if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
  751. and 'blocklist.rkn.gov.ru' in content):
  752. raise ExtractorError(
  753. 'Access to this webpage has been blocked by decision of the Russian government. '
  754. 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
  755. expected=True)
  756. def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
  757. content_type = urlh.headers.get('Content-Type', '')
  758. webpage_bytes = urlh.read()
  759. if prefix is not None:
  760. webpage_bytes = prefix + webpage_bytes
  761. if not encoding:
  762. encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
  763. if self.get_param('dump_intermediate_pages', False):
  764. self.to_screen('Dumping request to ' + urlh.geturl())
  765. dump = base64.b64encode(webpage_bytes).decode('ascii')
  766. self._downloader.to_screen(dump)
  767. if self.get_param('write_pages', False):
  768. basen = '%s_%s' % (video_id, urlh.geturl())
  769. trim_length = self.get_param('trim_file_name') or 240
  770. if len(basen) > trim_length:
  771. h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
  772. basen = basen[:trim_length - len(h)] + h
  773. raw_filename = basen + '.dump'
  774. filename = sanitize_filename(raw_filename, restricted=True)
  775. self.to_screen('Saving request to ' + filename)
  776. # Working around MAX_PATH limitation on Windows (see
  777. # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
  778. if compat_os_name == 'nt':
  779. absfilepath = os.path.abspath(filename)
  780. if len(absfilepath) > 259:
  781. filename = '\\\\?\\' + absfilepath
  782. with open(filename, 'wb') as outf:
  783. outf.write(webpage_bytes)
  784. try:
  785. content = webpage_bytes.decode(encoding, 'replace')
  786. except LookupError:
  787. content = webpage_bytes.decode('utf-8', 'replace')
  788. self.__check_blocked(content)
  789. return content
  790. def _download_webpage(
  791. self, url_or_request, video_id, note=None, errnote=None,
  792. fatal=True, tries=1, timeout=5, encoding=None, data=None,
  793. headers={}, query={}, expected_status=None):
  794. """
  795. Return the data of the page as a string.
  796. Arguments:
  797. url_or_request -- plain text URL as a string or
  798. a compat_urllib_request.Requestobject
  799. video_id -- Video/playlist/item identifier (string)
  800. Keyword arguments:
  801. note -- note printed before downloading (string)
  802. errnote -- note printed in case of an error (string)
  803. fatal -- flag denoting whether error should be considered fatal,
  804. i.e. whether it should cause ExtractionError to be raised,
  805. otherwise a warning will be reported and extraction continued
  806. tries -- number of tries
  807. timeout -- sleep interval between tries
  808. encoding -- encoding for a page content decoding, guessed automatically
  809. when not explicitly specified
  810. data -- POST data (bytes)
  811. headers -- HTTP headers (dict)
  812. query -- URL query (dict)
  813. expected_status -- allows to accept failed HTTP requests (non 2xx
  814. status code) by explicitly specifying a set of accepted status
  815. codes. Can be any of the following entities:
  816. - an integer type specifying an exact failed status code to
  817. accept
  818. - a list or a tuple of integer types specifying a list of
  819. failed status codes to accept
  820. - a callable accepting an actual failed status code and
  821. returning True if it should be accepted
  822. Note that this argument does not affect success status codes (2xx)
  823. which are always accepted.
  824. """
  825. success = False
  826. try_count = 0
  827. while success is False:
  828. try:
  829. res = self._download_webpage_handle(
  830. url_or_request, video_id, note, errnote, fatal,
  831. encoding=encoding, data=data, headers=headers, query=query,
  832. expected_status=expected_status)
  833. success = True
  834. except compat_http_client.IncompleteRead as e:
  835. try_count += 1
  836. if try_count >= tries:
  837. raise e
  838. self._sleep(timeout, video_id)
  839. if res is False:
  840. return res
  841. else:
  842. content, _ = res
  843. return content
  844. def _download_xml_handle(
  845. self, url_or_request, video_id, note='Downloading XML',
  846. errnote='Unable to download XML', transform_source=None,
  847. fatal=True, encoding=None, data=None, headers={}, query={},
  848. expected_status=None):
  849. """
  850. Return a tuple (xml as an compat_etree_Element, URL handle).
  851. See _download_webpage docstring for arguments specification.
  852. """
  853. res = self._download_webpage_handle(
  854. url_or_request, video_id, note, errnote, fatal=fatal,
  855. encoding=encoding, data=data, headers=headers, query=query,
  856. expected_status=expected_status)
  857. if res is False:
  858. return res
  859. xml_string, urlh = res
  860. return self._parse_xml(
  861. xml_string, video_id, transform_source=transform_source,
  862. fatal=fatal), urlh
  863. def _download_xml(
  864. self, url_or_request, video_id,
  865. note='Downloading XML', errnote='Unable to download XML',
  866. transform_source=None, fatal=True, encoding=None,
  867. data=None, headers={}, query={}, expected_status=None):
  868. """
  869. Return the xml as an compat_etree_Element.
  870. See _download_webpage docstring for arguments specification.
  871. """
  872. res = self._download_xml_handle(
  873. url_or_request, video_id, note=note, errnote=errnote,
  874. transform_source=transform_source, fatal=fatal, encoding=encoding,
  875. data=data, headers=headers, query=query,
  876. expected_status=expected_status)
  877. return res if res is False else res[0]
  878. def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
  879. if transform_source:
  880. xml_string = transform_source(xml_string)
  881. try:
  882. return compat_etree_fromstring(xml_string.encode('utf-8'))
  883. except compat_xml_parse_error as ve:
  884. errmsg = '%s: Failed to parse XML ' % video_id
  885. if fatal:
  886. raise ExtractorError(errmsg, cause=ve)
  887. else:
  888. self.report_warning(errmsg + str(ve))
  889. def _download_json_handle(
  890. self, url_or_request, video_id, note='Downloading JSON metadata',
  891. errnote='Unable to download JSON metadata', transform_source=None,
  892. fatal=True, encoding=None, data=None, headers={}, query={},
  893. expected_status=None):
  894. """
  895. Return a tuple (JSON object, URL handle).
  896. See _download_webpage docstring for arguments specification.
  897. """
  898. res = self._download_webpage_handle(
  899. url_or_request, video_id, note, errnote, fatal=fatal,
  900. encoding=encoding, data=data, headers=headers, query=query,
  901. expected_status=expected_status)
  902. if res is False:
  903. return res
  904. json_string, urlh = res
  905. return self._parse_json(
  906. json_string, video_id, transform_source=transform_source,
  907. fatal=fatal), urlh
  908. def _download_json(
  909. self, url_or_request, video_id, note='Downloading JSON metadata',
  910. errnote='Unable to download JSON metadata', transform_source=None,
  911. fatal=True, encoding=None, data=None, headers={}, query={},
  912. expected_status=None):
  913. """
  914. Return the JSON object as a dict.
  915. See _download_webpage docstring for arguments specification.
  916. """
  917. res = self._download_json_handle(
  918. url_or_request, video_id, note=note, errnote=errnote,
  919. transform_source=transform_source, fatal=fatal, encoding=encoding,
  920. data=data, headers=headers, query=query,
  921. expected_status=expected_status)
  922. return res if res is False else res[0]
  923. def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
  924. if transform_source:
  925. json_string = transform_source(json_string)
  926. try:
  927. return json.loads(json_string, strict=False)
  928. except ValueError as ve:
  929. errmsg = '%s: Failed to parse JSON ' % video_id
  930. if fatal:
  931. raise ExtractorError(errmsg, cause=ve)
  932. else:
  933. self.report_warning(errmsg + str(ve))
  934. def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
  935. return self._parse_json(
  936. data[data.find('{'):data.rfind('}') + 1],
  937. video_id, transform_source, fatal)
  938. def _download_socket_json_handle(
  939. self, url_or_request, video_id, note='Polling socket',
  940. errnote='Unable to poll socket', transform_source=None,
  941. fatal=True, encoding=None, data=None, headers={}, query={},
  942. expected_status=None):
  943. """
  944. Return a tuple (JSON object, URL handle).
  945. See _download_webpage docstring for arguments specification.
  946. """
  947. res = self._download_webpage_handle(
  948. url_or_request, video_id, note, errnote, fatal=fatal,
  949. encoding=encoding, data=data, headers=headers, query=query,
  950. expected_status=expected_status)
  951. if res is False:
  952. return res
  953. webpage, urlh = res
  954. return self._parse_socket_response_as_json(
  955. webpage, video_id, transform_source=transform_source,
  956. fatal=fatal), urlh
  957. def _download_socket_json(
  958. self, url_or_request, video_id, note='Polling socket',
  959. errnote='Unable to poll socket', transform_source=None,
  960. fatal=True, encoding=None, data=None, headers={}, query={},
  961. expected_status=None):
  962. """
  963. Return the JSON object as a dict.
  964. See _download_webpage docstring for arguments specification.
  965. """
  966. res = self._download_socket_json_handle(
  967. url_or_request, video_id, note=note, errnote=errnote,
  968. transform_source=transform_source, fatal=fatal, encoding=encoding,
  969. data=data, headers=headers, query=query,
  970. expected_status=expected_status)
  971. return res if res is False else res[0]
  972. def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
  973. idstr = format_field(video_id, template='%s: ')
  974. msg = f'[{self.IE_NAME}] {idstr}{msg}'
  975. if only_once:
  976. if f'WARNING: {msg}' in self._printed_messages:
  977. return
  978. self._printed_messages.add(f'WARNING: {msg}')
  979. self._downloader.report_warning(msg, *args, **kwargs)
  980. def to_screen(self, msg, *args, **kwargs):
  981. """Print msg to screen, prefixing it with '[ie_name]'"""
  982. self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
  983. def write_debug(self, msg, *args, **kwargs):
  984. self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
  985. def get_param(self, name, default=None, *args, **kwargs):
  986. if self._downloader:
  987. return self._downloader.params.get(name, default, *args, **kwargs)
  988. return default
  989. def report_drm(self, video_id, partial=False):
  990. self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
  991. def report_extraction(self, id_or_name):
  992. """Report information extraction."""
  993. self.to_screen('%s: Extracting information' % id_or_name)
  994. def report_download_webpage(self, video_id):
  995. """Report webpage download."""
  996. self.to_screen('%s: Downloading webpage' % video_id)
  997. def report_age_confirmation(self):
  998. """Report attempt to confirm age."""
  999. self.to_screen('Confirming age')
  1000. def report_login(self):
  1001. """Report attempt to log in."""
  1002. self.to_screen('Logging in')
  1003. def raise_login_required(
  1004. self, msg='This video is only available for registered users',
  1005. metadata_available=False, method=NO_DEFAULT):
  1006. if metadata_available and (
  1007. self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
  1008. self.report_warning(msg)
  1009. return
  1010. if method is NO_DEFAULT:
  1011. method = 'any' if self.supports_login() else 'cookies'
  1012. if method is not None:
  1013. assert method in self._LOGIN_HINTS, 'Invalid login method'
  1014. msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
  1015. raise ExtractorError(msg, expected=True)
  1016. def raise_geo_restricted(
  1017. self, msg='This video is not available from your location due to geo restriction',
  1018. countries=None, metadata_available=False):
  1019. if metadata_available and (
  1020. self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
  1021. self.report_warning(msg)
  1022. else:
  1023. raise GeoRestrictedError(msg, countries=countries)
  1024. def raise_no_formats(self, msg, expected=False, video_id=None):
  1025. if expected and (
  1026. self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
  1027. self.report_warning(msg, video_id)
  1028. elif isinstance(msg, ExtractorError):
  1029. raise msg
  1030. else:
  1031. raise ExtractorError(msg, expected=expected, video_id=video_id)
  1032. # Methods for following #608
  1033. @staticmethod
  1034. def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
  1035. """Returns a URL that points to a page that should be processed"""
  1036. if ie is not None:
  1037. kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
  1038. if video_id is not None:
  1039. kwargs['id'] = video_id
  1040. if video_title is not None:
  1041. kwargs['title'] = video_title
  1042. return {
  1043. **kwargs,
  1044. '_type': 'url_transparent' if url_transparent else 'url',
  1045. 'url': url,
  1046. }
  1047. def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
  1048. urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
  1049. for m in orderedSet(map(getter, matches) if getter else matches))
  1050. return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
  1051. @staticmethod
  1052. def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
  1053. """Returns a playlist"""
  1054. if playlist_id:
  1055. kwargs['id'] = playlist_id
  1056. if playlist_title:
  1057. kwargs['title'] = playlist_title
  1058. if playlist_description is not None:
  1059. kwargs['description'] = playlist_description
  1060. return {
  1061. **kwargs,
  1062. '_type': 'multi_video' if multi_video else 'playlist',
  1063. 'entries': entries,
  1064. }
  1065. def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
  1066. """
  1067. Perform a regex search on the given string, using a single or a list of
  1068. patterns returning the first matching group.
  1069. In case of failure return a default value or raise a WARNING or a
  1070. RegexNotFoundError, depending on fatal, specifying the field name.
  1071. """
  1072. if isinstance(pattern, (str, compat_str, compiled_regex_type)):
  1073. mobj = re.search(pattern, string, flags)
  1074. else:
  1075. for p in pattern:
  1076. mobj = re.search(p, string, flags)
  1077. if mobj:
  1078. break
  1079. _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
  1080. if mobj:
  1081. if group is None:
  1082. # return the first matching group
  1083. return next(g for g in mobj.groups() if g is not None)
  1084. elif isinstance(group, (list, tuple)):
  1085. return tuple(mobj.group(g) for g in group)
  1086. else:
  1087. return mobj.group(group)
  1088. elif default is not NO_DEFAULT:
  1089. return default
  1090. elif fatal:
  1091. raise RegexNotFoundError('Unable to extract %s' % _name)
  1092. else:
  1093. self.report_warning('unable to extract %s' % _name + bug_reports_message())
  1094. return None
  1095. def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
  1096. """
  1097. Like _search_regex, but strips HTML tags and unescapes entities.
  1098. """
  1099. res = self._search_regex(pattern, string, name, default, fatal, flags, group)
  1100. if res:
  1101. return clean_html(res).strip()
  1102. else:
  1103. return res
  1104. def _get_netrc_login_info(self, netrc_machine=None):
  1105. username = None
  1106. password = None
  1107. netrc_machine = netrc_machine or self._NETRC_MACHINE
  1108. if self.get_param('usenetrc', False):
  1109. try:
  1110. netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
  1111. if os.path.isdir(netrc_file):
  1112. netrc_file = os.path.join(netrc_file, '.netrc')
  1113. info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
  1114. if info is not None:
  1115. username = info[0]
  1116. password = info[2]
  1117. else:
  1118. raise netrc.NetrcParseError(
  1119. 'No authenticators for %s' % netrc_machine)
  1120. except (IOError, netrc.NetrcParseError) as err:
  1121. self.report_warning(
  1122. 'parsing .netrc: %s' % error_to_compat_str(err))
  1123. return username, password
  1124. def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
  1125. """
  1126. Get the login info as (username, password)
  1127. First look for the manually specified credentials using username_option
  1128. and password_option as keys in params dictionary. If no such credentials
  1129. available look in the netrc file using the netrc_machine or _NETRC_MACHINE
  1130. value.
  1131. If there's no info available, return (None, None)
  1132. """
  1133. # Attempt to use provided username and password or .netrc data
  1134. username = self.get_param(username_option)
  1135. if username is not None:
  1136. password = self.get_param(password_option)
  1137. else:
  1138. username, password = self._get_netrc_login_info(netrc_machine)
  1139. return username, password
  1140. def _get_tfa_info(self, note='two-factor verification code'):
  1141. """
  1142. Get the two-factor authentication info
  1143. TODO - asking the user will be required for sms/phone verify
  1144. currently just uses the command line option
  1145. If there's no info available, return None
  1146. """
  1147. tfa = self.get_param('twofactor')
  1148. if tfa is not None:
  1149. return tfa
  1150. return compat_getpass('Type %s and press [Return]: ' % note)
  1151. # Helper functions for extracting OpenGraph info
  1152. @staticmethod
  1153. def _og_regexes(prop):
  1154. content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
  1155. property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
  1156. % {'prop': re.escape(prop)})
  1157. template = r'<meta[^>]+?%s[^>]+?%s'
  1158. return [
  1159. template % (property_re, content_re),
  1160. template % (content_re, property_re),
  1161. ]
  1162. @staticmethod
  1163. def _meta_regex(prop):
  1164. return r'''(?isx)<meta
  1165. (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
  1166. [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
  1167. def _og_search_property(self, prop, html, name=None, **kargs):
  1168. prop = variadic(prop)
  1169. if name is None:
  1170. name = 'OpenGraph %s' % prop[0]
  1171. og_regexes = []
  1172. for p in prop:
  1173. og_regexes.extend(self._og_regexes(p))
  1174. escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
  1175. if escaped is None:
  1176. return None
  1177. return unescapeHTML(escaped)
  1178. def _og_search_thumbnail(self, html, **kargs):
  1179. return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
  1180. def _og_search_description(self, html, **kargs):
  1181. return self._og_search_property('description', html, fatal=False, **kargs)
  1182. def _og_search_title(self, html, **kargs):
  1183. kargs.setdefault('fatal', False)
  1184. return self._og_search_property('title', html, **kargs)
  1185. def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
  1186. regexes = self._og_regexes('video') + self._og_regexes('video:url')
  1187. if secure:
  1188. regexes = self._og_regexes('video:secure_url') + regexes
  1189. return self._html_search_regex(regexes, html, name, **kargs)
  1190. def _og_search_url(self, html, **kargs):
  1191. return self._og_search_property('url', html, **kargs)
  1192. def _html_extract_title(self, html, name, **kwargs):
  1193. return self._html_search_regex(
  1194. r'(?s)<title>(.*?)</title>', html, name, **kwargs)
  1195. def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
  1196. name = variadic(name)
  1197. if display_name is None:
  1198. display_name = name[0]
  1199. return self._html_search_regex(
  1200. [self._meta_regex(n) for n in name],
  1201. html, display_name, fatal=fatal, group='content', **kwargs)
  1202. def _dc_search_uploader(self, html):
  1203. return self._html_search_meta('dc.creator', html, 'uploader')
  1204. def _rta_search(self, html):
  1205. # See http://www.rtalabel.org/index.php?content=howtofaq#single
  1206. if re.search(r'(?ix)<meta\s+name="rating"\s+'
  1207. r' content="RTA-5042-1996-1400-1577-RTA"',
  1208. html):
  1209. return 18
  1210. return 0
  1211. def _media_rating_search(self, html):
  1212. # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
  1213. rating = self._html_search_meta('rating', html)
  1214. if not rating:
  1215. return None
  1216. RATING_TABLE = {
  1217. 'safe for kids': 0,
  1218. 'general': 8,
  1219. '14 years': 14,
  1220. 'mature': 17,
  1221. 'restricted': 19,
  1222. }
  1223. return RATING_TABLE.get(rating.lower())
  1224. def _family_friendly_search(self, html):
  1225. # See http://schema.org/VideoObject
  1226. family_friendly = self._html_search_meta(
  1227. 'isFamilyFriendly', html, default=None)
  1228. if not family_friendly:
  1229. return None
  1230. RATING_TABLE = {
  1231. '1': 0,
  1232. 'true': 0,
  1233. '0': 18,
  1234. 'false': 18,
  1235. }
  1236. return RATING_TABLE.get(family_friendly.lower())
  1237. def _twitter_search_player(self, html):
  1238. return self._html_search_meta('twitter:player', html,
  1239. 'twitter card player')
  1240. def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
  1241. json_ld_list = list(re.finditer(JSON_LD_RE, html))
  1242. default = kwargs.get('default', NO_DEFAULT)
  1243. # JSON-LD may be malformed and thus `fatal` should be respected.
  1244. # At the same time `default` may be passed that assumes `fatal=False`
  1245. # for _search_regex. Let's simulate the same behavior here as well.
  1246. fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
  1247. json_ld = []
  1248. for mobj in json_ld_list:
  1249. json_ld_item = self._parse_json(
  1250. mobj.group('json_ld'), video_id, fatal=fatal)
  1251. if not json_ld_item:
  1252. continue
  1253. if isinstance(json_ld_item, dict):
  1254. json_ld.append(json_ld_item)
  1255. elif isinstance(json_ld_item, (list, tuple)):
  1256. json_ld.extend(json_ld_item)
  1257. if json_ld:
  1258. json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
  1259. if json_ld:
  1260. return json_ld
  1261. if default is not NO_DEFAULT:
  1262. return default
  1263. elif fatal:
  1264. raise RegexNotFoundError('Unable to extract JSON-LD')
  1265. else:
  1266. self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
  1267. return {}
  1268. def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
  1269. if isinstance(json_ld, compat_str):
  1270. json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
  1271. if not json_ld:
  1272. return {}
  1273. info = {}
  1274. if not isinstance(json_ld, (list, tuple, dict)):
  1275. return info
  1276. if isinstance(json_ld, dict):
  1277. json_ld = [json_ld]
  1278. INTERACTION_TYPE_MAP = {
  1279. 'CommentAction': 'comment',
  1280. 'AgreeAction': 'like',
  1281. 'DisagreeAction': 'dislike',
  1282. 'LikeAction': 'like',
  1283. 'DislikeAction': 'dislike',
  1284. 'ListenAction': 'view',
  1285. 'WatchAction': 'view',
  1286. 'ViewAction': 'view',
  1287. }
  1288. def extract_interaction_type(e):
  1289. interaction_type = e.get('interactionType')
  1290. if isinstance(interaction_type, dict):
  1291. interaction_type = interaction_type.get('@type')
  1292. return str_or_none(interaction_type)
  1293. def extract_interaction_statistic(e):
  1294. interaction_statistic = e.get('interactionStatistic')
  1295. if isinstance(interaction_statistic, dict):
  1296. interaction_statistic = [interaction_statistic]
  1297. if not isinstance(interaction_statistic, list):
  1298. return
  1299. for is_e in interaction_statistic:
  1300. if not isinstance(is_e, dict):
  1301. continue
  1302. if is_e.get('@type') != 'InteractionCounter':
  1303. continue
  1304. interaction_type = extract_interaction_type(is_e)
  1305. if not interaction_type:
  1306. continue
  1307. # For interaction count some sites provide string instead of
  1308. # an integer (as per spec) with non digit characters (e.g. ",")
  1309. # so extracting count with more relaxed str_to_int
  1310. interaction_count = str_to_int(is_e.get('userInteractionCount'))
  1311. if interaction_count is None:
  1312. continue
  1313. count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
  1314. if not count_kind:
  1315. continue
  1316. count_key = '%s_count' % count_kind
  1317. if info.get(count_key) is not None:
  1318. continue
  1319. info[count_key] = interaction_count
  1320. def extract_chapter_information(e):
  1321. chapters = [{
  1322. 'title': part.get('name'),
  1323. 'start_time': part.get('startOffset'),
  1324. 'end_time': part.get('endOffset'),
  1325. } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
  1326. for idx, (last_c, current_c, next_c) in enumerate(zip(
  1327. [{'end_time': 0}] + chapters, chapters, chapters[1:])):
  1328. current_c['end_time'] = current_c['end_time'] or next_c['start_time']
  1329. current_c['start_time'] = current_c['start_time'] or last_c['end_time']
  1330. if None in current_c.values():
  1331. self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
  1332. return
  1333. if chapters:
  1334. chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
  1335. info['chapters'] = chapters
  1336. def extract_video_object(e):
  1337. assert e['@type'] == 'VideoObject'
  1338. author = e.get('author')
  1339. info.update({
  1340. 'url': url_or_none(e.get('contentUrl')),
  1341. 'title': unescapeHTML(e.get('name')),
  1342. 'description': unescapeHTML(e.get('description')),
  1343. 'thumbnails': [{'url': url_or_none(url)}
  1344. for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
  1345. 'duration': parse_duration(e.get('duration')),
  1346. 'timestamp': unified_timestamp(e.get('uploadDate')),
  1347. # author can be an instance of 'Organization' or 'Person' types.
  1348. # both types can have 'name' property(inherited from 'Thing' type). [1]
  1349. # however some websites are using 'Text' type instead.
  1350. # 1. https://schema.org/VideoObject
  1351. 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
  1352. 'filesize': float_or_none(e.get('contentSize')),
  1353. 'tbr': int_or_none(e.get('bitrate')),
  1354. 'width': int_or_none(e.get('width')),
  1355. 'height': int_or_none(e.get('height')),
  1356. 'view_count': int_or_none(e.get('interactionCount')),
  1357. })
  1358. extract_interaction_statistic(e)
  1359. extract_chapter_information(e)
  1360. def traverse_json_ld(json_ld, at_top_level=True):
  1361. for e in json_ld:
  1362. if at_top_level and '@context' not in e:
  1363. continue
  1364. if at_top_level and set(e.keys()) == {'@context', '@graph'}:
  1365. traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
  1366. break
  1367. item_type = e.get('@type')
  1368. if expected_type is not None and expected_type != item_type:
  1369. continue
  1370. rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
  1371. if rating is not None:
  1372. info['average_rating'] = rating
  1373. if item_type in ('TVEpisode', 'Episode'):
  1374. episode_name = unescapeHTML(e.get('name'))
  1375. info.update({
  1376. 'episode': episode_name,
  1377. 'episode_number': int_or_none(e.get('episodeNumber')),
  1378. 'description': unescapeHTML(e.get('description')),
  1379. })
  1380. if not info.get('title') and episode_name:
  1381. info['title'] = episode_name
  1382. part_of_season = e.get('partOfSeason')
  1383. if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
  1384. info.update({
  1385. 'season': unescapeHTML(part_of_season.get('name')),
  1386. 'season_number': int_or_none(part_of_season.get('seasonNumber')),
  1387. })
  1388. part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
  1389. if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
  1390. info['series'] = unescapeHTML(part_of_series.get('name'))
  1391. elif item_type == 'Movie':
  1392. info.update({
  1393. 'title': unescapeHTML(e.get('name')),
  1394. 'description': unescapeHTML(e.get('description')),
  1395. 'duration': parse_duration(e.get('duration')),
  1396. 'timestamp': unified_timestamp(e.get('dateCreated')),
  1397. })
  1398. elif item_type in ('Article', 'NewsArticle'):
  1399. info.update({
  1400. 'timestamp': parse_iso8601(e.get('datePublished')),
  1401. 'title': unescapeHTML(e.get('headline')),
  1402. 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
  1403. })
  1404. if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
  1405. extract_video_object(e['video'][0])
  1406. elif item_type == 'VideoObject':
  1407. extract_video_object(e)
  1408. if expected_type is None:
  1409. continue
  1410. else:
  1411. break
  1412. video = e.get('video')
  1413. if isinstance(video, dict) and video.get('@type') == 'VideoObject':
  1414. extract_video_object(video)
  1415. if expected_type is None:
  1416. continue
  1417. else:
  1418. break
  1419. traverse_json_ld(json_ld)
  1420. return dict((k, v) for k, v in info.items() if v is not None)
  1421. def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
  1422. return self._parse_json(
  1423. self._search_regex(
  1424. r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
  1425. webpage, 'next.js data', fatal=fatal, **kw),
  1426. video_id, transform_source=transform_source, fatal=fatal)
  1427. def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
  1428. ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
  1429. # not all website do this, but it can be changed
  1430. # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
  1431. rectx = re.escape(context_name)
  1432. js, arg_keys, arg_vals = self._search_regex(
  1433. (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
  1434. r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
  1435. webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
  1436. args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
  1437. for key, val in args.items():
  1438. if val in ('undefined', 'void 0'):
  1439. args[key] = 'null'
  1440. return self._parse_json(js_to_json(js, args), video_id)['data'][0]
  1441. @staticmethod
  1442. def _hidden_inputs(html):
  1443. html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
  1444. hidden_inputs = {}
  1445. for input in re.findall(r'(?i)(<input[^>]+>)', html):
  1446. attrs = extract_attributes(input)
  1447. if not input:
  1448. continue
  1449. if attrs.get('type') not in ('hidden', 'submit'):
  1450. continue
  1451. name = attrs.get('name') or attrs.get('id')
  1452. value = attrs.get('value')
  1453. if name and value is not None:
  1454. hidden_inputs[name] = value
  1455. return hidden_inputs
  1456. def _form_hidden_inputs(self, form_id, html):
  1457. form = self._search_regex(
  1458. r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
  1459. html, '%s form' % form_id, group='form')
  1460. return self._hidden_inputs(form)
  1461. class FormatSort:
  1462. regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
  1463. default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
  1464. 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
  1465. 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
  1466. ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
  1467. 'height', 'width', 'proto', 'vext', 'abr', 'aext',
  1468. 'fps', 'fs_approx', 'source', 'id')
  1469. settings = {
  1470. 'vcodec': {'type': 'ordered', 'regex': True,
  1471. 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
  1472. 'acodec': {'type': 'ordered', 'regex': True,
  1473. 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
  1474. 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
  1475. 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
  1476. 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
  1477. 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
  1478. 'vext': {'type': 'ordered', 'field': 'video_ext',
  1479. 'order': ('mp4', 'webm', 'flv', '', 'none'),
  1480. 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
  1481. 'aext': {'type': 'ordered', 'field': 'audio_ext',
  1482. 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
  1483. 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
  1484. 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
  1485. 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
  1486. 'field': ('vcodec', 'acodec'),
  1487. 'function': lambda it: int(any(v != 'none' for v in it))},
  1488. 'ie_pref': {'priority': True, 'type': 'extractor'},
  1489. 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
  1490. 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
  1491. 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
  1492. 'quality': {'convert': 'float', 'default': -1},
  1493. 'filesize': {'convert': 'bytes'},
  1494. 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
  1495. 'id': {'convert': 'string', 'field': 'format_id'},
  1496. 'height': {'convert': 'float_none'},
  1497. 'width': {'convert': 'float_none'},
  1498. 'fps': {'convert': 'float_none'},
  1499. 'tbr': {'convert': 'float_none'},
  1500. 'vbr': {'convert': 'float_none'},
  1501. 'abr': {'convert': 'float_none'},
  1502. 'asr': {'convert': 'float_none'},
  1503. 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
  1504. 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
  1505. 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
  1506. 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
  1507. 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
  1508. 'res': {'type': 'multiple', 'field': ('height', 'width'),
  1509. 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
  1510. # For compatibility with youtube-dl
  1511. 'format_id': {'type': 'alias', 'field': 'id'},
  1512. 'preference': {'type': 'alias', 'field': 'ie_pref'},
  1513. 'language_preference': {'type': 'alias', 'field': 'lang'},
  1514. 'source_preference': {'type': 'alias', 'field': 'source'},
  1515. 'protocol': {'type': 'alias', 'field': 'proto'},
  1516. 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
  1517. # Deprecated
  1518. 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
  1519. 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
  1520. 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
  1521. 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
  1522. 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
  1523. 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
  1524. 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
  1525. 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
  1526. 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
  1527. 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
  1528. 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
  1529. 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
  1530. 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
  1531. 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
  1532. 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
  1533. 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
  1534. 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
  1535. 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
  1536. 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
  1537. 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
  1538. }
  1539. def __init__(self, ie, field_preference):
  1540. self._order = []
  1541. self.ydl = ie._downloader
  1542. self.evaluate_params(self.ydl.params, field_preference)
  1543. if ie.get_param('verbose'):
  1544. self.print_verbose_info(self.ydl.write_debug)
  1545. def _get_field_setting(self, field, key):
  1546. if field not in self.settings:
  1547. if key in ('forced', 'priority'):
  1548. return False
  1549. self.ydl.deprecation_warning(
  1550. f'Using arbitrary fields ({field}) for format sorting is deprecated '
  1551. 'and may be removed in a future version')
  1552. self.settings[field] = {}
  1553. propObj = self.settings[field]
  1554. if key not in propObj:
  1555. type = propObj.get('type')
  1556. if key == 'field':
  1557. default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
  1558. elif key == 'convert':
  1559. default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
  1560. else:
  1561. default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
  1562. propObj[key] = default
  1563. return propObj[key]
  1564. def _resolve_field_value(self, field, value, convertNone=False):
  1565. if value is None:
  1566. if not convertNone:
  1567. return None
  1568. else:
  1569. value = value.lower()
  1570. conversion = self._get_field_setting(field, 'convert')
  1571. if conversion == 'ignore':
  1572. return None
  1573. if conversion == 'string':
  1574. return value
  1575. elif conversion == 'float_none':
  1576. return float_or_none(value)
  1577. elif conversion == 'bytes':
  1578. return FileDownloader.parse_bytes(value)
  1579. elif conversion == 'order':
  1580. order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
  1581. use_regex = self._get_field_setting(field, 'regex')
  1582. list_length = len(order_list)
  1583. empty_pos = order_list.index('') if '' in order_list else list_length + 1
  1584. if use_regex and value is not None:
  1585. for i, regex in enumerate(order_list):
  1586. if regex and re.match(regex, value):
  1587. return list_length - i
  1588. return list_length - empty_pos # not in list
  1589. else: # not regex or value = None
  1590. return list_length - (order_list.index(value) if value in order_list else empty_pos)
  1591. else:
  1592. if value.isnumeric():
  1593. return float(value)
  1594. else:
  1595. self.settings[field]['convert'] = 'string'
  1596. return value
  1597. def evaluate_params(self, params, sort_extractor):
  1598. self._use_free_order = params.get('prefer_free_formats', False)
  1599. self._sort_user = params.get('format_sort', [])
  1600. self._sort_extractor = sort_extractor
  1601. def add_item(field, reverse, closest, limit_text):
  1602. field = field.lower()
  1603. if field in self._order:
  1604. return
  1605. self._order.append(field)
  1606. limit = self._resolve_field_value(field, limit_text)
  1607. data = {
  1608. 'reverse': reverse,
  1609. 'closest': False if limit is None else closest,
  1610. 'limit_text': limit_text,
  1611. 'limit': limit}
  1612. if field in self.settings:
  1613. self.settings[field].update(data)
  1614. else:
  1615. self.settings[field] = data
  1616. sort_list = (
  1617. tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
  1618. + (tuple() if params.get('format_sort_force', False)
  1619. else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
  1620. + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
  1621. for item in sort_list:
  1622. match = re.match(self.regex, item)
  1623. if match is None:
  1624. raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
  1625. field = match.group('field')
  1626. if field is None:
  1627. continue
  1628. if self._get_field_setting(field, 'type') == 'alias':
  1629. alias, field = field, self._get_field_setting(field, 'field')
  1630. if self._get_field_setting(alias, 'deprecated'):
  1631. self.ydl.deprecation_warning(
  1632. f'Format sorting alias {alias} is deprecated '
  1633. f'and may be removed in a future version. Please use {field} instead')
  1634. reverse = match.group('reverse') is not None
  1635. closest = match.group('separator') == '~'
  1636. limit_text = match.group('limit')
  1637. has_limit = limit_text is not None
  1638. has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
  1639. has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
  1640. fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
  1641. limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
  1642. limit_count = len(limits)
  1643. for (i, f) in enumerate(fields):
  1644. add_item(f, reverse, closest,
  1645. limits[i] if i < limit_count
  1646. else limits[0] if has_limit and not has_multiple_limits
  1647. else None)
  1648. def print_verbose_info(self, write_debug):
  1649. if self._sort_user:
  1650. write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
  1651. if self._sort_extractor:
  1652. write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
  1653. write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
  1654. '+' if self._get_field_setting(field, 'reverse') else '', field,
  1655. '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
  1656. self._get_field_setting(field, 'limit_text'),
  1657. self._get_field_setting(field, 'limit'))
  1658. if self._get_field_setting(field, 'limit_text') is not None else '')
  1659. for field in self._order if self._get_field_setting(field, 'visible')]))
  1660. def _calculate_field_preference_from_value(self, format, field, type, value):
  1661. reverse = self._get_field_setting(field, 'reverse')
  1662. closest = self._get_field_setting(field, 'closest')
  1663. limit = self._get_field_setting(field, 'limit')
  1664. if type == 'extractor':
  1665. maximum = self._get_field_setting(field, 'max')
  1666. if value is None or (maximum is not None and value >= maximum):
  1667. value = -1
  1668. elif type == 'boolean':
  1669. in_list = self._get_field_setting(field, 'in_list')
  1670. not_in_list = self._get_field_setting(field, 'not_in_list')
  1671. value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
  1672. elif type == 'ordered':
  1673. value = self._resolve_field_value(field, value, True)
  1674. # try to convert to number
  1675. val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
  1676. is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
  1677. if is_num:
  1678. value = val_num
  1679. return ((-10, 0) if value is None
  1680. else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
  1681. else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
  1682. else (0, value, 0) if not reverse and (limit is None or value <= limit)
  1683. else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
  1684. else (-1, value, 0))
  1685. def _calculate_field_preference(self, format, field):
  1686. type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
  1687. get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
  1688. if type == 'multiple':
  1689. type = 'field' # Only 'field' is allowed in multiple for now
  1690. actual_fields = self._get_field_setting(field, 'field')
  1691. value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
  1692. else:
  1693. value = get_value(field)
  1694. return self._calculate_field_preference_from_value(format, field, type, value)
  1695. def calculate_preference(self, format):
  1696. # Determine missing protocol
  1697. if not format.get('protocol'):
  1698. format['protocol'] = determine_protocol(format)
  1699. # Determine missing ext
  1700. if not format.get('ext') and 'url' in format:
  1701. format['ext'] = determine_ext(format['url'])
  1702. if format.get('vcodec') == 'none':
  1703. format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
  1704. format['video_ext'] = 'none'
  1705. else:
  1706. format['video_ext'] = format['ext']
  1707. format['audio_ext'] = 'none'
  1708. # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
  1709. # format['preference'] = -1000
  1710. # Determine missing bitrates
  1711. if format.get('tbr') is None:
  1712. if format.get('vbr') is not None and format.get('abr') is not None:
  1713. format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
  1714. else:
  1715. if format.get('vcodec') != 'none' and format.get('vbr') is None:
  1716. format['vbr'] = format.get('tbr') - format.get('abr', 0)
  1717. if format.get('acodec') != 'none' and format.get('abr') is None:
  1718. format['abr'] = format.get('tbr') - format.get('vbr', 0)
  1719. return tuple(self._calculate_field_preference(format, field) for field in self._order)
  1720. def _sort_formats(self, formats, field_preference=[]):
  1721. if not formats:
  1722. return
  1723. format_sort = self.FormatSort(self, field_preference)
  1724. formats.sort(key=lambda f: format_sort.calculate_preference(f))
  1725. def _check_formats(self, formats, video_id):
  1726. if formats:
  1727. formats[:] = filter(
  1728. lambda f: self._is_valid_url(
  1729. f['url'], video_id,
  1730. item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
  1731. formats)
  1732. @staticmethod
  1733. def _remove_duplicate_formats(formats):
  1734. format_urls = set()
  1735. unique_formats = []
  1736. for f in formats:
  1737. if f['url'] not in format_urls:
  1738. format_urls.add(f['url'])
  1739. unique_formats.append(f)
  1740. formats[:] = unique_formats
  1741. def _is_valid_url(self, url, video_id, item='video', headers={}):
  1742. url = self._proto_relative_url(url, scheme='http:')
  1743. # For now assume non HTTP(S) URLs always valid
  1744. if not (url.startswith('http://') or url.startswith('https://')):
  1745. return True
  1746. try:
  1747. self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
  1748. return True
  1749. except ExtractorError as e:
  1750. self.to_screen(
  1751. '%s: %s URL is invalid, skipping: %s'
  1752. % (video_id, item, error_to_compat_str(e.cause)))
  1753. return False
  1754. def http_scheme(self):
  1755. """ Either "http:" or "https:", depending on the user's preferences """
  1756. return (
  1757. 'http:'
  1758. if self.get_param('prefer_insecure', False)
  1759. else 'https:')
  1760. def _proto_relative_url(self, url, scheme=None):
  1761. if url is None:
  1762. return url
  1763. if url.startswith('//'):
  1764. if scheme is None:
  1765. scheme = self.http_scheme()
  1766. return scheme + url
  1767. else:
  1768. return url
  1769. def _sleep(self, timeout, video_id, msg_template=None):
  1770. if msg_template is None:
  1771. msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
  1772. msg = msg_template % {'video_id': video_id, 'timeout': timeout}
  1773. self.to_screen(msg)
  1774. time.sleep(timeout)
  1775. def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
  1776. transform_source=lambda s: fix_xml_ampersands(s).strip(),
  1777. fatal=True, m3u8_id=None, data=None, headers={}, query={}):
  1778. manifest = self._download_xml(
  1779. manifest_url, video_id, 'Downloading f4m manifest',
  1780. 'Unable to download f4m manifest',
  1781. # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
  1782. # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
  1783. transform_source=transform_source,
  1784. fatal=fatal, data=data, headers=headers, query=query)
  1785. if manifest is False:
  1786. return []
  1787. return self._parse_f4m_formats(
  1788. manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
  1789. transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
  1790. def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
  1791. transform_source=lambda s: fix_xml_ampersands(s).strip(),
  1792. fatal=True, m3u8_id=None):
  1793. if not isinstance(manifest, compat_etree_Element) and not fatal:
  1794. return []
  1795. # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
  1796. akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
  1797. if akamai_pv is not None and ';' in akamai_pv.text:
  1798. playerVerificationChallenge = akamai_pv.text.split(';')[0]
  1799. if playerVerificationChallenge.strip() != '':
  1800. return []
  1801. formats = []
  1802. manifest_version = '1.0'
  1803. media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
  1804. if not media_nodes:
  1805. manifest_version = '2.0'
  1806. media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
  1807. # Remove unsupported DRM protected media from final formats
  1808. # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
  1809. media_nodes = remove_encrypted_media(media_nodes)
  1810. if not media_nodes:
  1811. return formats
  1812. manifest_base_url = get_base_url(manifest)
  1813. bootstrap_info = xpath_element(
  1814. manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
  1815. 'bootstrap info', default=None)
  1816. vcodec = None
  1817. mime_type = xpath_text(
  1818. manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
  1819. 'base URL', default=None)
  1820. if mime_type and mime_type.startswith('audio/'):
  1821. vcodec = 'none'
  1822. for i, media_el in enumerate(media_nodes):
  1823. tbr = int_or_none(media_el.attrib.get('bitrate'))
  1824. width = int_or_none(media_el.attrib.get('width'))
  1825. height = int_or_none(media_el.attrib.get('height'))
  1826. format_id = join_nonempty(f4m_id, tbr or i)
  1827. # If <bootstrapInfo> is present, the specified f4m is a
  1828. # stream-level manifest, and only set-level manifests may refer to
  1829. # external resources. See section 11.4 and section 4 of F4M spec
  1830. if bootstrap_info is None:
  1831. media_url = None
  1832. # @href is introduced in 2.0, see section 11.6 of F4M spec
  1833. if manifest_version == '2.0':
  1834. media_url = media_el.attrib.get('href')
  1835. if media_url is None:
  1836. media_url = media_el.attrib.get('url')
  1837. if not media_url:
  1838. continue
  1839. manifest_url = (
  1840. media_url if media_url.startswith('http://') or media_url.startswith('https://')
  1841. else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
  1842. # If media_url is itself a f4m manifest do the recursive extraction
  1843. # since bitrates in parent manifest (this one) and media_url manifest
  1844. # may differ leading to inability to resolve the format by requested
  1845. # bitrate in f4m downloader
  1846. ext = determine_ext(manifest_url)
  1847. if ext == 'f4m':
  1848. f4m_formats = self._extract_f4m_formats(
  1849. manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
  1850. transform_source=transform_source, fatal=fatal)
  1851. # Sometimes stream-level manifest contains single media entry that
  1852. # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
  1853. # At the same time parent's media entry in set-level manifest may
  1854. # contain it. We will copy it from parent in such cases.
  1855. if len(f4m_formats) == 1:
  1856. f = f4m_formats[0]
  1857. f.update({
  1858. 'tbr': f.get('tbr') or tbr,
  1859. 'width': f.get('width') or width,
  1860. 'height': f.get('height') or height,
  1861. 'format_id': f.get('format_id') if not tbr else format_id,
  1862. 'vcodec': vcodec,
  1863. })
  1864. formats.extend(f4m_formats)
  1865. continue
  1866. elif ext == 'm3u8':
  1867. formats.extend(self._extract_m3u8_formats(
  1868. manifest_url, video_id, 'mp4', preference=preference,
  1869. quality=quality, m3u8_id=m3u8_id, fatal=fatal))
  1870. continue
  1871. formats.append({
  1872. 'format_id': format_id,
  1873. 'url': manifest_url,
  1874. 'manifest_url': manifest_url,
  1875. 'ext': 'flv' if bootstrap_info is not None else None,
  1876. 'protocol': 'f4m',
  1877. 'tbr': tbr,
  1878. 'width': width,
  1879. 'height': height,
  1880. 'vcodec': vcodec,
  1881. 'preference': preference,
  1882. 'quality': quality,
  1883. })
  1884. return formats
  1885. def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
  1886. return {
  1887. 'format_id': join_nonempty(m3u8_id, 'meta'),
  1888. 'url': m3u8_url,
  1889. 'ext': ext,
  1890. 'protocol': 'm3u8',
  1891. 'preference': preference - 100 if preference else -100,
  1892. 'quality': quality,
  1893. 'resolution': 'multiple',
  1894. 'format_note': 'Quality selection URL',
  1895. }
  1896. def _report_ignoring_subs(self, name):
  1897. self.report_warning(bug_reports_message(
  1898. f'Ignoring subtitle tracks found in the {name} manifest; '
  1899. 'if any subtitle tracks are missing,'
  1900. ), only_once=True)
  1901. def _extract_m3u8_formats(self, *args, **kwargs):
  1902. fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
  1903. if subs:
  1904. self._report_ignoring_subs('HLS')
  1905. return fmts
  1906. def _extract_m3u8_formats_and_subtitles(
  1907. self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
  1908. preference=None, quality=None, m3u8_id=None, note=None,
  1909. errnote=None, fatal=True, live=False, data=None, headers={},
  1910. query={}):
  1911. res = self._download_webpage_handle(
  1912. m3u8_url, video_id,
  1913. note='Downloading m3u8 information' if note is None else note,
  1914. errnote='Failed to download m3u8 information' if errnote is None else errnote,
  1915. fatal=fatal, data=data, headers=headers, query=query)
  1916. if res is False:
  1917. return [], {}
  1918. m3u8_doc, urlh = res
  1919. m3u8_url = urlh.geturl()
  1920. return self._parse_m3u8_formats_and_subtitles(
  1921. m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
  1922. preference=preference, quality=quality, m3u8_id=m3u8_id,
  1923. note=note, errnote=errnote, fatal=fatal, live=live, data=data,
  1924. headers=headers, query=query, video_id=video_id)
  1925. def _parse_m3u8_formats_and_subtitles(
  1926. self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
  1927. preference=None, quality=None, m3u8_id=None, live=False, note=None,
  1928. errnote=None, fatal=True, data=None, headers={}, query={},
  1929. video_id=None):
  1930. formats, subtitles = [], {}
  1931. has_drm = re.search('|'.join([
  1932. r'#EXT-X-FAXS-CM:', # Adobe Flash Access
  1933. r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
  1934. ]), m3u8_doc)
  1935. def format_url(url):
  1936. return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
  1937. if self.get_param('hls_split_discontinuity', False):
  1938. def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
  1939. if not m3u8_doc:
  1940. if not manifest_url:
  1941. return []
  1942. m3u8_doc = self._download_webpage(
  1943. manifest_url, video_id, fatal=fatal, data=data, headers=headers,
  1944. note=False, errnote='Failed to download m3u8 playlist information')
  1945. if m3u8_doc is False:
  1946. return []
  1947. return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
  1948. else:
  1949. def _extract_m3u8_playlist_indices(*args, **kwargs):
  1950. return [None]
  1951. # References:
  1952. # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
  1953. # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
  1954. # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
  1955. # We should try extracting formats only from master playlists [1, 4.3.4],
  1956. # i.e. playlists that describe available qualities. On the other hand
  1957. # media playlists [1, 4.3.3] should be returned as is since they contain
  1958. # just the media without qualities renditions.
  1959. # Fortunately, master playlist can be easily distinguished from media
  1960. # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
  1961. # master playlist tags MUST NOT appear in a media playlist and vice versa.
  1962. # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
  1963. # media playlist and MUST NOT appear in master playlist thus we can
  1964. # clearly detect media playlist with this criterion.
  1965. if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
  1966. formats = [{
  1967. 'format_id': join_nonempty(m3u8_id, idx),
  1968. 'format_index': idx,
  1969. 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
  1970. 'ext': ext,
  1971. 'protocol': entry_protocol,
  1972. 'preference': preference,
  1973. 'quality': quality,
  1974. 'has_drm': has_drm,
  1975. } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
  1976. return formats, subtitles
  1977. groups = {}
  1978. last_stream_inf = {}
  1979. def extract_media(x_media_line):
  1980. media = parse_m3u8_attributes(x_media_line)
  1981. # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
  1982. media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
  1983. if not (media_type and group_id and name):
  1984. return
  1985. groups.setdefault(group_id, []).append(media)
  1986. # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
  1987. if media_type == 'SUBTITLES':
  1988. # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
  1989. # EXT-X-MEDIA tag if the media type is SUBTITLES.
  1990. # However, lack of URI has been spotted in the wild.
  1991. # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
  1992. if not media.get('URI'):
  1993. return
  1994. url = format_url(media['URI'])
  1995. sub_info = {
  1996. 'url': url,
  1997. 'ext': determine_ext(url),
  1998. }
  1999. if sub_info['ext'] == 'm3u8':
  2000. # Per RFC 8216 §3.1, the only possible subtitle format m3u8
  2001. # files may contain is WebVTT:
  2002. # <https://tools.ietf.org/html/rfc8216#section-3.1>
  2003. sub_info['ext'] = 'vtt'
  2004. sub_info['protocol'] = 'm3u8_native'
  2005. lang = media.get('LANGUAGE') or 'und'
  2006. subtitles.setdefault(lang, []).append(sub_info)
  2007. if media_type not in ('VIDEO', 'AUDIO'):
  2008. return
  2009. media_url = media.get('URI')
  2010. if media_url:
  2011. manifest_url = format_url(media_url)
  2012. formats.extend({
  2013. 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
  2014. 'format_note': name,
  2015. 'format_index': idx,
  2016. 'url': manifest_url,
  2017. 'manifest_url': m3u8_url,
  2018. 'language': media.get('LANGUAGE'),
  2019. 'ext': ext,
  2020. 'protocol': entry_protocol,
  2021. 'preference': preference,
  2022. 'quality': quality,
  2023. 'vcodec': 'none' if media_type == 'AUDIO' else None,
  2024. } for idx in _extract_m3u8_playlist_indices(manifest_url))
  2025. def build_stream_name():
  2026. # Despite specification does not mention NAME attribute for
  2027. # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
  2028. # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
  2029. # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
  2030. stream_name = last_stream_inf.get('NAME')
  2031. if stream_name:
  2032. return stream_name
  2033. # If there is no NAME in EXT-X-STREAM-INF it will be obtained
  2034. # from corresponding rendition group
  2035. stream_group_id = last_stream_inf.get('VIDEO')
  2036. if not stream_group_id:
  2037. return
  2038. stream_group = groups.get(stream_group_id)
  2039. if not stream_group:
  2040. return stream_group_id
  2041. rendition = stream_group[0]
  2042. return rendition.get('NAME') or stream_group_id
  2043. # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
  2044. # chance to detect video only formats when EXT-X-STREAM-INF tags
  2045. # precede EXT-X-MEDIA tags in HLS manifest such as [3].
  2046. for line in m3u8_doc.splitlines():
  2047. if line.startswith('#EXT-X-MEDIA:'):
  2048. extract_media(line)
  2049. for line in m3u8_doc.splitlines():
  2050. if line.startswith('#EXT-X-STREAM-INF:'):
  2051. last_stream_inf = parse_m3u8_attributes(line)
  2052. elif line.startswith('#') or not line.strip():
  2053. continue
  2054. else:
  2055. tbr = float_or_none(
  2056. last_stream_inf.get('AVERAGE-BANDWIDTH')
  2057. or last_stream_inf.get('BANDWIDTH'), scale=1000)
  2058. manifest_url = format_url(line.strip())
  2059. for idx in _extract_m3u8_playlist_indices(manifest_url):
  2060. format_id = [m3u8_id, None, idx]
  2061. # Bandwidth of live streams may differ over time thus making
  2062. # format_id unpredictable. So it's better to keep provided
  2063. # format_id intact.
  2064. if not live:
  2065. stream_name = build_stream_name()
  2066. format_id[1] = stream_name or '%d' % (tbr or len(formats))
  2067. f = {
  2068. 'format_id': join_nonempty(*format_id),
  2069. 'format_index': idx,
  2070. 'url': manifest_url,
  2071. 'manifest_url': m3u8_url,
  2072. 'tbr': tbr,
  2073. 'ext': ext,
  2074. 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
  2075. 'protocol': entry_protocol,
  2076. 'preference': preference,
  2077. 'quality': quality,
  2078. }
  2079. resolution = last_stream_inf.get('RESOLUTION')
  2080. if resolution:
  2081. mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
  2082. if mobj:
  2083. f['width'] = int(mobj.group('width'))
  2084. f['height'] = int(mobj.group('height'))
  2085. # Unified Streaming Platform
  2086. mobj = re.search(
  2087. r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
  2088. if mobj:
  2089. abr, vbr = mobj.groups()
  2090. abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
  2091. f.update({
  2092. 'vbr': vbr,
  2093. 'abr': abr,
  2094. })
  2095. codecs = parse_codecs(last_stream_inf.get('CODECS'))
  2096. f.update(codecs)
  2097. audio_group_id = last_stream_inf.get('AUDIO')
  2098. # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
  2099. # references a rendition group MUST have a CODECS attribute.
  2100. # However, this is not always respected, for example, [2]
  2101. # contains EXT-X-STREAM-INF tag which references AUDIO
  2102. # rendition group but does not have CODECS and despite
  2103. # referencing an audio group it represents a complete
  2104. # (with audio and video) format. So, for such cases we will
  2105. # ignore references to rendition groups and treat them
  2106. # as complete formats.
  2107. if audio_group_id and codecs and f.get('vcodec') != 'none':
  2108. audio_group = groups.get(audio_group_id)
  2109. if audio_group and audio_group[0].get('URI'):
  2110. # TODO: update acodec for audio only formats with
  2111. # the same GROUP-ID
  2112. f['acodec'] = 'none'
  2113. if not f.get('ext'):
  2114. f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
  2115. formats.append(f)
  2116. # for DailyMotion
  2117. progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
  2118. if progressive_uri:
  2119. http_f = f.copy()
  2120. del http_f['manifest_url']
  2121. http_f.update({
  2122. 'format_id': f['format_id'].replace('hls-', 'http-'),
  2123. 'protocol': 'http',
  2124. 'url': progressive_uri,
  2125. })
  2126. formats.append(http_f)
  2127. last_stream_inf = {}
  2128. return formats, subtitles
  2129. def _extract_m3u8_vod_duration(
  2130. self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
  2131. m3u8_vod = self._download_webpage(
  2132. m3u8_vod_url, video_id,
  2133. note='Downloading m3u8 VOD manifest' if note is None else note,
  2134. errnote='Failed to download VOD manifest' if errnote is None else errnote,
  2135. fatal=False, data=data, headers=headers, query=query)
  2136. return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
  2137. def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
  2138. if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
  2139. return None
  2140. return int(sum(
  2141. float(line[len('#EXTINF:'):].split(',')[0])
  2142. for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
  2143. @staticmethod
  2144. def _xpath_ns(path, namespace=None):
  2145. if not namespace:
  2146. return path
  2147. out = []
  2148. for c in path.split('/'):
  2149. if not c or c == '.':
  2150. out.append(c)
  2151. else:
  2152. out.append('{%s}%s' % (namespace, c))
  2153. return '/'.join(out)
  2154. def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
  2155. smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
  2156. if smil is False:
  2157. assert not fatal
  2158. return [], {}
  2159. namespace = self._parse_smil_namespace(smil)
  2160. fmts = self._parse_smil_formats(
  2161. smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
  2162. subs = self._parse_smil_subtitles(
  2163. smil, namespace=namespace)
  2164. return fmts, subs
  2165. def _extract_smil_formats(self, *args, **kwargs):
  2166. fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
  2167. if subs:
  2168. self._report_ignoring_subs('SMIL')
  2169. return fmts
  2170. def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
  2171. smil = self._download_smil(smil_url, video_id, fatal=fatal)
  2172. if smil is False:
  2173. return {}
  2174. return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
  2175. def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
  2176. return self._download_xml(
  2177. smil_url, video_id, 'Downloading SMIL file',
  2178. 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
  2179. def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
  2180. namespace = self._parse_smil_namespace(smil)
  2181. formats = self._parse_smil_formats(
  2182. smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
  2183. subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
  2184. video_id = os.path.splitext(url_basename(smil_url))[0]
  2185. title = None
  2186. description = None
  2187. upload_date = None
  2188. for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
  2189. name = meta.attrib.get('name')
  2190. content = meta.attrib.get('content')
  2191. if not name or not content:
  2192. continue
  2193. if not title and name == 'title':
  2194. title = content
  2195. elif not description and name in ('description', 'abstract'):
  2196. description = content
  2197. elif not upload_date and name == 'date':
  2198. upload_date = unified_strdate(content)
  2199. thumbnails = [{
  2200. 'id': image.get('type'),
  2201. 'url': image.get('src'),
  2202. 'width': int_or_none(image.get('width')),
  2203. 'height': int_or_none(image.get('height')),
  2204. } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
  2205. return {
  2206. 'id': video_id,
  2207. 'title': title or video_id,
  2208. 'description': description,
  2209. 'upload_date': upload_date,
  2210. 'thumbnails': thumbnails,
  2211. 'formats': formats,
  2212. 'subtitles': subtitles,
  2213. }
  2214. def _parse_smil_namespace(self, smil):
  2215. return self._search_regex(
  2216. r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
  2217. def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
  2218. base = smil_url
  2219. for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
  2220. b = meta.get('base') or meta.get('httpBase')
  2221. if b:
  2222. base = b
  2223. break
  2224. formats = []
  2225. rtmp_count = 0
  2226. http_count = 0
  2227. m3u8_count = 0
  2228. imgs_count = 0
  2229. srcs = set()
  2230. media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
  2231. for medium in media:
  2232. src = medium.get('src')
  2233. if not src or src in srcs:
  2234. continue
  2235. srcs.add(src)
  2236. bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
  2237. filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
  2238. width = int_or_none(medium.get('width'))
  2239. height = int_or_none(medium.get('height'))
  2240. proto = medium.get('proto')
  2241. ext = medium.get('ext')
  2242. src_ext = determine_ext(src)
  2243. streamer = medium.get('streamer') or base
  2244. if proto == 'rtmp' or streamer.startswith('rtmp'):
  2245. rtmp_count += 1
  2246. formats.append({
  2247. 'url': streamer,
  2248. 'play_path': src,
  2249. 'ext': 'flv',
  2250. 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
  2251. 'tbr': bitrate,
  2252. 'filesize': filesize,
  2253. 'width': width,
  2254. 'height': height,
  2255. })
  2256. if transform_rtmp_url:
  2257. streamer, src = transform_rtmp_url(streamer, src)
  2258. formats[-1].update({
  2259. 'url': streamer,
  2260. 'play_path': src,
  2261. })
  2262. continue
  2263. src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
  2264. src_url = src_url.strip()
  2265. if proto == 'm3u8' or src_ext == 'm3u8':
  2266. m3u8_formats = self._extract_m3u8_formats(
  2267. src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
  2268. if len(m3u8_formats) == 1:
  2269. m3u8_count += 1
  2270. m3u8_formats[0].update({
  2271. 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
  2272. 'tbr': bitrate,
  2273. 'width': width,
  2274. 'height': height,
  2275. })
  2276. formats.extend(m3u8_formats)
  2277. elif src_ext == 'f4m':
  2278. f4m_url = src_url
  2279. if not f4m_params:
  2280. f4m_params = {
  2281. 'hdcore': '3.2.0',
  2282. 'plugin': 'flowplayer-3.2.0.1',
  2283. }
  2284. f4m_url += '&' if '?' in f4m_url else '?'
  2285. f4m_url += compat_urllib_parse_urlencode(f4m_params)
  2286. formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
  2287. elif src_ext == 'mpd':
  2288. formats.extend(self._extract_mpd_formats(
  2289. src_url, video_id, mpd_id='dash', fatal=False))
  2290. elif re.search(r'\.ism/[Mm]anifest', src_url):
  2291. formats.extend(self._extract_ism_formats(
  2292. src_url, video_id, ism_id='mss', fatal=False))
  2293. elif src_url.startswith('http') and self._is_valid_url(src, video_id):
  2294. http_count += 1
  2295. formats.append({
  2296. 'url': src_url,
  2297. 'ext': ext or src_ext or 'flv',
  2298. 'format_id': 'http-%d' % (bitrate or http_count),
  2299. 'tbr': bitrate,
  2300. 'filesize': filesize,
  2301. 'width': width,
  2302. 'height': height,
  2303. })
  2304. for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
  2305. src = medium.get('src')
  2306. if not src or src in srcs:
  2307. continue
  2308. srcs.add(src)
  2309. imgs_count += 1
  2310. formats.append({
  2311. 'format_id': 'imagestream-%d' % (imgs_count),
  2312. 'url': src,
  2313. 'ext': mimetype2ext(medium.get('type')),
  2314. 'acodec': 'none',
  2315. 'vcodec': 'none',
  2316. 'width': int_or_none(medium.get('width')),
  2317. 'height': int_or_none(medium.get('height')),
  2318. 'format_note': 'SMIL storyboards',
  2319. })
  2320. return formats
  2321. def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
  2322. urls = []
  2323. subtitles = {}
  2324. for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
  2325. src = textstream.get('src')
  2326. if not src or src in urls:
  2327. continue
  2328. urls.append(src)
  2329. ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
  2330. lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
  2331. subtitles.setdefault(lang, []).append({
  2332. 'url': src,
  2333. 'ext': ext,
  2334. })
  2335. return subtitles
  2336. def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
  2337. xspf = self._download_xml(
  2338. xspf_url, playlist_id, 'Downloading xpsf playlist',
  2339. 'Unable to download xspf manifest', fatal=fatal)
  2340. if xspf is False:
  2341. return []
  2342. return self._parse_xspf(
  2343. xspf, playlist_id, xspf_url=xspf_url,
  2344. xspf_base_url=base_url(xspf_url))
  2345. def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
  2346. NS_MAP = {
  2347. 'xspf': 'http://xspf.org/ns/0/',
  2348. 's1': 'http://static.streamone.nl/player/ns/0',
  2349. }
  2350. entries = []
  2351. for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
  2352. title = xpath_text(
  2353. track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
  2354. description = xpath_text(
  2355. track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
  2356. thumbnail = xpath_text(
  2357. track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
  2358. duration = float_or_none(
  2359. xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
  2360. formats = []
  2361. for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
  2362. format_url = urljoin(xspf_base_url, location.text)
  2363. if not format_url:
  2364. continue
  2365. formats.append({
  2366. 'url': format_url,
  2367. 'manifest_url': xspf_url,
  2368. 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
  2369. 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
  2370. 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
  2371. })
  2372. self._sort_formats(formats)
  2373. entries.append({
  2374. 'id': playlist_id,
  2375. 'title': title,
  2376. 'description': description,
  2377. 'thumbnail': thumbnail,
  2378. 'duration': duration,
  2379. 'formats': formats,
  2380. })
  2381. return entries
  2382. def _extract_mpd_formats(self, *args, **kwargs):
  2383. fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
  2384. if subs:
  2385. self._report_ignoring_subs('DASH')
  2386. return fmts
  2387. def _extract_mpd_formats_and_subtitles(
  2388. self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
  2389. fatal=True, data=None, headers={}, query={}):
  2390. res = self._download_xml_handle(
  2391. mpd_url, video_id,
  2392. note='Downloading MPD manifest' if note is None else note,
  2393. errnote='Failed to download MPD manifest' if errnote is None else errnote,
  2394. fatal=fatal, data=data, headers=headers, query=query)
  2395. if res is False:
  2396. return [], {}
  2397. mpd_doc, urlh = res
  2398. if mpd_doc is None:
  2399. return [], {}
  2400. mpd_base_url = base_url(urlh.geturl())
  2401. return self._parse_mpd_formats_and_subtitles(
  2402. mpd_doc, mpd_id, mpd_base_url, mpd_url)
  2403. def _parse_mpd_formats(self, *args, **kwargs):
  2404. fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
  2405. if subs:
  2406. self._report_ignoring_subs('DASH')
  2407. return fmts
  2408. def _parse_mpd_formats_and_subtitles(
  2409. self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
  2410. """
  2411. Parse formats from MPD manifest.
  2412. References:
  2413. 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
  2414. http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
  2415. 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
  2416. """
  2417. if not self.get_param('dynamic_mpd', True):
  2418. if mpd_doc.get('type') == 'dynamic':
  2419. return [], {}
  2420. namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
  2421. def _add_ns(path):
  2422. return self._xpath_ns(path, namespace)
  2423. def is_drm_protected(element):
  2424. return element.find(_add_ns('ContentProtection')) is not None
  2425. def extract_multisegment_info(element, ms_parent_info):
  2426. ms_info = ms_parent_info.copy()
  2427. # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
  2428. # common attributes and elements. We will only extract relevant
  2429. # for us.
  2430. def extract_common(source):
  2431. segment_timeline = source.find(_add_ns('SegmentTimeline'))
  2432. if segment_timeline is not None:
  2433. s_e = segment_timeline.findall(_add_ns('S'))
  2434. if s_e:
  2435. ms_info['total_number'] = 0
  2436. ms_info['s'] = []
  2437. for s in s_e:
  2438. r = int(s.get('r', 0))
  2439. ms_info['total_number'] += 1 + r
  2440. ms_info['s'].append({
  2441. 't': int(s.get('t', 0)),
  2442. # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
  2443. 'd': int(s.attrib['d']),
  2444. 'r': r,
  2445. })
  2446. start_number = source.get('startNumber')
  2447. if start_number:
  2448. ms_info['start_number'] = int(start_number)
  2449. timescale = source.get('timescale')
  2450. if timescale:
  2451. ms_info['timescale'] = int(timescale)
  2452. segment_duration = source.get('duration')
  2453. if segment_duration:
  2454. ms_info['segment_duration'] = float(segment_duration)
  2455. def extract_Initialization(source):
  2456. initialization = source.find(_add_ns('Initialization'))
  2457. if initialization is not None:
  2458. ms_info['initialization_url'] = initialization.attrib['sourceURL']
  2459. segment_list = element.find(_add_ns('SegmentList'))
  2460. if segment_list is not None:
  2461. extract_common(segment_list)
  2462. extract_Initialization(segment_list)
  2463. segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
  2464. if segment_urls_e:
  2465. ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
  2466. else:
  2467. segment_template = element.find(_add_ns('SegmentTemplate'))
  2468. if segment_template is not None:
  2469. extract_common(segment_template)
  2470. media = segment_template.get('media')
  2471. if media:
  2472. ms_info['media'] = media
  2473. initialization = segment_template.get('initialization')
  2474. if initialization:
  2475. ms_info['initialization'] = initialization
  2476. else:
  2477. extract_Initialization(segment_template)
  2478. return ms_info
  2479. mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
  2480. formats, subtitles = [], {}
  2481. stream_numbers = collections.defaultdict(int)
  2482. for period in mpd_doc.findall(_add_ns('Period')):
  2483. period_duration = parse_duration(period.get('duration')) or mpd_duration
  2484. period_ms_info = extract_multisegment_info(period, {
  2485. 'start_number': 1,
  2486. 'timescale': 1,
  2487. })
  2488. for adaptation_set in period.findall(_add_ns('AdaptationSet')):
  2489. adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
  2490. for representation in adaptation_set.findall(_add_ns('Representation')):
  2491. representation_attrib = adaptation_set.attrib.copy()
  2492. representation_attrib.update(representation.attrib)
  2493. # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
  2494. mime_type = representation_attrib['mimeType']
  2495. content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
  2496. codecs = parse_codecs(representation_attrib.get('codecs', ''))
  2497. if content_type not in ('video', 'audio', 'text'):
  2498. if mime_type == 'image/jpeg':
  2499. content_type = mime_type
  2500. elif codecs['vcodec'] != 'none':
  2501. content_type = 'video'
  2502. elif codecs['acodec'] != 'none':
  2503. content_type = 'audio'
  2504. elif codecs.get('tcodec', 'none') != 'none':
  2505. content_type = 'text'
  2506. elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
  2507. content_type = 'text'
  2508. else:
  2509. self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
  2510. continue
  2511. base_url = ''
  2512. for element in (representation, adaptation_set, period, mpd_doc):
  2513. base_url_e = element.find(_add_ns('BaseURL'))
  2514. if base_url_e is not None:
  2515. base_url = base_url_e.text + base_url
  2516. if re.match(r'^https?://', base_url):
  2517. break
  2518. if mpd_base_url and base_url.startswith('/'):
  2519. base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
  2520. elif mpd_base_url and not re.match(r'^https?://', base_url):
  2521. if not mpd_base_url.endswith('/'):
  2522. mpd_base_url += '/'
  2523. base_url = mpd_base_url + base_url
  2524. representation_id = representation_attrib.get('id')
  2525. lang = representation_attrib.get('lang')
  2526. url_el = representation.find(_add_ns('BaseURL'))
  2527. filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
  2528. bandwidth = int_or_none(representation_attrib.get('bandwidth'))
  2529. if representation_id is not None:
  2530. format_id = representation_id
  2531. else:
  2532. format_id = content_type
  2533. if mpd_id:
  2534. format_id = mpd_id + '-' + format_id
  2535. if content_type in ('video', 'audio'):
  2536. f = {
  2537. 'format_id': format_id,
  2538. 'manifest_url': mpd_url,
  2539. 'ext': mimetype2ext(mime_type),
  2540. 'width': int_or_none(representation_attrib.get('width')),
  2541. 'height': int_or_none(representation_attrib.get('height')),
  2542. 'tbr': float_or_none(bandwidth, 1000),
  2543. 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
  2544. 'fps': int_or_none(representation_attrib.get('frameRate')),
  2545. 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
  2546. 'format_note': 'DASH %s' % content_type,
  2547. 'filesize': filesize,
  2548. 'container': mimetype2ext(mime_type) + '_dash',
  2549. **codecs
  2550. }
  2551. elif content_type == 'text':
  2552. f = {
  2553. 'ext': mimetype2ext(mime_type),
  2554. 'manifest_url': mpd_url,
  2555. 'filesize': filesize,
  2556. }
  2557. elif content_type == 'image/jpeg':
  2558. # See test case in VikiIE
  2559. # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
  2560. f = {
  2561. 'format_id': format_id,
  2562. 'ext': 'mhtml',
  2563. 'manifest_url': mpd_url,
  2564. 'format_note': 'DASH storyboards (jpeg)',
  2565. 'acodec': 'none',
  2566. 'vcodec': 'none',
  2567. }
  2568. if is_drm_protected(adaptation_set) or is_drm_protected(representation):
  2569. f['has_drm'] = True
  2570. representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
  2571. def prepare_template(template_name, identifiers):
  2572. tmpl = representation_ms_info[template_name]
  2573. # First of, % characters outside $...$ templates
  2574. # must be escaped by doubling for proper processing
  2575. # by % operator string formatting used further (see
  2576. # https://github.com/ytdl-org/youtube-dl/issues/16867).
  2577. t = ''
  2578. in_template = False
  2579. for c in tmpl:
  2580. t += c
  2581. if c == '$':
  2582. in_template = not in_template
  2583. elif c == '%' and not in_template:
  2584. t += c
  2585. # Next, $...$ templates are translated to their
  2586. # %(...) counterparts to be used with % operator
  2587. if representation_id is not None:
  2588. t = t.replace('$RepresentationID$', representation_id)
  2589. t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
  2590. t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
  2591. t.replace('$$', '$')
  2592. return t
  2593. # @initialization is a regular template like @media one
  2594. # so it should be handled just the same way (see
  2595. # https://github.com/ytdl-org/youtube-dl/issues/11605)
  2596. if 'initialization' in representation_ms_info:
  2597. initialization_template = prepare_template(
  2598. 'initialization',
  2599. # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
  2600. # $Time$ shall not be included for @initialization thus
  2601. # only $Bandwidth$ remains
  2602. ('Bandwidth', ))
  2603. representation_ms_info['initialization_url'] = initialization_template % {
  2604. 'Bandwidth': bandwidth,
  2605. }
  2606. def location_key(location):
  2607. return 'url' if re.match(r'^https?://', location) else 'path'
  2608. if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
  2609. media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
  2610. media_location_key = location_key(media_template)
  2611. # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
  2612. # can't be used at the same time
  2613. if '%(Number' in media_template and 's' not in representation_ms_info:
  2614. segment_duration = None
  2615. if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
  2616. segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
  2617. representation_ms_info['total_number'] = int(math.ceil(
  2618. float_or_none(period_duration, segment_duration, default=0)))
  2619. representation_ms_info['fragments'] = [{
  2620. media_location_key: media_template % {
  2621. 'Number': segment_number,
  2622. 'Bandwidth': bandwidth,
  2623. },
  2624. 'duration': segment_duration,
  2625. } for segment_number in range(
  2626. representation_ms_info['start_number'],
  2627. representation_ms_info['total_number'] + representation_ms_info['start_number'])]
  2628. else:
  2629. # $Number*$ or $Time$ in media template with S list available
  2630. # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
  2631. # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
  2632. representation_ms_info['fragments'] = []
  2633. segment_time = 0
  2634. segment_d = None
  2635. segment_number = representation_ms_info['start_number']
  2636. def add_segment_url():
  2637. segment_url = media_template % {
  2638. 'Time': segment_time,
  2639. 'Bandwidth': bandwidth,
  2640. 'Number': segment_number,
  2641. }
  2642. representation_ms_info['fragments'].append({
  2643. media_location_key: segment_url,
  2644. 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
  2645. })
  2646. for num, s in enumerate(representation_ms_info['s']):
  2647. segment_time = s.get('t') or segment_time
  2648. segment_d = s['d']
  2649. add_segment_url()
  2650. segment_number += 1
  2651. for r in range(s.get('r', 0)):
  2652. segment_time += segment_d
  2653. add_segment_url()
  2654. segment_number += 1
  2655. segment_time += segment_d
  2656. elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
  2657. # No media template
  2658. # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
  2659. # or any YouTube dashsegments video
  2660. fragments = []
  2661. segment_index = 0
  2662. timescale = representation_ms_info['timescale']
  2663. for s in representation_ms_info['s']:
  2664. duration = float_or_none(s['d'], timescale)
  2665. for r in range(s.get('r', 0) + 1):
  2666. segment_uri = representation_ms_info['segment_urls'][segment_index]
  2667. fragments.append({
  2668. location_key(segment_uri): segment_uri,
  2669. 'duration': duration,
  2670. })
  2671. segment_index += 1
  2672. representation_ms_info['fragments'] = fragments
  2673. elif 'segment_urls' in representation_ms_info:
  2674. # Segment URLs with no SegmentTimeline
  2675. # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
  2676. # https://github.com/ytdl-org/youtube-dl/pull/14844
  2677. fragments = []
  2678. segment_duration = float_or_none(
  2679. representation_ms_info['segment_duration'],
  2680. representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
  2681. for segment_url in representation_ms_info['segment_urls']:
  2682. fragment = {
  2683. location_key(segment_url): segment_url,
  2684. }
  2685. if segment_duration:
  2686. fragment['duration'] = segment_duration
  2687. fragments.append(fragment)
  2688. representation_ms_info['fragments'] = fragments
  2689. # If there is a fragments key available then we correctly recognized fragmented media.
  2690. # Otherwise we will assume unfragmented media with direct access. Technically, such
  2691. # assumption is not necessarily correct since we may simply have no support for
  2692. # some forms of fragmented media renditions yet, but for now we'll use this fallback.
  2693. if 'fragments' in representation_ms_info:
  2694. f.update({
  2695. # NB: mpd_url may be empty when MPD manifest is parsed from a string
  2696. 'url': mpd_url or base_url,
  2697. 'fragment_base_url': base_url,
  2698. 'fragments': [],
  2699. 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
  2700. })
  2701. if 'initialization_url' in representation_ms_info:
  2702. initialization_url = representation_ms_info['initialization_url']
  2703. if not f.get('url'):
  2704. f['url'] = initialization_url
  2705. f['fragments'].append({location_key(initialization_url): initialization_url})
  2706. f['fragments'].extend(representation_ms_info['fragments'])
  2707. if not period_duration:
  2708. period_duration = try_get(
  2709. representation_ms_info,
  2710. lambda r: sum(frag['duration'] for frag in r['fragments']), float)
  2711. else:
  2712. # Assuming direct URL to unfragmented media.
  2713. f['url'] = base_url
  2714. if content_type in ('video', 'audio', 'image/jpeg'):
  2715. f['manifest_stream_number'] = stream_numbers[f['url']]
  2716. stream_numbers[f['url']] += 1
  2717. formats.append(f)
  2718. elif content_type == 'text':
  2719. subtitles.setdefault(lang or 'und', []).append(f)
  2720. return formats, subtitles
  2721. def _extract_ism_formats(self, *args, **kwargs):
  2722. fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
  2723. if subs:
  2724. self._report_ignoring_subs('ISM')
  2725. return fmts
  2726. def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
  2727. res = self._download_xml_handle(
  2728. ism_url, video_id,
  2729. note='Downloading ISM manifest' if note is None else note,
  2730. errnote='Failed to download ISM manifest' if errnote is None else errnote,
  2731. fatal=fatal, data=data, headers=headers, query=query)
  2732. if res is False:
  2733. return [], {}
  2734. ism_doc, urlh = res
  2735. if ism_doc is None:
  2736. return [], {}
  2737. return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
  2738. def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
  2739. """
  2740. Parse formats from ISM manifest.
  2741. References:
  2742. 1. [MS-SSTR]: Smooth Streaming Protocol,
  2743. https://msdn.microsoft.com/en-us/library/ff469518.aspx
  2744. """
  2745. if ism_doc.get('IsLive') == 'TRUE':
  2746. return [], {}
  2747. duration = int(ism_doc.attrib['Duration'])
  2748. timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
  2749. formats = []
  2750. subtitles = {}
  2751. for stream in ism_doc.findall('StreamIndex'):
  2752. stream_type = stream.get('Type')
  2753. if stream_type not in ('video', 'audio', 'text'):
  2754. continue
  2755. url_pattern = stream.attrib['Url']
  2756. stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
  2757. stream_name = stream.get('Name')
  2758. stream_language = stream.get('Language', 'und')
  2759. for track in stream.findall('QualityLevel'):
  2760. fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
  2761. # TODO: add support for WVC1 and WMAP
  2762. if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
  2763. self.report_warning('%s is not a supported codec' % fourcc)
  2764. continue
  2765. tbr = int(track.attrib['Bitrate']) // 1000
  2766. # [1] does not mention Width and Height attributes. However,
  2767. # they're often present while MaxWidth and MaxHeight are
  2768. # missing, so should be used as fallbacks
  2769. width = int_or_none(track.get('MaxWidth') or track.get('Width'))
  2770. height = int_or_none(track.get('MaxHeight') or track.get('Height'))
  2771. sampling_rate = int_or_none(track.get('SamplingRate'))
  2772. track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
  2773. track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
  2774. fragments = []
  2775. fragment_ctx = {
  2776. 'time': 0,
  2777. }
  2778. stream_fragments = stream.findall('c')
  2779. for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
  2780. fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
  2781. fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
  2782. fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
  2783. if not fragment_ctx['duration']:
  2784. try:
  2785. next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
  2786. except IndexError:
  2787. next_fragment_time = duration
  2788. fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
  2789. for _ in range(fragment_repeat):
  2790. fragments.append({
  2791. 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
  2792. 'duration': fragment_ctx['duration'] / stream_timescale,
  2793. })
  2794. fragment_ctx['time'] += fragment_ctx['duration']
  2795. if stream_type == 'text':
  2796. subtitles.setdefault(stream_language, []).append({
  2797. 'ext': 'ismt',
  2798. 'protocol': 'ism',
  2799. 'url': ism_url,
  2800. 'manifest_url': ism_url,
  2801. 'fragments': fragments,
  2802. '_download_params': {
  2803. 'stream_type': stream_type,
  2804. 'duration': duration,
  2805. 'timescale': stream_timescale,
  2806. 'fourcc': fourcc,
  2807. 'language': stream_language,
  2808. 'codec_private_data': track.get('CodecPrivateData'),
  2809. }
  2810. })
  2811. elif stream_type in ('video', 'audio'):
  2812. formats.append({
  2813. 'format_id': join_nonempty(ism_id, stream_name, tbr),
  2814. 'url': ism_url,
  2815. 'manifest_url': ism_url,
  2816. 'ext': 'ismv' if stream_type == 'video' else 'isma',
  2817. 'width': width,
  2818. 'height': height,
  2819. 'tbr': tbr,
  2820. 'asr': sampling_rate,
  2821. 'vcodec': 'none' if stream_type == 'audio' else fourcc,
  2822. 'acodec': 'none' if stream_type == 'video' else fourcc,
  2823. 'protocol': 'ism',
  2824. 'fragments': fragments,
  2825. 'has_drm': ism_doc.find('Protection') is not None,
  2826. '_download_params': {
  2827. 'stream_type': stream_type,
  2828. 'duration': duration,
  2829. 'timescale': stream_timescale,
  2830. 'width': width or 0,
  2831. 'height': height or 0,
  2832. 'fourcc': fourcc,
  2833. 'language': stream_language,
  2834. 'codec_private_data': track.get('CodecPrivateData'),
  2835. 'sampling_rate': sampling_rate,
  2836. 'channels': int_or_none(track.get('Channels', 2)),
  2837. 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
  2838. 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
  2839. },
  2840. })
  2841. return formats, subtitles
  2842. def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
  2843. def absolute_url(item_url):
  2844. return urljoin(base_url, item_url)
  2845. def parse_content_type(content_type):
  2846. if not content_type:
  2847. return {}
  2848. ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
  2849. if ctr:
  2850. mimetype, codecs = ctr.groups()
  2851. f = parse_codecs(codecs)
  2852. f['ext'] = mimetype2ext(mimetype)
  2853. return f
  2854. return {}
  2855. def _media_formats(src, cur_media_type, type_info={}):
  2856. full_url = absolute_url(src)
  2857. ext = type_info.get('ext') or determine_ext(full_url)
  2858. if ext == 'm3u8':
  2859. is_plain_url = False
  2860. formats = self._extract_m3u8_formats(
  2861. full_url, video_id, ext='mp4',
  2862. entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
  2863. preference=preference, quality=quality, fatal=False)
  2864. elif ext == 'mpd':
  2865. is_plain_url = False
  2866. formats = self._extract_mpd_formats(
  2867. full_url, video_id, mpd_id=mpd_id, fatal=False)
  2868. else:
  2869. is_plain_url = True
  2870. formats = [{
  2871. 'url': full_url,
  2872. 'vcodec': 'none' if cur_media_type == 'audio' else None,
  2873. }]
  2874. return is_plain_url, formats
  2875. entries = []
  2876. # amp-video and amp-audio are very similar to their HTML5 counterparts
  2877. # so we wll include them right here (see
  2878. # https://www.ampproject.org/docs/reference/components/amp-video)
  2879. # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
  2880. _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
  2881. media_tags = [(media_tag, media_tag_name, media_type, '')
  2882. for media_tag, media_tag_name, media_type
  2883. in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
  2884. media_tags.extend(re.findall(
  2885. # We only allow video|audio followed by a whitespace or '>'.
  2886. # Allowing more characters may end up in significant slow down (see
  2887. # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
  2888. # http://www.porntrex.com/maps/videositemap.xml).
  2889. r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
  2890. for media_tag, _, media_type, media_content in media_tags:
  2891. media_info = {
  2892. 'formats': [],
  2893. 'subtitles': {},
  2894. }
  2895. media_attributes = extract_attributes(media_tag)
  2896. src = strip_or_none(media_attributes.get('src'))
  2897. if src:
  2898. _, formats = _media_formats(src, media_type)
  2899. media_info['formats'].extend(formats)
  2900. media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
  2901. if media_content:
  2902. for source_tag in re.findall(r'<source[^>]+>', media_content):
  2903. s_attr = extract_attributes(source_tag)
  2904. # data-video-src and data-src are non standard but seen
  2905. # several times in the wild
  2906. src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
  2907. if not src:
  2908. continue
  2909. f = parse_content_type(s_attr.get('type'))
  2910. is_plain_url, formats = _media_formats(src, media_type, f)
  2911. if is_plain_url:
  2912. # width, height, res, label and title attributes are
  2913. # all not standard but seen several times in the wild
  2914. labels = [
  2915. s_attr.get(lbl)
  2916. for lbl in ('label', 'title')
  2917. if str_or_none(s_attr.get(lbl))
  2918. ]
  2919. width = int_or_none(s_attr.get('width'))
  2920. height = (int_or_none(s_attr.get('height'))
  2921. or int_or_none(s_attr.get('res')))
  2922. if not width or not height:
  2923. for lbl in labels:
  2924. resolution = parse_resolution(lbl)
  2925. if not resolution:
  2926. continue
  2927. width = width or resolution.get('width')
  2928. height = height or resolution.get('height')
  2929. for lbl in labels:
  2930. tbr = parse_bitrate(lbl)
  2931. if tbr:
  2932. break
  2933. else:
  2934. tbr = None
  2935. f.update({
  2936. 'width': width,
  2937. 'height': height,
  2938. 'tbr': tbr,
  2939. 'format_id': s_attr.get('label') or s_attr.get('title'),
  2940. })
  2941. f.update(formats[0])
  2942. media_info['formats'].append(f)
  2943. else:
  2944. media_info['formats'].extend(formats)
  2945. for track_tag in re.findall(r'<track[^>]+>', media_content):
  2946. track_attributes = extract_attributes(track_tag)
  2947. kind = track_attributes.get('kind')
  2948. if not kind or kind in ('subtitles', 'captions'):
  2949. src = strip_or_none(track_attributes.get('src'))
  2950. if not src:
  2951. continue
  2952. lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
  2953. media_info['subtitles'].setdefault(lang, []).append({
  2954. 'url': absolute_url(src),
  2955. })
  2956. for f in media_info['formats']:
  2957. f.setdefault('http_headers', {})['Referer'] = base_url
  2958. if media_info['formats'] or media_info['subtitles']:
  2959. entries.append(media_info)
  2960. return entries
  2961. def _extract_akamai_formats(self, *args, **kwargs):
  2962. fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
  2963. if subs:
  2964. self._report_ignoring_subs('akamai')
  2965. return fmts
  2966. def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
  2967. signed = 'hdnea=' in manifest_url
  2968. if not signed:
  2969. # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
  2970. manifest_url = re.sub(
  2971. r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
  2972. '', manifest_url).strip('?')
  2973. formats = []
  2974. subtitles = {}
  2975. hdcore_sign = 'hdcore=3.7.0'
  2976. f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
  2977. hds_host = hosts.get('hds')
  2978. if hds_host:
  2979. f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
  2980. if 'hdcore=' not in f4m_url:
  2981. f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
  2982. f4m_formats = self._extract_f4m_formats(
  2983. f4m_url, video_id, f4m_id='hds', fatal=False)
  2984. for entry in f4m_formats:
  2985. entry.update({'extra_param_to_segment_url': hdcore_sign})
  2986. formats.extend(f4m_formats)
  2987. m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
  2988. hls_host = hosts.get('hls')
  2989. if hls_host:
  2990. m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
  2991. m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
  2992. m3u8_url, video_id, 'mp4', 'm3u8_native',
  2993. m3u8_id='hls', fatal=False)
  2994. formats.extend(m3u8_formats)
  2995. subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
  2996. http_host = hosts.get('http')
  2997. if http_host and m3u8_formats and not signed:
  2998. REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
  2999. qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
  3000. qualities_length = len(qualities)
  3001. if len(m3u8_formats) in (qualities_length, qualities_length + 1):
  3002. i = 0
  3003. for f in m3u8_formats:
  3004. if f['vcodec'] != 'none':
  3005. for protocol in ('http', 'https'):
  3006. http_f = f.copy()
  3007. del http_f['manifest_url']
  3008. http_url = re.sub(
  3009. REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
  3010. http_f.update({
  3011. 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
  3012. 'url': http_url,
  3013. 'protocol': protocol,
  3014. })
  3015. formats.append(http_f)
  3016. i += 1
  3017. return formats, subtitles
  3018. def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
  3019. query = compat_urlparse.urlparse(url).query
  3020. url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
  3021. mobj = re.search(
  3022. r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
  3023. url_base = mobj.group('url')
  3024. http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
  3025. formats = []
  3026. def manifest_url(manifest):
  3027. m_url = '%s/%s' % (http_base_url, manifest)
  3028. if query:
  3029. m_url += '?%s' % query
  3030. return m_url
  3031. if 'm3u8' not in skip_protocols:
  3032. formats.extend(self._extract_m3u8_formats(
  3033. manifest_url('playlist.m3u8'), video_id, 'mp4',
  3034. m3u8_entry_protocol, m3u8_id='hls', fatal=False))
  3035. if 'f4m' not in skip_protocols:
  3036. formats.extend(self._extract_f4m_formats(
  3037. manifest_url('manifest.f4m'),
  3038. video_id, f4m_id='hds', fatal=False))
  3039. if 'dash' not in skip_protocols:
  3040. formats.extend(self._extract_mpd_formats(
  3041. manifest_url('manifest.mpd'),
  3042. video_id, mpd_id='dash', fatal=False))
  3043. if re.search(r'(?:/smil:|\.smil)', url_base):
  3044. if 'smil' not in skip_protocols:
  3045. rtmp_formats = self._extract_smil_formats(
  3046. manifest_url('jwplayer.smil'),
  3047. video_id, fatal=False)
  3048. for rtmp_format in rtmp_formats:
  3049. rtsp_format = rtmp_format.copy()
  3050. rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
  3051. del rtsp_format['play_path']
  3052. del rtsp_format['ext']
  3053. rtsp_format.update({
  3054. 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
  3055. 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
  3056. 'protocol': 'rtsp',
  3057. })
  3058. formats.extend([rtmp_format, rtsp_format])
  3059. else:
  3060. for protocol in ('rtmp', 'rtsp'):
  3061. if protocol not in skip_protocols:
  3062. formats.append({
  3063. 'url': '%s:%s' % (protocol, url_base),
  3064. 'format_id': protocol,
  3065. 'protocol': protocol,
  3066. })
  3067. return formats
  3068. def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
  3069. mobj = re.search(
  3070. r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
  3071. webpage)
  3072. if mobj:
  3073. try:
  3074. jwplayer_data = self._parse_json(mobj.group('options'),
  3075. video_id=video_id,
  3076. transform_source=transform_source)
  3077. except ExtractorError:
  3078. pass
  3079. else:
  3080. if isinstance(jwplayer_data, dict):
  3081. return jwplayer_data
  3082. def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
  3083. jwplayer_data = self._find_jwplayer_data(
  3084. webpage, video_id, transform_source=js_to_json)
  3085. return self._parse_jwplayer_data(
  3086. jwplayer_data, video_id, *args, **kwargs)
  3087. def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
  3088. m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
  3089. # JWPlayer backward compatibility: flattened playlists
  3090. # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
  3091. if 'playlist' not in jwplayer_data:
  3092. jwplayer_data = {'playlist': [jwplayer_data]}
  3093. entries = []
  3094. # JWPlayer backward compatibility: single playlist item
  3095. # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
  3096. if not isinstance(jwplayer_data['playlist'], list):
  3097. jwplayer_data['playlist'] = [jwplayer_data['playlist']]
  3098. for video_data in jwplayer_data['playlist']:
  3099. # JWPlayer backward compatibility: flattened sources
  3100. # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
  3101. if 'sources' not in video_data:
  3102. video_data['sources'] = [video_data]
  3103. this_video_id = video_id or video_data['mediaid']
  3104. formats = self._parse_jwplayer_formats(
  3105. video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
  3106. mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
  3107. subtitles = {}
  3108. tracks = video_data.get('tracks')
  3109. if tracks and isinstance(tracks, list):
  3110. for track in tracks:
  3111. if not isinstance(track, dict):
  3112. continue
  3113. track_kind = track.get('kind')
  3114. if not track_kind or not isinstance(track_kind, compat_str):
  3115. continue
  3116. if track_kind.lower() not in ('captions', 'subtitles'):
  3117. continue
  3118. track_url = urljoin(base_url, track.get('file'))
  3119. if not track_url:
  3120. continue
  3121. subtitles.setdefault(track.get('label') or 'en', []).append({
  3122. 'url': self._proto_relative_url(track_url)
  3123. })
  3124. entry = {
  3125. 'id': this_video_id,
  3126. 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
  3127. 'description': clean_html(video_data.get('description')),
  3128. 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
  3129. 'timestamp': int_or_none(video_data.get('pubdate')),
  3130. 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
  3131. 'subtitles': subtitles,
  3132. }
  3133. # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
  3134. if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
  3135. entry.update({
  3136. '_type': 'url_transparent',
  3137. 'url': formats[0]['url'],
  3138. })
  3139. else:
  3140. self._sort_formats(formats)
  3141. entry['formats'] = formats
  3142. entries.append(entry)
  3143. if len(entries) == 1:
  3144. return entries[0]
  3145. else:
  3146. return self.playlist_result(entries)
  3147. def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
  3148. m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
  3149. urls = []
  3150. formats = []
  3151. for source in jwplayer_sources_data:
  3152. if not isinstance(source, dict):
  3153. continue
  3154. source_url = urljoin(
  3155. base_url, self._proto_relative_url(source.get('file')))
  3156. if not source_url or source_url in urls:
  3157. continue
  3158. urls.append(source_url)
  3159. source_type = source.get('type') or ''
  3160. ext = mimetype2ext(source_type) or determine_ext(source_url)
  3161. if source_type == 'hls' or ext == 'm3u8':
  3162. formats.extend(self._extract_m3u8_formats(
  3163. source_url, video_id, 'mp4', entry_protocol='m3u8_native',
  3164. m3u8_id=m3u8_id, fatal=False))
  3165. elif source_type == 'dash' or ext == 'mpd':
  3166. formats.extend(self._extract_mpd_formats(
  3167. source_url, video_id, mpd_id=mpd_id, fatal=False))
  3168. elif ext == 'smil':
  3169. formats.extend(self._extract_smil_formats(
  3170. source_url, video_id, fatal=False))
  3171. # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
  3172. elif source_type.startswith('audio') or ext in (
  3173. 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
  3174. formats.append({
  3175. 'url': source_url,
  3176. 'vcodec': 'none',
  3177. 'ext': ext,
  3178. })
  3179. else:
  3180. height = int_or_none(source.get('height'))
  3181. if height is None:
  3182. # Often no height is provided but there is a label in
  3183. # format like "1080p", "720p SD", or 1080.
  3184. height = int_or_none(self._search_regex(
  3185. r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
  3186. 'height', default=None))
  3187. a_format = {
  3188. 'url': source_url,
  3189. 'width': int_or_none(source.get('width')),
  3190. 'height': height,
  3191. 'tbr': int_or_none(source.get('bitrate')),
  3192. 'ext': ext,
  3193. }
  3194. if source_url.startswith('rtmp'):
  3195. a_format['ext'] = 'flv'
  3196. # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
  3197. # of jwplayer.flash.swf
  3198. rtmp_url_parts = re.split(
  3199. r'((?:mp4|mp3|flv):)', source_url, 1)
  3200. if len(rtmp_url_parts) == 3:
  3201. rtmp_url, prefix, play_path = rtmp_url_parts
  3202. a_format.update({
  3203. 'url': rtmp_url,
  3204. 'play_path': prefix + play_path,
  3205. })
  3206. if rtmp_params:
  3207. a_format.update(rtmp_params)
  3208. formats.append(a_format)
  3209. return formats
  3210. def _live_title(self, name):
  3211. self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
  3212. return name
  3213. def _int(self, v, name, fatal=False, **kwargs):
  3214. res = int_or_none(v, **kwargs)
  3215. if res is None:
  3216. msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
  3217. if fatal:
  3218. raise ExtractorError(msg)
  3219. else:
  3220. self.report_warning(msg)
  3221. return res
  3222. def _float(self, v, name, fatal=False, **kwargs):
  3223. res = float_or_none(v, **kwargs)
  3224. if res is None:
  3225. msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
  3226. if fatal:
  3227. raise ExtractorError(msg)
  3228. else:
  3229. self.report_warning(msg)
  3230. return res
  3231. def _set_cookie(self, domain, name, value, expire_time=None, port=None,
  3232. path='/', secure=False, discard=False, rest={}, **kwargs):
  3233. cookie = compat_cookiejar_Cookie(
  3234. 0, name, value, port, port is not None, domain, True,
  3235. domain.startswith('.'), path, True, secure, expire_time,
  3236. discard, None, None, rest)
  3237. self._downloader.cookiejar.set_cookie(cookie)
  3238. def _get_cookies(self, url):
  3239. """ Return a compat_cookies_SimpleCookie with the cookies for the url """
  3240. req = sanitized_Request(url)
  3241. self._downloader.cookiejar.add_cookie_header(req)
  3242. return compat_cookies_SimpleCookie(req.get_header('Cookie'))
  3243. def _apply_first_set_cookie_header(self, url_handle, cookie):
  3244. """
  3245. Apply first Set-Cookie header instead of the last. Experimental.
  3246. Some sites (e.g. [1-3]) may serve two cookies under the same name
  3247. in Set-Cookie header and expect the first (old) one to be set rather
  3248. than second (new). However, as of RFC6265 the newer one cookie
  3249. should be set into cookie store what actually happens.
  3250. We will workaround this issue by resetting the cookie to
  3251. the first one manually.
  3252. 1. https://new.vk.com/
  3253. 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
  3254. 3. https://learning.oreilly.com/
  3255. """
  3256. for header, cookies in url_handle.headers.items():
  3257. if header.lower() != 'set-cookie':
  3258. continue
  3259. if sys.version_info[0] >= 3:
  3260. cookies = cookies.encode('iso-8859-1')
  3261. cookies = cookies.decode('utf-8')
  3262. cookie_value = re.search(
  3263. r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
  3264. if cookie_value:
  3265. value, domain = cookie_value.groups()
  3266. self._set_cookie(domain, cookie, value)
  3267. break
  3268. def get_testcases(self, include_onlymatching=False):
  3269. t = getattr(self, '_TEST', None)
  3270. if t:
  3271. assert not hasattr(self, '_TESTS'), \
  3272. '%s has _TEST and _TESTS' % type(self).__name__
  3273. tests = [t]
  3274. else:
  3275. tests = getattr(self, '_TESTS', [])
  3276. for t in tests:
  3277. if not include_onlymatching and t.get('only_matching', False):
  3278. continue
  3279. t['name'] = type(self).__name__[:-len('IE')]
  3280. yield t
  3281. def is_suitable(self, age_limit):
  3282. """ Test whether the extractor is generally suitable for the given
  3283. age limit (i.e. pornographic sites are not, all others usually are) """
  3284. any_restricted = False
  3285. for tc in self.get_testcases(include_onlymatching=False):
  3286. if tc.get('playlist', []):
  3287. tc = tc['playlist'][0]
  3288. is_restricted = age_restricted(
  3289. tc.get('info_dict', {}).get('age_limit'), age_limit)
  3290. if not is_restricted:
  3291. return True
  3292. any_restricted = any_restricted or is_restricted
  3293. return not any_restricted
  3294. def extract_subtitles(self, *args, **kwargs):
  3295. if (self.get_param('writesubtitles', False)
  3296. or self.get_param('listsubtitles')):
  3297. return self._get_subtitles(*args, **kwargs)
  3298. return {}
  3299. def _get_subtitles(self, *args, **kwargs):
  3300. raise NotImplementedError('This method must be implemented by subclasses')
  3301. def extract_comments(self, *args, **kwargs):
  3302. if not self.get_param('getcomments'):
  3303. return None
  3304. generator = self._get_comments(*args, **kwargs)
  3305. def extractor():
  3306. comments = []
  3307. interrupted = True
  3308. try:
  3309. while True:
  3310. comments.append(next(generator))
  3311. except StopIteration:
  3312. interrupted = False
  3313. except KeyboardInterrupt:
  3314. self.to_screen('Interrupted by user')
  3315. except Exception as e:
  3316. if self.get_param('ignoreerrors') is not True:
  3317. raise
  3318. self._downloader.report_error(e)
  3319. comment_count = len(comments)
  3320. self.to_screen(f'Extracted {comment_count} comments')
  3321. return {
  3322. 'comments': comments,
  3323. 'comment_count': None if interrupted else comment_count
  3324. }
  3325. return extractor
  3326. def _get_comments(self, *args, **kwargs):
  3327. raise NotImplementedError('This method must be implemented by subclasses')
  3328. @staticmethod
  3329. def _merge_subtitle_items(subtitle_list1, subtitle_list2):
  3330. """ Merge subtitle items for one language. Items with duplicated URLs/data
  3331. will be dropped. """
  3332. list1_data = set([item.get('url') or item['data'] for item in subtitle_list1])
  3333. ret = list(subtitle_list1)
  3334. ret.extend([item for item in subtitle_list2 if (item.get('url') or item['data']) not in list1_data])
  3335. return ret
  3336. @classmethod
  3337. def _merge_subtitles(cls, *dicts, target=None):
  3338. """ Merge subtitle dictionaries, language by language. """
  3339. if target is None:
  3340. target = {}
  3341. for d in dicts:
  3342. for lang, subs in d.items():
  3343. target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
  3344. return target
  3345. def extract_automatic_captions(self, *args, **kwargs):
  3346. if (self.get_param('writeautomaticsub', False)
  3347. or self.get_param('listsubtitles')):
  3348. return self._get_automatic_captions(*args, **kwargs)
  3349. return {}
  3350. def _get_automatic_captions(self, *args, **kwargs):
  3351. raise NotImplementedError('This method must be implemented by subclasses')
  3352. def mark_watched(self, *args, **kwargs):
  3353. if not self.get_param('mark_watched', False):
  3354. return
  3355. if (self.supports_login() and self._get_login_info()[0] is not None
  3356. or self.get_param('cookiefile') or self.get_param('cookiesfrombrowser')):
  3357. self._mark_watched(*args, **kwargs)
  3358. def _mark_watched(self, *args, **kwargs):
  3359. raise NotImplementedError('This method must be implemented by subclasses')
  3360. def geo_verification_headers(self):
  3361. headers = {}
  3362. geo_verification_proxy = self.get_param('geo_verification_proxy')
  3363. if geo_verification_proxy:
  3364. headers['Ytdl-request-proxy'] = geo_verification_proxy
  3365. return headers
  3366. def _generic_id(self, url):
  3367. return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
  3368. def _generic_title(self, url):
  3369. return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
  3370. @staticmethod
  3371. def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
  3372. all_known = all(map(
  3373. lambda x: x is not None,
  3374. (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
  3375. return (
  3376. 'private' if is_private
  3377. else 'premium_only' if needs_premium
  3378. else 'subscriber_only' if needs_subscription
  3379. else 'needs_auth' if needs_auth
  3380. else 'unlisted' if is_unlisted
  3381. else 'public' if all_known
  3382. else None)
  3383. def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
  3384. '''
  3385. @returns A list of values for the extractor argument given by "key"
  3386. or "default" if no such key is present
  3387. @param default The default value to return when the key is not present (default: [])
  3388. @param casesense When false, the values are converted to lower case
  3389. '''
  3390. val = traverse_obj(
  3391. self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
  3392. if val is None:
  3393. return [] if default is NO_DEFAULT else default
  3394. return list(val) if casesense else [x.lower() for x in val]
  3395. def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
  3396. if not playlist_id or not video_id:
  3397. return not video_id
  3398. no_playlist = (smuggled_data or {}).get('force_noplaylist')
  3399. if no_playlist is not None:
  3400. return not no_playlist
  3401. video_id = '' if video_id is True else f' {video_id}'
  3402. playlist_id = '' if playlist_id is True else f' {playlist_id}'
  3403. if self.get_param('noplaylist'):
  3404. self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
  3405. return False
  3406. self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
  3407. return True
  3408. class SearchInfoExtractor(InfoExtractor):
  3409. """
  3410. Base class for paged search queries extractors.
  3411. They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
  3412. Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
  3413. """
  3414. _MAX_RESULTS = float('inf')
  3415. @classmethod
  3416. def _make_valid_url(cls):
  3417. return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
  3418. def _real_extract(self, query):
  3419. prefix, query = self._match_valid_url(query).group('prefix', 'query')
  3420. if prefix == '':
  3421. return self._get_n_results(query, 1)
  3422. elif prefix == 'all':
  3423. return self._get_n_results(query, self._MAX_RESULTS)
  3424. else:
  3425. n = int(prefix)
  3426. if n <= 0:
  3427. raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
  3428. elif n > self._MAX_RESULTS:
  3429. self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
  3430. n = self._MAX_RESULTS
  3431. return self._get_n_results(query, n)
  3432. def _get_n_results(self, query, n):
  3433. """Get a specified number of results for a query.
  3434. Either this function or _search_results must be overridden by subclasses """
  3435. return self.playlist_result(
  3436. itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
  3437. query, query)
  3438. def _search_results(self, query):
  3439. """Returns an iterator of search results"""
  3440. raise NotImplementedError('This method must be implemented by subclasses')
  3441. @property
  3442. def SEARCH_KEY(self):
  3443. return self._SEARCH_KEY