_utils.py 182 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572
  1. import base64
  2. import binascii
  3. import calendar
  4. import codecs
  5. import collections
  6. import collections.abc
  7. import contextlib
  8. import datetime as dt
  9. import email.header
  10. import email.utils
  11. import errno
  12. import hashlib
  13. import hmac
  14. import html.entities
  15. import html.parser
  16. import inspect
  17. import io
  18. import itertools
  19. import json
  20. import locale
  21. import math
  22. import mimetypes
  23. import netrc
  24. import operator
  25. import os
  26. import platform
  27. import random
  28. import re
  29. import shlex
  30. import socket
  31. import ssl
  32. import struct
  33. import subprocess
  34. import sys
  35. import tempfile
  36. import time
  37. import traceback
  38. import types
  39. import unicodedata
  40. import urllib.error
  41. import urllib.parse
  42. import urllib.request
  43. import xml.etree.ElementTree
  44. from . import traversal
  45. from ..compat import functools # isort: split
  46. from ..compat import (
  47. compat_etree_fromstring,
  48. compat_expanduser,
  49. compat_HTMLParseError,
  50. compat_os_name,
  51. )
  52. from ..dependencies import xattr
  53. __name__ = __name__.rsplit('.', 1)[0] # noqa: A001: Pretend to be the parent module
  54. # This is not clearly defined otherwise
  55. compiled_regex_type = type(re.compile(''))
  56. class NO_DEFAULT:
  57. pass
  58. def IDENTITY(x):
  59. return x
  60. ENGLISH_MONTH_NAMES = [
  61. 'January', 'February', 'March', 'April', 'May', 'June',
  62. 'July', 'August', 'September', 'October', 'November', 'December']
  63. MONTH_NAMES = {
  64. 'en': ENGLISH_MONTH_NAMES,
  65. 'fr': [
  66. 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  67. 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  68. # these follow the genitive grammatical case (dopełniacz)
  69. # some websites might be using nominative, which will require another month list
  70. # https://en.wikibooks.org/wiki/Polish/Noun_cases
  71. 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  72. 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  73. }
  74. # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  75. TIMEZONE_NAMES = {
  76. 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  77. 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
  78. 'EST': -5, 'EDT': -4, # Eastern
  79. 'CST': -6, 'CDT': -5, # Central
  80. 'MST': -7, 'MDT': -6, # Mountain
  81. 'PST': -8, 'PDT': -7, # Pacific
  82. }
  83. # needed for sanitizing filenames in restricted mode
  84. ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
  85. itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
  86. 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
  87. DATE_FORMATS = (
  88. '%d %B %Y',
  89. '%d %b %Y',
  90. '%B %d %Y',
  91. '%B %dst %Y',
  92. '%B %dnd %Y',
  93. '%B %drd %Y',
  94. '%B %dth %Y',
  95. '%b %d %Y',
  96. '%b %dst %Y',
  97. '%b %dnd %Y',
  98. '%b %drd %Y',
  99. '%b %dth %Y',
  100. '%b %dst %Y %I:%M',
  101. '%b %dnd %Y %I:%M',
  102. '%b %drd %Y %I:%M',
  103. '%b %dth %Y %I:%M',
  104. '%Y %m %d',
  105. '%Y-%m-%d',
  106. '%Y.%m.%d.',
  107. '%Y/%m/%d',
  108. '%Y/%m/%d %H:%M',
  109. '%Y/%m/%d %H:%M:%S',
  110. '%Y%m%d%H%M',
  111. '%Y%m%d%H%M%S',
  112. '%Y%m%d',
  113. '%Y-%m-%d %H:%M',
  114. '%Y-%m-%d %H:%M:%S',
  115. '%Y-%m-%d %H:%M:%S.%f',
  116. '%Y-%m-%d %H:%M:%S:%f',
  117. '%d.%m.%Y %H:%M',
  118. '%d.%m.%Y %H.%M',
  119. '%Y-%m-%dT%H:%M:%SZ',
  120. '%Y-%m-%dT%H:%M:%S.%fZ',
  121. '%Y-%m-%dT%H:%M:%S.%f0Z',
  122. '%Y-%m-%dT%H:%M:%S',
  123. '%Y-%m-%dT%H:%M:%S.%f',
  124. '%Y-%m-%dT%H:%M',
  125. '%b %d %Y at %H:%M',
  126. '%b %d %Y at %H:%M:%S',
  127. '%B %d %Y at %H:%M',
  128. '%B %d %Y at %H:%M:%S',
  129. '%H:%M %d-%b-%Y',
  130. )
  131. DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
  132. DATE_FORMATS_DAY_FIRST.extend([
  133. '%d-%m-%Y',
  134. '%d.%m.%Y',
  135. '%d.%m.%y',
  136. '%d/%m/%Y',
  137. '%d/%m/%y',
  138. '%d/%m/%Y %H:%M:%S',
  139. '%d-%m-%Y %H:%M',
  140. '%H:%M %d/%m/%Y',
  141. ])
  142. DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
  143. DATE_FORMATS_MONTH_FIRST.extend([
  144. '%m-%d-%Y',
  145. '%m.%d.%Y',
  146. '%m/%d/%Y',
  147. '%m/%d/%y',
  148. '%m/%d/%Y %H:%M:%S',
  149. ])
  150. PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
  151. JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
  152. NUMBER_RE = r'\d+(?:\.\d+)?'
  153. @functools.cache
  154. def preferredencoding():
  155. """Get preferred encoding.
  156. Returns the best encoding scheme for the system, based on
  157. locale.getpreferredencoding() and some further tweaks.
  158. """
  159. try:
  160. pref = locale.getpreferredencoding()
  161. 'TEST'.encode(pref)
  162. except Exception:
  163. pref = 'UTF-8'
  164. return pref
  165. def write_json_file(obj, fn):
  166. """ Encode obj as JSON and write it to fn, atomically if possible """
  167. tf = tempfile.NamedTemporaryFile(
  168. prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
  169. suffix='.tmp', delete=False, mode='w', encoding='utf-8')
  170. try:
  171. with tf:
  172. json.dump(obj, tf, ensure_ascii=False)
  173. if sys.platform == 'win32':
  174. # Need to remove existing file on Windows, else os.rename raises
  175. # WindowsError or FileExistsError.
  176. with contextlib.suppress(OSError):
  177. os.unlink(fn)
  178. with contextlib.suppress(OSError):
  179. mask = os.umask(0)
  180. os.umask(mask)
  181. os.chmod(tf.name, 0o666 & ~mask)
  182. os.rename(tf.name, fn)
  183. except Exception:
  184. with contextlib.suppress(OSError):
  185. os.remove(tf.name)
  186. raise
  187. def find_xpath_attr(node, xpath, key, val=None):
  188. """ Find the xpath xpath[@key=val] """
  189. assert re.match(r'^[a-zA-Z_-]+$', key)
  190. expr = xpath + (f'[@{key}]' if val is None else f"[@{key}='{val}']")
  191. return node.find(expr)
  192. # On python2.6 the xml.etree.ElementTree.Element methods don't support
  193. # the namespace parameter
  194. def xpath_with_ns(path, ns_map):
  195. components = [c.split(':') for c in path.split('/')]
  196. replaced = []
  197. for c in components:
  198. if len(c) == 1:
  199. replaced.append(c[0])
  200. else:
  201. ns, tag = c
  202. replaced.append(f'{{{ns_map[ns]}}}{tag}')
  203. return '/'.join(replaced)
  204. def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
  205. def _find_xpath(xpath):
  206. return node.find(xpath)
  207. if isinstance(xpath, str):
  208. n = _find_xpath(xpath)
  209. else:
  210. for xp in xpath:
  211. n = _find_xpath(xp)
  212. if n is not None:
  213. break
  214. if n is None:
  215. if default is not NO_DEFAULT:
  216. return default
  217. elif fatal:
  218. name = xpath if name is None else name
  219. raise ExtractorError(f'Could not find XML element {name}')
  220. else:
  221. return None
  222. return n
  223. def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
  224. n = xpath_element(node, xpath, name, fatal=fatal, default=default)
  225. if n is None or n == default:
  226. return n
  227. if n.text is None:
  228. if default is not NO_DEFAULT:
  229. return default
  230. elif fatal:
  231. name = xpath if name is None else name
  232. raise ExtractorError(f'Could not find XML element\'s text {name}')
  233. else:
  234. return None
  235. return n.text
  236. def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
  237. n = find_xpath_attr(node, xpath, key)
  238. if n is None:
  239. if default is not NO_DEFAULT:
  240. return default
  241. elif fatal:
  242. name = f'{xpath}[@{key}]' if name is None else name
  243. raise ExtractorError(f'Could not find XML attribute {name}')
  244. else:
  245. return None
  246. return n.attrib[key]
  247. def get_element_by_id(id, html, **kwargs):
  248. """Return the content of the tag with the specified ID in the passed HTML document"""
  249. return get_element_by_attribute('id', id, html, **kwargs)
  250. def get_element_html_by_id(id, html, **kwargs):
  251. """Return the html of the tag with the specified ID in the passed HTML document"""
  252. return get_element_html_by_attribute('id', id, html, **kwargs)
  253. def get_element_by_class(class_name, html):
  254. """Return the content of the first tag with the specified class in the passed HTML document"""
  255. retval = get_elements_by_class(class_name, html)
  256. return retval[0] if retval else None
  257. def get_element_html_by_class(class_name, html):
  258. """Return the html of the first tag with the specified class in the passed HTML document"""
  259. retval = get_elements_html_by_class(class_name, html)
  260. return retval[0] if retval else None
  261. def get_element_by_attribute(attribute, value, html, **kwargs):
  262. retval = get_elements_by_attribute(attribute, value, html, **kwargs)
  263. return retval[0] if retval else None
  264. def get_element_html_by_attribute(attribute, value, html, **kargs):
  265. retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
  266. return retval[0] if retval else None
  267. def get_elements_by_class(class_name, html, **kargs):
  268. """Return the content of all tags with the specified class in the passed HTML document as a list"""
  269. return get_elements_by_attribute(
  270. 'class', rf'[^\'"]*(?<=[\'"\s]){re.escape(class_name)}(?=[\'"\s])[^\'"]*',
  271. html, escape_value=False)
  272. def get_elements_html_by_class(class_name, html):
  273. """Return the html of all tags with the specified class in the passed HTML document as a list"""
  274. return get_elements_html_by_attribute(
  275. 'class', rf'[^\'"]*(?<=[\'"\s]){re.escape(class_name)}(?=[\'"\s])[^\'"]*',
  276. html, escape_value=False)
  277. def get_elements_by_attribute(*args, **kwargs):
  278. """Return the content of the tag with the specified attribute in the passed HTML document"""
  279. return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
  280. def get_elements_html_by_attribute(*args, **kwargs):
  281. """Return the html of the tag with the specified attribute in the passed HTML document"""
  282. return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
  283. def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
  284. """
  285. Return the text (content) and the html (whole) of the tag with the specified
  286. attribute in the passed HTML document
  287. """
  288. if not value:
  289. return
  290. quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
  291. value = re.escape(value) if escape_value else value
  292. partial_element_re = rf'''(?x)
  293. <(?P<tag>{tag})
  294. (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
  295. \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
  296. '''
  297. for m in re.finditer(partial_element_re, html):
  298. content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
  299. yield (
  300. unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
  301. whole,
  302. )
  303. class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
  304. """
  305. HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
  306. closing tag for the first opening tag it has encountered, and can be used
  307. as a context manager
  308. """
  309. class HTMLBreakOnClosingTagException(Exception):
  310. pass
  311. def __init__(self):
  312. self.tagstack = collections.deque()
  313. html.parser.HTMLParser.__init__(self)
  314. def __enter__(self):
  315. return self
  316. def __exit__(self, *_):
  317. self.close()
  318. def close(self):
  319. # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
  320. # so data remains buffered; we no longer have any interest in it, thus
  321. # override this method to discard it
  322. pass
  323. def handle_starttag(self, tag, _):
  324. self.tagstack.append(tag)
  325. def handle_endtag(self, tag):
  326. if not self.tagstack:
  327. raise compat_HTMLParseError('no tags in the stack')
  328. while self.tagstack:
  329. inner_tag = self.tagstack.pop()
  330. if inner_tag == tag:
  331. break
  332. else:
  333. raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
  334. if not self.tagstack:
  335. raise self.HTMLBreakOnClosingTagException
  336. # XXX: This should be far less strict
  337. def get_element_text_and_html_by_tag(tag, html):
  338. """
  339. For the first element with the specified tag in the passed HTML document
  340. return its' content (text) and the whole element (html)
  341. """
  342. def find_or_raise(haystack, needle, exc):
  343. try:
  344. return haystack.index(needle)
  345. except ValueError:
  346. raise exc
  347. closing_tag = f'</{tag}>'
  348. whole_start = find_or_raise(
  349. html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
  350. content_start = find_or_raise(
  351. html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
  352. content_start += whole_start + 1
  353. with HTMLBreakOnClosingTagParser() as parser:
  354. parser.feed(html[whole_start:content_start])
  355. if not parser.tagstack or parser.tagstack[0] != tag:
  356. raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
  357. offset = content_start
  358. while offset < len(html):
  359. next_closing_tag_start = find_or_raise(
  360. html[offset:], closing_tag,
  361. compat_HTMLParseError(f'closing {tag} tag not found'))
  362. next_closing_tag_end = next_closing_tag_start + len(closing_tag)
  363. try:
  364. parser.feed(html[offset:offset + next_closing_tag_end])
  365. offset += next_closing_tag_end
  366. except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
  367. return html[content_start:offset + next_closing_tag_start], \
  368. html[whole_start:offset + next_closing_tag_end]
  369. raise compat_HTMLParseError('unexpected end of html')
  370. class HTMLAttributeParser(html.parser.HTMLParser):
  371. """Trivial HTML parser to gather the attributes for a single element"""
  372. def __init__(self):
  373. self.attrs = {}
  374. html.parser.HTMLParser.__init__(self)
  375. def handle_starttag(self, tag, attrs):
  376. self.attrs = dict(attrs)
  377. raise compat_HTMLParseError('done')
  378. class HTMLListAttrsParser(html.parser.HTMLParser):
  379. """HTML parser to gather the attributes for the elements of a list"""
  380. def __init__(self):
  381. html.parser.HTMLParser.__init__(self)
  382. self.items = []
  383. self._level = 0
  384. def handle_starttag(self, tag, attrs):
  385. if tag == 'li' and self._level == 0:
  386. self.items.append(dict(attrs))
  387. self._level += 1
  388. def handle_endtag(self, tag):
  389. self._level -= 1
  390. def extract_attributes(html_element):
  391. """Given a string for an HTML element such as
  392. <el
  393. a="foo" B="bar" c="&98;az" d=boz
  394. empty= noval entity="&amp;"
  395. sq='"' dq="'"
  396. >
  397. Decode and return a dictionary of attributes.
  398. {
  399. 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
  400. 'empty': '', 'noval': None, 'entity': '&',
  401. 'sq': '"', 'dq': '\''
  402. }.
  403. """
  404. parser = HTMLAttributeParser()
  405. with contextlib.suppress(compat_HTMLParseError):
  406. parser.feed(html_element)
  407. parser.close()
  408. return parser.attrs
  409. def parse_list(webpage):
  410. """Given a string for an series of HTML <li> elements,
  411. return a dictionary of their attributes"""
  412. parser = HTMLListAttrsParser()
  413. parser.feed(webpage)
  414. parser.close()
  415. return parser.items
  416. def clean_html(html):
  417. """Clean an HTML snippet into a readable string"""
  418. if html is None: # Convenience for sanitizing descriptions etc.
  419. return html
  420. html = re.sub(r'\s+', ' ', html)
  421. html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
  422. html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
  423. # Strip html tags
  424. html = re.sub('<.*?>', '', html)
  425. # Replace html entities
  426. html = unescapeHTML(html)
  427. return html.strip()
  428. class LenientJSONDecoder(json.JSONDecoder):
  429. # TODO: Write tests
  430. def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
  431. self.transform_source, self.ignore_extra = transform_source, ignore_extra
  432. self._close_attempts = 2 * close_objects
  433. super().__init__(*args, **kwargs)
  434. @staticmethod
  435. def _close_object(err):
  436. doc = err.doc[:err.pos]
  437. # We need to add comma first to get the correct error message
  438. if err.msg.startswith('Expecting \',\''):
  439. return doc + ','
  440. elif not doc.endswith(','):
  441. return
  442. if err.msg.startswith('Expecting property name'):
  443. return doc[:-1] + '}'
  444. elif err.msg.startswith('Expecting value'):
  445. return doc[:-1] + ']'
  446. def decode(self, s):
  447. if self.transform_source:
  448. s = self.transform_source(s)
  449. for attempt in range(self._close_attempts + 1):
  450. try:
  451. if self.ignore_extra:
  452. return self.raw_decode(s.lstrip())[0]
  453. return super().decode(s)
  454. except json.JSONDecodeError as e:
  455. if e.pos is None:
  456. raise
  457. elif attempt < self._close_attempts:
  458. s = self._close_object(e)
  459. if s is not None:
  460. continue
  461. raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos)
  462. assert False, 'Too many attempts to decode JSON'
  463. def sanitize_open(filename, open_mode):
  464. """Try to open the given filename, and slightly tweak it if this fails.
  465. Attempts to open the given filename. If this fails, it tries to change
  466. the filename slightly, step by step, until it's either able to open it
  467. or it fails and raises a final exception, like the standard open()
  468. function.
  469. It returns the tuple (stream, definitive_file_name).
  470. """
  471. if filename == '-':
  472. if sys.platform == 'win32':
  473. import msvcrt
  474. # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
  475. with contextlib.suppress(io.UnsupportedOperation):
  476. msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
  477. return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
  478. for attempt in range(2):
  479. try:
  480. try:
  481. if sys.platform == 'win32':
  482. # FIXME: An exclusive lock also locks the file from being read.
  483. # Since windows locks are mandatory, don't lock the file on windows (for now).
  484. # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
  485. raise LockingUnsupportedError
  486. stream = locked_file(filename, open_mode, block=False).__enter__()
  487. except OSError:
  488. stream = open(filename, open_mode)
  489. return stream, filename
  490. except OSError as err:
  491. if attempt or err.errno in (errno.EACCES,):
  492. raise
  493. old_filename, filename = filename, sanitize_path(filename)
  494. if old_filename == filename:
  495. raise
  496. def timeconvert(timestr):
  497. """Convert RFC 2822 defined time string into system timestamp"""
  498. timestamp = None
  499. timetuple = email.utils.parsedate_tz(timestr)
  500. if timetuple is not None:
  501. timestamp = email.utils.mktime_tz(timetuple)
  502. return timestamp
  503. def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
  504. """Sanitizes a string so it could be used as part of a filename.
  505. @param restricted Use a stricter subset of allowed characters
  506. @param is_id Whether this is an ID that should be kept unchanged if possible.
  507. If unset, yt-dlp's new sanitization rules are in effect
  508. """
  509. if s == '':
  510. return ''
  511. def replace_insane(char):
  512. if restricted and char in ACCENT_CHARS:
  513. return ACCENT_CHARS[char]
  514. elif not restricted and char == '\n':
  515. return '\0 '
  516. elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
  517. # Replace with their full-width unicode counterparts
  518. return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
  519. elif char == '?' or ord(char) < 32 or ord(char) == 127:
  520. return ''
  521. elif char == '"':
  522. return '' if restricted else '\''
  523. elif char == ':':
  524. return '\0_\0-' if restricted else '\0 \0-'
  525. elif char in '\\/|*<>':
  526. return '\0_'
  527. if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
  528. return '' if unicodedata.category(char)[0] in 'CM' else '\0_'
  529. return char
  530. # Replace look-alike Unicode glyphs
  531. if restricted and (is_id is NO_DEFAULT or not is_id):
  532. s = unicodedata.normalize('NFKC', s)
  533. s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
  534. result = ''.join(map(replace_insane, s))
  535. if is_id is NO_DEFAULT:
  536. result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
  537. STRIP_RE = r'(?:\0.|[ _-])*'
  538. result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
  539. result = result.replace('\0', '') or '_'
  540. if not is_id:
  541. while '__' in result:
  542. result = result.replace('__', '_')
  543. result = result.strip('_')
  544. # Common case of "Foreign band name - English song title"
  545. if restricted and result.startswith('-_'):
  546. result = result[2:]
  547. if result.startswith('-'):
  548. result = '_' + result[len('-'):]
  549. result = result.lstrip('.')
  550. if not result:
  551. result = '_'
  552. return result
  553. def sanitize_path(s, force=False):
  554. """Sanitizes and normalizes path on Windows"""
  555. # XXX: this handles drive relative paths (c:sth) incorrectly
  556. if sys.platform == 'win32':
  557. force = False
  558. drive_or_unc, _ = os.path.splitdrive(s)
  559. elif force:
  560. drive_or_unc = ''
  561. else:
  562. return s
  563. norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
  564. if drive_or_unc:
  565. norm_path.pop(0)
  566. sanitized_path = [
  567. path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
  568. for path_part in norm_path]
  569. if drive_or_unc:
  570. sanitized_path.insert(0, drive_or_unc + os.path.sep)
  571. elif force and s and s[0] == os.path.sep:
  572. sanitized_path.insert(0, os.path.sep)
  573. # TODO: Fix behavioral differences <3.12
  574. # The workaround using `normpath` only superficially passes tests
  575. # Ref: https://github.com/python/cpython/pull/100351
  576. return os.path.normpath(os.path.join(*sanitized_path))
  577. def sanitize_url(url, *, scheme='http'):
  578. # Prepend protocol-less URLs with `http:` scheme in order to mitigate
  579. # the number of unwanted failures due to missing protocol
  580. if url is None:
  581. return
  582. elif url.startswith('//'):
  583. return f'{scheme}:{url}'
  584. # Fix some common typos seen so far
  585. COMMON_TYPOS = (
  586. # https://github.com/ytdl-org/youtube-dl/issues/15649
  587. (r'^httpss://', r'https://'),
  588. # https://bx1.be/lives/direct-tv/
  589. (r'^rmtp([es]?)://', r'rtmp\1://'),
  590. )
  591. for mistake, fixup in COMMON_TYPOS:
  592. if re.match(mistake, url):
  593. return re.sub(mistake, fixup, url)
  594. return url
  595. def extract_basic_auth(url):
  596. parts = urllib.parse.urlsplit(url)
  597. if parts.username is None:
  598. return url, None
  599. url = urllib.parse.urlunsplit(parts._replace(netloc=(
  600. parts.hostname if parts.port is None
  601. else f'{parts.hostname}:{parts.port}')))
  602. auth_payload = base64.b64encode(
  603. ('{}:{}'.format(parts.username, parts.password or '')).encode())
  604. return url, f'Basic {auth_payload.decode()}'
  605. def expand_path(s):
  606. """Expand shell variables and ~"""
  607. return os.path.expandvars(compat_expanduser(s))
  608. def orderedSet(iterable, *, lazy=False):
  609. """Remove all duplicates from the input iterable"""
  610. def _iter():
  611. seen = [] # Do not use set since the items can be unhashable
  612. for x in iterable:
  613. if x not in seen:
  614. seen.append(x)
  615. yield x
  616. return _iter() if lazy else list(_iter())
  617. def _htmlentity_transform(entity_with_semicolon):
  618. """Transforms an HTML entity to a character."""
  619. entity = entity_with_semicolon[:-1]
  620. # Known non-numeric HTML entity
  621. if entity in html.entities.name2codepoint:
  622. return chr(html.entities.name2codepoint[entity])
  623. # TODO: HTML5 allows entities without a semicolon.
  624. # E.g. '&Eacuteric' should be decoded as 'Éric'.
  625. if entity_with_semicolon in html.entities.html5:
  626. return html.entities.html5[entity_with_semicolon]
  627. mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
  628. if mobj is not None:
  629. numstr = mobj.group(1)
  630. if numstr.startswith('x'):
  631. base = 16
  632. numstr = f'0{numstr}'
  633. else:
  634. base = 10
  635. # See https://github.com/ytdl-org/youtube-dl/issues/7518
  636. with contextlib.suppress(ValueError):
  637. return chr(int(numstr, base))
  638. # Unknown entity in name, return its literal representation
  639. return f'&{entity};'
  640. def unescapeHTML(s):
  641. if s is None:
  642. return None
  643. assert isinstance(s, str)
  644. return re.sub(
  645. r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
  646. def escapeHTML(text):
  647. return (
  648. text
  649. .replace('&', '&amp;')
  650. .replace('<', '&lt;')
  651. .replace('>', '&gt;')
  652. .replace('"', '&quot;')
  653. .replace("'", '&#39;')
  654. )
  655. class netrc_from_content(netrc.netrc):
  656. def __init__(self, content):
  657. self.hosts, self.macros = {}, {}
  658. with io.StringIO(content) as stream:
  659. self._parse('-', stream, False)
  660. class Popen(subprocess.Popen):
  661. if sys.platform == 'win32':
  662. _startupinfo = subprocess.STARTUPINFO()
  663. _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
  664. else:
  665. _startupinfo = None
  666. @staticmethod
  667. def _fix_pyinstaller_ld_path(env):
  668. """Restore LD_LIBRARY_PATH when using PyInstaller
  669. Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
  670. https://github.com/yt-dlp/yt-dlp/issues/4573
  671. """
  672. if not hasattr(sys, '_MEIPASS'):
  673. return
  674. def _fix(key):
  675. orig = env.get(f'{key}_ORIG')
  676. if orig is None:
  677. env.pop(key, None)
  678. else:
  679. env[key] = orig
  680. _fix('LD_LIBRARY_PATH') # Linux
  681. _fix('DYLD_LIBRARY_PATH') # macOS
  682. def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
  683. if env is None:
  684. env = os.environ.copy()
  685. self._fix_pyinstaller_ld_path(env)
  686. self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
  687. if text is True:
  688. kwargs['universal_newlines'] = True # For 3.6 compatibility
  689. kwargs.setdefault('encoding', 'utf-8')
  690. kwargs.setdefault('errors', 'replace')
  691. if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
  692. if not isinstance(args, str):
  693. args = shell_quote(args, shell=True)
  694. shell = False
  695. # Set variable for `cmd.exe` newline escaping (see `utils.shell_quote`)
  696. env['='] = '"^\n\n"'
  697. args = f'{self.__comspec()} /Q /S /D /V:OFF /E:ON /C "{args}"'
  698. super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
  699. def __comspec(self):
  700. comspec = os.environ.get('ComSpec') or os.path.join(
  701. os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
  702. if os.path.isabs(comspec):
  703. return comspec
  704. raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
  705. def communicate_or_kill(self, *args, **kwargs):
  706. try:
  707. return self.communicate(*args, **kwargs)
  708. except BaseException: # Including KeyboardInterrupt
  709. self.kill(timeout=None)
  710. raise
  711. def kill(self, *, timeout=0):
  712. super().kill()
  713. if timeout != 0:
  714. self.wait(timeout=timeout)
  715. @classmethod
  716. def run(cls, *args, timeout=None, **kwargs):
  717. with cls(*args, **kwargs) as proc:
  718. default = '' if proc.__text_mode else b''
  719. stdout, stderr = proc.communicate_or_kill(timeout=timeout)
  720. return stdout or default, stderr or default, proc.returncode
  721. def encodeArgument(s):
  722. # Legacy code that uses byte strings
  723. # Uncomment the following line after fixing all post processors
  724. # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
  725. return s if isinstance(s, str) else s.decode('ascii')
  726. _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
  727. def timetuple_from_msec(msec):
  728. secs, msec = divmod(msec, 1000)
  729. mins, secs = divmod(secs, 60)
  730. hrs, mins = divmod(mins, 60)
  731. return _timetuple(hrs, mins, secs, msec)
  732. def formatSeconds(secs, delim=':', msec=False):
  733. time = timetuple_from_msec(secs * 1000)
  734. if time.hours:
  735. ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
  736. elif time.minutes:
  737. ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
  738. else:
  739. ret = '%d' % time.seconds
  740. return '%s.%03d' % (ret, time.milliseconds) if msec else ret
  741. def bug_reports_message(before=';'):
  742. from ..update import REPOSITORY
  743. msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
  744. 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
  745. before = before.rstrip()
  746. if not before or before.endswith(('.', '!', '?')):
  747. msg = msg[0].title() + msg[1:]
  748. return (before + ' ' if before else '') + msg
  749. class YoutubeDLError(Exception):
  750. """Base exception for YoutubeDL errors."""
  751. msg = None
  752. def __init__(self, msg=None):
  753. if msg is not None:
  754. self.msg = msg
  755. elif self.msg is None:
  756. self.msg = type(self).__name__
  757. super().__init__(self.msg)
  758. class ExtractorError(YoutubeDLError):
  759. """Error during info extraction."""
  760. def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
  761. """ tb, if given, is the original traceback (so that it can be printed out).
  762. If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
  763. """
  764. from ..networking.exceptions import network_exceptions
  765. if sys.exc_info()[0] in network_exceptions:
  766. expected = True
  767. self.orig_msg = str(msg)
  768. self.traceback = tb
  769. self.expected = expected
  770. self.cause = cause
  771. self.video_id = video_id
  772. self.ie = ie
  773. self.exc_info = sys.exc_info() # preserve original exception
  774. if isinstance(self.exc_info[1], ExtractorError):
  775. self.exc_info = self.exc_info[1].exc_info
  776. super().__init__(self.__msg)
  777. @property
  778. def __msg(self):
  779. return ''.join((
  780. format_field(self.ie, None, '[%s] '),
  781. format_field(self.video_id, None, '%s: '),
  782. self.orig_msg,
  783. format_field(self.cause, None, ' (caused by %r)'),
  784. '' if self.expected else bug_reports_message()))
  785. def format_traceback(self):
  786. return join_nonempty(
  787. self.traceback and ''.join(traceback.format_tb(self.traceback)),
  788. self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
  789. delim='\n') or None
  790. def __setattr__(self, name, value):
  791. super().__setattr__(name, value)
  792. if getattr(self, 'msg', None) and name not in ('msg', 'args'):
  793. self.msg = self.__msg or type(self).__name__
  794. self.args = (self.msg, ) # Cannot be property
  795. class UnsupportedError(ExtractorError):
  796. def __init__(self, url):
  797. super().__init__(
  798. f'Unsupported URL: {url}', expected=True)
  799. self.url = url
  800. class RegexNotFoundError(ExtractorError):
  801. """Error when a regex didn't match"""
  802. pass
  803. class GeoRestrictedError(ExtractorError):
  804. """Geographic restriction Error exception.
  805. This exception may be thrown when a video is not available from your
  806. geographic location due to geographic restrictions imposed by a website.
  807. """
  808. def __init__(self, msg, countries=None, **kwargs):
  809. kwargs['expected'] = True
  810. super().__init__(msg, **kwargs)
  811. self.countries = countries
  812. class UserNotLive(ExtractorError):
  813. """Error when a channel/user is not live"""
  814. def __init__(self, msg=None, **kwargs):
  815. kwargs['expected'] = True
  816. super().__init__(msg or 'The channel is not currently live', **kwargs)
  817. class DownloadError(YoutubeDLError):
  818. """Download Error exception.
  819. This exception may be thrown by FileDownloader objects if they are not
  820. configured to continue on errors. They will contain the appropriate
  821. error message.
  822. """
  823. def __init__(self, msg, exc_info=None):
  824. """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
  825. super().__init__(msg)
  826. self.exc_info = exc_info
  827. class EntryNotInPlaylist(YoutubeDLError):
  828. """Entry not in playlist exception.
  829. This exception will be thrown by YoutubeDL when a requested entry
  830. is not found in the playlist info_dict
  831. """
  832. msg = 'Entry not found in info'
  833. class SameFileError(YoutubeDLError):
  834. """Same File exception.
  835. This exception will be thrown by FileDownloader objects if they detect
  836. multiple files would have to be downloaded to the same file on disk.
  837. """
  838. msg = 'Fixed output name but more than one file to download'
  839. def __init__(self, filename=None):
  840. if filename is not None:
  841. self.msg += f': {filename}'
  842. super().__init__(self.msg)
  843. class PostProcessingError(YoutubeDLError):
  844. """Post Processing exception.
  845. This exception may be raised by PostProcessor's .run() method to
  846. indicate an error in the postprocessing task.
  847. """
  848. class DownloadCancelled(YoutubeDLError):
  849. """ Exception raised when the download queue should be interrupted """
  850. msg = 'The download was cancelled'
  851. class ExistingVideoReached(DownloadCancelled):
  852. """ --break-on-existing triggered """
  853. msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
  854. class RejectedVideoReached(DownloadCancelled):
  855. """ --break-match-filter triggered """
  856. msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
  857. class MaxDownloadsReached(DownloadCancelled):
  858. """ --max-downloads limit has been reached. """
  859. msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
  860. class ReExtractInfo(YoutubeDLError):
  861. """ Video info needs to be re-extracted. """
  862. def __init__(self, msg, expected=False):
  863. super().__init__(msg)
  864. self.expected = expected
  865. class ThrottledDownload(ReExtractInfo):
  866. """ Download speed below --throttled-rate. """
  867. msg = 'The download speed is below throttle limit'
  868. def __init__(self):
  869. super().__init__(self.msg, expected=False)
  870. class UnavailableVideoError(YoutubeDLError):
  871. """Unavailable Format exception.
  872. This exception will be thrown when a video is requested
  873. in a format that is not available for that video.
  874. """
  875. msg = 'Unable to download video'
  876. def __init__(self, err=None):
  877. if err is not None:
  878. self.msg += f': {err}'
  879. super().__init__(self.msg)
  880. class ContentTooShortError(YoutubeDLError):
  881. """Content Too Short exception.
  882. This exception may be raised by FileDownloader objects when a file they
  883. download is too small for what the server announced first, indicating
  884. the connection was probably interrupted.
  885. """
  886. def __init__(self, downloaded, expected):
  887. super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
  888. # Both in bytes
  889. self.downloaded = downloaded
  890. self.expected = expected
  891. class XAttrMetadataError(YoutubeDLError):
  892. def __init__(self, code=None, msg='Unknown error'):
  893. super().__init__(msg)
  894. self.code = code
  895. self.msg = msg
  896. # Parsing code and msg
  897. if (self.code in (errno.ENOSPC, errno.EDQUOT)
  898. or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
  899. self.reason = 'NO_SPACE'
  900. elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
  901. self.reason = 'VALUE_TOO_LONG'
  902. else:
  903. self.reason = 'NOT_SUPPORTED'
  904. class XAttrUnavailableError(YoutubeDLError):
  905. pass
  906. def is_path_like(f):
  907. return isinstance(f, (str, bytes, os.PathLike))
  908. def extract_timezone(date_str, default=None):
  909. m = re.search(
  910. r'''(?x)
  911. ^.{8,}? # >=8 char non-TZ prefix, if present
  912. (?P<tz>Z| # just the UTC Z, or
  913. (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
  914. (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
  915. [ ]? # optional space
  916. (?P<sign>\+|-) # +/-
  917. (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
  918. $)
  919. ''', date_str)
  920. timezone = None
  921. if not m:
  922. m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
  923. timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
  924. if timezone is not None:
  925. date_str = date_str[:-len(m.group('tz'))]
  926. timezone = dt.timedelta(hours=timezone)
  927. else:
  928. date_str = date_str[:-len(m.group('tz'))]
  929. if m.group('sign'):
  930. sign = 1 if m.group('sign') == '+' else -1
  931. timezone = dt.timedelta(
  932. hours=sign * int(m.group('hours')),
  933. minutes=sign * int(m.group('minutes')))
  934. if timezone is None and default is not NO_DEFAULT:
  935. timezone = default or dt.timedelta()
  936. return timezone, date_str
  937. def parse_iso8601(date_str, delimiter='T', timezone=None):
  938. """ Return a UNIX timestamp from the given date """
  939. if date_str is None:
  940. return None
  941. date_str = re.sub(r'\.[0-9]+', '', date_str)
  942. timezone, date_str = extract_timezone(date_str, timezone)
  943. with contextlib.suppress(ValueError, TypeError):
  944. date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
  945. dt_ = dt.datetime.strptime(date_str, date_format) - timezone
  946. return calendar.timegm(dt_.timetuple())
  947. def date_formats(day_first=True):
  948. return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
  949. def unified_strdate(date_str, day_first=True):
  950. """Return a string with the date in the format YYYYMMDD"""
  951. if date_str is None:
  952. return None
  953. upload_date = None
  954. # Replace commas
  955. date_str = date_str.replace(',', ' ')
  956. # Remove AM/PM + timezone
  957. date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
  958. _, date_str = extract_timezone(date_str)
  959. for expression in date_formats(day_first):
  960. with contextlib.suppress(ValueError):
  961. upload_date = dt.datetime.strptime(date_str, expression).strftime('%Y%m%d')
  962. if upload_date is None:
  963. timetuple = email.utils.parsedate_tz(date_str)
  964. if timetuple:
  965. with contextlib.suppress(ValueError):
  966. upload_date = dt.datetime(*timetuple[:6]).strftime('%Y%m%d')
  967. if upload_date is not None:
  968. return str(upload_date)
  969. def unified_timestamp(date_str, day_first=True):
  970. if not isinstance(date_str, str):
  971. return None
  972. date_str = re.sub(r'\s+', ' ', re.sub(
  973. r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
  974. pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
  975. timezone, date_str = extract_timezone(date_str)
  976. # Remove AM/PM + timezone
  977. date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
  978. # Remove unrecognized timezones from ISO 8601 alike timestamps
  979. m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
  980. if m:
  981. date_str = date_str[:-len(m.group('tz'))]
  982. # Python only supports microseconds, so remove nanoseconds
  983. m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
  984. if m:
  985. date_str = m.group(1)
  986. for expression in date_formats(day_first):
  987. with contextlib.suppress(ValueError):
  988. dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta)
  989. return calendar.timegm(dt_.timetuple())
  990. timetuple = email.utils.parsedate_tz(date_str)
  991. if timetuple:
  992. return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
  993. def determine_ext(url, default_ext='unknown_video'):
  994. if url is None or '.' not in url:
  995. return default_ext
  996. guess = url.partition('?')[0].rpartition('.')[2]
  997. if re.match(r'^[A-Za-z0-9]+$', guess):
  998. return guess
  999. # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
  1000. elif guess.rstrip('/') in KNOWN_EXTENSIONS:
  1001. return guess.rstrip('/')
  1002. else:
  1003. return default_ext
  1004. def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
  1005. return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
  1006. def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
  1007. R"""
  1008. Return a datetime object from a string.
  1009. Supported format:
  1010. (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
  1011. @param format strftime format of DATE
  1012. @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
  1013. auto: round to the unit provided in date_str (if applicable).
  1014. """
  1015. auto_precision = False
  1016. if precision == 'auto':
  1017. auto_precision = True
  1018. precision = 'microsecond'
  1019. today = datetime_round(dt.datetime.now(dt.timezone.utc), precision)
  1020. if date_str in ('now', 'today'):
  1021. return today
  1022. if date_str == 'yesterday':
  1023. return today - dt.timedelta(days=1)
  1024. match = re.match(
  1025. r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
  1026. date_str)
  1027. if match is not None:
  1028. start_time = datetime_from_str(match.group('start'), precision, format)
  1029. time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
  1030. unit = match.group('unit')
  1031. if unit == 'month' or unit == 'year':
  1032. new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
  1033. unit = 'day'
  1034. else:
  1035. if unit == 'week':
  1036. unit = 'day'
  1037. time *= 7
  1038. delta = dt.timedelta(**{unit + 's': time})
  1039. new_date = start_time + delta
  1040. if auto_precision:
  1041. return datetime_round(new_date, unit)
  1042. return new_date
  1043. return datetime_round(dt.datetime.strptime(date_str, format), precision)
  1044. def date_from_str(date_str, format='%Y%m%d', strict=False):
  1045. R"""
  1046. Return a date object from a string using datetime_from_str
  1047. @param strict Restrict allowed patterns to "YYYYMMDD" and
  1048. (now|today|yesterday)(-\d+(day|week|month|year)s?)?
  1049. """
  1050. if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
  1051. raise ValueError(f'Invalid date format "{date_str}"')
  1052. return datetime_from_str(date_str, precision='microsecond', format=format).date()
  1053. def datetime_add_months(dt_, months):
  1054. """Increment/Decrement a datetime object by months."""
  1055. month = dt_.month + months - 1
  1056. year = dt_.year + month // 12
  1057. month = month % 12 + 1
  1058. day = min(dt_.day, calendar.monthrange(year, month)[1])
  1059. return dt_.replace(year, month, day)
  1060. def datetime_round(dt_, precision='day'):
  1061. """
  1062. Round a datetime object's time to a specific precision
  1063. """
  1064. if precision == 'microsecond':
  1065. return dt_
  1066. unit_seconds = {
  1067. 'day': 86400,
  1068. 'hour': 3600,
  1069. 'minute': 60,
  1070. 'second': 1,
  1071. }
  1072. roundto = lambda x, n: ((x + n / 2) // n) * n
  1073. timestamp = roundto(calendar.timegm(dt_.timetuple()), unit_seconds[precision])
  1074. return dt.datetime.fromtimestamp(timestamp, dt.timezone.utc)
  1075. def hyphenate_date(date_str):
  1076. """
  1077. Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
  1078. match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
  1079. if match is not None:
  1080. return '-'.join(match.groups())
  1081. else:
  1082. return date_str
  1083. class DateRange:
  1084. """Represents a time interval between two dates"""
  1085. def __init__(self, start=None, end=None):
  1086. """start and end must be strings in the format accepted by date"""
  1087. if start is not None:
  1088. self.start = date_from_str(start, strict=True)
  1089. else:
  1090. self.start = dt.datetime.min.date()
  1091. if end is not None:
  1092. self.end = date_from_str(end, strict=True)
  1093. else:
  1094. self.end = dt.datetime.max.date()
  1095. if self.start > self.end:
  1096. raise ValueError(f'Date range: "{self}" , the start date must be before the end date')
  1097. @classmethod
  1098. def day(cls, day):
  1099. """Returns a range that only contains the given day"""
  1100. return cls(day, day)
  1101. def __contains__(self, date):
  1102. """Check if the date is in the range"""
  1103. if not isinstance(date, dt.date):
  1104. date = date_from_str(date)
  1105. return self.start <= date <= self.end
  1106. def __repr__(self):
  1107. return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
  1108. def __str__(self):
  1109. return f'{self.start} to {self.end}'
  1110. def __eq__(self, other):
  1111. return (isinstance(other, DateRange)
  1112. and self.start == other.start and self.end == other.end)
  1113. @functools.cache
  1114. def system_identifier():
  1115. python_implementation = platform.python_implementation()
  1116. if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
  1117. python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
  1118. libc_ver = []
  1119. with contextlib.suppress(OSError): # We may not have access to the executable
  1120. libc_ver = platform.libc_ver()
  1121. return 'Python {} ({} {} {}) - {} ({}{})'.format(
  1122. platform.python_version(),
  1123. python_implementation,
  1124. platform.machine(),
  1125. platform.architecture()[0],
  1126. platform.platform(),
  1127. ssl.OPENSSL_VERSION,
  1128. format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
  1129. )
  1130. @functools.cache
  1131. def get_windows_version():
  1132. """ Get Windows version. returns () if it's not running on Windows """
  1133. if compat_os_name == 'nt':
  1134. return version_tuple(platform.win32_ver()[1])
  1135. else:
  1136. return ()
  1137. def write_string(s, out=None, encoding=None):
  1138. assert isinstance(s, str)
  1139. out = out or sys.stderr
  1140. # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
  1141. if not out:
  1142. return
  1143. if compat_os_name == 'nt' and supports_terminal_sequences(out):
  1144. s = re.sub(r'([\r\n]+)', r' \1', s)
  1145. enc, buffer = None, out
  1146. # `mode` might be `None` (Ref: https://github.com/yt-dlp/yt-dlp/issues/8816)
  1147. if 'b' in (getattr(out, 'mode', None) or ''):
  1148. enc = encoding or preferredencoding()
  1149. elif hasattr(out, 'buffer'):
  1150. buffer = out.buffer
  1151. enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
  1152. buffer.write(s.encode(enc, 'ignore') if enc else s)
  1153. out.flush()
  1154. # TODO: Use global logger
  1155. def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
  1156. from .. import _IN_CLI
  1157. if _IN_CLI:
  1158. if msg in deprecation_warning._cache:
  1159. return
  1160. deprecation_warning._cache.add(msg)
  1161. if printer:
  1162. return printer(f'{msg}{bug_reports_message()}', **kwargs)
  1163. return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
  1164. else:
  1165. import warnings
  1166. warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
  1167. deprecation_warning._cache = set()
  1168. def bytes_to_intlist(bs):
  1169. if not bs:
  1170. return []
  1171. if isinstance(bs[0], int): # Python 3
  1172. return list(bs)
  1173. else:
  1174. return [ord(c) for c in bs]
  1175. def intlist_to_bytes(xs):
  1176. if not xs:
  1177. return b''
  1178. return struct.pack('%dB' % len(xs), *xs)
  1179. class LockingUnsupportedError(OSError):
  1180. msg = 'File locking is not supported'
  1181. def __init__(self):
  1182. super().__init__(self.msg)
  1183. # Cross-platform file locking
  1184. if sys.platform == 'win32':
  1185. import ctypes
  1186. import ctypes.wintypes
  1187. import msvcrt
  1188. class OVERLAPPED(ctypes.Structure):
  1189. _fields_ = [
  1190. ('Internal', ctypes.wintypes.LPVOID),
  1191. ('InternalHigh', ctypes.wintypes.LPVOID),
  1192. ('Offset', ctypes.wintypes.DWORD),
  1193. ('OffsetHigh', ctypes.wintypes.DWORD),
  1194. ('hEvent', ctypes.wintypes.HANDLE),
  1195. ]
  1196. kernel32 = ctypes.WinDLL('kernel32')
  1197. LockFileEx = kernel32.LockFileEx
  1198. LockFileEx.argtypes = [
  1199. ctypes.wintypes.HANDLE, # hFile
  1200. ctypes.wintypes.DWORD, # dwFlags
  1201. ctypes.wintypes.DWORD, # dwReserved
  1202. ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
  1203. ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
  1204. ctypes.POINTER(OVERLAPPED), # Overlapped
  1205. ]
  1206. LockFileEx.restype = ctypes.wintypes.BOOL
  1207. UnlockFileEx = kernel32.UnlockFileEx
  1208. UnlockFileEx.argtypes = [
  1209. ctypes.wintypes.HANDLE, # hFile
  1210. ctypes.wintypes.DWORD, # dwReserved
  1211. ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
  1212. ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
  1213. ctypes.POINTER(OVERLAPPED), # Overlapped
  1214. ]
  1215. UnlockFileEx.restype = ctypes.wintypes.BOOL
  1216. whole_low = 0xffffffff
  1217. whole_high = 0x7fffffff
  1218. def _lock_file(f, exclusive, block):
  1219. overlapped = OVERLAPPED()
  1220. overlapped.Offset = 0
  1221. overlapped.OffsetHigh = 0
  1222. overlapped.hEvent = 0
  1223. f._lock_file_overlapped_p = ctypes.pointer(overlapped)
  1224. if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
  1225. (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
  1226. 0, whole_low, whole_high, f._lock_file_overlapped_p):
  1227. # NB: No argument form of "ctypes.FormatError" does not work on PyPy
  1228. raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
  1229. def _unlock_file(f):
  1230. assert f._lock_file_overlapped_p
  1231. handle = msvcrt.get_osfhandle(f.fileno())
  1232. if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
  1233. raise OSError(f'Unlocking file failed: {ctypes.FormatError()!r}')
  1234. else:
  1235. try:
  1236. import fcntl
  1237. def _lock_file(f, exclusive, block):
  1238. flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
  1239. if not block:
  1240. flags |= fcntl.LOCK_NB
  1241. try:
  1242. fcntl.flock(f, flags)
  1243. except BlockingIOError:
  1244. raise
  1245. except OSError: # AOSP does not have flock()
  1246. fcntl.lockf(f, flags)
  1247. def _unlock_file(f):
  1248. with contextlib.suppress(OSError):
  1249. return fcntl.flock(f, fcntl.LOCK_UN)
  1250. with contextlib.suppress(OSError):
  1251. return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
  1252. return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
  1253. except ImportError:
  1254. def _lock_file(f, exclusive, block):
  1255. raise LockingUnsupportedError
  1256. def _unlock_file(f):
  1257. raise LockingUnsupportedError
  1258. class locked_file:
  1259. locked = False
  1260. def __init__(self, filename, mode, block=True, encoding=None):
  1261. if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
  1262. raise NotImplementedError(mode)
  1263. self.mode, self.block = mode, block
  1264. writable = any(f in mode for f in 'wax+')
  1265. readable = any(f in mode for f in 'r+')
  1266. flags = functools.reduce(operator.ior, (
  1267. getattr(os, 'O_CLOEXEC', 0), # UNIX only
  1268. getattr(os, 'O_BINARY', 0), # Windows only
  1269. getattr(os, 'O_NOINHERIT', 0), # Windows only
  1270. os.O_CREAT if writable else 0, # O_TRUNC only after locking
  1271. os.O_APPEND if 'a' in mode else 0,
  1272. os.O_EXCL if 'x' in mode else 0,
  1273. os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
  1274. ))
  1275. self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
  1276. def __enter__(self):
  1277. exclusive = 'r' not in self.mode
  1278. try:
  1279. _lock_file(self.f, exclusive, self.block)
  1280. self.locked = True
  1281. except OSError:
  1282. self.f.close()
  1283. raise
  1284. if 'w' in self.mode:
  1285. try:
  1286. self.f.truncate()
  1287. except OSError as e:
  1288. if e.errno not in (
  1289. errno.ESPIPE, # Illegal seek - expected for FIFO
  1290. errno.EINVAL, # Invalid argument - expected for /dev/null
  1291. ):
  1292. raise
  1293. return self
  1294. def unlock(self):
  1295. if not self.locked:
  1296. return
  1297. try:
  1298. _unlock_file(self.f)
  1299. finally:
  1300. self.locked = False
  1301. def __exit__(self, *_):
  1302. try:
  1303. self.unlock()
  1304. finally:
  1305. self.f.close()
  1306. open = __enter__
  1307. close = __exit__
  1308. def __getattr__(self, attr):
  1309. return getattr(self.f, attr)
  1310. def __iter__(self):
  1311. return iter(self.f)
  1312. @functools.cache
  1313. def get_filesystem_encoding():
  1314. encoding = sys.getfilesystemencoding()
  1315. return encoding if encoding is not None else 'utf-8'
  1316. _WINDOWS_QUOTE_TRANS = str.maketrans({'"': R'\"'})
  1317. _CMD_QUOTE_TRANS = str.maketrans({
  1318. # Keep quotes balanced by replacing them with `""` instead of `\\"`
  1319. '"': '""',
  1320. # These require an env-variable `=` containing `"^\n\n"` (set in `utils.Popen`)
  1321. # `=` should be unique since variables containing `=` cannot be set using cmd
  1322. '\n': '%=%',
  1323. '\r': '%=%',
  1324. # Use zero length variable replacement so `%` doesn't get expanded
  1325. # `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`)
  1326. '%': '%%cd:~,%',
  1327. })
  1328. def shell_quote(args, *, shell=False):
  1329. args = list(variadic(args))
  1330. if compat_os_name != 'nt':
  1331. return shlex.join(args)
  1332. trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS
  1333. return ' '.join(
  1334. s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII)
  1335. else re.sub(r'(\\+)("|$)', r'\1\1\2', s).translate(trans).join('""')
  1336. for s in args)
  1337. def smuggle_url(url, data):
  1338. """ Pass additional data in a URL for internal use. """
  1339. url, idata = unsmuggle_url(url, {})
  1340. data.update(idata)
  1341. sdata = urllib.parse.urlencode(
  1342. {'__youtubedl_smuggle': json.dumps(data)})
  1343. return url + '#' + sdata
  1344. def unsmuggle_url(smug_url, default=None):
  1345. if '#__youtubedl_smuggle' not in smug_url:
  1346. return smug_url, default
  1347. url, _, sdata = smug_url.rpartition('#')
  1348. jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
  1349. data = json.loads(jsond)
  1350. return url, data
  1351. def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
  1352. """ Formats numbers with decimal sufixes like K, M, etc """
  1353. num, factor = float_or_none(num), float(factor)
  1354. if num is None or num < 0:
  1355. return None
  1356. POSSIBLE_SUFFIXES = 'kMGTPEZY'
  1357. exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
  1358. suffix = ['', *POSSIBLE_SUFFIXES][exponent]
  1359. if factor == 1024:
  1360. suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
  1361. converted = num / (factor ** exponent)
  1362. return fmt % (converted, suffix)
  1363. def format_bytes(bytes):
  1364. return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
  1365. def lookup_unit_table(unit_table, s, strict=False):
  1366. num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
  1367. units_re = '|'.join(re.escape(u) for u in unit_table)
  1368. m = (re.fullmatch if strict else re.match)(
  1369. rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
  1370. if not m:
  1371. return None
  1372. num = float(m.group('num').replace(',', '.'))
  1373. mult = unit_table[m.group('unit')]
  1374. return round(num * mult)
  1375. def parse_bytes(s):
  1376. """Parse a string indicating a byte quantity into an integer"""
  1377. return lookup_unit_table(
  1378. {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
  1379. s.upper(), strict=True)
  1380. def parse_filesize(s):
  1381. if s is None:
  1382. return None
  1383. # The lower-case forms are of course incorrect and unofficial,
  1384. # but we support those too
  1385. _UNIT_TABLE = {
  1386. 'B': 1,
  1387. 'b': 1,
  1388. 'bytes': 1,
  1389. 'KiB': 1024,
  1390. 'KB': 1000,
  1391. 'kB': 1024,
  1392. 'Kb': 1000,
  1393. 'kb': 1000,
  1394. 'kilobytes': 1000,
  1395. 'kibibytes': 1024,
  1396. 'MiB': 1024 ** 2,
  1397. 'MB': 1000 ** 2,
  1398. 'mB': 1024 ** 2,
  1399. 'Mb': 1000 ** 2,
  1400. 'mb': 1000 ** 2,
  1401. 'megabytes': 1000 ** 2,
  1402. 'mebibytes': 1024 ** 2,
  1403. 'GiB': 1024 ** 3,
  1404. 'GB': 1000 ** 3,
  1405. 'gB': 1024 ** 3,
  1406. 'Gb': 1000 ** 3,
  1407. 'gb': 1000 ** 3,
  1408. 'gigabytes': 1000 ** 3,
  1409. 'gibibytes': 1024 ** 3,
  1410. 'TiB': 1024 ** 4,
  1411. 'TB': 1000 ** 4,
  1412. 'tB': 1024 ** 4,
  1413. 'Tb': 1000 ** 4,
  1414. 'tb': 1000 ** 4,
  1415. 'terabytes': 1000 ** 4,
  1416. 'tebibytes': 1024 ** 4,
  1417. 'PiB': 1024 ** 5,
  1418. 'PB': 1000 ** 5,
  1419. 'pB': 1024 ** 5,
  1420. 'Pb': 1000 ** 5,
  1421. 'pb': 1000 ** 5,
  1422. 'petabytes': 1000 ** 5,
  1423. 'pebibytes': 1024 ** 5,
  1424. 'EiB': 1024 ** 6,
  1425. 'EB': 1000 ** 6,
  1426. 'eB': 1024 ** 6,
  1427. 'Eb': 1000 ** 6,
  1428. 'eb': 1000 ** 6,
  1429. 'exabytes': 1000 ** 6,
  1430. 'exbibytes': 1024 ** 6,
  1431. 'ZiB': 1024 ** 7,
  1432. 'ZB': 1000 ** 7,
  1433. 'zB': 1024 ** 7,
  1434. 'Zb': 1000 ** 7,
  1435. 'zb': 1000 ** 7,
  1436. 'zettabytes': 1000 ** 7,
  1437. 'zebibytes': 1024 ** 7,
  1438. 'YiB': 1024 ** 8,
  1439. 'YB': 1000 ** 8,
  1440. 'yB': 1024 ** 8,
  1441. 'Yb': 1000 ** 8,
  1442. 'yb': 1000 ** 8,
  1443. 'yottabytes': 1000 ** 8,
  1444. 'yobibytes': 1024 ** 8,
  1445. }
  1446. return lookup_unit_table(_UNIT_TABLE, s)
  1447. def parse_count(s):
  1448. if s is None:
  1449. return None
  1450. s = re.sub(r'^[^\d]+\s', '', s).strip()
  1451. if re.match(r'^[\d,.]+$', s):
  1452. return str_to_int(s)
  1453. _UNIT_TABLE = {
  1454. 'k': 1000,
  1455. 'K': 1000,
  1456. 'm': 1000 ** 2,
  1457. 'M': 1000 ** 2,
  1458. 'kk': 1000 ** 2,
  1459. 'KK': 1000 ** 2,
  1460. 'b': 1000 ** 3,
  1461. 'B': 1000 ** 3,
  1462. }
  1463. ret = lookup_unit_table(_UNIT_TABLE, s)
  1464. if ret is not None:
  1465. return ret
  1466. mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
  1467. if mobj:
  1468. return str_to_int(mobj.group(1))
  1469. def parse_resolution(s, *, lenient=False):
  1470. if s is None:
  1471. return {}
  1472. if lenient:
  1473. mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
  1474. else:
  1475. mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
  1476. if mobj:
  1477. return {
  1478. 'width': int(mobj.group('w')),
  1479. 'height': int(mobj.group('h')),
  1480. }
  1481. mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
  1482. if mobj:
  1483. return {'height': int(mobj.group(1))}
  1484. mobj = re.search(r'\b([48])[kK]\b', s)
  1485. if mobj:
  1486. return {'height': int(mobj.group(1)) * 540}
  1487. return {}
  1488. def parse_bitrate(s):
  1489. if not isinstance(s, str):
  1490. return
  1491. mobj = re.search(r'\b(\d+)\s*kbps', s)
  1492. if mobj:
  1493. return int(mobj.group(1))
  1494. def month_by_name(name, lang='en'):
  1495. """ Return the number of a month by (locale-independently) English name """
  1496. month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
  1497. try:
  1498. return month_names.index(name) + 1
  1499. except ValueError:
  1500. return None
  1501. def month_by_abbreviation(abbrev):
  1502. """ Return the number of a month by (locale-independently) English
  1503. abbreviations """
  1504. try:
  1505. return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
  1506. except ValueError:
  1507. return None
  1508. def fix_xml_ampersands(xml_str):
  1509. """Replace all the '&' by '&amp;' in XML"""
  1510. return re.sub(
  1511. r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
  1512. '&amp;',
  1513. xml_str)
  1514. def setproctitle(title):
  1515. assert isinstance(title, str)
  1516. # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
  1517. try:
  1518. import ctypes
  1519. except ImportError:
  1520. return
  1521. try:
  1522. libc = ctypes.cdll.LoadLibrary('libc.so.6')
  1523. except OSError:
  1524. return
  1525. except TypeError:
  1526. # LoadLibrary in Windows Python 2.7.13 only expects
  1527. # a bytestring, but since unicode_literals turns
  1528. # every string into a unicode string, it fails.
  1529. return
  1530. title_bytes = title.encode()
  1531. buf = ctypes.create_string_buffer(len(title_bytes))
  1532. buf.value = title_bytes
  1533. try:
  1534. # PR_SET_NAME = 15 Ref: /usr/include/linux/prctl.h
  1535. libc.prctl(15, buf, 0, 0, 0)
  1536. except AttributeError:
  1537. return # Strange libc, just skip this
  1538. def remove_start(s, start):
  1539. return s[len(start):] if s is not None and s.startswith(start) else s
  1540. def remove_end(s, end):
  1541. return s[:-len(end)] if s is not None and s.endswith(end) else s
  1542. def remove_quotes(s):
  1543. if s is None or len(s) < 2:
  1544. return s
  1545. for quote in ('"', "'"):
  1546. if s[0] == quote and s[-1] == quote:
  1547. return s[1:-1]
  1548. return s
  1549. def get_domain(url):
  1550. """
  1551. This implementation is inconsistent, but is kept for compatibility.
  1552. Use this only for "webpage_url_domain"
  1553. """
  1554. return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
  1555. def url_basename(url):
  1556. path = urllib.parse.urlparse(url).path
  1557. return path.strip('/').split('/')[-1]
  1558. def base_url(url):
  1559. return re.match(r'https?://[^?#]+/', url).group()
  1560. def urljoin(base, path):
  1561. if isinstance(path, bytes):
  1562. path = path.decode()
  1563. if not isinstance(path, str) or not path:
  1564. return None
  1565. if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
  1566. return path
  1567. if isinstance(base, bytes):
  1568. base = base.decode()
  1569. if not isinstance(base, str) or not re.match(
  1570. r'^(?:https?:)?//', base):
  1571. return None
  1572. return urllib.parse.urljoin(base, path)
  1573. def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
  1574. if get_attr and v is not None:
  1575. v = getattr(v, get_attr, None)
  1576. try:
  1577. return int(v) * invscale // scale
  1578. except (ValueError, TypeError, OverflowError):
  1579. return default
  1580. def str_or_none(v, default=None):
  1581. return default if v is None else str(v)
  1582. def str_to_int(int_str):
  1583. """ A more relaxed version of int_or_none """
  1584. if isinstance(int_str, int):
  1585. return int_str
  1586. elif isinstance(int_str, str):
  1587. int_str = re.sub(r'[,\.\+]', '', int_str)
  1588. return int_or_none(int_str)
  1589. def float_or_none(v, scale=1, invscale=1, default=None):
  1590. if v is None:
  1591. return default
  1592. try:
  1593. return float(v) * invscale / scale
  1594. except (ValueError, TypeError):
  1595. return default
  1596. def bool_or_none(v, default=None):
  1597. return v if isinstance(v, bool) else default
  1598. def strip_or_none(v, default=None):
  1599. return v.strip() if isinstance(v, str) else default
  1600. def url_or_none(url):
  1601. if not url or not isinstance(url, str):
  1602. return None
  1603. url = url.strip()
  1604. return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
  1605. def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
  1606. datetime_object = None
  1607. try:
  1608. if isinstance(timestamp, (int, float)): # unix timestamp
  1609. # Using naive datetime here can break timestamp() in Windows
  1610. # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
  1611. # Also, dt.datetime.fromtimestamp breaks for negative timestamps
  1612. # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
  1613. datetime_object = (dt.datetime.fromtimestamp(0, dt.timezone.utc)
  1614. + dt.timedelta(seconds=timestamp))
  1615. elif isinstance(timestamp, str): # assume YYYYMMDD
  1616. datetime_object = dt.datetime.strptime(timestamp, '%Y%m%d')
  1617. date_format = re.sub( # Support %s on windows
  1618. r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
  1619. return datetime_object.strftime(date_format)
  1620. except (ValueError, TypeError, AttributeError):
  1621. return default
  1622. def parse_duration(s):
  1623. if not isinstance(s, str):
  1624. return None
  1625. s = s.strip()
  1626. if not s:
  1627. return None
  1628. days, hours, mins, secs, ms = [None] * 5
  1629. m = re.match(r'''(?x)
  1630. (?P<before_secs>
  1631. (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
  1632. (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
  1633. (?P<ms>[.:][0-9]+)?Z?$
  1634. ''', s)
  1635. if m:
  1636. days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
  1637. else:
  1638. m = re.match(
  1639. r'''(?ix)(?:P?
  1640. (?:
  1641. [0-9]+\s*y(?:ears?)?,?\s*
  1642. )?
  1643. (?:
  1644. [0-9]+\s*m(?:onths?)?,?\s*
  1645. )?
  1646. (?:
  1647. [0-9]+\s*w(?:eeks?)?,?\s*
  1648. )?
  1649. (?:
  1650. (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
  1651. )?
  1652. T)?
  1653. (?:
  1654. (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
  1655. )?
  1656. (?:
  1657. (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
  1658. )?
  1659. (?:
  1660. (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
  1661. )?Z?$''', s)
  1662. if m:
  1663. days, hours, mins, secs, ms = m.groups()
  1664. else:
  1665. m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
  1666. if m:
  1667. hours, mins = m.groups()
  1668. else:
  1669. return None
  1670. if ms:
  1671. ms = ms.replace(':', '.')
  1672. return sum(float(part or 0) * mult for part, mult in (
  1673. (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
  1674. def _change_extension(prepend, filename, ext, expected_real_ext=None):
  1675. name, real_ext = os.path.splitext(filename)
  1676. if not expected_real_ext or real_ext[1:] == expected_real_ext:
  1677. filename = name
  1678. if prepend and real_ext:
  1679. _UnsafeExtensionError.sanitize_extension(ext, prepend=True)
  1680. return f'{filename}.{ext}{real_ext}'
  1681. return f'{filename}.{_UnsafeExtensionError.sanitize_extension(ext)}'
  1682. prepend_extension = functools.partial(_change_extension, True)
  1683. replace_extension = functools.partial(_change_extension, False)
  1684. def check_executable(exe, args=[]):
  1685. """ Checks if the given binary is installed somewhere in PATH, and returns its name.
  1686. args can be a list of arguments for a short output (like -version) """
  1687. try:
  1688. Popen.run([exe, *args], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  1689. except OSError:
  1690. return False
  1691. return exe
  1692. def _get_exe_version_output(exe, args):
  1693. try:
  1694. # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
  1695. # SIGTTOU if yt-dlp is run in the background.
  1696. # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
  1697. stdout, _, ret = Popen.run([encodeArgument(exe), *args], text=True,
  1698. stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
  1699. if ret:
  1700. return None
  1701. except OSError:
  1702. return False
  1703. return stdout
  1704. def detect_exe_version(output, version_re=None, unrecognized='present'):
  1705. assert isinstance(output, str)
  1706. if version_re is None:
  1707. version_re = r'version\s+([-0-9._a-zA-Z]+)'
  1708. m = re.search(version_re, output)
  1709. if m:
  1710. return m.group(1)
  1711. else:
  1712. return unrecognized
  1713. def get_exe_version(exe, args=['--version'],
  1714. version_re=None, unrecognized=('present', 'broken')):
  1715. """ Returns the version of the specified executable,
  1716. or False if the executable is not present """
  1717. unrecognized = variadic(unrecognized)
  1718. assert len(unrecognized) in (1, 2)
  1719. out = _get_exe_version_output(exe, args)
  1720. if out is None:
  1721. return unrecognized[-1]
  1722. return out and detect_exe_version(out, version_re, unrecognized[0])
  1723. def frange(start=0, stop=None, step=1):
  1724. """Float range"""
  1725. if stop is None:
  1726. start, stop = 0, start
  1727. sign = [-1, 1][step > 0] if step else 0
  1728. while sign * start < sign * stop:
  1729. yield start
  1730. start += step
  1731. class LazyList(collections.abc.Sequence):
  1732. """Lazy immutable list from an iterable
  1733. Note that slices of a LazyList are lists and not LazyList"""
  1734. class IndexError(IndexError): # noqa: A001
  1735. pass
  1736. def __init__(self, iterable, *, reverse=False, _cache=None):
  1737. self._iterable = iter(iterable)
  1738. self._cache = [] if _cache is None else _cache
  1739. self._reversed = reverse
  1740. def __iter__(self):
  1741. if self._reversed:
  1742. # We need to consume the entire iterable to iterate in reverse
  1743. yield from self.exhaust()
  1744. return
  1745. yield from self._cache
  1746. for item in self._iterable:
  1747. self._cache.append(item)
  1748. yield item
  1749. def _exhaust(self):
  1750. self._cache.extend(self._iterable)
  1751. self._iterable = [] # Discard the emptied iterable to make it pickle-able
  1752. return self._cache
  1753. def exhaust(self):
  1754. """Evaluate the entire iterable"""
  1755. return self._exhaust()[::-1 if self._reversed else 1]
  1756. @staticmethod
  1757. def _reverse_index(x):
  1758. return None if x is None else ~x
  1759. def __getitem__(self, idx):
  1760. if isinstance(idx, slice):
  1761. if self._reversed:
  1762. idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
  1763. start, stop, step = idx.start, idx.stop, idx.step or 1
  1764. elif isinstance(idx, int):
  1765. if self._reversed:
  1766. idx = self._reverse_index(idx)
  1767. start, stop, step = idx, idx, 0
  1768. else:
  1769. raise TypeError('indices must be integers or slices')
  1770. if ((start or 0) < 0 or (stop or 0) < 0
  1771. or (start is None and step < 0)
  1772. or (stop is None and step > 0)):
  1773. # We need to consume the entire iterable to be able to slice from the end
  1774. # Obviously, never use this with infinite iterables
  1775. self._exhaust()
  1776. try:
  1777. return self._cache[idx]
  1778. except IndexError as e:
  1779. raise self.IndexError(e) from e
  1780. n = max(start or 0, stop or 0) - len(self._cache) + 1
  1781. if n > 0:
  1782. self._cache.extend(itertools.islice(self._iterable, n))
  1783. try:
  1784. return self._cache[idx]
  1785. except IndexError as e:
  1786. raise self.IndexError(e) from e
  1787. def __bool__(self):
  1788. try:
  1789. self[-1] if self._reversed else self[0]
  1790. except self.IndexError:
  1791. return False
  1792. return True
  1793. def __len__(self):
  1794. self._exhaust()
  1795. return len(self._cache)
  1796. def __reversed__(self):
  1797. return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
  1798. def __copy__(self):
  1799. return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
  1800. def __repr__(self):
  1801. # repr and str should mimic a list. So we exhaust the iterable
  1802. return repr(self.exhaust())
  1803. def __str__(self):
  1804. return repr(self.exhaust())
  1805. class PagedList:
  1806. class IndexError(IndexError): # noqa: A001
  1807. pass
  1808. def __len__(self):
  1809. # This is only useful for tests
  1810. return len(self.getslice())
  1811. def __init__(self, pagefunc, pagesize, use_cache=True):
  1812. self._pagefunc = pagefunc
  1813. self._pagesize = pagesize
  1814. self._pagecount = float('inf')
  1815. self._use_cache = use_cache
  1816. self._cache = {}
  1817. def getpage(self, pagenum):
  1818. page_results = self._cache.get(pagenum)
  1819. if page_results is None:
  1820. page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
  1821. if self._use_cache:
  1822. self._cache[pagenum] = page_results
  1823. return page_results
  1824. def getslice(self, start=0, end=None):
  1825. return list(self._getslice(start, end))
  1826. def _getslice(self, start, end):
  1827. raise NotImplementedError('This method must be implemented by subclasses')
  1828. def __getitem__(self, idx):
  1829. assert self._use_cache, 'Indexing PagedList requires cache'
  1830. if not isinstance(idx, int) or idx < 0:
  1831. raise TypeError('indices must be non-negative integers')
  1832. entries = self.getslice(idx, idx + 1)
  1833. if not entries:
  1834. raise self.IndexError
  1835. return entries[0]
  1836. def __bool__(self):
  1837. return bool(self.getslice(0, 1))
  1838. class OnDemandPagedList(PagedList):
  1839. """Download pages until a page with less than maximum results"""
  1840. def _getslice(self, start, end):
  1841. for pagenum in itertools.count(start // self._pagesize):
  1842. firstid = pagenum * self._pagesize
  1843. nextfirstid = pagenum * self._pagesize + self._pagesize
  1844. if start >= nextfirstid:
  1845. continue
  1846. startv = (
  1847. start % self._pagesize
  1848. if firstid <= start < nextfirstid
  1849. else 0)
  1850. endv = (
  1851. ((end - 1) % self._pagesize) + 1
  1852. if (end is not None and firstid <= end <= nextfirstid)
  1853. else None)
  1854. try:
  1855. page_results = self.getpage(pagenum)
  1856. except Exception:
  1857. self._pagecount = pagenum - 1
  1858. raise
  1859. if startv != 0 or endv is not None:
  1860. page_results = page_results[startv:endv]
  1861. yield from page_results
  1862. # A little optimization - if current page is not "full", ie. does
  1863. # not contain page_size videos then we can assume that this page
  1864. # is the last one - there are no more ids on further pages -
  1865. # i.e. no need to query again.
  1866. if len(page_results) + startv < self._pagesize:
  1867. break
  1868. # If we got the whole page, but the next page is not interesting,
  1869. # break out early as well
  1870. if end == nextfirstid:
  1871. break
  1872. class InAdvancePagedList(PagedList):
  1873. """PagedList with total number of pages known in advance"""
  1874. def __init__(self, pagefunc, pagecount, pagesize):
  1875. PagedList.__init__(self, pagefunc, pagesize, True)
  1876. self._pagecount = pagecount
  1877. def _getslice(self, start, end):
  1878. start_page = start // self._pagesize
  1879. end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
  1880. skip_elems = start - start_page * self._pagesize
  1881. only_more = None if end is None else end - start
  1882. for pagenum in range(start_page, end_page):
  1883. page_results = self.getpage(pagenum)
  1884. if skip_elems:
  1885. page_results = page_results[skip_elems:]
  1886. skip_elems = None
  1887. if only_more is not None:
  1888. if len(page_results) < only_more:
  1889. only_more -= len(page_results)
  1890. else:
  1891. yield from page_results[:only_more]
  1892. break
  1893. yield from page_results
  1894. class PlaylistEntries:
  1895. MissingEntry = object()
  1896. is_exhausted = False
  1897. def __init__(self, ydl, info_dict):
  1898. self.ydl = ydl
  1899. # _entries must be assigned now since infodict can change during iteration
  1900. entries = info_dict.get('entries')
  1901. if entries is None:
  1902. raise EntryNotInPlaylist('There are no entries')
  1903. elif isinstance(entries, list):
  1904. self.is_exhausted = True
  1905. requested_entries = info_dict.get('requested_entries')
  1906. self.is_incomplete = requested_entries is not None
  1907. if self.is_incomplete:
  1908. assert self.is_exhausted
  1909. self._entries = [self.MissingEntry] * max(requested_entries or [0])
  1910. for i, entry in zip(requested_entries, entries):
  1911. self._entries[i - 1] = entry
  1912. elif isinstance(entries, (list, PagedList, LazyList)):
  1913. self._entries = entries
  1914. else:
  1915. self._entries = LazyList(entries)
  1916. PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
  1917. (?P<start>[+-]?\d+)?
  1918. (?P<range>[:-]
  1919. (?P<end>[+-]?\d+|inf(?:inite)?)?
  1920. (?::(?P<step>[+-]?\d+))?
  1921. )?''')
  1922. @classmethod
  1923. def parse_playlist_items(cls, string):
  1924. for segment in string.split(','):
  1925. if not segment:
  1926. raise ValueError('There is two or more consecutive commas')
  1927. mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
  1928. if not mobj:
  1929. raise ValueError(f'{segment!r} is not a valid specification')
  1930. start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
  1931. if int_or_none(step) == 0:
  1932. raise ValueError(f'Step in {segment!r} cannot be zero')
  1933. yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
  1934. def get_requested_items(self):
  1935. playlist_items = self.ydl.params.get('playlist_items')
  1936. playlist_start = self.ydl.params.get('playliststart', 1)
  1937. playlist_end = self.ydl.params.get('playlistend')
  1938. # For backwards compatibility, interpret -1 as whole list
  1939. if playlist_end in (-1, None):
  1940. playlist_end = ''
  1941. if not playlist_items:
  1942. playlist_items = f'{playlist_start}:{playlist_end}'
  1943. elif playlist_start != 1 or playlist_end:
  1944. self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
  1945. for index in self.parse_playlist_items(playlist_items):
  1946. for i, entry in self[index]:
  1947. yield i, entry
  1948. if not entry:
  1949. continue
  1950. try:
  1951. # The item may have just been added to archive. Don't break due to it
  1952. if not self.ydl.params.get('lazy_playlist'):
  1953. # TODO: Add auto-generated fields
  1954. self.ydl._match_entry(entry, incomplete=True, silent=True)
  1955. except (ExistingVideoReached, RejectedVideoReached):
  1956. return
  1957. def get_full_count(self):
  1958. if self.is_exhausted and not self.is_incomplete:
  1959. return len(self)
  1960. elif isinstance(self._entries, InAdvancePagedList):
  1961. if self._entries._pagesize == 1:
  1962. return self._entries._pagecount
  1963. @functools.cached_property
  1964. def _getter(self):
  1965. if isinstance(self._entries, list):
  1966. def get_entry(i):
  1967. try:
  1968. entry = self._entries[i]
  1969. except IndexError:
  1970. entry = self.MissingEntry
  1971. if not self.is_incomplete:
  1972. raise self.IndexError
  1973. if entry is self.MissingEntry:
  1974. raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
  1975. return entry
  1976. else:
  1977. def get_entry(i):
  1978. try:
  1979. return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
  1980. except (LazyList.IndexError, PagedList.IndexError):
  1981. raise self.IndexError
  1982. return get_entry
  1983. def __getitem__(self, idx):
  1984. if isinstance(idx, int):
  1985. idx = slice(idx, idx)
  1986. # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
  1987. step = 1 if idx.step is None else idx.step
  1988. if idx.start is None:
  1989. start = 0 if step > 0 else len(self) - 1
  1990. else:
  1991. start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
  1992. # NB: Do not call len(self) when idx == [:]
  1993. if idx.stop is None:
  1994. stop = 0 if step < 0 else float('inf')
  1995. else:
  1996. stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
  1997. stop += [-1, 1][step > 0]
  1998. for i in frange(start, stop, step):
  1999. if i < 0:
  2000. continue
  2001. try:
  2002. entry = self._getter(i)
  2003. except self.IndexError:
  2004. self.is_exhausted = True
  2005. if step > 0:
  2006. break
  2007. continue
  2008. yield i + 1, entry
  2009. def __len__(self):
  2010. return len(tuple(self[:]))
  2011. class IndexError(IndexError): # noqa: A001
  2012. pass
  2013. def uppercase_escape(s):
  2014. unicode_escape = codecs.getdecoder('unicode_escape')
  2015. return re.sub(
  2016. r'\\U[0-9a-fA-F]{8}',
  2017. lambda m: unicode_escape(m.group(0))[0],
  2018. s)
  2019. def lowercase_escape(s):
  2020. unicode_escape = codecs.getdecoder('unicode_escape')
  2021. return re.sub(
  2022. r'\\u[0-9a-fA-F]{4}',
  2023. lambda m: unicode_escape(m.group(0))[0],
  2024. s)
  2025. def parse_qs(url, **kwargs):
  2026. return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
  2027. def read_batch_urls(batch_fd):
  2028. def fixup(url):
  2029. if not isinstance(url, str):
  2030. url = url.decode('utf-8', 'replace')
  2031. BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
  2032. for bom in BOM_UTF8:
  2033. if url.startswith(bom):
  2034. url = url[len(bom):]
  2035. url = url.lstrip()
  2036. if not url or url.startswith(('#', ';', ']')):
  2037. return False
  2038. # "#" cannot be stripped out since it is part of the URI
  2039. # However, it can be safely stripped out if following a whitespace
  2040. return re.split(r'\s#', url, maxsplit=1)[0].rstrip()
  2041. with contextlib.closing(batch_fd) as fd:
  2042. return [url for url in map(fixup, fd) if url]
  2043. def urlencode_postdata(*args, **kargs):
  2044. return urllib.parse.urlencode(*args, **kargs).encode('ascii')
  2045. def update_url(url, *, query_update=None, **kwargs):
  2046. """Replace URL components specified by kwargs
  2047. @param url str or parse url tuple
  2048. @param query_update update query
  2049. @returns str
  2050. """
  2051. if isinstance(url, str):
  2052. if not kwargs and not query_update:
  2053. return url
  2054. else:
  2055. url = urllib.parse.urlparse(url)
  2056. if query_update:
  2057. assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
  2058. kwargs['query'] = urllib.parse.urlencode({
  2059. **urllib.parse.parse_qs(url.query),
  2060. **query_update,
  2061. }, True)
  2062. return urllib.parse.urlunparse(url._replace(**kwargs))
  2063. def update_url_query(url, query):
  2064. return update_url(url, query_update=query)
  2065. def _multipart_encode_impl(data, boundary):
  2066. content_type = f'multipart/form-data; boundary={boundary}'
  2067. out = b''
  2068. for k, v in data.items():
  2069. out += b'--' + boundary.encode('ascii') + b'\r\n'
  2070. if isinstance(k, str):
  2071. k = k.encode()
  2072. if isinstance(v, str):
  2073. v = v.encode()
  2074. # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
  2075. # suggests sending UTF-8 directly. Firefox sends UTF-8, too
  2076. content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
  2077. if boundary.encode('ascii') in content:
  2078. raise ValueError('Boundary overlaps with data')
  2079. out += content
  2080. out += b'--' + boundary.encode('ascii') + b'--\r\n'
  2081. return out, content_type
  2082. def multipart_encode(data, boundary=None):
  2083. """
  2084. Encode a dict to RFC 7578-compliant form-data
  2085. data:
  2086. A dict where keys and values can be either Unicode or bytes-like
  2087. objects.
  2088. boundary:
  2089. If specified a Unicode object, it's used as the boundary. Otherwise
  2090. a random boundary is generated.
  2091. Reference: https://tools.ietf.org/html/rfc7578
  2092. """
  2093. has_specified_boundary = boundary is not None
  2094. while True:
  2095. if boundary is None:
  2096. boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
  2097. try:
  2098. out, content_type = _multipart_encode_impl(data, boundary)
  2099. break
  2100. except ValueError:
  2101. if has_specified_boundary:
  2102. raise
  2103. boundary = None
  2104. return out, content_type
  2105. def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
  2106. if blocked_types is NO_DEFAULT:
  2107. blocked_types = (str, bytes, collections.abc.Mapping)
  2108. return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
  2109. def variadic(x, allowed_types=NO_DEFAULT):
  2110. if not isinstance(allowed_types, (tuple, type)):
  2111. deprecation_warning('allowed_types should be a tuple or a type')
  2112. allowed_types = tuple(allowed_types)
  2113. return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
  2114. def try_call(*funcs, expected_type=None, args=[], kwargs={}):
  2115. for f in funcs:
  2116. try:
  2117. val = f(*args, **kwargs)
  2118. except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
  2119. pass
  2120. else:
  2121. if expected_type is None or isinstance(val, expected_type):
  2122. return val
  2123. def try_get(src, getter, expected_type=None):
  2124. return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
  2125. def filter_dict(dct, cndn=lambda _, v: v is not None):
  2126. return {k: v for k, v in dct.items() if cndn(k, v)}
  2127. def merge_dicts(*dicts):
  2128. merged = {}
  2129. for a_dict in dicts:
  2130. for k, v in a_dict.items():
  2131. if (v is not None and k not in merged
  2132. or isinstance(v, str) and merged[k] == ''):
  2133. merged[k] = v
  2134. return merged
  2135. def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
  2136. return string if isinstance(string, str) else str(string, encoding, errors)
  2137. US_RATINGS = {
  2138. 'G': 0,
  2139. 'PG': 10,
  2140. 'PG-13': 13,
  2141. 'R': 16,
  2142. 'NC': 18,
  2143. }
  2144. TV_PARENTAL_GUIDELINES = {
  2145. 'TV-Y': 0,
  2146. 'TV-Y7': 7,
  2147. 'TV-G': 0,
  2148. 'TV-PG': 0,
  2149. 'TV-14': 14,
  2150. 'TV-MA': 17,
  2151. }
  2152. def parse_age_limit(s):
  2153. # isinstance(False, int) is True. So type() must be used instead
  2154. if type(s) is int: # noqa: E721
  2155. return s if 0 <= s <= 21 else None
  2156. elif not isinstance(s, str):
  2157. return None
  2158. m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
  2159. if m:
  2160. return int(m.group('age'))
  2161. s = s.upper()
  2162. if s in US_RATINGS:
  2163. return US_RATINGS[s]
  2164. m = re.match(r'^TV[_-]?({})$'.format('|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES)), s)
  2165. if m:
  2166. return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
  2167. return None
  2168. def strip_jsonp(code):
  2169. return re.sub(
  2170. r'''(?sx)^
  2171. (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
  2172. (?:\s*&&\s*(?P=func_name))?
  2173. \s*\(\s*(?P<callback_data>.*)\);?
  2174. \s*?(?://[^\n]*)*$''',
  2175. r'\g<callback_data>', code)
  2176. def js_to_json(code, vars={}, *, strict=False):
  2177. # vars is a dict of var, val pairs to substitute
  2178. STRING_QUOTES = '\'"`'
  2179. STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
  2180. COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
  2181. SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
  2182. INTEGER_TABLE = (
  2183. (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
  2184. (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
  2185. )
  2186. def process_escape(match):
  2187. JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
  2188. escape = match.group(1) or match.group(2)
  2189. return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
  2190. else R'\u00' if escape == 'x'
  2191. else '' if escape == '\n'
  2192. else escape)
  2193. def template_substitute(match):
  2194. evaluated = js_to_json(match.group(1), vars, strict=strict)
  2195. if evaluated[0] == '"':
  2196. return json.loads(evaluated)
  2197. return evaluated
  2198. def fix_kv(m):
  2199. v = m.group(0)
  2200. if v in ('true', 'false', 'null'):
  2201. return v
  2202. elif v in ('undefined', 'void 0'):
  2203. return 'null'
  2204. elif v.startswith(('/*', '//', '!')) or v == ',':
  2205. return ''
  2206. if v[0] in STRING_QUOTES:
  2207. v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
  2208. escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
  2209. return f'"{escaped}"'
  2210. for regex, base in INTEGER_TABLE:
  2211. im = re.match(regex, v)
  2212. if im:
  2213. i = int(im.group(1), base)
  2214. return f'"{i}":' if v.endswith(':') else str(i)
  2215. if v in vars:
  2216. try:
  2217. if not strict:
  2218. json.loads(vars[v])
  2219. except json.JSONDecodeError:
  2220. return json.dumps(vars[v])
  2221. else:
  2222. return vars[v]
  2223. if not strict:
  2224. return f'"{v}"'
  2225. raise ValueError(f'Unknown value: {v}')
  2226. def create_map(mobj):
  2227. return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
  2228. code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
  2229. code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
  2230. if not strict:
  2231. code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
  2232. code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
  2233. code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
  2234. code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
  2235. return re.sub(rf'''(?sx)
  2236. {STRING_RE}|
  2237. {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
  2238. void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
  2239. \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
  2240. [0-9]+(?={SKIP_RE}:)|
  2241. !+
  2242. ''', fix_kv, code)
  2243. def qualities(quality_ids):
  2244. """ Get a numeric quality value out of a list of possible values """
  2245. def q(qid):
  2246. try:
  2247. return quality_ids.index(qid)
  2248. except ValueError:
  2249. return -1
  2250. return q
  2251. POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
  2252. DEFAULT_OUTTMPL = {
  2253. 'default': '%(title)s [%(id)s].%(ext)s',
  2254. 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
  2255. }
  2256. OUTTMPL_TYPES = {
  2257. 'chapter': None,
  2258. 'subtitle': None,
  2259. 'thumbnail': None,
  2260. 'description': 'description',
  2261. 'annotation': 'annotations.xml',
  2262. 'infojson': 'info.json',
  2263. 'link': None,
  2264. 'pl_video': None,
  2265. 'pl_thumbnail': None,
  2266. 'pl_description': 'description',
  2267. 'pl_infojson': 'info.json',
  2268. }
  2269. # As of [1] format syntax is:
  2270. # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
  2271. # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
  2272. STR_FORMAT_RE_TMPL = r'''(?x)
  2273. (?<!%)(?P<prefix>(?:%%)*)
  2274. %
  2275. (?P<has_key>\((?P<key>{0})\))?
  2276. (?P<format>
  2277. (?P<conversion>[#0\-+ ]+)?
  2278. (?P<min_width>\d+)?
  2279. (?P<precision>\.\d+)?
  2280. (?P<len_mod>[hlL])? # unused in python
  2281. {1} # conversion type
  2282. )
  2283. '''
  2284. STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
  2285. def limit_length(s, length):
  2286. """ Add ellipses to overly long strings """
  2287. if s is None:
  2288. return None
  2289. ELLIPSES = '...'
  2290. if len(s) > length:
  2291. return s[:length - len(ELLIPSES)] + ELLIPSES
  2292. return s
  2293. def version_tuple(v):
  2294. return tuple(int(e) for e in re.split(r'[-.]', v))
  2295. def is_outdated_version(version, limit, assume_new=True):
  2296. if not version:
  2297. return not assume_new
  2298. try:
  2299. return version_tuple(version) < version_tuple(limit)
  2300. except ValueError:
  2301. return not assume_new
  2302. def ytdl_is_updateable():
  2303. """ Returns if yt-dlp can be updated with -U """
  2304. from ..update import is_non_updateable
  2305. return not is_non_updateable()
  2306. def args_to_str(args):
  2307. # Get a short string representation for a subprocess command
  2308. return shell_quote(args)
  2309. def error_to_str(err):
  2310. return f'{type(err).__name__}: {err}'
  2311. def mimetype2ext(mt, default=NO_DEFAULT):
  2312. if not isinstance(mt, str):
  2313. if default is not NO_DEFAULT:
  2314. return default
  2315. return None
  2316. MAP = {
  2317. # video
  2318. '3gpp': '3gp',
  2319. 'mp2t': 'ts',
  2320. 'mp4': 'mp4',
  2321. 'mpeg': 'mpeg',
  2322. 'mpegurl': 'm3u8',
  2323. 'quicktime': 'mov',
  2324. 'webm': 'webm',
  2325. 'vp9': 'vp9',
  2326. 'video/ogg': 'ogv',
  2327. 'x-flv': 'flv',
  2328. 'x-m4v': 'm4v',
  2329. 'x-matroska': 'mkv',
  2330. 'x-mng': 'mng',
  2331. 'x-mp4-fragmented': 'mp4',
  2332. 'x-ms-asf': 'asf',
  2333. 'x-ms-wmv': 'wmv',
  2334. 'x-msvideo': 'avi',
  2335. # application (streaming playlists)
  2336. 'dash+xml': 'mpd',
  2337. 'f4m+xml': 'f4m',
  2338. 'hds+xml': 'f4m',
  2339. 'vnd.apple.mpegurl': 'm3u8',
  2340. 'vnd.ms-sstr+xml': 'ism',
  2341. 'x-mpegurl': 'm3u8',
  2342. # audio
  2343. 'audio/mp4': 'm4a',
  2344. # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
  2345. # Using .mp3 as it's the most popular one
  2346. 'audio/mpeg': 'mp3',
  2347. 'audio/webm': 'webm',
  2348. 'audio/x-matroska': 'mka',
  2349. 'audio/x-mpegurl': 'm3u',
  2350. 'midi': 'mid',
  2351. 'ogg': 'ogg',
  2352. 'wav': 'wav',
  2353. 'wave': 'wav',
  2354. 'x-aac': 'aac',
  2355. 'x-flac': 'flac',
  2356. 'x-m4a': 'm4a',
  2357. 'x-realaudio': 'ra',
  2358. 'x-wav': 'wav',
  2359. # image
  2360. 'avif': 'avif',
  2361. 'bmp': 'bmp',
  2362. 'gif': 'gif',
  2363. 'jpeg': 'jpg',
  2364. 'png': 'png',
  2365. 'svg+xml': 'svg',
  2366. 'tiff': 'tif',
  2367. 'vnd.wap.wbmp': 'wbmp',
  2368. 'webp': 'webp',
  2369. 'x-icon': 'ico',
  2370. 'x-jng': 'jng',
  2371. 'x-ms-bmp': 'bmp',
  2372. # caption
  2373. 'filmstrip+json': 'fs',
  2374. 'smptett+xml': 'tt',
  2375. 'ttaf+xml': 'dfxp',
  2376. 'ttml+xml': 'ttml',
  2377. 'x-ms-sami': 'sami',
  2378. # misc
  2379. 'gzip': 'gz',
  2380. 'json': 'json',
  2381. 'xml': 'xml',
  2382. 'zip': 'zip',
  2383. }
  2384. mimetype = mt.partition(';')[0].strip().lower()
  2385. _, _, subtype = mimetype.rpartition('/')
  2386. ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
  2387. if ext:
  2388. return ext
  2389. elif default is not NO_DEFAULT:
  2390. return default
  2391. return subtype.replace('+', '.')
  2392. def ext2mimetype(ext_or_url):
  2393. if not ext_or_url:
  2394. return None
  2395. if '.' not in ext_or_url:
  2396. ext_or_url = f'file.{ext_or_url}'
  2397. return mimetypes.guess_type(ext_or_url)[0]
  2398. def parse_codecs(codecs_str):
  2399. # http://tools.ietf.org/html/rfc6381
  2400. if not codecs_str:
  2401. return {}
  2402. split_codecs = list(filter(None, map(
  2403. str.strip, codecs_str.strip().strip(',').split(','))))
  2404. vcodec, acodec, scodec, hdr = None, None, None, None
  2405. for full_codec in split_codecs:
  2406. parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
  2407. if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
  2408. 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
  2409. if vcodec:
  2410. continue
  2411. vcodec = full_codec
  2412. if parts[0] in ('dvh1', 'dvhe'):
  2413. hdr = 'DV'
  2414. elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
  2415. hdr = 'HDR10'
  2416. elif parts[:2] == ['vp9', '2']:
  2417. hdr = 'HDR10'
  2418. elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
  2419. 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
  2420. acodec = acodec or full_codec
  2421. elif parts[0] in ('stpp', 'wvtt'):
  2422. scodec = scodec or full_codec
  2423. else:
  2424. write_string(f'WARNING: Unknown codec {full_codec}\n')
  2425. if vcodec or acodec or scodec:
  2426. return {
  2427. 'vcodec': vcodec or 'none',
  2428. 'acodec': acodec or 'none',
  2429. 'dynamic_range': hdr,
  2430. **({'scodec': scodec} if scodec is not None else {}),
  2431. }
  2432. elif len(split_codecs) == 2:
  2433. return {
  2434. 'vcodec': split_codecs[0],
  2435. 'acodec': split_codecs[1],
  2436. }
  2437. return {}
  2438. def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
  2439. assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
  2440. allow_mkv = not preferences or 'mkv' in preferences
  2441. if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
  2442. return 'mkv' # TODO: any other format allows this?
  2443. # TODO: All codecs supported by parse_codecs isn't handled here
  2444. COMPATIBLE_CODECS = {
  2445. 'mp4': {
  2446. 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
  2447. 'h264', 'aacl', 'ec-3', # Set in ISM
  2448. },
  2449. 'webm': {
  2450. 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
  2451. 'vp9x', 'vp8x', # in the webm spec
  2452. },
  2453. }
  2454. sanitize_codec = functools.partial(
  2455. try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
  2456. vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
  2457. for ext in preferences or COMPATIBLE_CODECS.keys():
  2458. codec_set = COMPATIBLE_CODECS.get(ext, set())
  2459. if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
  2460. return ext
  2461. COMPATIBLE_EXTS = (
  2462. {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
  2463. {'webm', 'weba'},
  2464. )
  2465. for ext in preferences or vexts:
  2466. current_exts = {ext, *vexts, *aexts}
  2467. if ext == 'mkv' or current_exts == {ext} or any(
  2468. ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
  2469. return ext
  2470. return 'mkv' if allow_mkv else preferences[-1]
  2471. def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
  2472. getheader = url_handle.headers.get
  2473. cd = getheader('Content-Disposition')
  2474. if cd:
  2475. m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
  2476. if m:
  2477. e = determine_ext(m.group('filename'), default_ext=None)
  2478. if e:
  2479. return e
  2480. meta_ext = getheader('x-amz-meta-name')
  2481. if meta_ext:
  2482. e = meta_ext.rpartition('.')[2]
  2483. if e:
  2484. return e
  2485. return mimetype2ext(getheader('Content-Type'), default=default)
  2486. def encode_data_uri(data, mime_type):
  2487. return 'data:{};base64,{}'.format(mime_type, base64.b64encode(data).decode('ascii'))
  2488. def age_restricted(content_limit, age_limit):
  2489. """ Returns True iff the content should be blocked """
  2490. if age_limit is None: # No limit set
  2491. return False
  2492. if content_limit is None:
  2493. return False # Content available for everyone
  2494. return age_limit < content_limit
  2495. # List of known byte-order-marks (BOM)
  2496. BOMS = [
  2497. (b'\xef\xbb\xbf', 'utf-8'),
  2498. (b'\x00\x00\xfe\xff', 'utf-32-be'),
  2499. (b'\xff\xfe\x00\x00', 'utf-32-le'),
  2500. (b'\xff\xfe', 'utf-16-le'),
  2501. (b'\xfe\xff', 'utf-16-be'),
  2502. ]
  2503. def is_html(first_bytes):
  2504. """ Detect whether a file contains HTML by examining its first bytes. """
  2505. encoding = 'utf-8'
  2506. for bom, enc in BOMS:
  2507. while first_bytes.startswith(bom):
  2508. encoding, first_bytes = enc, first_bytes[len(bom):]
  2509. return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
  2510. def determine_protocol(info_dict):
  2511. protocol = info_dict.get('protocol')
  2512. if protocol is not None:
  2513. return protocol
  2514. url = sanitize_url(info_dict['url'])
  2515. if url.startswith('rtmp'):
  2516. return 'rtmp'
  2517. elif url.startswith('mms'):
  2518. return 'mms'
  2519. elif url.startswith('rtsp'):
  2520. return 'rtsp'
  2521. ext = determine_ext(url)
  2522. if ext == 'm3u8':
  2523. return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
  2524. elif ext == 'f4m':
  2525. return 'f4m'
  2526. return urllib.parse.urlparse(url).scheme
  2527. def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
  2528. """ Render a list of rows, each as a list of values.
  2529. Text after a \t will be right aligned """
  2530. def width(string):
  2531. return len(remove_terminal_sequences(string).replace('\t', ''))
  2532. def get_max_lens(table):
  2533. return [max(width(str(v)) for v in col) for col in zip(*table)]
  2534. def filter_using_list(row, filter_array):
  2535. return [col for take, col in itertools.zip_longest(filter_array, row, fillvalue=True) if take]
  2536. max_lens = get_max_lens(data) if hide_empty else []
  2537. header_row = filter_using_list(header_row, max_lens)
  2538. data = [filter_using_list(row, max_lens) for row in data]
  2539. table = [header_row, *data]
  2540. max_lens = get_max_lens(table)
  2541. extra_gap += 1
  2542. if delim:
  2543. table = [header_row, [delim * (ml + extra_gap) for ml in max_lens], *data]
  2544. table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
  2545. for row in table:
  2546. for pos, text in enumerate(map(str, row)):
  2547. if '\t' in text:
  2548. row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
  2549. else:
  2550. row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
  2551. return '\n'.join(''.join(row).rstrip() for row in table)
  2552. def _match_one(filter_part, dct, incomplete):
  2553. # TODO: Generalize code with YoutubeDL._build_format_filter
  2554. STRING_OPERATORS = {
  2555. '*=': operator.contains,
  2556. '^=': lambda attr, value: attr.startswith(value),
  2557. '$=': lambda attr, value: attr.endswith(value),
  2558. '~=': lambda attr, value: re.search(value, attr),
  2559. }
  2560. COMPARISON_OPERATORS = {
  2561. **STRING_OPERATORS,
  2562. '<=': operator.le, # "<=" must be defined above "<"
  2563. '<': operator.lt,
  2564. '>=': operator.ge,
  2565. '>': operator.gt,
  2566. '=': operator.eq,
  2567. }
  2568. if isinstance(incomplete, bool):
  2569. is_incomplete = lambda _: incomplete
  2570. else:
  2571. is_incomplete = lambda k: k in incomplete
  2572. operator_rex = re.compile(r'''(?x)
  2573. (?P<key>[a-z_]+)
  2574. \s*(?P<negation>!\s*)?(?P<op>{})(?P<none_inclusive>\s*\?)?\s*
  2575. (?:
  2576. (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
  2577. (?P<strval>.+?)
  2578. )
  2579. '''.format('|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))))
  2580. m = operator_rex.fullmatch(filter_part.strip())
  2581. if m:
  2582. m = m.groupdict()
  2583. unnegated_op = COMPARISON_OPERATORS[m['op']]
  2584. if m['negation']:
  2585. op = lambda attr, value: not unnegated_op(attr, value)
  2586. else:
  2587. op = unnegated_op
  2588. comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
  2589. if m['quote']:
  2590. comparison_value = comparison_value.replace(r'\{}'.format(m['quote']), m['quote'])
  2591. actual_value = dct.get(m['key'])
  2592. numeric_comparison = None
  2593. if isinstance(actual_value, (int, float)):
  2594. # If the original field is a string and matching comparisonvalue is
  2595. # a number we should respect the origin of the original field
  2596. # and process comparison value as a string (see
  2597. # https://github.com/ytdl-org/youtube-dl/issues/11082)
  2598. try:
  2599. numeric_comparison = int(comparison_value)
  2600. except ValueError:
  2601. numeric_comparison = parse_filesize(comparison_value)
  2602. if numeric_comparison is None:
  2603. numeric_comparison = parse_filesize(f'{comparison_value}B')
  2604. if numeric_comparison is None:
  2605. numeric_comparison = parse_duration(comparison_value)
  2606. if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
  2607. raise ValueError('Operator {} only supports string values!'.format(m['op']))
  2608. if actual_value is None:
  2609. return is_incomplete(m['key']) or m['none_inclusive']
  2610. return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
  2611. UNARY_OPERATORS = {
  2612. '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
  2613. '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
  2614. }
  2615. operator_rex = re.compile(r'''(?x)
  2616. (?P<op>{})\s*(?P<key>[a-z_]+)
  2617. '''.format('|'.join(map(re.escape, UNARY_OPERATORS.keys()))))
  2618. m = operator_rex.fullmatch(filter_part.strip())
  2619. if m:
  2620. op = UNARY_OPERATORS[m.group('op')]
  2621. actual_value = dct.get(m.group('key'))
  2622. if is_incomplete(m.group('key')) and actual_value is None:
  2623. return True
  2624. return op(actual_value)
  2625. raise ValueError(f'Invalid filter part {filter_part!r}')
  2626. def match_str(filter_str, dct, incomplete=False):
  2627. """ Filter a dictionary with a simple string syntax.
  2628. @returns Whether the filter passes
  2629. @param incomplete Set of keys that is expected to be missing from dct.
  2630. Can be True/False to indicate all/none of the keys may be missing.
  2631. All conditions on incomplete keys pass if the key is missing
  2632. """
  2633. return all(
  2634. _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
  2635. for filter_part in re.split(r'(?<!\\)&', filter_str))
  2636. def match_filter_func(filters, breaking_filters=None):
  2637. if not filters and not breaking_filters:
  2638. return None
  2639. repr_ = f'{match_filter_func.__module__}.{match_filter_func.__qualname__}({filters}, {breaking_filters})'
  2640. breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
  2641. filters = set(variadic(filters or []))
  2642. interactive = '-' in filters
  2643. if interactive:
  2644. filters.remove('-')
  2645. @function_with_repr.set_repr(repr_)
  2646. def _match_func(info_dict, incomplete=False):
  2647. ret = breaking_filters(info_dict, incomplete)
  2648. if ret is not None:
  2649. raise RejectedVideoReached(ret)
  2650. if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
  2651. return NO_DEFAULT if interactive and not incomplete else None
  2652. else:
  2653. video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
  2654. filter_str = ') | ('.join(map(str.strip, filters))
  2655. return f'{video_title} does not pass filter ({filter_str}), skipping ..'
  2656. return _match_func
  2657. class download_range_func:
  2658. def __init__(self, chapters, ranges, from_info=False):
  2659. self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
  2660. def __call__(self, info_dict, ydl):
  2661. warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
  2662. else 'Cannot match chapters since chapter information is unavailable')
  2663. for regex in self.chapters or []:
  2664. for i, chapter in enumerate(info_dict.get('chapters') or []):
  2665. if re.search(regex, chapter['title']):
  2666. warning = None
  2667. yield {**chapter, 'index': i}
  2668. if self.chapters and warning:
  2669. ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
  2670. for start, end in self.ranges or []:
  2671. yield {
  2672. 'start_time': self._handle_negative_timestamp(start, info_dict),
  2673. 'end_time': self._handle_negative_timestamp(end, info_dict),
  2674. }
  2675. if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
  2676. yield {
  2677. 'start_time': info_dict.get('start_time') or 0,
  2678. 'end_time': info_dict.get('end_time') or float('inf'),
  2679. }
  2680. elif not self.ranges and not self.chapters:
  2681. yield {}
  2682. @staticmethod
  2683. def _handle_negative_timestamp(time, info):
  2684. return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
  2685. def __eq__(self, other):
  2686. return (isinstance(other, download_range_func)
  2687. and self.chapters == other.chapters and self.ranges == other.ranges)
  2688. def __repr__(self):
  2689. return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
  2690. def parse_dfxp_time_expr(time_expr):
  2691. if not time_expr:
  2692. return
  2693. mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
  2694. if mobj:
  2695. return float(mobj.group('time_offset'))
  2696. mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
  2697. if mobj:
  2698. return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
  2699. def srt_subtitles_timecode(seconds):
  2700. return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
  2701. def ass_subtitles_timecode(seconds):
  2702. time = timetuple_from_msec(seconds * 1000)
  2703. return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
  2704. def dfxp2srt(dfxp_data):
  2705. """
  2706. @param dfxp_data A bytes-like object containing DFXP data
  2707. @returns A unicode object containing converted SRT data
  2708. """
  2709. LEGACY_NAMESPACES = (
  2710. (b'http://www.w3.org/ns/ttml', [
  2711. b'http://www.w3.org/2004/11/ttaf1',
  2712. b'http://www.w3.org/2006/04/ttaf1',
  2713. b'http://www.w3.org/2006/10/ttaf1',
  2714. ]),
  2715. (b'http://www.w3.org/ns/ttml#styling', [
  2716. b'http://www.w3.org/ns/ttml#style',
  2717. ]),
  2718. )
  2719. SUPPORTED_STYLING = [
  2720. 'color',
  2721. 'fontFamily',
  2722. 'fontSize',
  2723. 'fontStyle',
  2724. 'fontWeight',
  2725. 'textDecoration',
  2726. ]
  2727. _x = functools.partial(xpath_with_ns, ns_map={
  2728. 'xml': 'http://www.w3.org/XML/1998/namespace',
  2729. 'ttml': 'http://www.w3.org/ns/ttml',
  2730. 'tts': 'http://www.w3.org/ns/ttml#styling',
  2731. })
  2732. styles = {}
  2733. default_style = {}
  2734. class TTMLPElementParser:
  2735. _out = ''
  2736. _unclosed_elements = []
  2737. _applied_styles = []
  2738. def start(self, tag, attrib):
  2739. if tag in (_x('ttml:br'), 'br'):
  2740. self._out += '\n'
  2741. else:
  2742. unclosed_elements = []
  2743. style = {}
  2744. element_style_id = attrib.get('style')
  2745. if default_style:
  2746. style.update(default_style)
  2747. if element_style_id:
  2748. style.update(styles.get(element_style_id, {}))
  2749. for prop in SUPPORTED_STYLING:
  2750. prop_val = attrib.get(_x('tts:' + prop))
  2751. if prop_val:
  2752. style[prop] = prop_val
  2753. if style:
  2754. font = ''
  2755. for k, v in sorted(style.items()):
  2756. if self._applied_styles and self._applied_styles[-1].get(k) == v:
  2757. continue
  2758. if k == 'color':
  2759. font += f' color="{v}"'
  2760. elif k == 'fontSize':
  2761. font += f' size="{v}"'
  2762. elif k == 'fontFamily':
  2763. font += f' face="{v}"'
  2764. elif k == 'fontWeight' and v == 'bold':
  2765. self._out += '<b>'
  2766. unclosed_elements.append('b')
  2767. elif k == 'fontStyle' and v == 'italic':
  2768. self._out += '<i>'
  2769. unclosed_elements.append('i')
  2770. elif k == 'textDecoration' and v == 'underline':
  2771. self._out += '<u>'
  2772. unclosed_elements.append('u')
  2773. if font:
  2774. self._out += '<font' + font + '>'
  2775. unclosed_elements.append('font')
  2776. applied_style = {}
  2777. if self._applied_styles:
  2778. applied_style.update(self._applied_styles[-1])
  2779. applied_style.update(style)
  2780. self._applied_styles.append(applied_style)
  2781. self._unclosed_elements.append(unclosed_elements)
  2782. def end(self, tag):
  2783. if tag not in (_x('ttml:br'), 'br'):
  2784. unclosed_elements = self._unclosed_elements.pop()
  2785. for element in reversed(unclosed_elements):
  2786. self._out += f'</{element}>'
  2787. if unclosed_elements and self._applied_styles:
  2788. self._applied_styles.pop()
  2789. def data(self, data):
  2790. self._out += data
  2791. def close(self):
  2792. return self._out.strip()
  2793. # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
  2794. # This will not trigger false positives since only UTF-8 text is being replaced
  2795. dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
  2796. def parse_node(node):
  2797. target = TTMLPElementParser()
  2798. parser = xml.etree.ElementTree.XMLParser(target=target)
  2799. parser.feed(xml.etree.ElementTree.tostring(node))
  2800. return parser.close()
  2801. for k, v in LEGACY_NAMESPACES:
  2802. for ns in v:
  2803. dfxp_data = dfxp_data.replace(ns, k)
  2804. dfxp = compat_etree_fromstring(dfxp_data)
  2805. out = []
  2806. paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
  2807. if not paras:
  2808. raise ValueError('Invalid dfxp/TTML subtitle')
  2809. repeat = False
  2810. while True:
  2811. for style in dfxp.findall(_x('.//ttml:style')):
  2812. style_id = style.get('id') or style.get(_x('xml:id'))
  2813. if not style_id:
  2814. continue
  2815. parent_style_id = style.get('style')
  2816. if parent_style_id:
  2817. if parent_style_id not in styles:
  2818. repeat = True
  2819. continue
  2820. styles[style_id] = styles[parent_style_id].copy()
  2821. for prop in SUPPORTED_STYLING:
  2822. prop_val = style.get(_x('tts:' + prop))
  2823. if prop_val:
  2824. styles.setdefault(style_id, {})[prop] = prop_val
  2825. if repeat:
  2826. repeat = False
  2827. else:
  2828. break
  2829. for p in ('body', 'div'):
  2830. ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
  2831. if ele is None:
  2832. continue
  2833. style = styles.get(ele.get('style'))
  2834. if not style:
  2835. continue
  2836. default_style.update(style)
  2837. for para, index in zip(paras, itertools.count(1)):
  2838. begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
  2839. end_time = parse_dfxp_time_expr(para.attrib.get('end'))
  2840. dur = parse_dfxp_time_expr(para.attrib.get('dur'))
  2841. if begin_time is None:
  2842. continue
  2843. if not end_time:
  2844. if not dur:
  2845. continue
  2846. end_time = begin_time + dur
  2847. out.append('%d\n%s --> %s\n%s\n\n' % (
  2848. index,
  2849. srt_subtitles_timecode(begin_time),
  2850. srt_subtitles_timecode(end_time),
  2851. parse_node(para)))
  2852. return ''.join(out)
  2853. def cli_option(params, command_option, param, separator=None):
  2854. param = params.get(param)
  2855. return ([] if param is None
  2856. else [command_option, str(param)] if separator is None
  2857. else [f'{command_option}{separator}{param}'])
  2858. def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
  2859. param = params.get(param)
  2860. assert param in (True, False, None)
  2861. return cli_option({True: true_value, False: false_value}, command_option, param, separator)
  2862. def cli_valueless_option(params, command_option, param, expected_value=True):
  2863. return [command_option] if params.get(param) == expected_value else []
  2864. def cli_configuration_args(argdict, keys, default=[], use_compat=True):
  2865. if isinstance(argdict, (list, tuple)): # for backward compatibility
  2866. if use_compat:
  2867. return argdict
  2868. else:
  2869. argdict = None
  2870. if argdict is None:
  2871. return default
  2872. assert isinstance(argdict, dict)
  2873. assert isinstance(keys, (list, tuple))
  2874. for key_list in keys:
  2875. arg_list = list(filter(
  2876. lambda x: x is not None,
  2877. [argdict.get(key.lower()) for key in variadic(key_list)]))
  2878. if arg_list:
  2879. return [arg for args in arg_list for arg in args]
  2880. return default
  2881. def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
  2882. main_key, exe = main_key.lower(), exe.lower()
  2883. root_key = exe if main_key == exe else f'{main_key}+{exe}'
  2884. keys = [f'{root_key}{k}' for k in (keys or [''])]
  2885. if root_key in keys:
  2886. if main_key != exe:
  2887. keys.append((main_key, exe))
  2888. keys.append('default')
  2889. else:
  2890. use_compat = False
  2891. return cli_configuration_args(argdict, keys, default, use_compat)
  2892. class ISO639Utils:
  2893. # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
  2894. _lang_map = {
  2895. 'aa': 'aar',
  2896. 'ab': 'abk',
  2897. 'ae': 'ave',
  2898. 'af': 'afr',
  2899. 'ak': 'aka',
  2900. 'am': 'amh',
  2901. 'an': 'arg',
  2902. 'ar': 'ara',
  2903. 'as': 'asm',
  2904. 'av': 'ava',
  2905. 'ay': 'aym',
  2906. 'az': 'aze',
  2907. 'ba': 'bak',
  2908. 'be': 'bel',
  2909. 'bg': 'bul',
  2910. 'bh': 'bih',
  2911. 'bi': 'bis',
  2912. 'bm': 'bam',
  2913. 'bn': 'ben',
  2914. 'bo': 'bod',
  2915. 'br': 'bre',
  2916. 'bs': 'bos',
  2917. 'ca': 'cat',
  2918. 'ce': 'che',
  2919. 'ch': 'cha',
  2920. 'co': 'cos',
  2921. 'cr': 'cre',
  2922. 'cs': 'ces',
  2923. 'cu': 'chu',
  2924. 'cv': 'chv',
  2925. 'cy': 'cym',
  2926. 'da': 'dan',
  2927. 'de': 'deu',
  2928. 'dv': 'div',
  2929. 'dz': 'dzo',
  2930. 'ee': 'ewe',
  2931. 'el': 'ell',
  2932. 'en': 'eng',
  2933. 'eo': 'epo',
  2934. 'es': 'spa',
  2935. 'et': 'est',
  2936. 'eu': 'eus',
  2937. 'fa': 'fas',
  2938. 'ff': 'ful',
  2939. 'fi': 'fin',
  2940. 'fj': 'fij',
  2941. 'fo': 'fao',
  2942. 'fr': 'fra',
  2943. 'fy': 'fry',
  2944. 'ga': 'gle',
  2945. 'gd': 'gla',
  2946. 'gl': 'glg',
  2947. 'gn': 'grn',
  2948. 'gu': 'guj',
  2949. 'gv': 'glv',
  2950. 'ha': 'hau',
  2951. 'he': 'heb',
  2952. 'iw': 'heb', # Replaced by he in 1989 revision
  2953. 'hi': 'hin',
  2954. 'ho': 'hmo',
  2955. 'hr': 'hrv',
  2956. 'ht': 'hat',
  2957. 'hu': 'hun',
  2958. 'hy': 'hye',
  2959. 'hz': 'her',
  2960. 'ia': 'ina',
  2961. 'id': 'ind',
  2962. 'in': 'ind', # Replaced by id in 1989 revision
  2963. 'ie': 'ile',
  2964. 'ig': 'ibo',
  2965. 'ii': 'iii',
  2966. 'ik': 'ipk',
  2967. 'io': 'ido',
  2968. 'is': 'isl',
  2969. 'it': 'ita',
  2970. 'iu': 'iku',
  2971. 'ja': 'jpn',
  2972. 'jv': 'jav',
  2973. 'ka': 'kat',
  2974. 'kg': 'kon',
  2975. 'ki': 'kik',
  2976. 'kj': 'kua',
  2977. 'kk': 'kaz',
  2978. 'kl': 'kal',
  2979. 'km': 'khm',
  2980. 'kn': 'kan',
  2981. 'ko': 'kor',
  2982. 'kr': 'kau',
  2983. 'ks': 'kas',
  2984. 'ku': 'kur',
  2985. 'kv': 'kom',
  2986. 'kw': 'cor',
  2987. 'ky': 'kir',
  2988. 'la': 'lat',
  2989. 'lb': 'ltz',
  2990. 'lg': 'lug',
  2991. 'li': 'lim',
  2992. 'ln': 'lin',
  2993. 'lo': 'lao',
  2994. 'lt': 'lit',
  2995. 'lu': 'lub',
  2996. 'lv': 'lav',
  2997. 'mg': 'mlg',
  2998. 'mh': 'mah',
  2999. 'mi': 'mri',
  3000. 'mk': 'mkd',
  3001. 'ml': 'mal',
  3002. 'mn': 'mon',
  3003. 'mr': 'mar',
  3004. 'ms': 'msa',
  3005. 'mt': 'mlt',
  3006. 'my': 'mya',
  3007. 'na': 'nau',
  3008. 'nb': 'nob',
  3009. 'nd': 'nde',
  3010. 'ne': 'nep',
  3011. 'ng': 'ndo',
  3012. 'nl': 'nld',
  3013. 'nn': 'nno',
  3014. 'no': 'nor',
  3015. 'nr': 'nbl',
  3016. 'nv': 'nav',
  3017. 'ny': 'nya',
  3018. 'oc': 'oci',
  3019. 'oj': 'oji',
  3020. 'om': 'orm',
  3021. 'or': 'ori',
  3022. 'os': 'oss',
  3023. 'pa': 'pan',
  3024. 'pe': 'per',
  3025. 'pi': 'pli',
  3026. 'pl': 'pol',
  3027. 'ps': 'pus',
  3028. 'pt': 'por',
  3029. 'qu': 'que',
  3030. 'rm': 'roh',
  3031. 'rn': 'run',
  3032. 'ro': 'ron',
  3033. 'ru': 'rus',
  3034. 'rw': 'kin',
  3035. 'sa': 'san',
  3036. 'sc': 'srd',
  3037. 'sd': 'snd',
  3038. 'se': 'sme',
  3039. 'sg': 'sag',
  3040. 'si': 'sin',
  3041. 'sk': 'slk',
  3042. 'sl': 'slv',
  3043. 'sm': 'smo',
  3044. 'sn': 'sna',
  3045. 'so': 'som',
  3046. 'sq': 'sqi',
  3047. 'sr': 'srp',
  3048. 'ss': 'ssw',
  3049. 'st': 'sot',
  3050. 'su': 'sun',
  3051. 'sv': 'swe',
  3052. 'sw': 'swa',
  3053. 'ta': 'tam',
  3054. 'te': 'tel',
  3055. 'tg': 'tgk',
  3056. 'th': 'tha',
  3057. 'ti': 'tir',
  3058. 'tk': 'tuk',
  3059. 'tl': 'tgl',
  3060. 'tn': 'tsn',
  3061. 'to': 'ton',
  3062. 'tr': 'tur',
  3063. 'ts': 'tso',
  3064. 'tt': 'tat',
  3065. 'tw': 'twi',
  3066. 'ty': 'tah',
  3067. 'ug': 'uig',
  3068. 'uk': 'ukr',
  3069. 'ur': 'urd',
  3070. 'uz': 'uzb',
  3071. 've': 'ven',
  3072. 'vi': 'vie',
  3073. 'vo': 'vol',
  3074. 'wa': 'wln',
  3075. 'wo': 'wol',
  3076. 'xh': 'xho',
  3077. 'yi': 'yid',
  3078. 'ji': 'yid', # Replaced by yi in 1989 revision
  3079. 'yo': 'yor',
  3080. 'za': 'zha',
  3081. 'zh': 'zho',
  3082. 'zu': 'zul',
  3083. }
  3084. @classmethod
  3085. def short2long(cls, code):
  3086. """Convert language code from ISO 639-1 to ISO 639-2/T"""
  3087. return cls._lang_map.get(code[:2])
  3088. @classmethod
  3089. def long2short(cls, code):
  3090. """Convert language code from ISO 639-2/T to ISO 639-1"""
  3091. for short_name, long_name in cls._lang_map.items():
  3092. if long_name == code:
  3093. return short_name
  3094. class ISO3166Utils:
  3095. # From http://data.okfn.org/data/core/country-list
  3096. _country_map = {
  3097. 'AF': 'Afghanistan',
  3098. 'AX': 'Åland Islands',
  3099. 'AL': 'Albania',
  3100. 'DZ': 'Algeria',
  3101. 'AS': 'American Samoa',
  3102. 'AD': 'Andorra',
  3103. 'AO': 'Angola',
  3104. 'AI': 'Anguilla',
  3105. 'AQ': 'Antarctica',
  3106. 'AG': 'Antigua and Barbuda',
  3107. 'AR': 'Argentina',
  3108. 'AM': 'Armenia',
  3109. 'AW': 'Aruba',
  3110. 'AU': 'Australia',
  3111. 'AT': 'Austria',
  3112. 'AZ': 'Azerbaijan',
  3113. 'BS': 'Bahamas',
  3114. 'BH': 'Bahrain',
  3115. 'BD': 'Bangladesh',
  3116. 'BB': 'Barbados',
  3117. 'BY': 'Belarus',
  3118. 'BE': 'Belgium',
  3119. 'BZ': 'Belize',
  3120. 'BJ': 'Benin',
  3121. 'BM': 'Bermuda',
  3122. 'BT': 'Bhutan',
  3123. 'BO': 'Bolivia, Plurinational State of',
  3124. 'BQ': 'Bonaire, Sint Eustatius and Saba',
  3125. 'BA': 'Bosnia and Herzegovina',
  3126. 'BW': 'Botswana',
  3127. 'BV': 'Bouvet Island',
  3128. 'BR': 'Brazil',
  3129. 'IO': 'British Indian Ocean Territory',
  3130. 'BN': 'Brunei Darussalam',
  3131. 'BG': 'Bulgaria',
  3132. 'BF': 'Burkina Faso',
  3133. 'BI': 'Burundi',
  3134. 'KH': 'Cambodia',
  3135. 'CM': 'Cameroon',
  3136. 'CA': 'Canada',
  3137. 'CV': 'Cape Verde',
  3138. 'KY': 'Cayman Islands',
  3139. 'CF': 'Central African Republic',
  3140. 'TD': 'Chad',
  3141. 'CL': 'Chile',
  3142. 'CN': 'China',
  3143. 'CX': 'Christmas Island',
  3144. 'CC': 'Cocos (Keeling) Islands',
  3145. 'CO': 'Colombia',
  3146. 'KM': 'Comoros',
  3147. 'CG': 'Congo',
  3148. 'CD': 'Congo, the Democratic Republic of the',
  3149. 'CK': 'Cook Islands',
  3150. 'CR': 'Costa Rica',
  3151. 'CI': 'Côte d\'Ivoire',
  3152. 'HR': 'Croatia',
  3153. 'CU': 'Cuba',
  3154. 'CW': 'Curaçao',
  3155. 'CY': 'Cyprus',
  3156. 'CZ': 'Czech Republic',
  3157. 'DK': 'Denmark',
  3158. 'DJ': 'Djibouti',
  3159. 'DM': 'Dominica',
  3160. 'DO': 'Dominican Republic',
  3161. 'EC': 'Ecuador',
  3162. 'EG': 'Egypt',
  3163. 'SV': 'El Salvador',
  3164. 'GQ': 'Equatorial Guinea',
  3165. 'ER': 'Eritrea',
  3166. 'EE': 'Estonia',
  3167. 'ET': 'Ethiopia',
  3168. 'FK': 'Falkland Islands (Malvinas)',
  3169. 'FO': 'Faroe Islands',
  3170. 'FJ': 'Fiji',
  3171. 'FI': 'Finland',
  3172. 'FR': 'France',
  3173. 'GF': 'French Guiana',
  3174. 'PF': 'French Polynesia',
  3175. 'TF': 'French Southern Territories',
  3176. 'GA': 'Gabon',
  3177. 'GM': 'Gambia',
  3178. 'GE': 'Georgia',
  3179. 'DE': 'Germany',
  3180. 'GH': 'Ghana',
  3181. 'GI': 'Gibraltar',
  3182. 'GR': 'Greece',
  3183. 'GL': 'Greenland',
  3184. 'GD': 'Grenada',
  3185. 'GP': 'Guadeloupe',
  3186. 'GU': 'Guam',
  3187. 'GT': 'Guatemala',
  3188. 'GG': 'Guernsey',
  3189. 'GN': 'Guinea',
  3190. 'GW': 'Guinea-Bissau',
  3191. 'GY': 'Guyana',
  3192. 'HT': 'Haiti',
  3193. 'HM': 'Heard Island and McDonald Islands',
  3194. 'VA': 'Holy See (Vatican City State)',
  3195. 'HN': 'Honduras',
  3196. 'HK': 'Hong Kong',
  3197. 'HU': 'Hungary',
  3198. 'IS': 'Iceland',
  3199. 'IN': 'India',
  3200. 'ID': 'Indonesia',
  3201. 'IR': 'Iran, Islamic Republic of',
  3202. 'IQ': 'Iraq',
  3203. 'IE': 'Ireland',
  3204. 'IM': 'Isle of Man',
  3205. 'IL': 'Israel',
  3206. 'IT': 'Italy',
  3207. 'JM': 'Jamaica',
  3208. 'JP': 'Japan',
  3209. 'JE': 'Jersey',
  3210. 'JO': 'Jordan',
  3211. 'KZ': 'Kazakhstan',
  3212. 'KE': 'Kenya',
  3213. 'KI': 'Kiribati',
  3214. 'KP': 'Korea, Democratic People\'s Republic of',
  3215. 'KR': 'Korea, Republic of',
  3216. 'KW': 'Kuwait',
  3217. 'KG': 'Kyrgyzstan',
  3218. 'LA': 'Lao People\'s Democratic Republic',
  3219. 'LV': 'Latvia',
  3220. 'LB': 'Lebanon',
  3221. 'LS': 'Lesotho',
  3222. 'LR': 'Liberia',
  3223. 'LY': 'Libya',
  3224. 'LI': 'Liechtenstein',
  3225. 'LT': 'Lithuania',
  3226. 'LU': 'Luxembourg',
  3227. 'MO': 'Macao',
  3228. 'MK': 'Macedonia, the Former Yugoslav Republic of',
  3229. 'MG': 'Madagascar',
  3230. 'MW': 'Malawi',
  3231. 'MY': 'Malaysia',
  3232. 'MV': 'Maldives',
  3233. 'ML': 'Mali',
  3234. 'MT': 'Malta',
  3235. 'MH': 'Marshall Islands',
  3236. 'MQ': 'Martinique',
  3237. 'MR': 'Mauritania',
  3238. 'MU': 'Mauritius',
  3239. 'YT': 'Mayotte',
  3240. 'MX': 'Mexico',
  3241. 'FM': 'Micronesia, Federated States of',
  3242. 'MD': 'Moldova, Republic of',
  3243. 'MC': 'Monaco',
  3244. 'MN': 'Mongolia',
  3245. 'ME': 'Montenegro',
  3246. 'MS': 'Montserrat',
  3247. 'MA': 'Morocco',
  3248. 'MZ': 'Mozambique',
  3249. 'MM': 'Myanmar',
  3250. 'NA': 'Namibia',
  3251. 'NR': 'Nauru',
  3252. 'NP': 'Nepal',
  3253. 'NL': 'Netherlands',
  3254. 'NC': 'New Caledonia',
  3255. 'NZ': 'New Zealand',
  3256. 'NI': 'Nicaragua',
  3257. 'NE': 'Niger',
  3258. 'NG': 'Nigeria',
  3259. 'NU': 'Niue',
  3260. 'NF': 'Norfolk Island',
  3261. 'MP': 'Northern Mariana Islands',
  3262. 'NO': 'Norway',
  3263. 'OM': 'Oman',
  3264. 'PK': 'Pakistan',
  3265. 'PW': 'Palau',
  3266. 'PS': 'Palestine, State of',
  3267. 'PA': 'Panama',
  3268. 'PG': 'Papua New Guinea',
  3269. 'PY': 'Paraguay',
  3270. 'PE': 'Peru',
  3271. 'PH': 'Philippines',
  3272. 'PN': 'Pitcairn',
  3273. 'PL': 'Poland',
  3274. 'PT': 'Portugal',
  3275. 'PR': 'Puerto Rico',
  3276. 'QA': 'Qatar',
  3277. 'RE': 'Réunion',
  3278. 'RO': 'Romania',
  3279. 'RU': 'Russian Federation',
  3280. 'RW': 'Rwanda',
  3281. 'BL': 'Saint Barthélemy',
  3282. 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
  3283. 'KN': 'Saint Kitts and Nevis',
  3284. 'LC': 'Saint Lucia',
  3285. 'MF': 'Saint Martin (French part)',
  3286. 'PM': 'Saint Pierre and Miquelon',
  3287. 'VC': 'Saint Vincent and the Grenadines',
  3288. 'WS': 'Samoa',
  3289. 'SM': 'San Marino',
  3290. 'ST': 'Sao Tome and Principe',
  3291. 'SA': 'Saudi Arabia',
  3292. 'SN': 'Senegal',
  3293. 'RS': 'Serbia',
  3294. 'SC': 'Seychelles',
  3295. 'SL': 'Sierra Leone',
  3296. 'SG': 'Singapore',
  3297. 'SX': 'Sint Maarten (Dutch part)',
  3298. 'SK': 'Slovakia',
  3299. 'SI': 'Slovenia',
  3300. 'SB': 'Solomon Islands',
  3301. 'SO': 'Somalia',
  3302. 'ZA': 'South Africa',
  3303. 'GS': 'South Georgia and the South Sandwich Islands',
  3304. 'SS': 'South Sudan',
  3305. 'ES': 'Spain',
  3306. 'LK': 'Sri Lanka',
  3307. 'SD': 'Sudan',
  3308. 'SR': 'Suriname',
  3309. 'SJ': 'Svalbard and Jan Mayen',
  3310. 'SZ': 'Swaziland',
  3311. 'SE': 'Sweden',
  3312. 'CH': 'Switzerland',
  3313. 'SY': 'Syrian Arab Republic',
  3314. 'TW': 'Taiwan, Province of China',
  3315. 'TJ': 'Tajikistan',
  3316. 'TZ': 'Tanzania, United Republic of',
  3317. 'TH': 'Thailand',
  3318. 'TL': 'Timor-Leste',
  3319. 'TG': 'Togo',
  3320. 'TK': 'Tokelau',
  3321. 'TO': 'Tonga',
  3322. 'TT': 'Trinidad and Tobago',
  3323. 'TN': 'Tunisia',
  3324. 'TR': 'Turkey',
  3325. 'TM': 'Turkmenistan',
  3326. 'TC': 'Turks and Caicos Islands',
  3327. 'TV': 'Tuvalu',
  3328. 'UG': 'Uganda',
  3329. 'UA': 'Ukraine',
  3330. 'AE': 'United Arab Emirates',
  3331. 'GB': 'United Kingdom',
  3332. 'US': 'United States',
  3333. 'UM': 'United States Minor Outlying Islands',
  3334. 'UY': 'Uruguay',
  3335. 'UZ': 'Uzbekistan',
  3336. 'VU': 'Vanuatu',
  3337. 'VE': 'Venezuela, Bolivarian Republic of',
  3338. 'VN': 'Viet Nam',
  3339. 'VG': 'Virgin Islands, British',
  3340. 'VI': 'Virgin Islands, U.S.',
  3341. 'WF': 'Wallis and Futuna',
  3342. 'EH': 'Western Sahara',
  3343. 'YE': 'Yemen',
  3344. 'ZM': 'Zambia',
  3345. 'ZW': 'Zimbabwe',
  3346. # Not ISO 3166 codes, but used for IP blocks
  3347. 'AP': 'Asia/Pacific Region',
  3348. 'EU': 'Europe',
  3349. }
  3350. @classmethod
  3351. def short2full(cls, code):
  3352. """Convert an ISO 3166-2 country code to the corresponding full name"""
  3353. return cls._country_map.get(code.upper())
  3354. class GeoUtils:
  3355. # Major IPv4 address blocks per country
  3356. _country_ip_map = {
  3357. 'AD': '46.172.224.0/19',
  3358. 'AE': '94.200.0.0/13',
  3359. 'AF': '149.54.0.0/17',
  3360. 'AG': '209.59.64.0/18',
  3361. 'AI': '204.14.248.0/21',
  3362. 'AL': '46.99.0.0/16',
  3363. 'AM': '46.70.0.0/15',
  3364. 'AO': '105.168.0.0/13',
  3365. 'AP': '182.50.184.0/21',
  3366. 'AQ': '23.154.160.0/24',
  3367. 'AR': '181.0.0.0/12',
  3368. 'AS': '202.70.112.0/20',
  3369. 'AT': '77.116.0.0/14',
  3370. 'AU': '1.128.0.0/11',
  3371. 'AW': '181.41.0.0/18',
  3372. 'AX': '185.217.4.0/22',
  3373. 'AZ': '5.197.0.0/16',
  3374. 'BA': '31.176.128.0/17',
  3375. 'BB': '65.48.128.0/17',
  3376. 'BD': '114.130.0.0/16',
  3377. 'BE': '57.0.0.0/8',
  3378. 'BF': '102.178.0.0/15',
  3379. 'BG': '95.42.0.0/15',
  3380. 'BH': '37.131.0.0/17',
  3381. 'BI': '154.117.192.0/18',
  3382. 'BJ': '137.255.0.0/16',
  3383. 'BL': '185.212.72.0/23',
  3384. 'BM': '196.12.64.0/18',
  3385. 'BN': '156.31.0.0/16',
  3386. 'BO': '161.56.0.0/16',
  3387. 'BQ': '161.0.80.0/20',
  3388. 'BR': '191.128.0.0/12',
  3389. 'BS': '24.51.64.0/18',
  3390. 'BT': '119.2.96.0/19',
  3391. 'BW': '168.167.0.0/16',
  3392. 'BY': '178.120.0.0/13',
  3393. 'BZ': '179.42.192.0/18',
  3394. 'CA': '99.224.0.0/11',
  3395. 'CD': '41.243.0.0/16',
  3396. 'CF': '197.242.176.0/21',
  3397. 'CG': '160.113.0.0/16',
  3398. 'CH': '85.0.0.0/13',
  3399. 'CI': '102.136.0.0/14',
  3400. 'CK': '202.65.32.0/19',
  3401. 'CL': '152.172.0.0/14',
  3402. 'CM': '102.244.0.0/14',
  3403. 'CN': '36.128.0.0/10',
  3404. 'CO': '181.240.0.0/12',
  3405. 'CR': '201.192.0.0/12',
  3406. 'CU': '152.206.0.0/15',
  3407. 'CV': '165.90.96.0/19',
  3408. 'CW': '190.88.128.0/17',
  3409. 'CY': '31.153.0.0/16',
  3410. 'CZ': '88.100.0.0/14',
  3411. 'DE': '53.0.0.0/8',
  3412. 'DJ': '197.241.0.0/17',
  3413. 'DK': '87.48.0.0/12',
  3414. 'DM': '192.243.48.0/20',
  3415. 'DO': '152.166.0.0/15',
  3416. 'DZ': '41.96.0.0/12',
  3417. 'EC': '186.68.0.0/15',
  3418. 'EE': '90.190.0.0/15',
  3419. 'EG': '156.160.0.0/11',
  3420. 'ER': '196.200.96.0/20',
  3421. 'ES': '88.0.0.0/11',
  3422. 'ET': '196.188.0.0/14',
  3423. 'EU': '2.16.0.0/13',
  3424. 'FI': '91.152.0.0/13',
  3425. 'FJ': '144.120.0.0/16',
  3426. 'FK': '80.73.208.0/21',
  3427. 'FM': '119.252.112.0/20',
  3428. 'FO': '88.85.32.0/19',
  3429. 'FR': '90.0.0.0/9',
  3430. 'GA': '41.158.0.0/15',
  3431. 'GB': '25.0.0.0/8',
  3432. 'GD': '74.122.88.0/21',
  3433. 'GE': '31.146.0.0/16',
  3434. 'GF': '161.22.64.0/18',
  3435. 'GG': '62.68.160.0/19',
  3436. 'GH': '154.160.0.0/12',
  3437. 'GI': '95.164.0.0/16',
  3438. 'GL': '88.83.0.0/19',
  3439. 'GM': '160.182.0.0/15',
  3440. 'GN': '197.149.192.0/18',
  3441. 'GP': '104.250.0.0/19',
  3442. 'GQ': '105.235.224.0/20',
  3443. 'GR': '94.64.0.0/13',
  3444. 'GT': '168.234.0.0/16',
  3445. 'GU': '168.123.0.0/16',
  3446. 'GW': '197.214.80.0/20',
  3447. 'GY': '181.41.64.0/18',
  3448. 'HK': '113.252.0.0/14',
  3449. 'HN': '181.210.0.0/16',
  3450. 'HR': '93.136.0.0/13',
  3451. 'HT': '148.102.128.0/17',
  3452. 'HU': '84.0.0.0/14',
  3453. 'ID': '39.192.0.0/10',
  3454. 'IE': '87.32.0.0/12',
  3455. 'IL': '79.176.0.0/13',
  3456. 'IM': '5.62.80.0/20',
  3457. 'IN': '117.192.0.0/10',
  3458. 'IO': '203.83.48.0/21',
  3459. 'IQ': '37.236.0.0/14',
  3460. 'IR': '2.176.0.0/12',
  3461. 'IS': '82.221.0.0/16',
  3462. 'IT': '79.0.0.0/10',
  3463. 'JE': '87.244.64.0/18',
  3464. 'JM': '72.27.0.0/17',
  3465. 'JO': '176.29.0.0/16',
  3466. 'JP': '133.0.0.0/8',
  3467. 'KE': '105.48.0.0/12',
  3468. 'KG': '158.181.128.0/17',
  3469. 'KH': '36.37.128.0/17',
  3470. 'KI': '103.25.140.0/22',
  3471. 'KM': '197.255.224.0/20',
  3472. 'KN': '198.167.192.0/19',
  3473. 'KP': '175.45.176.0/22',
  3474. 'KR': '175.192.0.0/10',
  3475. 'KW': '37.36.0.0/14',
  3476. 'KY': '64.96.0.0/15',
  3477. 'KZ': '2.72.0.0/13',
  3478. 'LA': '115.84.64.0/18',
  3479. 'LB': '178.135.0.0/16',
  3480. 'LC': '24.92.144.0/20',
  3481. 'LI': '82.117.0.0/19',
  3482. 'LK': '112.134.0.0/15',
  3483. 'LR': '102.183.0.0/16',
  3484. 'LS': '129.232.0.0/17',
  3485. 'LT': '78.56.0.0/13',
  3486. 'LU': '188.42.0.0/16',
  3487. 'LV': '46.109.0.0/16',
  3488. 'LY': '41.252.0.0/14',
  3489. 'MA': '105.128.0.0/11',
  3490. 'MC': '88.209.64.0/18',
  3491. 'MD': '37.246.0.0/16',
  3492. 'ME': '178.175.0.0/17',
  3493. 'MF': '74.112.232.0/21',
  3494. 'MG': '154.126.0.0/17',
  3495. 'MH': '117.103.88.0/21',
  3496. 'MK': '77.28.0.0/15',
  3497. 'ML': '154.118.128.0/18',
  3498. 'MM': '37.111.0.0/17',
  3499. 'MN': '49.0.128.0/17',
  3500. 'MO': '60.246.0.0/16',
  3501. 'MP': '202.88.64.0/20',
  3502. 'MQ': '109.203.224.0/19',
  3503. 'MR': '41.188.64.0/18',
  3504. 'MS': '208.90.112.0/22',
  3505. 'MT': '46.11.0.0/16',
  3506. 'MU': '105.16.0.0/12',
  3507. 'MV': '27.114.128.0/18',
  3508. 'MW': '102.70.0.0/15',
  3509. 'MX': '187.192.0.0/11',
  3510. 'MY': '175.136.0.0/13',
  3511. 'MZ': '197.218.0.0/15',
  3512. 'NA': '41.182.0.0/16',
  3513. 'NC': '101.101.0.0/18',
  3514. 'NE': '197.214.0.0/18',
  3515. 'NF': '203.17.240.0/22',
  3516. 'NG': '105.112.0.0/12',
  3517. 'NI': '186.76.0.0/15',
  3518. 'NL': '145.96.0.0/11',
  3519. 'NO': '84.208.0.0/13',
  3520. 'NP': '36.252.0.0/15',
  3521. 'NR': '203.98.224.0/19',
  3522. 'NU': '49.156.48.0/22',
  3523. 'NZ': '49.224.0.0/14',
  3524. 'OM': '5.36.0.0/15',
  3525. 'PA': '186.72.0.0/15',
  3526. 'PE': '186.160.0.0/14',
  3527. 'PF': '123.50.64.0/18',
  3528. 'PG': '124.240.192.0/19',
  3529. 'PH': '49.144.0.0/13',
  3530. 'PK': '39.32.0.0/11',
  3531. 'PL': '83.0.0.0/11',
  3532. 'PM': '70.36.0.0/20',
  3533. 'PR': '66.50.0.0/16',
  3534. 'PS': '188.161.0.0/16',
  3535. 'PT': '85.240.0.0/13',
  3536. 'PW': '202.124.224.0/20',
  3537. 'PY': '181.120.0.0/14',
  3538. 'QA': '37.210.0.0/15',
  3539. 'RE': '102.35.0.0/16',
  3540. 'RO': '79.112.0.0/13',
  3541. 'RS': '93.86.0.0/15',
  3542. 'RU': '5.136.0.0/13',
  3543. 'RW': '41.186.0.0/16',
  3544. 'SA': '188.48.0.0/13',
  3545. 'SB': '202.1.160.0/19',
  3546. 'SC': '154.192.0.0/11',
  3547. 'SD': '102.120.0.0/13',
  3548. 'SE': '78.64.0.0/12',
  3549. 'SG': '8.128.0.0/10',
  3550. 'SI': '188.196.0.0/14',
  3551. 'SK': '78.98.0.0/15',
  3552. 'SL': '102.143.0.0/17',
  3553. 'SM': '89.186.32.0/19',
  3554. 'SN': '41.82.0.0/15',
  3555. 'SO': '154.115.192.0/18',
  3556. 'SR': '186.179.128.0/17',
  3557. 'SS': '105.235.208.0/21',
  3558. 'ST': '197.159.160.0/19',
  3559. 'SV': '168.243.0.0/16',
  3560. 'SX': '190.102.0.0/20',
  3561. 'SY': '5.0.0.0/16',
  3562. 'SZ': '41.84.224.0/19',
  3563. 'TC': '65.255.48.0/20',
  3564. 'TD': '154.68.128.0/19',
  3565. 'TG': '196.168.0.0/14',
  3566. 'TH': '171.96.0.0/13',
  3567. 'TJ': '85.9.128.0/18',
  3568. 'TK': '27.96.24.0/21',
  3569. 'TL': '180.189.160.0/20',
  3570. 'TM': '95.85.96.0/19',
  3571. 'TN': '197.0.0.0/11',
  3572. 'TO': '175.176.144.0/21',
  3573. 'TR': '78.160.0.0/11',
  3574. 'TT': '186.44.0.0/15',
  3575. 'TV': '202.2.96.0/19',
  3576. 'TW': '120.96.0.0/11',
  3577. 'TZ': '156.156.0.0/14',
  3578. 'UA': '37.52.0.0/14',
  3579. 'UG': '102.80.0.0/13',
  3580. 'US': '6.0.0.0/8',
  3581. 'UY': '167.56.0.0/13',
  3582. 'UZ': '84.54.64.0/18',
  3583. 'VA': '212.77.0.0/19',
  3584. 'VC': '207.191.240.0/21',
  3585. 'VE': '186.88.0.0/13',
  3586. 'VG': '66.81.192.0/20',
  3587. 'VI': '146.226.0.0/16',
  3588. 'VN': '14.160.0.0/11',
  3589. 'VU': '202.80.32.0/20',
  3590. 'WF': '117.20.32.0/21',
  3591. 'WS': '202.4.32.0/19',
  3592. 'YE': '134.35.0.0/16',
  3593. 'YT': '41.242.116.0/22',
  3594. 'ZA': '41.0.0.0/11',
  3595. 'ZM': '102.144.0.0/13',
  3596. 'ZW': '102.177.192.0/18',
  3597. }
  3598. @classmethod
  3599. def random_ipv4(cls, code_or_block):
  3600. if len(code_or_block) == 2:
  3601. block = cls._country_ip_map.get(code_or_block.upper())
  3602. if not block:
  3603. return None
  3604. else:
  3605. block = code_or_block
  3606. addr, preflen = block.split('/')
  3607. addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
  3608. addr_max = addr_min | (0xffffffff >> int(preflen))
  3609. return str(socket.inet_ntoa(
  3610. struct.pack('!L', random.randint(addr_min, addr_max))))
  3611. # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
  3612. # released into Public Domain
  3613. # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
  3614. def long_to_bytes(n, blocksize=0):
  3615. """long_to_bytes(n:long, blocksize:int) : string
  3616. Convert a long integer to a byte string.
  3617. If optional blocksize is given and greater than zero, pad the front of the
  3618. byte string with binary zeros so that the length is a multiple of
  3619. blocksize.
  3620. """
  3621. # after much testing, this algorithm was deemed to be the fastest
  3622. s = b''
  3623. n = int(n)
  3624. while n > 0:
  3625. s = struct.pack('>I', n & 0xffffffff) + s
  3626. n = n >> 32
  3627. # strip off leading zeros
  3628. for i in range(len(s)):
  3629. if s[i] != b'\000'[0]:
  3630. break
  3631. else:
  3632. # only happens when n == 0
  3633. s = b'\000'
  3634. i = 0
  3635. s = s[i:]
  3636. # add back some pad bytes. this could be done more efficiently w.r.t. the
  3637. # de-padding being done above, but sigh...
  3638. if blocksize > 0 and len(s) % blocksize:
  3639. s = (blocksize - len(s) % blocksize) * b'\000' + s
  3640. return s
  3641. def bytes_to_long(s):
  3642. """bytes_to_long(string) : long
  3643. Convert a byte string to a long integer.
  3644. This is (essentially) the inverse of long_to_bytes().
  3645. """
  3646. acc = 0
  3647. length = len(s)
  3648. if length % 4:
  3649. extra = (4 - length % 4)
  3650. s = b'\000' * extra + s
  3651. length = length + extra
  3652. for i in range(0, length, 4):
  3653. acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
  3654. return acc
  3655. def ohdave_rsa_encrypt(data, exponent, modulus):
  3656. """
  3657. Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
  3658. Input:
  3659. data: data to encrypt, bytes-like object
  3660. exponent, modulus: parameter e and N of RSA algorithm, both integer
  3661. Output: hex string of encrypted data
  3662. Limitation: supports one block encryption only
  3663. """
  3664. payload = int(binascii.hexlify(data[::-1]), 16)
  3665. encrypted = pow(payload, exponent, modulus)
  3666. return f'{encrypted:x}'
  3667. def pkcs1pad(data, length):
  3668. """
  3669. Padding input data with PKCS#1 scheme
  3670. @param {int[]} data input data
  3671. @param {int} length target length
  3672. @returns {int[]} padded data
  3673. """
  3674. if len(data) > length - 11:
  3675. raise ValueError('Input data too long for PKCS#1 padding')
  3676. pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
  3677. return [0, 2, *pseudo_random, 0, *data]
  3678. def _base_n_table(n, table):
  3679. if not table and not n:
  3680. raise ValueError('Either table or n must be specified')
  3681. table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
  3682. if n and n != len(table):
  3683. raise ValueError(f'base {n} exceeds table length {len(table)}')
  3684. return table
  3685. def encode_base_n(num, n=None, table=None):
  3686. """Convert given int to a base-n string"""
  3687. table = _base_n_table(n, table)
  3688. if not num:
  3689. return table[0]
  3690. result, base = '', len(table)
  3691. while num:
  3692. result = table[num % base] + result
  3693. num = num // base
  3694. return result
  3695. def decode_base_n(string, n=None, table=None):
  3696. """Convert given base-n string to int"""
  3697. table = {char: index for index, char in enumerate(_base_n_table(n, table))}
  3698. result, base = 0, len(table)
  3699. for char in string:
  3700. result = result * base + table[char]
  3701. return result
  3702. def decode_packed_codes(code):
  3703. mobj = re.search(PACKED_CODES_RE, code)
  3704. obfuscated_code, base, count, symbols = mobj.groups()
  3705. base = int(base)
  3706. count = int(count)
  3707. symbols = symbols.split('|')
  3708. symbol_table = {}
  3709. while count:
  3710. count -= 1
  3711. base_n_count = encode_base_n(count, base)
  3712. symbol_table[base_n_count] = symbols[count] or base_n_count
  3713. return re.sub(
  3714. r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
  3715. obfuscated_code)
  3716. def caesar(s, alphabet, shift):
  3717. if shift == 0:
  3718. return s
  3719. l = len(alphabet)
  3720. return ''.join(
  3721. alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
  3722. for c in s)
  3723. def rot47(s):
  3724. return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
  3725. def parse_m3u8_attributes(attrib):
  3726. info = {}
  3727. for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
  3728. if val.startswith('"'):
  3729. val = val[1:-1]
  3730. info[key] = val
  3731. return info
  3732. def urshift(val, n):
  3733. return val >> n if val >= 0 else (val + 0x100000000) >> n
  3734. def write_xattr(path, key, value):
  3735. # Windows: Write xattrs to NTFS Alternate Data Streams:
  3736. # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
  3737. if compat_os_name == 'nt':
  3738. assert ':' not in key
  3739. assert os.path.exists(path)
  3740. try:
  3741. with open(f'{path}:{key}', 'wb') as f:
  3742. f.write(value)
  3743. except OSError as e:
  3744. raise XAttrMetadataError(e.errno, e.strerror)
  3745. return
  3746. # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
  3747. setxattr = None
  3748. if callable(getattr(os, 'setxattr', None)):
  3749. setxattr = os.setxattr
  3750. elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
  3751. # Unicode arguments are not supported in pyxattr until version 0.5.0
  3752. # See https://github.com/ytdl-org/youtube-dl/issues/5498
  3753. if version_tuple(xattr.__version__) >= (0, 5, 0):
  3754. setxattr = xattr.set
  3755. elif xattr:
  3756. setxattr = xattr.setxattr
  3757. if setxattr:
  3758. try:
  3759. setxattr(path, key, value)
  3760. except OSError as e:
  3761. raise XAttrMetadataError(e.errno, e.strerror)
  3762. return
  3763. # UNIX Method 2. Use setfattr/xattr executables
  3764. exe = ('setfattr' if check_executable('setfattr', ['--version'])
  3765. else 'xattr' if check_executable('xattr', ['-h']) else None)
  3766. if not exe:
  3767. raise XAttrUnavailableError(
  3768. 'Couldn\'t find a tool to set the xattrs. Install either the "xattr" or "pyxattr" Python modules or the '
  3769. + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
  3770. value = value.decode()
  3771. try:
  3772. _, stderr, returncode = Popen.run(
  3773. [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
  3774. text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
  3775. except OSError as e:
  3776. raise XAttrMetadataError(e.errno, e.strerror)
  3777. if returncode:
  3778. raise XAttrMetadataError(returncode, stderr)
  3779. def random_birthday(year_field, month_field, day_field):
  3780. start_date = dt.date(1950, 1, 1)
  3781. end_date = dt.date(1995, 12, 31)
  3782. offset = random.randint(0, (end_date - start_date).days)
  3783. random_date = start_date + dt.timedelta(offset)
  3784. return {
  3785. year_field: str(random_date.year),
  3786. month_field: str(random_date.month),
  3787. day_field: str(random_date.day),
  3788. }
  3789. def find_available_port(interface=''):
  3790. try:
  3791. with socket.socket() as sock:
  3792. sock.bind((interface, 0))
  3793. return sock.getsockname()[1]
  3794. except OSError:
  3795. return None
  3796. # Templates for internet shortcut files, which are plain text files.
  3797. DOT_URL_LINK_TEMPLATE = '''\
  3798. [InternetShortcut]
  3799. URL=%(url)s
  3800. '''
  3801. DOT_WEBLOC_LINK_TEMPLATE = '''\
  3802. <?xml version="1.0" encoding="UTF-8"?>
  3803. <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
  3804. <plist version="1.0">
  3805. <dict>
  3806. \t<key>URL</key>
  3807. \t<string>%(url)s</string>
  3808. </dict>
  3809. </plist>
  3810. '''
  3811. DOT_DESKTOP_LINK_TEMPLATE = '''\
  3812. [Desktop Entry]
  3813. Encoding=UTF-8
  3814. Name=%(filename)s
  3815. Type=Link
  3816. URL=%(url)s
  3817. Icon=text-html
  3818. '''
  3819. LINK_TEMPLATES = {
  3820. 'url': DOT_URL_LINK_TEMPLATE,
  3821. 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
  3822. 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
  3823. }
  3824. def iri_to_uri(iri):
  3825. """
  3826. Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
  3827. The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
  3828. """
  3829. iri_parts = urllib.parse.urlparse(iri)
  3830. if '[' in iri_parts.netloc:
  3831. raise ValueError('IPv6 URIs are not, yet, supported.')
  3832. # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
  3833. # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
  3834. net_location = ''
  3835. if iri_parts.username:
  3836. net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
  3837. if iri_parts.password is not None:
  3838. net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
  3839. net_location += '@'
  3840. net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
  3841. # The 'idna' encoding produces ASCII text.
  3842. if iri_parts.port is not None and iri_parts.port != 80:
  3843. net_location += ':' + str(iri_parts.port)
  3844. return urllib.parse.urlunparse(
  3845. (iri_parts.scheme,
  3846. net_location,
  3847. urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
  3848. # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
  3849. urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
  3850. # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
  3851. urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
  3852. urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
  3853. # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
  3854. def to_high_limit_path(path):
  3855. if sys.platform in ['win32', 'cygwin']:
  3856. # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
  3857. return '\\\\?\\' + os.path.abspath(path)
  3858. return path
  3859. def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
  3860. val = traversal.traverse_obj(obj, *variadic(field))
  3861. if not val if ignore is NO_DEFAULT else val in variadic(ignore):
  3862. return default
  3863. return template % func(val)
  3864. def clean_podcast_url(url):
  3865. url = re.sub(r'''(?x)
  3866. (?:
  3867. (?:
  3868. chtbl\.com/track|
  3869. media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
  3870. play\.podtrac\.com|
  3871. chrt\.fm/track|
  3872. mgln\.ai/e
  3873. )(?:/[^/.]+)?|
  3874. (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
  3875. flex\.acast\.com|
  3876. pd(?:
  3877. cn\.co| # https://podcorn.com/analytics-prefix/
  3878. st\.fm # https://podsights.com/docs/
  3879. )/e|
  3880. [0-9]\.gum\.fm|
  3881. pscrb\.fm/rss/p
  3882. )/''', '', url)
  3883. return re.sub(r'^\w+://(\w+://)', r'\1', url)
  3884. _HEX_TABLE = '0123456789abcdef'
  3885. def random_uuidv4():
  3886. return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
  3887. def make_dir(path, to_screen=None):
  3888. try:
  3889. dn = os.path.dirname(path)
  3890. if dn:
  3891. os.makedirs(dn, exist_ok=True)
  3892. return True
  3893. except OSError as err:
  3894. if callable(to_screen) is not None:
  3895. to_screen(f'unable to create directory {err}')
  3896. return False
  3897. def get_executable_path():
  3898. from ..update import _get_variant_and_executable_path
  3899. return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
  3900. def get_user_config_dirs(package_name):
  3901. # .config (e.g. ~/.config/package_name)
  3902. xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
  3903. yield os.path.join(xdg_config_home, package_name)
  3904. # appdata (%APPDATA%/package_name)
  3905. appdata_dir = os.getenv('appdata')
  3906. if appdata_dir:
  3907. yield os.path.join(appdata_dir, package_name)
  3908. # home (~/.package_name)
  3909. yield os.path.join(compat_expanduser('~'), f'.{package_name}')
  3910. def get_system_config_dirs(package_name):
  3911. # /etc/package_name
  3912. yield os.path.join('/etc', package_name)
  3913. def time_seconds(**kwargs):
  3914. """
  3915. Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
  3916. """
  3917. return time.time() + dt.timedelta(**kwargs).total_seconds()
  3918. # create a JSON Web Signature (jws) with HS256 algorithm
  3919. # the resulting format is in JWS Compact Serialization
  3920. # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
  3921. # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
  3922. def jwt_encode_hs256(payload_data, key, headers={}):
  3923. header_data = {
  3924. 'alg': 'HS256',
  3925. 'typ': 'JWT',
  3926. }
  3927. if headers:
  3928. header_data.update(headers)
  3929. header_b64 = base64.b64encode(json.dumps(header_data).encode())
  3930. payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
  3931. h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
  3932. signature_b64 = base64.b64encode(h.digest())
  3933. return header_b64 + b'.' + payload_b64 + b'.' + signature_b64
  3934. # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
  3935. def jwt_decode_hs256(jwt):
  3936. header_b64, payload_b64, signature_b64 = jwt.split('.')
  3937. # add trailing ='s that may have been stripped, superfluous ='s are ignored
  3938. return json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
  3939. WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
  3940. @functools.cache
  3941. def supports_terminal_sequences(stream):
  3942. if compat_os_name == 'nt':
  3943. if not WINDOWS_VT_MODE:
  3944. return False
  3945. elif not os.getenv('TERM'):
  3946. return False
  3947. try:
  3948. return stream.isatty()
  3949. except BaseException:
  3950. return False
  3951. def windows_enable_vt_mode():
  3952. """Ref: https://bugs.python.org/issue30075 """
  3953. if get_windows_version() < (10, 0, 10586):
  3954. return
  3955. import ctypes
  3956. import ctypes.wintypes
  3957. import msvcrt
  3958. ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
  3959. dll = ctypes.WinDLL('kernel32', use_last_error=False)
  3960. handle = os.open('CONOUT$', os.O_RDWR)
  3961. try:
  3962. h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
  3963. dw_original_mode = ctypes.wintypes.DWORD()
  3964. success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
  3965. if not success:
  3966. raise Exception('GetConsoleMode failed')
  3967. success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
  3968. dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
  3969. if not success:
  3970. raise Exception('SetConsoleMode failed')
  3971. finally:
  3972. os.close(handle)
  3973. global WINDOWS_VT_MODE
  3974. WINDOWS_VT_MODE = True
  3975. supports_terminal_sequences.cache_clear()
  3976. _terminal_sequences_re = re.compile('\033\\[[^m]+m')
  3977. def remove_terminal_sequences(string):
  3978. return _terminal_sequences_re.sub('', string)
  3979. def number_of_digits(number):
  3980. return len('%d' % number)
  3981. def join_nonempty(*values, delim='-', from_dict=None):
  3982. if from_dict is not None:
  3983. values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
  3984. return delim.join(map(str, filter(None, values)))
  3985. def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
  3986. """
  3987. Find the largest format dimensions in terms of video width and, for each thumbnail:
  3988. * Modify the URL: Match the width with the provided regex and replace with the former width
  3989. * Update dimensions
  3990. This function is useful with video services that scale the provided thumbnails on demand
  3991. """
  3992. _keys = ('width', 'height')
  3993. max_dimensions = max(
  3994. (tuple(fmt.get(k) or 0 for k in _keys) for fmt in formats),
  3995. default=(0, 0))
  3996. if not max_dimensions[0]:
  3997. return thumbnails
  3998. return [
  3999. merge_dicts(
  4000. {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
  4001. dict(zip(_keys, max_dimensions)), thumbnail)
  4002. for thumbnail in thumbnails
  4003. ]
  4004. def parse_http_range(range):
  4005. """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
  4006. if not range:
  4007. return None, None, None
  4008. crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
  4009. if not crg:
  4010. return None, None, None
  4011. return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
  4012. def read_stdin(what):
  4013. if what:
  4014. eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
  4015. write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
  4016. return sys.stdin
  4017. def determine_file_encoding(data):
  4018. """
  4019. Detect the text encoding used
  4020. @returns (encoding, bytes to skip)
  4021. """
  4022. # BOM marks are given priority over declarations
  4023. for bom, enc in BOMS:
  4024. if data.startswith(bom):
  4025. return enc, len(bom)
  4026. # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
  4027. # We ignore the endianness to get a good enough match
  4028. data = data.replace(b'\0', b'')
  4029. mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
  4030. return mobj.group(1).decode() if mobj else None, 0
  4031. class Config:
  4032. own_args = None
  4033. parsed_args = None
  4034. filename = None
  4035. __initialized = False
  4036. def __init__(self, parser, label=None):
  4037. self.parser, self.label = parser, label
  4038. self._loaded_paths, self.configs = set(), []
  4039. def init(self, args=None, filename=None):
  4040. assert not self.__initialized
  4041. self.own_args, self.filename = args, filename
  4042. return self.load_configs()
  4043. def load_configs(self):
  4044. directory = ''
  4045. if self.filename:
  4046. location = os.path.realpath(self.filename)
  4047. directory = os.path.dirname(location)
  4048. if location in self._loaded_paths:
  4049. return False
  4050. self._loaded_paths.add(location)
  4051. self.__initialized = True
  4052. opts, _ = self.parser.parse_known_args(self.own_args)
  4053. self.parsed_args = self.own_args
  4054. for location in opts.config_locations or []:
  4055. if location == '-':
  4056. if location in self._loaded_paths:
  4057. continue
  4058. self._loaded_paths.add(location)
  4059. self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
  4060. continue
  4061. location = os.path.join(directory, expand_path(location))
  4062. if os.path.isdir(location):
  4063. location = os.path.join(location, 'yt-dlp.conf')
  4064. if not os.path.exists(location):
  4065. self.parser.error(f'config location {location} does not exist')
  4066. self.append_config(self.read_file(location), location)
  4067. return True
  4068. def __str__(self):
  4069. label = join_nonempty(
  4070. self.label, 'config', f'"{self.filename}"' if self.filename else '',
  4071. delim=' ')
  4072. return join_nonempty(
  4073. self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
  4074. *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
  4075. delim='\n')
  4076. @staticmethod
  4077. def read_file(filename, default=[]):
  4078. try:
  4079. optionf = open(filename, 'rb')
  4080. except OSError:
  4081. return default # silently skip if file is not present
  4082. try:
  4083. enc, skip = determine_file_encoding(optionf.read(512))
  4084. optionf.seek(skip, io.SEEK_SET)
  4085. except OSError:
  4086. enc = None # silently skip read errors
  4087. try:
  4088. # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
  4089. contents = optionf.read().decode(enc or preferredencoding())
  4090. res = shlex.split(contents, comments=True)
  4091. except Exception as err:
  4092. raise ValueError(f'Unable to parse "{filename}": {err}')
  4093. finally:
  4094. optionf.close()
  4095. return res
  4096. @staticmethod
  4097. def hide_login_info(opts):
  4098. PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
  4099. eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
  4100. def _scrub_eq(o):
  4101. m = eqre.match(o)
  4102. if m:
  4103. return m.group('key') + '=PRIVATE'
  4104. else:
  4105. return o
  4106. opts = list(map(_scrub_eq, opts))
  4107. for idx, opt in enumerate(opts):
  4108. if opt in PRIVATE_OPTS and idx + 1 < len(opts):
  4109. opts[idx + 1] = 'PRIVATE'
  4110. return opts
  4111. def append_config(self, *args, label=None):
  4112. config = type(self)(self.parser, label)
  4113. config._loaded_paths = self._loaded_paths
  4114. if config.init(*args):
  4115. self.configs.append(config)
  4116. @property
  4117. def all_args(self):
  4118. for config in reversed(self.configs):
  4119. yield from config.all_args
  4120. yield from self.parsed_args or []
  4121. def parse_known_args(self, **kwargs):
  4122. return self.parser.parse_known_args(self.all_args, **kwargs)
  4123. def parse_args(self):
  4124. return self.parser.parse_args(self.all_args)
  4125. def merge_headers(*dicts):
  4126. """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
  4127. return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
  4128. def cached_method(f):
  4129. """Cache a method"""
  4130. signature = inspect.signature(f)
  4131. @functools.wraps(f)
  4132. def wrapper(self, *args, **kwargs):
  4133. bound_args = signature.bind(self, *args, **kwargs)
  4134. bound_args.apply_defaults()
  4135. key = tuple(bound_args.arguments.values())[1:]
  4136. cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
  4137. if key not in cache:
  4138. cache[key] = f(self, *args, **kwargs)
  4139. return cache[key]
  4140. return wrapper
  4141. class classproperty:
  4142. """property access for class methods with optional caching"""
  4143. def __new__(cls, func=None, *args, **kwargs):
  4144. if not func:
  4145. return functools.partial(cls, *args, **kwargs)
  4146. return super().__new__(cls)
  4147. def __init__(self, func, *, cache=False):
  4148. functools.update_wrapper(self, func)
  4149. self.func = func
  4150. self._cache = {} if cache else None
  4151. def __get__(self, _, cls):
  4152. if self._cache is None:
  4153. return self.func(cls)
  4154. elif cls not in self._cache:
  4155. self._cache[cls] = self.func(cls)
  4156. return self._cache[cls]
  4157. class function_with_repr:
  4158. def __init__(self, func, repr_=None):
  4159. functools.update_wrapper(self, func)
  4160. self.func, self.__repr = func, repr_
  4161. def __call__(self, *args, **kwargs):
  4162. return self.func(*args, **kwargs)
  4163. @classmethod
  4164. def set_repr(cls, repr_):
  4165. return functools.partial(cls, repr_=repr_)
  4166. def __repr__(self):
  4167. if self.__repr:
  4168. return self.__repr
  4169. return f'{self.func.__module__}.{self.func.__qualname__}'
  4170. class Namespace(types.SimpleNamespace):
  4171. """Immutable namespace"""
  4172. def __iter__(self):
  4173. return iter(self.__dict__.values())
  4174. @property
  4175. def items_(self):
  4176. return self.__dict__.items()
  4177. MEDIA_EXTENSIONS = Namespace(
  4178. common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
  4179. video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
  4180. common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
  4181. audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
  4182. thumbnails=('jpg', 'png', 'webp'),
  4183. storyboards=('mhtml', ),
  4184. subtitles=('srt', 'vtt', 'ass', 'lrc'),
  4185. manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
  4186. )
  4187. MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
  4188. MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
  4189. KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
  4190. class _UnsafeExtensionError(Exception):
  4191. """
  4192. Mitigation exception for uncommon/malicious file extensions
  4193. This should be caught in YoutubeDL.py alongside a warning
  4194. Ref: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-79w7-vh3h-8g4j
  4195. """
  4196. ALLOWED_EXTENSIONS = frozenset([
  4197. # internal
  4198. 'description',
  4199. 'json',
  4200. 'meta',
  4201. 'orig',
  4202. 'part',
  4203. 'temp',
  4204. 'uncut',
  4205. 'unknown_video',
  4206. 'ytdl',
  4207. # video
  4208. *MEDIA_EXTENSIONS.video,
  4209. 'avif',
  4210. 'ismv',
  4211. 'm2ts',
  4212. 'm4s',
  4213. 'mng',
  4214. 'mpeg',
  4215. 'qt',
  4216. 'swf',
  4217. 'ts',
  4218. 'vp9',
  4219. 'wvm',
  4220. # audio
  4221. *MEDIA_EXTENSIONS.audio,
  4222. 'isma',
  4223. 'mid',
  4224. 'mpga',
  4225. 'ra',
  4226. # image
  4227. *MEDIA_EXTENSIONS.thumbnails,
  4228. 'bmp',
  4229. 'gif',
  4230. 'heic',
  4231. 'ico',
  4232. 'jng',
  4233. 'jpeg',
  4234. 'jxl',
  4235. 'svg',
  4236. 'tif',
  4237. 'wbmp',
  4238. # subtitle
  4239. *MEDIA_EXTENSIONS.subtitles,
  4240. 'dfxp',
  4241. 'fs',
  4242. 'ismt',
  4243. 'sami',
  4244. 'scc',
  4245. 'ssa',
  4246. 'tt',
  4247. 'ttml',
  4248. # others
  4249. *MEDIA_EXTENSIONS.manifests,
  4250. *MEDIA_EXTENSIONS.storyboards,
  4251. 'desktop',
  4252. 'ism',
  4253. 'm3u',
  4254. 'sbv',
  4255. 'url',
  4256. 'webloc',
  4257. 'xml',
  4258. ])
  4259. def __init__(self, extension, /):
  4260. super().__init__(f'unsafe file extension: {extension!r}')
  4261. self.extension = extension
  4262. @classmethod
  4263. def sanitize_extension(cls, extension, /, *, prepend=False):
  4264. if extension is None:
  4265. return None
  4266. if '/' in extension or '\\' in extension:
  4267. raise cls(extension)
  4268. if not prepend:
  4269. _, _, last = extension.rpartition('.')
  4270. if last == 'bin':
  4271. extension = last = 'unknown_video'
  4272. if last.lower() not in cls.ALLOWED_EXTENSIONS:
  4273. raise cls(extension)
  4274. return extension
  4275. class RetryManager:
  4276. """Usage:
  4277. for retry in RetryManager(...):
  4278. try:
  4279. ...
  4280. except SomeException as err:
  4281. retry.error = err
  4282. continue
  4283. """
  4284. attempt, _error = 0, None
  4285. def __init__(self, _retries, _error_callback, **kwargs):
  4286. self.retries = _retries or 0
  4287. self.error_callback = functools.partial(_error_callback, **kwargs)
  4288. def _should_retry(self):
  4289. return self._error is not NO_DEFAULT and self.attempt <= self.retries
  4290. @property
  4291. def error(self):
  4292. if self._error is NO_DEFAULT:
  4293. return None
  4294. return self._error
  4295. @error.setter
  4296. def error(self, value):
  4297. self._error = value
  4298. def __iter__(self):
  4299. while self._should_retry():
  4300. self.error = NO_DEFAULT
  4301. self.attempt += 1
  4302. yield self
  4303. if self.error:
  4304. self.error_callback(self.error, self.attempt, self.retries)
  4305. @staticmethod
  4306. def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
  4307. """Utility function for reporting retries"""
  4308. if count > retries:
  4309. if error:
  4310. return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
  4311. raise e
  4312. if not count:
  4313. return warn(e)
  4314. elif isinstance(e, ExtractorError):
  4315. e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
  4316. warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
  4317. delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
  4318. if delay:
  4319. info(f'Sleeping {delay:.2f} seconds ...')
  4320. time.sleep(delay)
  4321. def make_archive_id(ie, video_id):
  4322. ie_key = ie if isinstance(ie, str) else ie.ie_key()
  4323. return f'{ie_key.lower()} {video_id}'
  4324. def truncate_string(s, left, right=0):
  4325. assert left > 3 and right >= 0
  4326. if s is None or len(s) <= left + right:
  4327. return s
  4328. return f'{s[:left - 3]}...{s[-right:] if right else ""}'
  4329. def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
  4330. assert 'all' in alias_dict, '"all" alias is required'
  4331. requested = list(start or [])
  4332. for val in options:
  4333. discard = val.startswith('-')
  4334. if discard:
  4335. val = val[1:]
  4336. if val in alias_dict:
  4337. val = alias_dict[val] if not discard else [
  4338. i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
  4339. # NB: Do not allow regex in aliases for performance
  4340. requested = orderedSet_from_options(val, alias_dict, start=requested)
  4341. continue
  4342. current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
  4343. else [val] if val in alias_dict['all'] else None)
  4344. if current is None:
  4345. raise ValueError(val)
  4346. if discard:
  4347. for item in current:
  4348. while item in requested:
  4349. requested.remove(item)
  4350. else:
  4351. requested.extend(current)
  4352. return orderedSet(requested)
  4353. # TODO: Rewrite
  4354. class FormatSorter:
  4355. regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
  4356. default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
  4357. 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
  4358. 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
  4359. ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
  4360. 'height', 'width', 'proto', 'vext', 'abr', 'aext',
  4361. 'fps', 'fs_approx', 'source', 'id')
  4362. settings = {
  4363. 'vcodec': {'type': 'ordered', 'regex': True,
  4364. 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
  4365. 'acodec': {'type': 'ordered', 'regex': True,
  4366. 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
  4367. 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
  4368. 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
  4369. 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
  4370. 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
  4371. 'vext': {'type': 'ordered', 'field': 'video_ext',
  4372. 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
  4373. 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
  4374. 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
  4375. 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
  4376. 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
  4377. 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
  4378. 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
  4379. 'field': ('vcodec', 'acodec'),
  4380. 'function': lambda it: int(any(v != 'none' for v in it))},
  4381. 'ie_pref': {'priority': True, 'type': 'extractor'},
  4382. 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
  4383. 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
  4384. 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
  4385. 'quality': {'convert': 'float', 'default': -1},
  4386. 'filesize': {'convert': 'bytes'},
  4387. 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
  4388. 'id': {'convert': 'string', 'field': 'format_id'},
  4389. 'height': {'convert': 'float_none'},
  4390. 'width': {'convert': 'float_none'},
  4391. 'fps': {'convert': 'float_none'},
  4392. 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
  4393. 'tbr': {'convert': 'float_none'},
  4394. 'vbr': {'convert': 'float_none'},
  4395. 'abr': {'convert': 'float_none'},
  4396. 'asr': {'convert': 'float_none'},
  4397. 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
  4398. 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
  4399. 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
  4400. 'function': lambda it: next(filter(None, it), None)},
  4401. 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
  4402. 'function': lambda it: next(filter(None, it), None)},
  4403. 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
  4404. 'res': {'type': 'multiple', 'field': ('height', 'width'),
  4405. 'function': lambda it: min(filter(None, it), default=0)},
  4406. # Actual field names
  4407. 'format_id': {'type': 'alias', 'field': 'id'},
  4408. 'preference': {'type': 'alias', 'field': 'ie_pref'},
  4409. 'language_preference': {'type': 'alias', 'field': 'lang'},
  4410. 'source_preference': {'type': 'alias', 'field': 'source'},
  4411. 'protocol': {'type': 'alias', 'field': 'proto'},
  4412. 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
  4413. 'audio_channels': {'type': 'alias', 'field': 'channels'},
  4414. # Deprecated
  4415. 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
  4416. 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
  4417. 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
  4418. 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
  4419. 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
  4420. 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
  4421. 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
  4422. 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
  4423. 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
  4424. 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
  4425. 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
  4426. 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
  4427. 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
  4428. 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
  4429. 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
  4430. 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
  4431. 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
  4432. 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
  4433. 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
  4434. 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
  4435. }
  4436. def __init__(self, ydl, field_preference):
  4437. self.ydl = ydl
  4438. self._order = []
  4439. self.evaluate_params(self.ydl.params, field_preference)
  4440. if ydl.params.get('verbose'):
  4441. self.print_verbose_info(self.ydl.write_debug)
  4442. def _get_field_setting(self, field, key):
  4443. if field not in self.settings:
  4444. if key in ('forced', 'priority'):
  4445. return False
  4446. self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
  4447. 'deprecated and may be removed in a future version')
  4448. self.settings[field] = {}
  4449. prop_obj = self.settings[field]
  4450. if key not in prop_obj:
  4451. type_ = prop_obj.get('type')
  4452. if key == 'field':
  4453. default = 'preference' if type_ == 'extractor' else (field,) if type_ in ('combined', 'multiple') else field
  4454. elif key == 'convert':
  4455. default = 'order' if type_ == 'ordered' else 'float_string' if field else 'ignore'
  4456. else:
  4457. default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key)
  4458. prop_obj[key] = default
  4459. return prop_obj[key]
  4460. def _resolve_field_value(self, field, value, convert_none=False):
  4461. if value is None:
  4462. if not convert_none:
  4463. return None
  4464. else:
  4465. value = value.lower()
  4466. conversion = self._get_field_setting(field, 'convert')
  4467. if conversion == 'ignore':
  4468. return None
  4469. if conversion == 'string':
  4470. return value
  4471. elif conversion == 'float_none':
  4472. return float_or_none(value)
  4473. elif conversion == 'bytes':
  4474. return parse_bytes(value)
  4475. elif conversion == 'order':
  4476. order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
  4477. use_regex = self._get_field_setting(field, 'regex')
  4478. list_length = len(order_list)
  4479. empty_pos = order_list.index('') if '' in order_list else list_length + 1
  4480. if use_regex and value is not None:
  4481. for i, regex in enumerate(order_list):
  4482. if regex and re.match(regex, value):
  4483. return list_length - i
  4484. return list_length - empty_pos # not in list
  4485. else: # not regex or value = None
  4486. return list_length - (order_list.index(value) if value in order_list else empty_pos)
  4487. else:
  4488. if value.isnumeric():
  4489. return float(value)
  4490. else:
  4491. self.settings[field]['convert'] = 'string'
  4492. return value
  4493. def evaluate_params(self, params, sort_extractor):
  4494. self._use_free_order = params.get('prefer_free_formats', False)
  4495. self._sort_user = params.get('format_sort', [])
  4496. self._sort_extractor = sort_extractor
  4497. def add_item(field, reverse, closest, limit_text):
  4498. field = field.lower()
  4499. if field in self._order:
  4500. return
  4501. self._order.append(field)
  4502. limit = self._resolve_field_value(field, limit_text)
  4503. data = {
  4504. 'reverse': reverse,
  4505. 'closest': False if limit is None else closest,
  4506. 'limit_text': limit_text,
  4507. 'limit': limit}
  4508. if field in self.settings:
  4509. self.settings[field].update(data)
  4510. else:
  4511. self.settings[field] = data
  4512. sort_list = (
  4513. tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
  4514. + (tuple() if params.get('format_sort_force', False)
  4515. else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
  4516. + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
  4517. for item in sort_list:
  4518. match = re.match(self.regex, item)
  4519. if match is None:
  4520. raise ExtractorError(f'Invalid format sort string "{item}" given by extractor')
  4521. field = match.group('field')
  4522. if field is None:
  4523. continue
  4524. if self._get_field_setting(field, 'type') == 'alias':
  4525. alias, field = field, self._get_field_setting(field, 'field')
  4526. if self._get_field_setting(alias, 'deprecated'):
  4527. self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
  4528. f'be removed in a future version. Please use {field} instead')
  4529. reverse = match.group('reverse') is not None
  4530. closest = match.group('separator') == '~'
  4531. limit_text = match.group('limit')
  4532. has_limit = limit_text is not None
  4533. has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
  4534. has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
  4535. fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
  4536. limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
  4537. limit_count = len(limits)
  4538. for (i, f) in enumerate(fields):
  4539. add_item(f, reverse, closest,
  4540. limits[i] if i < limit_count
  4541. else limits[0] if has_limit and not has_multiple_limits
  4542. else None)
  4543. def print_verbose_info(self, write_debug):
  4544. if self._sort_user:
  4545. write_debug('Sort order given by user: {}'.format(', '.join(self._sort_user)))
  4546. if self._sort_extractor:
  4547. write_debug('Sort order given by extractor: {}'.format(', '.join(self._sort_extractor)))
  4548. write_debug('Formats sorted by: {}'.format(', '.join(['{}{}{}'.format(
  4549. '+' if self._get_field_setting(field, 'reverse') else '', field,
  4550. '{}{}({})'.format('~' if self._get_field_setting(field, 'closest') else ':',
  4551. self._get_field_setting(field, 'limit_text'),
  4552. self._get_field_setting(field, 'limit'))
  4553. if self._get_field_setting(field, 'limit_text') is not None else '')
  4554. for field in self._order if self._get_field_setting(field, 'visible')])))
  4555. def _calculate_field_preference_from_value(self, format_, field, type_, value):
  4556. reverse = self._get_field_setting(field, 'reverse')
  4557. closest = self._get_field_setting(field, 'closest')
  4558. limit = self._get_field_setting(field, 'limit')
  4559. if type_ == 'extractor':
  4560. maximum = self._get_field_setting(field, 'max')
  4561. if value is None or (maximum is not None and value >= maximum):
  4562. value = -1
  4563. elif type_ == 'boolean':
  4564. in_list = self._get_field_setting(field, 'in_list')
  4565. not_in_list = self._get_field_setting(field, 'not_in_list')
  4566. value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
  4567. elif type_ == 'ordered':
  4568. value = self._resolve_field_value(field, value, True)
  4569. # try to convert to number
  4570. val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
  4571. is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
  4572. if is_num:
  4573. value = val_num
  4574. return ((-10, 0) if value is None
  4575. else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
  4576. else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
  4577. else (0, value, 0) if not reverse and (limit is None or value <= limit)
  4578. else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
  4579. else (-1, value, 0))
  4580. def _calculate_field_preference(self, format_, field):
  4581. type_ = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
  4582. get_value = lambda f: format_.get(self._get_field_setting(f, 'field'))
  4583. if type_ == 'multiple':
  4584. type_ = 'field' # Only 'field' is allowed in multiple for now
  4585. actual_fields = self._get_field_setting(field, 'field')
  4586. value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
  4587. else:
  4588. value = get_value(field)
  4589. return self._calculate_field_preference_from_value(format_, field, type_, value)
  4590. def calculate_preference(self, format):
  4591. # Determine missing protocol
  4592. if not format.get('protocol'):
  4593. format['protocol'] = determine_protocol(format)
  4594. # Determine missing ext
  4595. if not format.get('ext') and 'url' in format:
  4596. format['ext'] = determine_ext(format['url'])
  4597. if format.get('vcodec') == 'none':
  4598. format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
  4599. format['video_ext'] = 'none'
  4600. else:
  4601. format['video_ext'] = format['ext']
  4602. format['audio_ext'] = 'none'
  4603. # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
  4604. # format['preference'] = -1000
  4605. if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
  4606. # HEVC-over-FLV is out-of-spec by FLV's original spec
  4607. # ref. https://trac.ffmpeg.org/ticket/6389
  4608. # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
  4609. format['preference'] = -100
  4610. # Determine missing bitrates
  4611. if format.get('vcodec') == 'none':
  4612. format['vbr'] = 0
  4613. if format.get('acodec') == 'none':
  4614. format['abr'] = 0
  4615. if not format.get('vbr') and format.get('vcodec') != 'none':
  4616. format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
  4617. if not format.get('abr') and format.get('acodec') != 'none':
  4618. format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
  4619. if not format.get('tbr'):
  4620. format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
  4621. return tuple(self._calculate_field_preference(format, field) for field in self._order)
  4622. def filesize_from_tbr(tbr, duration):
  4623. """
  4624. @param tbr: Total bitrate in kbps (1000 bits/sec)
  4625. @param duration: Duration in seconds
  4626. @returns Filesize in bytes
  4627. """
  4628. if tbr is None or duration is None:
  4629. return None
  4630. return int(duration * tbr * (1000 / 8))
  4631. # XXX: Temporary
  4632. class _YDLLogger:
  4633. def __init__(self, ydl=None):
  4634. self._ydl = ydl
  4635. def debug(self, message):
  4636. if self._ydl:
  4637. self._ydl.write_debug(message)
  4638. def info(self, message):
  4639. if self._ydl:
  4640. self._ydl.to_screen(message)
  4641. def warning(self, message, *, once=False):
  4642. if self._ydl:
  4643. self._ydl.report_warning(message, once)
  4644. def error(self, message, *, is_error=True):
  4645. if self._ydl:
  4646. self._ydl.report_error(message, is_error=is_error)
  4647. def stdout(self, message):
  4648. if self._ydl:
  4649. self._ydl.to_stdout(message)
  4650. def stderr(self, message):
  4651. if self._ydl:
  4652. self._ydl.to_stderr(message)