_utils.py 184 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658
  1. import base64
  2. import binascii
  3. import calendar
  4. import codecs
  5. import collections
  6. import collections.abc
  7. import contextlib
  8. import datetime as dt
  9. import email.header
  10. import email.utils
  11. import errno
  12. import functools
  13. import hashlib
  14. import hmac
  15. import html.entities
  16. import html.parser
  17. import inspect
  18. import io
  19. import itertools
  20. import json
  21. import locale
  22. import math
  23. import mimetypes
  24. import netrc
  25. import operator
  26. import os
  27. import platform
  28. import random
  29. import re
  30. import shlex
  31. import socket
  32. import ssl
  33. import struct
  34. import subprocess
  35. import sys
  36. import tempfile
  37. import time
  38. import traceback
  39. import types
  40. import unicodedata
  41. import urllib.error
  42. import urllib.parse
  43. import urllib.request
  44. import xml.etree.ElementTree
  45. from . import traversal
  46. from ..compat import (
  47. compat_etree_fromstring,
  48. compat_expanduser,
  49. compat_HTMLParseError,
  50. )
  51. from ..dependencies import xattr
  52. __name__ = __name__.rsplit('.', 1)[0] # noqa: A001: Pretend to be the parent module
  53. class NO_DEFAULT:
  54. pass
  55. def IDENTITY(x):
  56. return x
  57. ENGLISH_MONTH_NAMES = [
  58. 'January', 'February', 'March', 'April', 'May', 'June',
  59. 'July', 'August', 'September', 'October', 'November', 'December']
  60. MONTH_NAMES = {
  61. 'en': ENGLISH_MONTH_NAMES,
  62. 'fr': [
  63. 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  64. 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  65. # these follow the genitive grammatical case (dopełniacz)
  66. # some websites might be using nominative, which will require another month list
  67. # https://en.wikibooks.org/wiki/Polish/Noun_cases
  68. 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  69. 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  70. }
  71. # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  72. TIMEZONE_NAMES = {
  73. 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  74. 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
  75. 'EST': -5, 'EDT': -4, # Eastern
  76. 'CST': -6, 'CDT': -5, # Central
  77. 'MST': -7, 'MDT': -6, # Mountain
  78. 'PST': -8, 'PDT': -7, # Pacific
  79. }
  80. # needed for sanitizing filenames in restricted mode
  81. ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
  82. itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
  83. 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
  84. DATE_FORMATS = (
  85. '%d %B %Y',
  86. '%d %b %Y',
  87. '%B %d %Y',
  88. '%B %dst %Y',
  89. '%B %dnd %Y',
  90. '%B %drd %Y',
  91. '%B %dth %Y',
  92. '%b %d %Y',
  93. '%b %dst %Y',
  94. '%b %dnd %Y',
  95. '%b %drd %Y',
  96. '%b %dth %Y',
  97. '%b %dst %Y %I:%M',
  98. '%b %dnd %Y %I:%M',
  99. '%b %drd %Y %I:%M',
  100. '%b %dth %Y %I:%M',
  101. '%Y %m %d',
  102. '%Y-%m-%d',
  103. '%Y.%m.%d.',
  104. '%Y/%m/%d',
  105. '%Y/%m/%d %H:%M',
  106. '%Y/%m/%d %H:%M:%S',
  107. '%Y%m%d%H%M',
  108. '%Y%m%d%H%M%S',
  109. '%Y%m%d',
  110. '%Y-%m-%d %H:%M',
  111. '%Y-%m-%d %H:%M:%S',
  112. '%Y-%m-%d %H:%M:%S.%f',
  113. '%Y-%m-%d %H:%M:%S:%f',
  114. '%d.%m.%Y %H:%M',
  115. '%d.%m.%Y %H.%M',
  116. '%Y-%m-%dT%H:%M:%SZ',
  117. '%Y-%m-%dT%H:%M:%S.%fZ',
  118. '%Y-%m-%dT%H:%M:%S.%f0Z',
  119. '%Y-%m-%dT%H:%M:%S',
  120. '%Y-%m-%dT%H:%M:%S.%f',
  121. '%Y-%m-%dT%H:%M',
  122. '%b %d %Y at %H:%M',
  123. '%b %d %Y at %H:%M:%S',
  124. '%B %d %Y at %H:%M',
  125. '%B %d %Y at %H:%M:%S',
  126. '%H:%M %d-%b-%Y',
  127. )
  128. DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
  129. DATE_FORMATS_DAY_FIRST.extend([
  130. '%d-%m-%Y',
  131. '%d.%m.%Y',
  132. '%d.%m.%y',
  133. '%d/%m/%Y',
  134. '%d/%m/%y',
  135. '%d/%m/%Y %H:%M:%S',
  136. '%d-%m-%Y %H:%M',
  137. '%H:%M %d/%m/%Y',
  138. ])
  139. DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
  140. DATE_FORMATS_MONTH_FIRST.extend([
  141. '%m-%d-%Y',
  142. '%m.%d.%Y',
  143. '%m/%d/%Y',
  144. '%m/%d/%y',
  145. '%m/%d/%Y %H:%M:%S',
  146. ])
  147. PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
  148. JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
  149. NUMBER_RE = r'\d+(?:\.\d+)?'
  150. @functools.cache
  151. def preferredencoding():
  152. """Get preferred encoding.
  153. Returns the best encoding scheme for the system, based on
  154. locale.getpreferredencoding() and some further tweaks.
  155. """
  156. try:
  157. pref = locale.getpreferredencoding()
  158. 'TEST'.encode(pref)
  159. except Exception:
  160. pref = 'UTF-8'
  161. return pref
  162. def write_json_file(obj, fn):
  163. """ Encode obj as JSON and write it to fn, atomically if possible """
  164. tf = tempfile.NamedTemporaryFile(
  165. prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
  166. suffix='.tmp', delete=False, mode='w', encoding='utf-8')
  167. try:
  168. with tf:
  169. json.dump(obj, tf, ensure_ascii=False)
  170. if sys.platform == 'win32':
  171. # Need to remove existing file on Windows, else os.rename raises
  172. # WindowsError or FileExistsError.
  173. with contextlib.suppress(OSError):
  174. os.unlink(fn)
  175. with contextlib.suppress(OSError):
  176. mask = os.umask(0)
  177. os.umask(mask)
  178. os.chmod(tf.name, 0o666 & ~mask)
  179. os.rename(tf.name, fn)
  180. except Exception:
  181. with contextlib.suppress(OSError):
  182. os.remove(tf.name)
  183. raise
  184. def partial_application(func):
  185. sig = inspect.signature(func)
  186. required_args = [
  187. param.name for param in sig.parameters.values()
  188. if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD)
  189. if param.default is inspect.Parameter.empty
  190. ]
  191. @functools.wraps(func)
  192. def wrapped(*args, **kwargs):
  193. if set(required_args[len(args):]).difference(kwargs):
  194. return functools.partial(func, *args, **kwargs)
  195. return func(*args, **kwargs)
  196. return wrapped
  197. def find_xpath_attr(node, xpath, key, val=None):
  198. """ Find the xpath xpath[@key=val] """
  199. assert re.match(r'^[a-zA-Z_-]+$', key)
  200. expr = xpath + (f'[@{key}]' if val is None else f"[@{key}='{val}']")
  201. return node.find(expr)
  202. # On python2.6 the xml.etree.ElementTree.Element methods don't support
  203. # the namespace parameter
  204. def xpath_with_ns(path, ns_map):
  205. components = [c.split(':') for c in path.split('/')]
  206. replaced = []
  207. for c in components:
  208. if len(c) == 1:
  209. replaced.append(c[0])
  210. else:
  211. ns, tag = c
  212. replaced.append(f'{{{ns_map[ns]}}}{tag}')
  213. return '/'.join(replaced)
  214. def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
  215. def _find_xpath(xpath):
  216. return node.find(xpath)
  217. if isinstance(xpath, str):
  218. n = _find_xpath(xpath)
  219. else:
  220. for xp in xpath:
  221. n = _find_xpath(xp)
  222. if n is not None:
  223. break
  224. if n is None:
  225. if default is not NO_DEFAULT:
  226. return default
  227. elif fatal:
  228. name = xpath if name is None else name
  229. raise ExtractorError(f'Could not find XML element {name}')
  230. else:
  231. return None
  232. return n
  233. def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
  234. n = xpath_element(node, xpath, name, fatal=fatal, default=default)
  235. if n is None or n == default:
  236. return n
  237. if n.text is None:
  238. if default is not NO_DEFAULT:
  239. return default
  240. elif fatal:
  241. name = xpath if name is None else name
  242. raise ExtractorError(f'Could not find XML element\'s text {name}')
  243. else:
  244. return None
  245. return n.text
  246. def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
  247. n = find_xpath_attr(node, xpath, key)
  248. if n is None:
  249. if default is not NO_DEFAULT:
  250. return default
  251. elif fatal:
  252. name = f'{xpath}[@{key}]' if name is None else name
  253. raise ExtractorError(f'Could not find XML attribute {name}')
  254. else:
  255. return None
  256. return n.attrib[key]
  257. def get_element_by_id(id, html, **kwargs):
  258. """Return the content of the tag with the specified ID in the passed HTML document"""
  259. return get_element_by_attribute('id', id, html, **kwargs)
  260. def get_element_html_by_id(id, html, **kwargs):
  261. """Return the html of the tag with the specified ID in the passed HTML document"""
  262. return get_element_html_by_attribute('id', id, html, **kwargs)
  263. def get_element_by_class(class_name, html):
  264. """Return the content of the first tag with the specified class in the passed HTML document"""
  265. retval = get_elements_by_class(class_name, html)
  266. return retval[0] if retval else None
  267. def get_element_html_by_class(class_name, html):
  268. """Return the html of the first tag with the specified class in the passed HTML document"""
  269. retval = get_elements_html_by_class(class_name, html)
  270. return retval[0] if retval else None
  271. def get_element_by_attribute(attribute, value, html, **kwargs):
  272. retval = get_elements_by_attribute(attribute, value, html, **kwargs)
  273. return retval[0] if retval else None
  274. def get_element_html_by_attribute(attribute, value, html, **kargs):
  275. retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
  276. return retval[0] if retval else None
  277. def get_elements_by_class(class_name, html, **kargs):
  278. """Return the content of all tags with the specified class in the passed HTML document as a list"""
  279. return get_elements_by_attribute(
  280. 'class', rf'[^\'"]*(?<=[\'"\s]){re.escape(class_name)}(?=[\'"\s])[^\'"]*',
  281. html, escape_value=False)
  282. def get_elements_html_by_class(class_name, html):
  283. """Return the html of all tags with the specified class in the passed HTML document as a list"""
  284. return get_elements_html_by_attribute(
  285. 'class', rf'[^\'"]*(?<=[\'"\s]){re.escape(class_name)}(?=[\'"\s])[^\'"]*',
  286. html, escape_value=False)
  287. def get_elements_by_attribute(*args, **kwargs):
  288. """Return the content of the tag with the specified attribute in the passed HTML document"""
  289. return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
  290. def get_elements_html_by_attribute(*args, **kwargs):
  291. """Return the html of the tag with the specified attribute in the passed HTML document"""
  292. return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
  293. def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
  294. """
  295. Return the text (content) and the html (whole) of the tag with the specified
  296. attribute in the passed HTML document
  297. """
  298. if not value:
  299. return
  300. quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
  301. value = re.escape(value) if escape_value else value
  302. partial_element_re = rf'''(?x)
  303. <(?P<tag>{tag})
  304. (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
  305. \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
  306. '''
  307. for m in re.finditer(partial_element_re, html):
  308. content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
  309. yield (
  310. unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
  311. whole,
  312. )
  313. class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
  314. """
  315. HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
  316. closing tag for the first opening tag it has encountered, and can be used
  317. as a context manager
  318. """
  319. class HTMLBreakOnClosingTagException(Exception):
  320. pass
  321. def __init__(self):
  322. self.tagstack = collections.deque()
  323. html.parser.HTMLParser.__init__(self)
  324. def __enter__(self):
  325. return self
  326. def __exit__(self, *_):
  327. self.close()
  328. def close(self):
  329. # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
  330. # so data remains buffered; we no longer have any interest in it, thus
  331. # override this method to discard it
  332. pass
  333. def handle_starttag(self, tag, _):
  334. self.tagstack.append(tag)
  335. def handle_endtag(self, tag):
  336. if not self.tagstack:
  337. raise compat_HTMLParseError('no tags in the stack')
  338. while self.tagstack:
  339. inner_tag = self.tagstack.pop()
  340. if inner_tag == tag:
  341. break
  342. else:
  343. raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
  344. if not self.tagstack:
  345. raise self.HTMLBreakOnClosingTagException
  346. # XXX: This should be far less strict
  347. def get_element_text_and_html_by_tag(tag, html):
  348. """
  349. For the first element with the specified tag in the passed HTML document
  350. return its' content (text) and the whole element (html)
  351. """
  352. def find_or_raise(haystack, needle, exc):
  353. try:
  354. return haystack.index(needle)
  355. except ValueError:
  356. raise exc
  357. closing_tag = f'</{tag}>'
  358. whole_start = find_or_raise(
  359. html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
  360. content_start = find_or_raise(
  361. html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
  362. content_start += whole_start + 1
  363. with HTMLBreakOnClosingTagParser() as parser:
  364. parser.feed(html[whole_start:content_start])
  365. if not parser.tagstack or parser.tagstack[0] != tag:
  366. raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
  367. offset = content_start
  368. while offset < len(html):
  369. next_closing_tag_start = find_or_raise(
  370. html[offset:], closing_tag,
  371. compat_HTMLParseError(f'closing {tag} tag not found'))
  372. next_closing_tag_end = next_closing_tag_start + len(closing_tag)
  373. try:
  374. parser.feed(html[offset:offset + next_closing_tag_end])
  375. offset += next_closing_tag_end
  376. except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
  377. return html[content_start:offset + next_closing_tag_start], \
  378. html[whole_start:offset + next_closing_tag_end]
  379. raise compat_HTMLParseError('unexpected end of html')
  380. class HTMLAttributeParser(html.parser.HTMLParser):
  381. """Trivial HTML parser to gather the attributes for a single element"""
  382. def __init__(self):
  383. self.attrs = {}
  384. html.parser.HTMLParser.__init__(self)
  385. def handle_starttag(self, tag, attrs):
  386. self.attrs = dict(attrs)
  387. raise compat_HTMLParseError('done')
  388. class HTMLListAttrsParser(html.parser.HTMLParser):
  389. """HTML parser to gather the attributes for the elements of a list"""
  390. def __init__(self):
  391. html.parser.HTMLParser.__init__(self)
  392. self.items = []
  393. self._level = 0
  394. def handle_starttag(self, tag, attrs):
  395. if tag == 'li' and self._level == 0:
  396. self.items.append(dict(attrs))
  397. self._level += 1
  398. def handle_endtag(self, tag):
  399. self._level -= 1
  400. def extract_attributes(html_element):
  401. """Given a string for an HTML element such as
  402. <el
  403. a="foo" B="bar" c="&98;az" d=boz
  404. empty= noval entity="&amp;"
  405. sq='"' dq="'"
  406. >
  407. Decode and return a dictionary of attributes.
  408. {
  409. 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
  410. 'empty': '', 'noval': None, 'entity': '&',
  411. 'sq': '"', 'dq': '\''
  412. }.
  413. """
  414. parser = HTMLAttributeParser()
  415. with contextlib.suppress(compat_HTMLParseError):
  416. parser.feed(html_element)
  417. parser.close()
  418. return parser.attrs
  419. def parse_list(webpage):
  420. """Given a string for an series of HTML <li> elements,
  421. return a dictionary of their attributes"""
  422. parser = HTMLListAttrsParser()
  423. parser.feed(webpage)
  424. parser.close()
  425. return parser.items
  426. def clean_html(html):
  427. """Clean an HTML snippet into a readable string"""
  428. if html is None: # Convenience for sanitizing descriptions etc.
  429. return html
  430. html = re.sub(r'\s+', ' ', html)
  431. html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
  432. html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
  433. # Strip html tags
  434. html = re.sub('<.*?>', '', html)
  435. # Replace html entities
  436. html = unescapeHTML(html)
  437. return html.strip()
  438. class LenientJSONDecoder(json.JSONDecoder):
  439. # TODO: Write tests
  440. def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
  441. self.transform_source, self.ignore_extra = transform_source, ignore_extra
  442. self._close_attempts = 2 * close_objects
  443. super().__init__(*args, **kwargs)
  444. @staticmethod
  445. def _close_object(err):
  446. doc = err.doc[:err.pos]
  447. # We need to add comma first to get the correct error message
  448. if err.msg.startswith('Expecting \',\''):
  449. return doc + ','
  450. elif not doc.endswith(','):
  451. return
  452. if err.msg.startswith('Expecting property name'):
  453. return doc[:-1] + '}'
  454. elif err.msg.startswith('Expecting value'):
  455. return doc[:-1] + ']'
  456. def decode(self, s):
  457. if self.transform_source:
  458. s = self.transform_source(s)
  459. for attempt in range(self._close_attempts + 1):
  460. try:
  461. if self.ignore_extra:
  462. return self.raw_decode(s.lstrip())[0]
  463. return super().decode(s)
  464. except json.JSONDecodeError as e:
  465. if e.pos is None:
  466. raise
  467. elif attempt < self._close_attempts:
  468. s = self._close_object(e)
  469. if s is not None:
  470. continue
  471. raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos)
  472. assert False, 'Too many attempts to decode JSON'
  473. def sanitize_open(filename, open_mode):
  474. """Try to open the given filename, and slightly tweak it if this fails.
  475. Attempts to open the given filename. If this fails, it tries to change
  476. the filename slightly, step by step, until it's either able to open it
  477. or it fails and raises a final exception, like the standard open()
  478. function.
  479. It returns the tuple (stream, definitive_file_name).
  480. """
  481. if filename == '-':
  482. if sys.platform == 'win32':
  483. import msvcrt
  484. # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
  485. with contextlib.suppress(io.UnsupportedOperation):
  486. msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
  487. return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
  488. for attempt in range(2):
  489. try:
  490. try:
  491. if sys.platform == 'win32':
  492. # FIXME: An exclusive lock also locks the file from being read.
  493. # Since windows locks are mandatory, don't lock the file on windows (for now).
  494. # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
  495. raise LockingUnsupportedError
  496. stream = locked_file(filename, open_mode, block=False).__enter__()
  497. except OSError:
  498. stream = open(filename, open_mode)
  499. return stream, filename
  500. except OSError as err:
  501. if attempt or err.errno in (errno.EACCES,):
  502. raise
  503. old_filename, filename = filename, sanitize_path(filename)
  504. if old_filename == filename:
  505. raise
  506. def timeconvert(timestr):
  507. """Convert RFC 2822 defined time string into system timestamp"""
  508. timestamp = None
  509. timetuple = email.utils.parsedate_tz(timestr)
  510. if timetuple is not None:
  511. timestamp = email.utils.mktime_tz(timetuple)
  512. return timestamp
  513. def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
  514. """Sanitizes a string so it could be used as part of a filename.
  515. @param restricted Use a stricter subset of allowed characters
  516. @param is_id Whether this is an ID that should be kept unchanged if possible.
  517. If unset, yt-dlp's new sanitization rules are in effect
  518. """
  519. if s == '':
  520. return ''
  521. def replace_insane(char):
  522. if restricted and char in ACCENT_CHARS:
  523. return ACCENT_CHARS[char]
  524. elif not restricted and char == '\n':
  525. return '\0 '
  526. elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
  527. # Replace with their full-width unicode counterparts
  528. return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
  529. elif char == '?' or ord(char) < 32 or ord(char) == 127:
  530. return ''
  531. elif char == '"':
  532. return '' if restricted else '\''
  533. elif char == ':':
  534. return '\0_\0-' if restricted else '\0 \0-'
  535. elif char in '\\/|*<>':
  536. return '\0_'
  537. if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
  538. return '' if unicodedata.category(char)[0] in 'CM' else '\0_'
  539. return char
  540. # Replace look-alike Unicode glyphs
  541. if restricted and (is_id is NO_DEFAULT or not is_id):
  542. s = unicodedata.normalize('NFKC', s)
  543. s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
  544. result = ''.join(map(replace_insane, s))
  545. if is_id is NO_DEFAULT:
  546. result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
  547. STRIP_RE = r'(?:\0.|[ _-])*'
  548. result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
  549. result = result.replace('\0', '') or '_'
  550. if not is_id:
  551. while '__' in result:
  552. result = result.replace('__', '_')
  553. result = result.strip('_')
  554. # Common case of "Foreign band name - English song title"
  555. if restricted and result.startswith('-_'):
  556. result = result[2:]
  557. if result.startswith('-'):
  558. result = '_' + result[len('-'):]
  559. result = result.lstrip('.')
  560. if not result:
  561. result = '_'
  562. return result
  563. def _sanitize_path_parts(parts):
  564. sanitized_parts = []
  565. for part in parts:
  566. if not part or part == '.':
  567. continue
  568. elif part == '..':
  569. if sanitized_parts and sanitized_parts[-1] != '..':
  570. sanitized_parts.pop()
  571. sanitized_parts.append('..')
  572. continue
  573. # Replace invalid segments with `#`
  574. # - trailing dots and spaces (`asdf...` => `asdf..#`)
  575. # - invalid chars (`<>` => `##`)
  576. sanitized_part = re.sub(r'[/<>:"\|\\?\*]|[\s.]$', '#', part)
  577. sanitized_parts.append(sanitized_part)
  578. return sanitized_parts
  579. def sanitize_path(s, force=False):
  580. """Sanitizes and normalizes path on Windows"""
  581. if sys.platform != 'win32':
  582. if not force:
  583. return s
  584. root = '/' if s.startswith('/') else ''
  585. return root + '/'.join(_sanitize_path_parts(s.split('/')))
  586. normed = s.replace('/', '\\')
  587. if normed.startswith('\\\\'):
  588. # UNC path (`\\SERVER\SHARE`) or device path (`\\.`, `\\?`)
  589. parts = normed.split('\\')
  590. root = '\\'.join(parts[:4]) + '\\'
  591. parts = parts[4:]
  592. elif normed[1:2] == ':':
  593. # absolute path or drive relative path
  594. offset = 3 if normed[2:3] == '\\' else 2
  595. root = normed[:offset]
  596. parts = normed[offset:].split('\\')
  597. else:
  598. # relative/drive root relative path
  599. root = '\\' if normed[:1] == '\\' else ''
  600. parts = normed.split('\\')
  601. return root + '\\'.join(_sanitize_path_parts(parts))
  602. def sanitize_url(url, *, scheme='http'):
  603. # Prepend protocol-less URLs with `http:` scheme in order to mitigate
  604. # the number of unwanted failures due to missing protocol
  605. if url is None:
  606. return
  607. elif url.startswith('//'):
  608. return f'{scheme}:{url}'
  609. # Fix some common typos seen so far
  610. COMMON_TYPOS = (
  611. # https://github.com/ytdl-org/youtube-dl/issues/15649
  612. (r'^httpss://', r'https://'),
  613. # https://bx1.be/lives/direct-tv/
  614. (r'^rmtp([es]?)://', r'rtmp\1://'),
  615. )
  616. for mistake, fixup in COMMON_TYPOS:
  617. if re.match(mistake, url):
  618. return re.sub(mistake, fixup, url)
  619. return url
  620. def extract_basic_auth(url):
  621. parts = urllib.parse.urlsplit(url)
  622. if parts.username is None:
  623. return url, None
  624. url = urllib.parse.urlunsplit(parts._replace(netloc=(
  625. parts.hostname if parts.port is None
  626. else f'{parts.hostname}:{parts.port}')))
  627. auth_payload = base64.b64encode(
  628. ('{}:{}'.format(parts.username, parts.password or '')).encode())
  629. return url, f'Basic {auth_payload.decode()}'
  630. def expand_path(s):
  631. """Expand shell variables and ~"""
  632. return os.path.expandvars(compat_expanduser(s))
  633. def orderedSet(iterable, *, lazy=False):
  634. """Remove all duplicates from the input iterable"""
  635. def _iter():
  636. seen = [] # Do not use set since the items can be unhashable
  637. for x in iterable:
  638. if x not in seen:
  639. seen.append(x)
  640. yield x
  641. return _iter() if lazy else list(_iter())
  642. def _htmlentity_transform(entity_with_semicolon):
  643. """Transforms an HTML entity to a character."""
  644. entity = entity_with_semicolon[:-1]
  645. # Known non-numeric HTML entity
  646. if entity in html.entities.name2codepoint:
  647. return chr(html.entities.name2codepoint[entity])
  648. # TODO: HTML5 allows entities without a semicolon.
  649. # E.g. '&Eacuteric' should be decoded as 'Éric'.
  650. if entity_with_semicolon in html.entities.html5:
  651. return html.entities.html5[entity_with_semicolon]
  652. mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
  653. if mobj is not None:
  654. numstr = mobj.group(1)
  655. if numstr.startswith('x'):
  656. base = 16
  657. numstr = f'0{numstr}'
  658. else:
  659. base = 10
  660. # See https://github.com/ytdl-org/youtube-dl/issues/7518
  661. with contextlib.suppress(ValueError):
  662. return chr(int(numstr, base))
  663. # Unknown entity in name, return its literal representation
  664. return f'&{entity};'
  665. def unescapeHTML(s):
  666. if s is None:
  667. return None
  668. assert isinstance(s, str)
  669. return re.sub(
  670. r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
  671. def escapeHTML(text):
  672. return (
  673. text
  674. .replace('&', '&amp;')
  675. .replace('<', '&lt;')
  676. .replace('>', '&gt;')
  677. .replace('"', '&quot;')
  678. .replace("'", '&#39;')
  679. )
  680. class netrc_from_content(netrc.netrc):
  681. def __init__(self, content):
  682. self.hosts, self.macros = {}, {}
  683. with io.StringIO(content) as stream:
  684. self._parse('-', stream, False)
  685. class Popen(subprocess.Popen):
  686. if sys.platform == 'win32':
  687. _startupinfo = subprocess.STARTUPINFO()
  688. _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
  689. else:
  690. _startupinfo = None
  691. @staticmethod
  692. def _fix_pyinstaller_issues(env):
  693. if not hasattr(sys, '_MEIPASS'):
  694. return
  695. # Force spawning independent subprocesses for exes bundled with PyInstaller>=6.10
  696. # Ref: https://pyinstaller.org/en/v6.10.0/CHANGES.html#incompatible-changes
  697. # https://github.com/yt-dlp/yt-dlp/issues/11259
  698. env['PYINSTALLER_RESET_ENVIRONMENT'] = '1'
  699. # Restore LD_LIBRARY_PATH when using PyInstaller
  700. # Ref: https://pyinstaller.org/en/v6.10.0/runtime-information.html#ld-library-path-libpath-considerations
  701. # https://github.com/yt-dlp/yt-dlp/issues/4573
  702. def _fix(key):
  703. orig = env.get(f'{key}_ORIG')
  704. if orig is None:
  705. env.pop(key, None)
  706. else:
  707. env[key] = orig
  708. _fix('LD_LIBRARY_PATH') # Linux
  709. _fix('DYLD_LIBRARY_PATH') # macOS
  710. def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
  711. if env is None:
  712. env = os.environ.copy()
  713. self._fix_pyinstaller_issues(env)
  714. self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
  715. if text is True:
  716. kwargs['universal_newlines'] = True # For 3.6 compatibility
  717. kwargs.setdefault('encoding', 'utf-8')
  718. kwargs.setdefault('errors', 'replace')
  719. if shell and os.name == 'nt' and kwargs.get('executable') is None:
  720. if not isinstance(args, str):
  721. args = shell_quote(args, shell=True)
  722. shell = False
  723. # Set variable for `cmd.exe` newline escaping (see `utils.shell_quote`)
  724. env['='] = '"^\n\n"'
  725. args = f'{self.__comspec()} /Q /S /D /V:OFF /E:ON /C "{args}"'
  726. super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
  727. def __comspec(self):
  728. comspec = os.environ.get('ComSpec') or os.path.join(
  729. os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
  730. if os.path.isabs(comspec):
  731. return comspec
  732. raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
  733. def communicate_or_kill(self, *args, **kwargs):
  734. try:
  735. return self.communicate(*args, **kwargs)
  736. except BaseException: # Including KeyboardInterrupt
  737. self.kill(timeout=None)
  738. raise
  739. def kill(self, *, timeout=0):
  740. super().kill()
  741. if timeout != 0:
  742. self.wait(timeout=timeout)
  743. @classmethod
  744. def run(cls, *args, timeout=None, **kwargs):
  745. with cls(*args, **kwargs) as proc:
  746. default = '' if proc.__text_mode else b''
  747. stdout, stderr = proc.communicate_or_kill(timeout=timeout)
  748. return stdout or default, stderr or default, proc.returncode
  749. def encodeArgument(s):
  750. # Legacy code that uses byte strings
  751. # Uncomment the following line after fixing all post processors
  752. # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
  753. return s if isinstance(s, str) else s.decode('ascii')
  754. _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
  755. def timetuple_from_msec(msec):
  756. secs, msec = divmod(msec, 1000)
  757. mins, secs = divmod(secs, 60)
  758. hrs, mins = divmod(mins, 60)
  759. return _timetuple(hrs, mins, secs, msec)
  760. def formatSeconds(secs, delim=':', msec=False):
  761. time = timetuple_from_msec(secs * 1000)
  762. if time.hours:
  763. ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
  764. elif time.minutes:
  765. ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
  766. else:
  767. ret = '%d' % time.seconds
  768. return '%s.%03d' % (ret, time.milliseconds) if msec else ret
  769. def bug_reports_message(before=';'):
  770. from ..update import REPOSITORY
  771. msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
  772. 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
  773. before = before.rstrip()
  774. if not before or before.endswith(('.', '!', '?')):
  775. msg = msg[0].title() + msg[1:]
  776. return (before + ' ' if before else '') + msg
  777. class YoutubeDLError(Exception):
  778. """Base exception for YoutubeDL errors."""
  779. msg = None
  780. def __init__(self, msg=None):
  781. if msg is not None:
  782. self.msg = msg
  783. elif self.msg is None:
  784. self.msg = type(self).__name__
  785. super().__init__(self.msg)
  786. class ExtractorError(YoutubeDLError):
  787. """Error during info extraction."""
  788. def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
  789. """ tb, if given, is the original traceback (so that it can be printed out).
  790. If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
  791. """
  792. from ..networking.exceptions import network_exceptions
  793. if sys.exc_info()[0] in network_exceptions:
  794. expected = True
  795. self.orig_msg = str(msg)
  796. self.traceback = tb
  797. self.expected = expected
  798. self.cause = cause
  799. self.video_id = video_id
  800. self.ie = ie
  801. self.exc_info = sys.exc_info() # preserve original exception
  802. if isinstance(self.exc_info[1], ExtractorError):
  803. self.exc_info = self.exc_info[1].exc_info
  804. super().__init__(self.__msg)
  805. @property
  806. def __msg(self):
  807. return ''.join((
  808. format_field(self.ie, None, '[%s] '),
  809. format_field(self.video_id, None, '%s: '),
  810. self.orig_msg,
  811. format_field(self.cause, None, ' (caused by %r)'),
  812. '' if self.expected else bug_reports_message()))
  813. def format_traceback(self):
  814. return join_nonempty(
  815. self.traceback and ''.join(traceback.format_tb(self.traceback)),
  816. self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
  817. delim='\n') or None
  818. def __setattr__(self, name, value):
  819. super().__setattr__(name, value)
  820. if getattr(self, 'msg', None) and name not in ('msg', 'args'):
  821. self.msg = self.__msg or type(self).__name__
  822. self.args = (self.msg, ) # Cannot be property
  823. class UnsupportedError(ExtractorError):
  824. def __init__(self, url):
  825. super().__init__(
  826. f'Unsupported URL: {url}', expected=True)
  827. self.url = url
  828. class RegexNotFoundError(ExtractorError):
  829. """Error when a regex didn't match"""
  830. pass
  831. class GeoRestrictedError(ExtractorError):
  832. """Geographic restriction Error exception.
  833. This exception may be thrown when a video is not available from your
  834. geographic location due to geographic restrictions imposed by a website.
  835. """
  836. def __init__(self, msg, countries=None, **kwargs):
  837. kwargs['expected'] = True
  838. super().__init__(msg, **kwargs)
  839. self.countries = countries
  840. class UserNotLive(ExtractorError):
  841. """Error when a channel/user is not live"""
  842. def __init__(self, msg=None, **kwargs):
  843. kwargs['expected'] = True
  844. super().__init__(msg or 'The channel is not currently live', **kwargs)
  845. class DownloadError(YoutubeDLError):
  846. """Download Error exception.
  847. This exception may be thrown by FileDownloader objects if they are not
  848. configured to continue on errors. They will contain the appropriate
  849. error message.
  850. """
  851. def __init__(self, msg, exc_info=None):
  852. """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
  853. super().__init__(msg)
  854. self.exc_info = exc_info
  855. class EntryNotInPlaylist(YoutubeDLError):
  856. """Entry not in playlist exception.
  857. This exception will be thrown by YoutubeDL when a requested entry
  858. is not found in the playlist info_dict
  859. """
  860. msg = 'Entry not found in info'
  861. class SameFileError(YoutubeDLError):
  862. """Same File exception.
  863. This exception will be thrown by FileDownloader objects if they detect
  864. multiple files would have to be downloaded to the same file on disk.
  865. """
  866. msg = 'Fixed output name but more than one file to download'
  867. def __init__(self, filename=None):
  868. if filename is not None:
  869. self.msg += f': {filename}'
  870. super().__init__(self.msg)
  871. class PostProcessingError(YoutubeDLError):
  872. """Post Processing exception.
  873. This exception may be raised by PostProcessor's .run() method to
  874. indicate an error in the postprocessing task.
  875. """
  876. class DownloadCancelled(YoutubeDLError):
  877. """ Exception raised when the download queue should be interrupted """
  878. msg = 'The download was cancelled'
  879. class ExistingVideoReached(DownloadCancelled):
  880. """ --break-on-existing triggered """
  881. msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
  882. class RejectedVideoReached(DownloadCancelled):
  883. """ --break-match-filter triggered """
  884. msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
  885. class MaxDownloadsReached(DownloadCancelled):
  886. """ --max-downloads limit has been reached. """
  887. msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
  888. class ReExtractInfo(YoutubeDLError):
  889. """ Video info needs to be re-extracted. """
  890. def __init__(self, msg, expected=False):
  891. super().__init__(msg)
  892. self.expected = expected
  893. class ThrottledDownload(ReExtractInfo):
  894. """ Download speed below --throttled-rate. """
  895. msg = 'The download speed is below throttle limit'
  896. def __init__(self):
  897. super().__init__(self.msg, expected=False)
  898. class UnavailableVideoError(YoutubeDLError):
  899. """Unavailable Format exception.
  900. This exception will be thrown when a video is requested
  901. in a format that is not available for that video.
  902. """
  903. msg = 'Unable to download video'
  904. def __init__(self, err=None):
  905. if err is not None:
  906. self.msg += f': {err}'
  907. super().__init__(self.msg)
  908. class ContentTooShortError(YoutubeDLError):
  909. """Content Too Short exception.
  910. This exception may be raised by FileDownloader objects when a file they
  911. download is too small for what the server announced first, indicating
  912. the connection was probably interrupted.
  913. """
  914. def __init__(self, downloaded, expected):
  915. super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
  916. # Both in bytes
  917. self.downloaded = downloaded
  918. self.expected = expected
  919. class XAttrMetadataError(YoutubeDLError):
  920. def __init__(self, code=None, msg='Unknown error'):
  921. super().__init__(msg)
  922. self.code = code
  923. self.msg = msg
  924. # Parsing code and msg
  925. if (self.code in (errno.ENOSPC, errno.EDQUOT)
  926. or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
  927. self.reason = 'NO_SPACE'
  928. elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
  929. self.reason = 'VALUE_TOO_LONG'
  930. else:
  931. self.reason = 'NOT_SUPPORTED'
  932. class XAttrUnavailableError(YoutubeDLError):
  933. pass
  934. def is_path_like(f):
  935. return isinstance(f, (str, bytes, os.PathLike))
  936. def extract_timezone(date_str, default=None):
  937. m = re.search(
  938. r'''(?x)
  939. ^.{8,}? # >=8 char non-TZ prefix, if present
  940. (?P<tz>Z| # just the UTC Z, or
  941. (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
  942. (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
  943. [ ]? # optional space
  944. (?P<sign>\+|-) # +/-
  945. (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
  946. $)
  947. ''', date_str)
  948. timezone = None
  949. if not m:
  950. m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
  951. timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
  952. if timezone is not None:
  953. date_str = date_str[:-len(m.group('tz'))]
  954. timezone = dt.timedelta(hours=timezone)
  955. else:
  956. date_str = date_str[:-len(m.group('tz'))]
  957. if m.group('sign'):
  958. sign = 1 if m.group('sign') == '+' else -1
  959. timezone = dt.timedelta(
  960. hours=sign * int(m.group('hours')),
  961. minutes=sign * int(m.group('minutes')))
  962. if timezone is None and default is not NO_DEFAULT:
  963. timezone = default or dt.timedelta()
  964. return timezone, date_str
  965. @partial_application
  966. def parse_iso8601(date_str, delimiter='T', timezone=None):
  967. """ Return a UNIX timestamp from the given date """
  968. if date_str is None:
  969. return None
  970. date_str = re.sub(r'\.[0-9]+', '', date_str)
  971. timezone, date_str = extract_timezone(date_str, timezone)
  972. with contextlib.suppress(ValueError, TypeError):
  973. date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
  974. dt_ = dt.datetime.strptime(date_str, date_format) - timezone
  975. return calendar.timegm(dt_.timetuple())
  976. def date_formats(day_first=True):
  977. return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
  978. def unified_strdate(date_str, day_first=True):
  979. """Return a string with the date in the format YYYYMMDD"""
  980. if date_str is None:
  981. return None
  982. upload_date = None
  983. # Replace commas
  984. date_str = date_str.replace(',', ' ')
  985. # Remove AM/PM + timezone
  986. date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
  987. _, date_str = extract_timezone(date_str)
  988. for expression in date_formats(day_first):
  989. with contextlib.suppress(ValueError):
  990. upload_date = dt.datetime.strptime(date_str, expression).strftime('%Y%m%d')
  991. if upload_date is None:
  992. timetuple = email.utils.parsedate_tz(date_str)
  993. if timetuple:
  994. with contextlib.suppress(ValueError):
  995. upload_date = dt.datetime(*timetuple[:6]).strftime('%Y%m%d')
  996. if upload_date is not None:
  997. return str(upload_date)
  998. def unified_timestamp(date_str, day_first=True):
  999. if not isinstance(date_str, str):
  1000. return None
  1001. date_str = re.sub(r'\s+', ' ', re.sub(
  1002. r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?|sun)(day)?', '', date_str))
  1003. pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
  1004. timezone, date_str = extract_timezone(date_str)
  1005. # Remove AM/PM + timezone
  1006. date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
  1007. # Remove unrecognized timezones from ISO 8601 alike timestamps
  1008. m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
  1009. if m:
  1010. date_str = date_str[:-len(m.group('tz'))]
  1011. # Python only supports microseconds, so remove nanoseconds
  1012. m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
  1013. if m:
  1014. date_str = m.group(1)
  1015. for expression in date_formats(day_first):
  1016. with contextlib.suppress(ValueError):
  1017. dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta)
  1018. return calendar.timegm(dt_.timetuple())
  1019. timetuple = email.utils.parsedate_tz(date_str)
  1020. if timetuple:
  1021. return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
  1022. @partial_application
  1023. def determine_ext(url, default_ext='unknown_video'):
  1024. if url is None or '.' not in url:
  1025. return default_ext
  1026. guess = url.partition('?')[0].rpartition('.')[2]
  1027. if re.match(r'^[A-Za-z0-9]+$', guess):
  1028. return guess
  1029. # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
  1030. elif guess.rstrip('/') in KNOWN_EXTENSIONS:
  1031. return guess.rstrip('/')
  1032. else:
  1033. return default_ext
  1034. def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
  1035. return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
  1036. def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
  1037. R"""
  1038. Return a datetime object from a string.
  1039. Supported format:
  1040. (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
  1041. @param format strftime format of DATE
  1042. @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
  1043. auto: round to the unit provided in date_str (if applicable).
  1044. """
  1045. auto_precision = False
  1046. if precision == 'auto':
  1047. auto_precision = True
  1048. precision = 'microsecond'
  1049. today = datetime_round(dt.datetime.now(dt.timezone.utc), precision)
  1050. if date_str in ('now', 'today'):
  1051. return today
  1052. if date_str == 'yesterday':
  1053. return today - dt.timedelta(days=1)
  1054. match = re.match(
  1055. r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
  1056. date_str)
  1057. if match is not None:
  1058. start_time = datetime_from_str(match.group('start'), precision, format)
  1059. time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
  1060. unit = match.group('unit')
  1061. if unit == 'month' or unit == 'year':
  1062. new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
  1063. unit = 'day'
  1064. else:
  1065. if unit == 'week':
  1066. unit = 'day'
  1067. time *= 7
  1068. delta = dt.timedelta(**{unit + 's': time})
  1069. new_date = start_time + delta
  1070. if auto_precision:
  1071. return datetime_round(new_date, unit)
  1072. return new_date
  1073. return datetime_round(dt.datetime.strptime(date_str, format), precision)
  1074. def date_from_str(date_str, format='%Y%m%d', strict=False):
  1075. R"""
  1076. Return a date object from a string using datetime_from_str
  1077. @param strict Restrict allowed patterns to "YYYYMMDD" and
  1078. (now|today|yesterday)(-\d+(day|week|month|year)s?)?
  1079. """
  1080. if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
  1081. raise ValueError(f'Invalid date format "{date_str}"')
  1082. return datetime_from_str(date_str, precision='microsecond', format=format).date()
  1083. def datetime_add_months(dt_, months):
  1084. """Increment/Decrement a datetime object by months."""
  1085. month = dt_.month + months - 1
  1086. year = dt_.year + month // 12
  1087. month = month % 12 + 1
  1088. day = min(dt_.day, calendar.monthrange(year, month)[1])
  1089. return dt_.replace(year, month, day)
  1090. def datetime_round(dt_, precision='day'):
  1091. """
  1092. Round a datetime object's time to a specific precision
  1093. """
  1094. if precision == 'microsecond':
  1095. return dt_
  1096. unit_seconds = {
  1097. 'day': 86400,
  1098. 'hour': 3600,
  1099. 'minute': 60,
  1100. 'second': 1,
  1101. }
  1102. roundto = lambda x, n: ((x + n / 2) // n) * n
  1103. timestamp = roundto(calendar.timegm(dt_.timetuple()), unit_seconds[precision])
  1104. return dt.datetime.fromtimestamp(timestamp, dt.timezone.utc)
  1105. def hyphenate_date(date_str):
  1106. """
  1107. Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
  1108. match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
  1109. if match is not None:
  1110. return '-'.join(match.groups())
  1111. else:
  1112. return date_str
  1113. class DateRange:
  1114. """Represents a time interval between two dates"""
  1115. def __init__(self, start=None, end=None):
  1116. """start and end must be strings in the format accepted by date"""
  1117. if start is not None:
  1118. self.start = date_from_str(start, strict=True)
  1119. else:
  1120. self.start = dt.datetime.min.date()
  1121. if end is not None:
  1122. self.end = date_from_str(end, strict=True)
  1123. else:
  1124. self.end = dt.datetime.max.date()
  1125. if self.start > self.end:
  1126. raise ValueError(f'Date range: "{self}" , the start date must be before the end date')
  1127. @classmethod
  1128. def day(cls, day):
  1129. """Returns a range that only contains the given day"""
  1130. return cls(day, day)
  1131. def __contains__(self, date):
  1132. """Check if the date is in the range"""
  1133. if not isinstance(date, dt.date):
  1134. date = date_from_str(date)
  1135. return self.start <= date <= self.end
  1136. def __repr__(self):
  1137. return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
  1138. def __str__(self):
  1139. return f'{self.start} to {self.end}'
  1140. def __eq__(self, other):
  1141. return (isinstance(other, DateRange)
  1142. and self.start == other.start and self.end == other.end)
  1143. @functools.cache
  1144. def system_identifier():
  1145. python_implementation = platform.python_implementation()
  1146. if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
  1147. python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
  1148. libc_ver = []
  1149. with contextlib.suppress(OSError): # We may not have access to the executable
  1150. libc_ver = platform.libc_ver()
  1151. return 'Python {} ({} {} {}) - {} ({}{})'.format(
  1152. platform.python_version(),
  1153. python_implementation,
  1154. platform.machine(),
  1155. platform.architecture()[0],
  1156. platform.platform(),
  1157. ssl.OPENSSL_VERSION,
  1158. format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
  1159. )
  1160. @functools.cache
  1161. def get_windows_version():
  1162. """ Get Windows version. returns () if it's not running on Windows """
  1163. if os.name == 'nt':
  1164. return version_tuple(platform.win32_ver()[1])
  1165. else:
  1166. return ()
  1167. def write_string(s, out=None, encoding=None):
  1168. assert isinstance(s, str)
  1169. out = out or sys.stderr
  1170. # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
  1171. if not out:
  1172. return
  1173. if os.name == 'nt' and supports_terminal_sequences(out):
  1174. s = re.sub(r'([\r\n]+)', r' \1', s)
  1175. enc, buffer = None, out
  1176. # `mode` might be `None` (Ref: https://github.com/yt-dlp/yt-dlp/issues/8816)
  1177. if 'b' in (getattr(out, 'mode', None) or ''):
  1178. enc = encoding or preferredencoding()
  1179. elif hasattr(out, 'buffer'):
  1180. buffer = out.buffer
  1181. enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
  1182. buffer.write(s.encode(enc, 'ignore') if enc else s)
  1183. out.flush()
  1184. # TODO: Use global logger
  1185. def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
  1186. from .. import _IN_CLI
  1187. if _IN_CLI:
  1188. if msg in deprecation_warning._cache:
  1189. return
  1190. deprecation_warning._cache.add(msg)
  1191. if printer:
  1192. return printer(f'{msg}{bug_reports_message()}', **kwargs)
  1193. return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
  1194. else:
  1195. import warnings
  1196. warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
  1197. deprecation_warning._cache = set()
  1198. class LockingUnsupportedError(OSError):
  1199. msg = 'File locking is not supported'
  1200. def __init__(self):
  1201. super().__init__(self.msg)
  1202. # Cross-platform file locking
  1203. if sys.platform == 'win32':
  1204. import ctypes
  1205. import ctypes.wintypes
  1206. import msvcrt
  1207. class OVERLAPPED(ctypes.Structure):
  1208. _fields_ = [
  1209. ('Internal', ctypes.wintypes.LPVOID),
  1210. ('InternalHigh', ctypes.wintypes.LPVOID),
  1211. ('Offset', ctypes.wintypes.DWORD),
  1212. ('OffsetHigh', ctypes.wintypes.DWORD),
  1213. ('hEvent', ctypes.wintypes.HANDLE),
  1214. ]
  1215. kernel32 = ctypes.WinDLL('kernel32')
  1216. LockFileEx = kernel32.LockFileEx
  1217. LockFileEx.argtypes = [
  1218. ctypes.wintypes.HANDLE, # hFile
  1219. ctypes.wintypes.DWORD, # dwFlags
  1220. ctypes.wintypes.DWORD, # dwReserved
  1221. ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
  1222. ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
  1223. ctypes.POINTER(OVERLAPPED), # Overlapped
  1224. ]
  1225. LockFileEx.restype = ctypes.wintypes.BOOL
  1226. UnlockFileEx = kernel32.UnlockFileEx
  1227. UnlockFileEx.argtypes = [
  1228. ctypes.wintypes.HANDLE, # hFile
  1229. ctypes.wintypes.DWORD, # dwReserved
  1230. ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
  1231. ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
  1232. ctypes.POINTER(OVERLAPPED), # Overlapped
  1233. ]
  1234. UnlockFileEx.restype = ctypes.wintypes.BOOL
  1235. whole_low = 0xffffffff
  1236. whole_high = 0x7fffffff
  1237. def _lock_file(f, exclusive, block):
  1238. overlapped = OVERLAPPED()
  1239. overlapped.Offset = 0
  1240. overlapped.OffsetHigh = 0
  1241. overlapped.hEvent = 0
  1242. f._lock_file_overlapped_p = ctypes.pointer(overlapped)
  1243. if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
  1244. (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
  1245. 0, whole_low, whole_high, f._lock_file_overlapped_p):
  1246. # NB: No argument form of "ctypes.FormatError" does not work on PyPy
  1247. raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
  1248. def _unlock_file(f):
  1249. assert f._lock_file_overlapped_p
  1250. handle = msvcrt.get_osfhandle(f.fileno())
  1251. if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
  1252. raise OSError(f'Unlocking file failed: {ctypes.FormatError()!r}')
  1253. else:
  1254. try:
  1255. import fcntl
  1256. def _lock_file(f, exclusive, block):
  1257. flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
  1258. if not block:
  1259. flags |= fcntl.LOCK_NB
  1260. try:
  1261. fcntl.flock(f, flags)
  1262. except BlockingIOError:
  1263. raise
  1264. except OSError: # AOSP does not have flock()
  1265. fcntl.lockf(f, flags)
  1266. def _unlock_file(f):
  1267. with contextlib.suppress(OSError):
  1268. return fcntl.flock(f, fcntl.LOCK_UN)
  1269. with contextlib.suppress(OSError):
  1270. return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
  1271. return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
  1272. except ImportError:
  1273. def _lock_file(f, exclusive, block):
  1274. raise LockingUnsupportedError
  1275. def _unlock_file(f):
  1276. raise LockingUnsupportedError
  1277. class locked_file:
  1278. locked = False
  1279. def __init__(self, filename, mode, block=True, encoding=None):
  1280. if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
  1281. raise NotImplementedError(mode)
  1282. self.mode, self.block = mode, block
  1283. writable = any(f in mode for f in 'wax+')
  1284. readable = any(f in mode for f in 'r+')
  1285. flags = functools.reduce(operator.ior, (
  1286. getattr(os, 'O_CLOEXEC', 0), # UNIX only
  1287. getattr(os, 'O_BINARY', 0), # Windows only
  1288. getattr(os, 'O_NOINHERIT', 0), # Windows only
  1289. os.O_CREAT if writable else 0, # O_TRUNC only after locking
  1290. os.O_APPEND if 'a' in mode else 0,
  1291. os.O_EXCL if 'x' in mode else 0,
  1292. os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
  1293. ))
  1294. self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
  1295. def __enter__(self):
  1296. exclusive = 'r' not in self.mode
  1297. try:
  1298. _lock_file(self.f, exclusive, self.block)
  1299. self.locked = True
  1300. except OSError:
  1301. self.f.close()
  1302. raise
  1303. if 'w' in self.mode:
  1304. try:
  1305. self.f.truncate()
  1306. except OSError as e:
  1307. if e.errno not in (
  1308. errno.ESPIPE, # Illegal seek - expected for FIFO
  1309. errno.EINVAL, # Invalid argument - expected for /dev/null
  1310. ):
  1311. raise
  1312. return self
  1313. def unlock(self):
  1314. if not self.locked:
  1315. return
  1316. try:
  1317. _unlock_file(self.f)
  1318. finally:
  1319. self.locked = False
  1320. def __exit__(self, *_):
  1321. try:
  1322. self.unlock()
  1323. finally:
  1324. self.f.close()
  1325. open = __enter__
  1326. close = __exit__
  1327. def __getattr__(self, attr):
  1328. return getattr(self.f, attr)
  1329. def __iter__(self):
  1330. return iter(self.f)
  1331. @functools.cache
  1332. def get_filesystem_encoding():
  1333. encoding = sys.getfilesystemencoding()
  1334. return encoding if encoding is not None else 'utf-8'
  1335. _WINDOWS_QUOTE_TRANS = str.maketrans({'"': R'\"'})
  1336. _CMD_QUOTE_TRANS = str.maketrans({
  1337. # Keep quotes balanced by replacing them with `""` instead of `\\"`
  1338. '"': '""',
  1339. # These require an env-variable `=` containing `"^\n\n"` (set in `utils.Popen`)
  1340. # `=` should be unique since variables containing `=` cannot be set using cmd
  1341. '\n': '%=%',
  1342. '\r': '%=%',
  1343. # Use zero length variable replacement so `%` doesn't get expanded
  1344. # `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`)
  1345. '%': '%%cd:~,%',
  1346. })
  1347. def shell_quote(args, *, shell=False):
  1348. args = list(variadic(args))
  1349. if os.name != 'nt':
  1350. return shlex.join(args)
  1351. trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS
  1352. return ' '.join(
  1353. s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII)
  1354. else re.sub(r'(\\+)("|$)', r'\1\1\2', s).translate(trans).join('""')
  1355. for s in args)
  1356. def smuggle_url(url, data):
  1357. """ Pass additional data in a URL for internal use. """
  1358. url, idata = unsmuggle_url(url, {})
  1359. data.update(idata)
  1360. sdata = urllib.parse.urlencode(
  1361. {'__youtubedl_smuggle': json.dumps(data)})
  1362. return url + '#' + sdata
  1363. def unsmuggle_url(smug_url, default=None):
  1364. if '#__youtubedl_smuggle' not in smug_url:
  1365. return smug_url, default
  1366. url, _, sdata = smug_url.rpartition('#')
  1367. jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
  1368. data = json.loads(jsond)
  1369. return url, data
  1370. def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
  1371. """ Formats numbers with decimal sufixes like K, M, etc """
  1372. num, factor = float_or_none(num), float(factor)
  1373. if num is None or num < 0:
  1374. return None
  1375. POSSIBLE_SUFFIXES = 'kMGTPEZY'
  1376. exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
  1377. suffix = ['', *POSSIBLE_SUFFIXES][exponent]
  1378. if factor == 1024:
  1379. suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
  1380. converted = num / (factor ** exponent)
  1381. return fmt % (converted, suffix)
  1382. def format_bytes(bytes):
  1383. return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
  1384. def lookup_unit_table(unit_table, s, strict=False):
  1385. num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
  1386. units_re = '|'.join(re.escape(u) for u in unit_table)
  1387. m = (re.fullmatch if strict else re.match)(
  1388. rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
  1389. if not m:
  1390. return None
  1391. num = float(m.group('num').replace(',', '.'))
  1392. mult = unit_table[m.group('unit')]
  1393. return round(num * mult)
  1394. def parse_bytes(s):
  1395. """Parse a string indicating a byte quantity into an integer"""
  1396. return lookup_unit_table(
  1397. {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
  1398. s.upper(), strict=True)
  1399. def parse_filesize(s):
  1400. if s is None:
  1401. return None
  1402. # The lower-case forms are of course incorrect and unofficial,
  1403. # but we support those too
  1404. _UNIT_TABLE = {
  1405. 'B': 1,
  1406. 'b': 1,
  1407. 'bytes': 1,
  1408. 'KiB': 1024,
  1409. 'KB': 1000,
  1410. 'kB': 1024,
  1411. 'Kb': 1000,
  1412. 'kb': 1000,
  1413. 'kilobytes': 1000,
  1414. 'kibibytes': 1024,
  1415. 'MiB': 1024 ** 2,
  1416. 'MB': 1000 ** 2,
  1417. 'mB': 1024 ** 2,
  1418. 'Mb': 1000 ** 2,
  1419. 'mb': 1000 ** 2,
  1420. 'megabytes': 1000 ** 2,
  1421. 'mebibytes': 1024 ** 2,
  1422. 'GiB': 1024 ** 3,
  1423. 'GB': 1000 ** 3,
  1424. 'gB': 1024 ** 3,
  1425. 'Gb': 1000 ** 3,
  1426. 'gb': 1000 ** 3,
  1427. 'gigabytes': 1000 ** 3,
  1428. 'gibibytes': 1024 ** 3,
  1429. 'TiB': 1024 ** 4,
  1430. 'TB': 1000 ** 4,
  1431. 'tB': 1024 ** 4,
  1432. 'Tb': 1000 ** 4,
  1433. 'tb': 1000 ** 4,
  1434. 'terabytes': 1000 ** 4,
  1435. 'tebibytes': 1024 ** 4,
  1436. 'PiB': 1024 ** 5,
  1437. 'PB': 1000 ** 5,
  1438. 'pB': 1024 ** 5,
  1439. 'Pb': 1000 ** 5,
  1440. 'pb': 1000 ** 5,
  1441. 'petabytes': 1000 ** 5,
  1442. 'pebibytes': 1024 ** 5,
  1443. 'EiB': 1024 ** 6,
  1444. 'EB': 1000 ** 6,
  1445. 'eB': 1024 ** 6,
  1446. 'Eb': 1000 ** 6,
  1447. 'eb': 1000 ** 6,
  1448. 'exabytes': 1000 ** 6,
  1449. 'exbibytes': 1024 ** 6,
  1450. 'ZiB': 1024 ** 7,
  1451. 'ZB': 1000 ** 7,
  1452. 'zB': 1024 ** 7,
  1453. 'Zb': 1000 ** 7,
  1454. 'zb': 1000 ** 7,
  1455. 'zettabytes': 1000 ** 7,
  1456. 'zebibytes': 1024 ** 7,
  1457. 'YiB': 1024 ** 8,
  1458. 'YB': 1000 ** 8,
  1459. 'yB': 1024 ** 8,
  1460. 'Yb': 1000 ** 8,
  1461. 'yb': 1000 ** 8,
  1462. 'yottabytes': 1000 ** 8,
  1463. 'yobibytes': 1024 ** 8,
  1464. }
  1465. return lookup_unit_table(_UNIT_TABLE, s)
  1466. def parse_count(s):
  1467. if s is None:
  1468. return None
  1469. s = re.sub(r'^[^\d]+\s', '', s).strip()
  1470. if re.match(r'^[\d,.]+$', s):
  1471. return str_to_int(s)
  1472. _UNIT_TABLE = {
  1473. 'k': 1000,
  1474. 'K': 1000,
  1475. 'm': 1000 ** 2,
  1476. 'M': 1000 ** 2,
  1477. 'kk': 1000 ** 2,
  1478. 'KK': 1000 ** 2,
  1479. 'b': 1000 ** 3,
  1480. 'B': 1000 ** 3,
  1481. }
  1482. ret = lookup_unit_table(_UNIT_TABLE, s)
  1483. if ret is not None:
  1484. return ret
  1485. mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
  1486. if mobj:
  1487. return str_to_int(mobj.group(1))
  1488. def parse_resolution(s, *, lenient=False):
  1489. if s is None:
  1490. return {}
  1491. if lenient:
  1492. mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
  1493. else:
  1494. mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
  1495. if mobj:
  1496. return {
  1497. 'width': int(mobj.group('w')),
  1498. 'height': int(mobj.group('h')),
  1499. }
  1500. mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
  1501. if mobj:
  1502. return {'height': int(mobj.group(1))}
  1503. mobj = re.search(r'\b([48])[kK]\b', s)
  1504. if mobj:
  1505. return {'height': int(mobj.group(1)) * 540}
  1506. return {}
  1507. def parse_bitrate(s):
  1508. if not isinstance(s, str):
  1509. return
  1510. mobj = re.search(r'\b(\d+)\s*kbps', s)
  1511. if mobj:
  1512. return int(mobj.group(1))
  1513. def month_by_name(name, lang='en'):
  1514. """ Return the number of a month by (locale-independently) English name """
  1515. month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
  1516. try:
  1517. return month_names.index(name) + 1
  1518. except ValueError:
  1519. return None
  1520. def month_by_abbreviation(abbrev):
  1521. """ Return the number of a month by (locale-independently) English
  1522. abbreviations """
  1523. try:
  1524. return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
  1525. except ValueError:
  1526. return None
  1527. def fix_xml_ampersands(xml_str):
  1528. """Replace all the '&' by '&amp;' in XML"""
  1529. return re.sub(
  1530. r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
  1531. '&amp;',
  1532. xml_str)
  1533. def setproctitle(title):
  1534. assert isinstance(title, str)
  1535. # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
  1536. try:
  1537. import ctypes
  1538. except ImportError:
  1539. return
  1540. try:
  1541. libc = ctypes.cdll.LoadLibrary('libc.so.6')
  1542. except OSError:
  1543. return
  1544. except TypeError:
  1545. # LoadLibrary in Windows Python 2.7.13 only expects
  1546. # a bytestring, but since unicode_literals turns
  1547. # every string into a unicode string, it fails.
  1548. return
  1549. title_bytes = title.encode()
  1550. buf = ctypes.create_string_buffer(len(title_bytes))
  1551. buf.value = title_bytes
  1552. try:
  1553. # PR_SET_NAME = 15 Ref: /usr/include/linux/prctl.h
  1554. libc.prctl(15, buf, 0, 0, 0)
  1555. except AttributeError:
  1556. return # Strange libc, just skip this
  1557. def remove_start(s, start):
  1558. return s[len(start):] if s is not None and s.startswith(start) else s
  1559. def remove_end(s, end):
  1560. return s[:-len(end)] if s is not None and end and s.endswith(end) else s
  1561. def remove_quotes(s):
  1562. if s is None or len(s) < 2:
  1563. return s
  1564. for quote in ('"', "'"):
  1565. if s[0] == quote and s[-1] == quote:
  1566. return s[1:-1]
  1567. return s
  1568. def get_domain(url):
  1569. """
  1570. This implementation is inconsistent, but is kept for compatibility.
  1571. Use this only for "webpage_url_domain"
  1572. """
  1573. return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
  1574. def url_basename(url):
  1575. path = urllib.parse.urlparse(url).path
  1576. return path.strip('/').split('/')[-1]
  1577. def base_url(url):
  1578. return re.match(r'https?://[^?#]+/', url).group()
  1579. @partial_application
  1580. def urljoin(base, path):
  1581. if isinstance(path, bytes):
  1582. path = path.decode()
  1583. if not isinstance(path, str) or not path:
  1584. return None
  1585. if re.match(r'(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
  1586. return path
  1587. if isinstance(base, bytes):
  1588. base = base.decode()
  1589. if not isinstance(base, str) or not re.match(
  1590. r'^(?:https?:)?//', base):
  1591. return None
  1592. return urllib.parse.urljoin(base, path)
  1593. @partial_application
  1594. def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1, base=None):
  1595. if get_attr and v is not None:
  1596. v = getattr(v, get_attr, None)
  1597. if invscale == 1 and scale < 1:
  1598. invscale = int(1 / scale)
  1599. scale = 1
  1600. try:
  1601. return (int(v) if base is None else int(v, base=base)) * invscale // scale
  1602. except (ValueError, TypeError, OverflowError):
  1603. return default
  1604. def str_or_none(v, default=None):
  1605. return default if v is None else str(v)
  1606. def str_to_int(int_str):
  1607. """ A more relaxed version of int_or_none """
  1608. if isinstance(int_str, int):
  1609. return int_str
  1610. elif isinstance(int_str, str):
  1611. int_str = re.sub(r'[,\.\+]', '', int_str)
  1612. return int_or_none(int_str)
  1613. @partial_application
  1614. def float_or_none(v, scale=1, invscale=1, default=None):
  1615. if v is None:
  1616. return default
  1617. if invscale == 1 and scale < 1:
  1618. invscale = int(1 / scale)
  1619. scale = 1
  1620. try:
  1621. return float(v) * invscale / scale
  1622. except (ValueError, TypeError):
  1623. return default
  1624. def bool_or_none(v, default=None):
  1625. return v if isinstance(v, bool) else default
  1626. def strip_or_none(v, default=None):
  1627. return v.strip() if isinstance(v, str) else default
  1628. def url_or_none(url):
  1629. if not url or not isinstance(url, str):
  1630. return None
  1631. url = url.strip()
  1632. return url if re.match(r'(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
  1633. def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
  1634. datetime_object = None
  1635. try:
  1636. if isinstance(timestamp, (int, float)): # unix timestamp
  1637. # Using naive datetime here can break timestamp() in Windows
  1638. # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
  1639. # Also, dt.datetime.fromtimestamp breaks for negative timestamps
  1640. # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
  1641. datetime_object = (dt.datetime.fromtimestamp(0, dt.timezone.utc)
  1642. + dt.timedelta(seconds=timestamp))
  1643. elif isinstance(timestamp, str): # assume YYYYMMDD
  1644. datetime_object = dt.datetime.strptime(timestamp, '%Y%m%d')
  1645. date_format = re.sub( # Support %s on windows
  1646. r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
  1647. return datetime_object.strftime(date_format)
  1648. except (ValueError, TypeError, AttributeError):
  1649. return default
  1650. def parse_duration(s):
  1651. if not isinstance(s, str):
  1652. return None
  1653. s = s.strip()
  1654. if not s:
  1655. return None
  1656. days, hours, mins, secs, ms = [None] * 5
  1657. m = re.match(r'''(?x)
  1658. (?P<before_secs>
  1659. (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
  1660. (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
  1661. (?P<ms>[.:][0-9]+)?Z?$
  1662. ''', s)
  1663. if m:
  1664. days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
  1665. else:
  1666. m = re.match(
  1667. r'''(?ix)(?:P?
  1668. (?:
  1669. [0-9]+\s*y(?:ears?)?,?\s*
  1670. )?
  1671. (?:
  1672. [0-9]+\s*m(?:onths?)?,?\s*
  1673. )?
  1674. (?:
  1675. [0-9]+\s*w(?:eeks?)?,?\s*
  1676. )?
  1677. (?:
  1678. (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
  1679. )?
  1680. T)?
  1681. (?:
  1682. (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
  1683. )?
  1684. (?:
  1685. (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
  1686. )?
  1687. (?:
  1688. (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
  1689. )?Z?$''', s)
  1690. if m:
  1691. days, hours, mins, secs, ms = m.groups()
  1692. else:
  1693. m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
  1694. if m:
  1695. hours, mins = m.groups()
  1696. else:
  1697. return None
  1698. if ms:
  1699. ms = ms.replace(':', '.')
  1700. return sum(float(part or 0) * mult for part, mult in (
  1701. (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
  1702. def _change_extension(prepend, filename, ext, expected_real_ext=None):
  1703. name, real_ext = os.path.splitext(filename)
  1704. if not expected_real_ext or real_ext[1:] == expected_real_ext:
  1705. filename = name
  1706. if prepend and real_ext:
  1707. _UnsafeExtensionError.sanitize_extension(ext, prepend=True)
  1708. return f'{filename}.{ext}{real_ext}'
  1709. return f'{filename}.{_UnsafeExtensionError.sanitize_extension(ext)}'
  1710. prepend_extension = functools.partial(_change_extension, True)
  1711. replace_extension = functools.partial(_change_extension, False)
  1712. def check_executable(exe, args=[]):
  1713. """ Checks if the given binary is installed somewhere in PATH, and returns its name.
  1714. args can be a list of arguments for a short output (like -version) """
  1715. try:
  1716. Popen.run([exe, *args], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  1717. except OSError:
  1718. return False
  1719. return exe
  1720. def _get_exe_version_output(exe, args):
  1721. try:
  1722. # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
  1723. # SIGTTOU if yt-dlp is run in the background.
  1724. # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
  1725. stdout, _, ret = Popen.run([encodeArgument(exe), *args], text=True,
  1726. stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
  1727. if ret:
  1728. return None
  1729. except OSError:
  1730. return False
  1731. return stdout
  1732. def detect_exe_version(output, version_re=None, unrecognized='present'):
  1733. assert isinstance(output, str)
  1734. if version_re is None:
  1735. version_re = r'version\s+([-0-9._a-zA-Z]+)'
  1736. m = re.search(version_re, output)
  1737. if m:
  1738. return m.group(1)
  1739. else:
  1740. return unrecognized
  1741. def get_exe_version(exe, args=['--version'],
  1742. version_re=None, unrecognized=('present', 'broken')):
  1743. """ Returns the version of the specified executable,
  1744. or False if the executable is not present """
  1745. unrecognized = variadic(unrecognized)
  1746. assert len(unrecognized) in (1, 2)
  1747. out = _get_exe_version_output(exe, args)
  1748. if out is None:
  1749. return unrecognized[-1]
  1750. return out and detect_exe_version(out, version_re, unrecognized[0])
  1751. def frange(start=0, stop=None, step=1):
  1752. """Float range"""
  1753. if stop is None:
  1754. start, stop = 0, start
  1755. sign = [-1, 1][step > 0] if step else 0
  1756. while sign * start < sign * stop:
  1757. yield start
  1758. start += step
  1759. class LazyList(collections.abc.Sequence):
  1760. """Lazy immutable list from an iterable
  1761. Note that slices of a LazyList are lists and not LazyList"""
  1762. class IndexError(IndexError): # noqa: A001
  1763. pass
  1764. def __init__(self, iterable, *, reverse=False, _cache=None):
  1765. self._iterable = iter(iterable)
  1766. self._cache = [] if _cache is None else _cache
  1767. self._reversed = reverse
  1768. def __iter__(self):
  1769. if self._reversed:
  1770. # We need to consume the entire iterable to iterate in reverse
  1771. yield from self.exhaust()
  1772. return
  1773. yield from self._cache
  1774. for item in self._iterable:
  1775. self._cache.append(item)
  1776. yield item
  1777. def _exhaust(self):
  1778. self._cache.extend(self._iterable)
  1779. self._iterable = [] # Discard the emptied iterable to make it pickle-able
  1780. return self._cache
  1781. def exhaust(self):
  1782. """Evaluate the entire iterable"""
  1783. return self._exhaust()[::-1 if self._reversed else 1]
  1784. @staticmethod
  1785. def _reverse_index(x):
  1786. return None if x is None else ~x
  1787. def __getitem__(self, idx):
  1788. if isinstance(idx, slice):
  1789. if self._reversed:
  1790. idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
  1791. start, stop, step = idx.start, idx.stop, idx.step or 1
  1792. elif isinstance(idx, int):
  1793. if self._reversed:
  1794. idx = self._reverse_index(idx)
  1795. start, stop, step = idx, idx, 0
  1796. else:
  1797. raise TypeError('indices must be integers or slices')
  1798. if ((start or 0) < 0 or (stop or 0) < 0
  1799. or (start is None and step < 0)
  1800. or (stop is None and step > 0)):
  1801. # We need to consume the entire iterable to be able to slice from the end
  1802. # Obviously, never use this with infinite iterables
  1803. self._exhaust()
  1804. try:
  1805. return self._cache[idx]
  1806. except IndexError as e:
  1807. raise self.IndexError(e) from e
  1808. n = max(start or 0, stop or 0) - len(self._cache) + 1
  1809. if n > 0:
  1810. self._cache.extend(itertools.islice(self._iterable, n))
  1811. try:
  1812. return self._cache[idx]
  1813. except IndexError as e:
  1814. raise self.IndexError(e) from e
  1815. def __bool__(self):
  1816. try:
  1817. self[-1] if self._reversed else self[0]
  1818. except self.IndexError:
  1819. return False
  1820. return True
  1821. def __len__(self):
  1822. self._exhaust()
  1823. return len(self._cache)
  1824. def __reversed__(self):
  1825. return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
  1826. def __copy__(self):
  1827. return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
  1828. def __repr__(self):
  1829. # repr and str should mimic a list. So we exhaust the iterable
  1830. return repr(self.exhaust())
  1831. def __str__(self):
  1832. return repr(self.exhaust())
  1833. class PagedList:
  1834. class IndexError(IndexError): # noqa: A001
  1835. pass
  1836. def __len__(self):
  1837. # This is only useful for tests
  1838. return len(self.getslice())
  1839. def __init__(self, pagefunc, pagesize, use_cache=True):
  1840. self._pagefunc = pagefunc
  1841. self._pagesize = pagesize
  1842. self._pagecount = float('inf')
  1843. self._use_cache = use_cache
  1844. self._cache = {}
  1845. def getpage(self, pagenum):
  1846. page_results = self._cache.get(pagenum)
  1847. if page_results is None:
  1848. page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
  1849. if self._use_cache:
  1850. self._cache[pagenum] = page_results
  1851. return page_results
  1852. def getslice(self, start=0, end=None):
  1853. return list(self._getslice(start, end))
  1854. def _getslice(self, start, end):
  1855. raise NotImplementedError('This method must be implemented by subclasses')
  1856. def __getitem__(self, idx):
  1857. assert self._use_cache, 'Indexing PagedList requires cache'
  1858. if not isinstance(idx, int) or idx < 0:
  1859. raise TypeError('indices must be non-negative integers')
  1860. entries = self.getslice(idx, idx + 1)
  1861. if not entries:
  1862. raise self.IndexError
  1863. return entries[0]
  1864. def __bool__(self):
  1865. return bool(self.getslice(0, 1))
  1866. class OnDemandPagedList(PagedList):
  1867. """Download pages until a page with less than maximum results"""
  1868. def _getslice(self, start, end):
  1869. for pagenum in itertools.count(start // self._pagesize):
  1870. firstid = pagenum * self._pagesize
  1871. nextfirstid = pagenum * self._pagesize + self._pagesize
  1872. if start >= nextfirstid:
  1873. continue
  1874. startv = (
  1875. start % self._pagesize
  1876. if firstid <= start < nextfirstid
  1877. else 0)
  1878. endv = (
  1879. ((end - 1) % self._pagesize) + 1
  1880. if (end is not None and firstid <= end <= nextfirstid)
  1881. else None)
  1882. try:
  1883. page_results = self.getpage(pagenum)
  1884. except Exception:
  1885. self._pagecount = pagenum - 1
  1886. raise
  1887. if startv != 0 or endv is not None:
  1888. page_results = page_results[startv:endv]
  1889. yield from page_results
  1890. # A little optimization - if current page is not "full", ie. does
  1891. # not contain page_size videos then we can assume that this page
  1892. # is the last one - there are no more ids on further pages -
  1893. # i.e. no need to query again.
  1894. if len(page_results) + startv < self._pagesize:
  1895. break
  1896. # If we got the whole page, but the next page is not interesting,
  1897. # break out early as well
  1898. if end == nextfirstid:
  1899. break
  1900. class InAdvancePagedList(PagedList):
  1901. """PagedList with total number of pages known in advance"""
  1902. def __init__(self, pagefunc, pagecount, pagesize):
  1903. PagedList.__init__(self, pagefunc, pagesize, True)
  1904. self._pagecount = pagecount
  1905. def _getslice(self, start, end):
  1906. start_page = start // self._pagesize
  1907. end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
  1908. skip_elems = start - start_page * self._pagesize
  1909. only_more = None if end is None else end - start
  1910. for pagenum in range(start_page, end_page):
  1911. page_results = self.getpage(pagenum)
  1912. if skip_elems:
  1913. page_results = page_results[skip_elems:]
  1914. skip_elems = None
  1915. if only_more is not None:
  1916. if len(page_results) < only_more:
  1917. only_more -= len(page_results)
  1918. else:
  1919. yield from page_results[:only_more]
  1920. break
  1921. yield from page_results
  1922. class PlaylistEntries:
  1923. MissingEntry = object()
  1924. is_exhausted = False
  1925. def __init__(self, ydl, info_dict):
  1926. self.ydl = ydl
  1927. # _entries must be assigned now since infodict can change during iteration
  1928. entries = info_dict.get('entries')
  1929. if entries is None:
  1930. raise EntryNotInPlaylist('There are no entries')
  1931. elif isinstance(entries, list):
  1932. self.is_exhausted = True
  1933. requested_entries = info_dict.get('requested_entries')
  1934. self.is_incomplete = requested_entries is not None
  1935. if self.is_incomplete:
  1936. assert self.is_exhausted
  1937. self._entries = [self.MissingEntry] * max(requested_entries or [0])
  1938. for i, entry in zip(requested_entries, entries):
  1939. self._entries[i - 1] = entry
  1940. elif isinstance(entries, (list, PagedList, LazyList)):
  1941. self._entries = entries
  1942. else:
  1943. self._entries = LazyList(entries)
  1944. PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
  1945. (?P<start>[+-]?\d+)?
  1946. (?P<range>[:-]
  1947. (?P<end>[+-]?\d+|inf(?:inite)?)?
  1948. (?::(?P<step>[+-]?\d+))?
  1949. )?''')
  1950. @classmethod
  1951. def parse_playlist_items(cls, string):
  1952. for segment in string.split(','):
  1953. if not segment:
  1954. raise ValueError('There is two or more consecutive commas')
  1955. mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
  1956. if not mobj:
  1957. raise ValueError(f'{segment!r} is not a valid specification')
  1958. start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
  1959. if int_or_none(step) == 0:
  1960. raise ValueError(f'Step in {segment!r} cannot be zero')
  1961. yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
  1962. def get_requested_items(self):
  1963. playlist_items = self.ydl.params.get('playlist_items')
  1964. playlist_start = self.ydl.params.get('playliststart', 1)
  1965. playlist_end = self.ydl.params.get('playlistend')
  1966. # For backwards compatibility, interpret -1 as whole list
  1967. if playlist_end in (-1, None):
  1968. playlist_end = ''
  1969. if not playlist_items:
  1970. playlist_items = f'{playlist_start}:{playlist_end}'
  1971. elif playlist_start != 1 or playlist_end:
  1972. self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
  1973. for index in self.parse_playlist_items(playlist_items):
  1974. for i, entry in self[index]:
  1975. yield i, entry
  1976. if not entry:
  1977. continue
  1978. try:
  1979. # The item may have just been added to archive. Don't break due to it
  1980. if not self.ydl.params.get('lazy_playlist'):
  1981. # TODO: Add auto-generated fields
  1982. self.ydl._match_entry(entry, incomplete=True, silent=True)
  1983. except (ExistingVideoReached, RejectedVideoReached):
  1984. return
  1985. def get_full_count(self):
  1986. if self.is_exhausted and not self.is_incomplete:
  1987. return len(self)
  1988. elif isinstance(self._entries, InAdvancePagedList):
  1989. if self._entries._pagesize == 1:
  1990. return self._entries._pagecount
  1991. @functools.cached_property
  1992. def _getter(self):
  1993. if isinstance(self._entries, list):
  1994. def get_entry(i):
  1995. try:
  1996. entry = self._entries[i]
  1997. except IndexError:
  1998. entry = self.MissingEntry
  1999. if not self.is_incomplete:
  2000. raise self.IndexError
  2001. if entry is self.MissingEntry:
  2002. raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
  2003. return entry
  2004. else:
  2005. def get_entry(i):
  2006. try:
  2007. return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
  2008. except (LazyList.IndexError, PagedList.IndexError):
  2009. raise self.IndexError
  2010. return get_entry
  2011. def __getitem__(self, idx):
  2012. if isinstance(idx, int):
  2013. idx = slice(idx, idx)
  2014. # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
  2015. step = 1 if idx.step is None else idx.step
  2016. if idx.start is None:
  2017. start = 0 if step > 0 else len(self) - 1
  2018. else:
  2019. start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
  2020. # NB: Do not call len(self) when idx == [:]
  2021. if idx.stop is None:
  2022. stop = 0 if step < 0 else float('inf')
  2023. else:
  2024. stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
  2025. stop += [-1, 1][step > 0]
  2026. for i in frange(start, stop, step):
  2027. if i < 0:
  2028. continue
  2029. try:
  2030. entry = self._getter(i)
  2031. except self.IndexError:
  2032. self.is_exhausted = True
  2033. if step > 0:
  2034. break
  2035. continue
  2036. yield i + 1, entry
  2037. def __len__(self):
  2038. return len(tuple(self[:]))
  2039. class IndexError(IndexError): # noqa: A001
  2040. pass
  2041. def uppercase_escape(s):
  2042. unicode_escape = codecs.getdecoder('unicode_escape')
  2043. return re.sub(
  2044. r'\\U[0-9a-fA-F]{8}',
  2045. lambda m: unicode_escape(m.group(0))[0],
  2046. s)
  2047. def lowercase_escape(s):
  2048. unicode_escape = codecs.getdecoder('unicode_escape')
  2049. return re.sub(
  2050. r'\\u[0-9a-fA-F]{4}',
  2051. lambda m: unicode_escape(m.group(0))[0],
  2052. s)
  2053. def parse_qs(url, **kwargs):
  2054. return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
  2055. def read_batch_urls(batch_fd):
  2056. def fixup(url):
  2057. if not isinstance(url, str):
  2058. url = url.decode('utf-8', 'replace')
  2059. BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
  2060. for bom in BOM_UTF8:
  2061. if url.startswith(bom):
  2062. url = url[len(bom):]
  2063. url = url.lstrip()
  2064. if not url or url.startswith(('#', ';', ']')):
  2065. return False
  2066. # "#" cannot be stripped out since it is part of the URI
  2067. # However, it can be safely stripped out if following a whitespace
  2068. return re.split(r'\s#', url, maxsplit=1)[0].rstrip()
  2069. with contextlib.closing(batch_fd) as fd:
  2070. return [url for url in map(fixup, fd) if url]
  2071. def urlencode_postdata(*args, **kargs):
  2072. return urllib.parse.urlencode(*args, **kargs).encode('ascii')
  2073. @partial_application
  2074. def update_url(url, *, query_update=None, **kwargs):
  2075. """Replace URL components specified by kwargs
  2076. @param url str or parse url tuple
  2077. @param query_update update query
  2078. @returns str
  2079. """
  2080. if isinstance(url, str):
  2081. if not kwargs and not query_update:
  2082. return url
  2083. else:
  2084. url = urllib.parse.urlparse(url)
  2085. if query_update:
  2086. assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
  2087. kwargs['query'] = urllib.parse.urlencode({
  2088. **urllib.parse.parse_qs(url.query),
  2089. **query_update,
  2090. }, True)
  2091. return urllib.parse.urlunparse(url._replace(**kwargs))
  2092. @partial_application
  2093. def update_url_query(url, query):
  2094. return update_url(url, query_update=query)
  2095. def _multipart_encode_impl(data, boundary):
  2096. content_type = f'multipart/form-data; boundary={boundary}'
  2097. out = b''
  2098. for k, v in data.items():
  2099. out += b'--' + boundary.encode('ascii') + b'\r\n'
  2100. if isinstance(k, str):
  2101. k = k.encode()
  2102. if isinstance(v, str):
  2103. v = v.encode()
  2104. # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
  2105. # suggests sending UTF-8 directly. Firefox sends UTF-8, too
  2106. content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
  2107. if boundary.encode('ascii') in content:
  2108. raise ValueError('Boundary overlaps with data')
  2109. out += content
  2110. out += b'--' + boundary.encode('ascii') + b'--\r\n'
  2111. return out, content_type
  2112. def multipart_encode(data, boundary=None):
  2113. """
  2114. Encode a dict to RFC 7578-compliant form-data
  2115. data:
  2116. A dict where keys and values can be either Unicode or bytes-like
  2117. objects.
  2118. boundary:
  2119. If specified a Unicode object, it's used as the boundary. Otherwise
  2120. a random boundary is generated.
  2121. Reference: https://tools.ietf.org/html/rfc7578
  2122. """
  2123. has_specified_boundary = boundary is not None
  2124. while True:
  2125. if boundary is None:
  2126. boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
  2127. try:
  2128. out, content_type = _multipart_encode_impl(data, boundary)
  2129. break
  2130. except ValueError:
  2131. if has_specified_boundary:
  2132. raise
  2133. boundary = None
  2134. return out, content_type
  2135. def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
  2136. if blocked_types is NO_DEFAULT:
  2137. blocked_types = (str, bytes, collections.abc.Mapping)
  2138. return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
  2139. def variadic(x, allowed_types=NO_DEFAULT):
  2140. if not isinstance(allowed_types, (tuple, type)):
  2141. deprecation_warning('allowed_types should be a tuple or a type')
  2142. allowed_types = tuple(allowed_types)
  2143. return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
  2144. def try_call(*funcs, expected_type=None, args=[], kwargs={}):
  2145. for f in funcs:
  2146. try:
  2147. val = f(*args, **kwargs)
  2148. except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
  2149. pass
  2150. else:
  2151. if expected_type is None or isinstance(val, expected_type):
  2152. return val
  2153. def try_get(src, getter, expected_type=None):
  2154. return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
  2155. def filter_dict(dct, cndn=lambda _, v: v is not None):
  2156. return {k: v for k, v in dct.items() if cndn(k, v)}
  2157. def merge_dicts(*dicts):
  2158. merged = {}
  2159. for a_dict in dicts:
  2160. for k, v in a_dict.items():
  2161. if ((v is not None and k not in merged)
  2162. or (isinstance(v, str) and merged[k] == '')):
  2163. merged[k] = v
  2164. return merged
  2165. def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
  2166. return string if isinstance(string, str) else str(string, encoding, errors)
  2167. US_RATINGS = {
  2168. 'G': 0,
  2169. 'PG': 10,
  2170. 'PG-13': 13,
  2171. 'R': 16,
  2172. 'NC': 18,
  2173. }
  2174. TV_PARENTAL_GUIDELINES = {
  2175. 'TV-Y': 0,
  2176. 'TV-Y7': 7,
  2177. 'TV-G': 0,
  2178. 'TV-PG': 0,
  2179. 'TV-14': 14,
  2180. 'TV-MA': 17,
  2181. }
  2182. def parse_age_limit(s):
  2183. # isinstance(False, int) is True. So type() must be used instead
  2184. if type(s) is int: # noqa: E721
  2185. return s if 0 <= s <= 21 else None
  2186. elif not isinstance(s, str):
  2187. return None
  2188. m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
  2189. if m:
  2190. return int(m.group('age'))
  2191. s = s.upper()
  2192. if s in US_RATINGS:
  2193. return US_RATINGS[s]
  2194. m = re.match(r'^TV[_-]?({})$'.format('|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES)), s)
  2195. if m:
  2196. return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
  2197. return None
  2198. def strip_jsonp(code):
  2199. return re.sub(
  2200. r'''(?sx)^
  2201. (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
  2202. (?:\s*&&\s*(?P=func_name))?
  2203. \s*\(\s*(?P<callback_data>.*)\);?
  2204. \s*?(?://[^\n]*)*$''',
  2205. r'\g<callback_data>', code)
  2206. def js_to_json(code, vars={}, *, strict=False):
  2207. # vars is a dict of var, val pairs to substitute
  2208. STRING_QUOTES = '\'"`'
  2209. STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
  2210. COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
  2211. SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
  2212. INTEGER_TABLE = (
  2213. (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
  2214. (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
  2215. )
  2216. def process_escape(match):
  2217. JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
  2218. escape = match.group(1) or match.group(2)
  2219. return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
  2220. else R'\u00' if escape == 'x'
  2221. else '' if escape == '\n'
  2222. else escape)
  2223. def template_substitute(match):
  2224. evaluated = js_to_json(match.group(1), vars, strict=strict)
  2225. if evaluated[0] == '"':
  2226. return json.loads(evaluated)
  2227. return evaluated
  2228. def fix_kv(m):
  2229. v = m.group(0)
  2230. if v in ('true', 'false', 'null'):
  2231. return v
  2232. elif v in ('undefined', 'void 0'):
  2233. return 'null'
  2234. elif v.startswith(('/*', '//', '!')) or v == ',':
  2235. return ''
  2236. if v[0] in STRING_QUOTES:
  2237. v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
  2238. escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
  2239. return f'"{escaped}"'
  2240. for regex, base in INTEGER_TABLE:
  2241. im = re.match(regex, v)
  2242. if im:
  2243. i = int(im.group(1), base)
  2244. return f'"{i}":' if v.endswith(':') else str(i)
  2245. if v in vars:
  2246. try:
  2247. if not strict:
  2248. json.loads(vars[v])
  2249. except json.JSONDecodeError:
  2250. return json.dumps(vars[v])
  2251. else:
  2252. return vars[v]
  2253. if not strict:
  2254. return f'"{v}"'
  2255. raise ValueError(f'Unknown value: {v}')
  2256. def create_map(mobj):
  2257. return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
  2258. code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
  2259. code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
  2260. if not strict:
  2261. code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
  2262. code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
  2263. code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
  2264. code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
  2265. return re.sub(rf'''(?sx)
  2266. {STRING_RE}|
  2267. {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
  2268. void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
  2269. \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
  2270. [0-9]+(?={SKIP_RE}:)|
  2271. !+
  2272. ''', fix_kv, code)
  2273. def qualities(quality_ids):
  2274. """ Get a numeric quality value out of a list of possible values """
  2275. def q(qid):
  2276. try:
  2277. return quality_ids.index(qid)
  2278. except ValueError:
  2279. return -1
  2280. return q
  2281. POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
  2282. DEFAULT_OUTTMPL = {
  2283. 'default': '%(title)s [%(id)s].%(ext)s',
  2284. 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
  2285. }
  2286. OUTTMPL_TYPES = {
  2287. 'chapter': None,
  2288. 'subtitle': None,
  2289. 'thumbnail': None,
  2290. 'description': 'description',
  2291. 'annotation': 'annotations.xml',
  2292. 'infojson': 'info.json',
  2293. 'link': None,
  2294. 'pl_video': None,
  2295. 'pl_thumbnail': None,
  2296. 'pl_description': 'description',
  2297. 'pl_infojson': 'info.json',
  2298. }
  2299. # As of [1] format syntax is:
  2300. # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
  2301. # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
  2302. STR_FORMAT_RE_TMPL = r'''(?x)
  2303. (?<!%)(?P<prefix>(?:%%)*)
  2304. %
  2305. (?P<has_key>\((?P<key>{0})\))?
  2306. (?P<format>
  2307. (?P<conversion>[#0\-+ ]+)?
  2308. (?P<min_width>\d+)?
  2309. (?P<precision>\.\d+)?
  2310. (?P<len_mod>[hlL])? # unused in python
  2311. {1} # conversion type
  2312. )
  2313. '''
  2314. STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
  2315. def limit_length(s, length):
  2316. """ Add ellipses to overly long strings """
  2317. if s is None:
  2318. return None
  2319. ELLIPSES = '...'
  2320. if len(s) > length:
  2321. return s[:length - len(ELLIPSES)] + ELLIPSES
  2322. return s
  2323. def version_tuple(v):
  2324. return tuple(int(e) for e in re.split(r'[-.]', v))
  2325. def is_outdated_version(version, limit, assume_new=True):
  2326. if not version:
  2327. return not assume_new
  2328. try:
  2329. return version_tuple(version) < version_tuple(limit)
  2330. except ValueError:
  2331. return not assume_new
  2332. def ytdl_is_updateable():
  2333. """ Returns if yt-dlp can be updated with -U """
  2334. from ..update import is_non_updateable
  2335. return not is_non_updateable()
  2336. def args_to_str(args):
  2337. # Get a short string representation for a subprocess command
  2338. return shell_quote(args)
  2339. def error_to_str(err):
  2340. return f'{type(err).__name__}: {err}'
  2341. @partial_application
  2342. def mimetype2ext(mt, default=NO_DEFAULT):
  2343. if not isinstance(mt, str):
  2344. if default is not NO_DEFAULT:
  2345. return default
  2346. return None
  2347. MAP = {
  2348. # video
  2349. '3gpp': '3gp',
  2350. 'mp2t': 'ts',
  2351. 'mp4': 'mp4',
  2352. 'mpeg': 'mpeg',
  2353. 'mpegurl': 'm3u8',
  2354. 'quicktime': 'mov',
  2355. 'webm': 'webm',
  2356. 'vp9': 'vp9',
  2357. 'video/ogg': 'ogv',
  2358. 'x-flv': 'flv',
  2359. 'x-m4v': 'm4v',
  2360. 'x-matroska': 'mkv',
  2361. 'x-mng': 'mng',
  2362. 'x-mp4-fragmented': 'mp4',
  2363. 'x-ms-asf': 'asf',
  2364. 'x-ms-wmv': 'wmv',
  2365. 'x-msvideo': 'avi',
  2366. # application (streaming playlists)
  2367. 'dash+xml': 'mpd',
  2368. 'f4m+xml': 'f4m',
  2369. 'hds+xml': 'f4m',
  2370. 'vnd.apple.mpegurl': 'm3u8',
  2371. 'vnd.ms-sstr+xml': 'ism',
  2372. 'x-mpegurl': 'm3u8',
  2373. # audio
  2374. 'audio/mp4': 'm4a',
  2375. # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
  2376. # Using .mp3 as it's the most popular one
  2377. 'audio/mpeg': 'mp3',
  2378. 'audio/webm': 'webm',
  2379. 'audio/x-matroska': 'mka',
  2380. 'audio/x-mpegurl': 'm3u',
  2381. 'aacp': 'aac',
  2382. 'midi': 'mid',
  2383. 'ogg': 'ogg',
  2384. 'wav': 'wav',
  2385. 'wave': 'wav',
  2386. 'x-aac': 'aac',
  2387. 'x-flac': 'flac',
  2388. 'x-m4a': 'm4a',
  2389. 'x-realaudio': 'ra',
  2390. 'x-wav': 'wav',
  2391. # image
  2392. 'avif': 'avif',
  2393. 'bmp': 'bmp',
  2394. 'gif': 'gif',
  2395. 'jpeg': 'jpg',
  2396. 'png': 'png',
  2397. 'svg+xml': 'svg',
  2398. 'tiff': 'tif',
  2399. 'vnd.wap.wbmp': 'wbmp',
  2400. 'webp': 'webp',
  2401. 'x-icon': 'ico',
  2402. 'x-jng': 'jng',
  2403. 'x-ms-bmp': 'bmp',
  2404. # caption
  2405. 'filmstrip+json': 'fs',
  2406. 'smptett+xml': 'tt',
  2407. 'ttaf+xml': 'dfxp',
  2408. 'ttml+xml': 'ttml',
  2409. 'x-ms-sami': 'sami',
  2410. # misc
  2411. 'gzip': 'gz',
  2412. 'json': 'json',
  2413. 'xml': 'xml',
  2414. 'zip': 'zip',
  2415. }
  2416. mimetype = mt.partition(';')[0].strip().lower()
  2417. _, _, subtype = mimetype.rpartition('/')
  2418. ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
  2419. if ext:
  2420. return ext
  2421. elif default is not NO_DEFAULT:
  2422. return default
  2423. return subtype.replace('+', '.')
  2424. def ext2mimetype(ext_or_url):
  2425. if not ext_or_url:
  2426. return None
  2427. if '.' not in ext_or_url:
  2428. ext_or_url = f'file.{ext_or_url}'
  2429. return mimetypes.guess_type(ext_or_url)[0]
  2430. def parse_codecs(codecs_str):
  2431. # http://tools.ietf.org/html/rfc6381
  2432. if not codecs_str:
  2433. return {}
  2434. split_codecs = list(filter(None, map(
  2435. str.strip, codecs_str.strip().strip(',').split(','))))
  2436. vcodec, acodec, scodec, hdr = None, None, None, None
  2437. for full_codec in split_codecs:
  2438. full_codec = re.sub(r'^([^.]+)', lambda m: m.group(1).lower(), full_codec)
  2439. parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
  2440. if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
  2441. 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
  2442. if vcodec:
  2443. continue
  2444. vcodec = full_codec
  2445. if parts[0] in ('dvh1', 'dvhe'):
  2446. hdr = 'DV'
  2447. elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
  2448. hdr = 'HDR10'
  2449. elif parts[:2] == ['vp9', '2']:
  2450. hdr = 'HDR10'
  2451. elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
  2452. 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
  2453. acodec = acodec or full_codec
  2454. elif parts[0] in ('stpp', 'wvtt'):
  2455. scodec = scodec or full_codec
  2456. else:
  2457. write_string(f'WARNING: Unknown codec {full_codec}\n')
  2458. if vcodec or acodec or scodec:
  2459. return {
  2460. 'vcodec': vcodec or 'none',
  2461. 'acodec': acodec or 'none',
  2462. 'dynamic_range': hdr,
  2463. **({'scodec': scodec} if scodec is not None else {}),
  2464. }
  2465. elif len(split_codecs) == 2:
  2466. return {
  2467. 'vcodec': split_codecs[0],
  2468. 'acodec': split_codecs[1],
  2469. }
  2470. return {}
  2471. def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
  2472. assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
  2473. allow_mkv = not preferences or 'mkv' in preferences
  2474. if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
  2475. return 'mkv' # TODO: any other format allows this?
  2476. # TODO: All codecs supported by parse_codecs isn't handled here
  2477. COMPATIBLE_CODECS = {
  2478. 'mp4': {
  2479. 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
  2480. 'h264', 'aacl', 'ec-3', # Set in ISM
  2481. },
  2482. 'webm': {
  2483. 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
  2484. 'vp9x', 'vp8x', # in the webm spec
  2485. },
  2486. }
  2487. sanitize_codec = functools.partial(
  2488. try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
  2489. vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
  2490. for ext in preferences or COMPATIBLE_CODECS.keys():
  2491. codec_set = COMPATIBLE_CODECS.get(ext, set())
  2492. if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
  2493. return ext
  2494. COMPATIBLE_EXTS = (
  2495. {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
  2496. {'webm', 'weba'},
  2497. )
  2498. for ext in preferences or vexts:
  2499. current_exts = {ext, *vexts, *aexts}
  2500. if ext == 'mkv' or current_exts == {ext} or any(
  2501. ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
  2502. return ext
  2503. return 'mkv' if allow_mkv else preferences[-1]
  2504. def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
  2505. getheader = url_handle.headers.get
  2506. cd = getheader('Content-Disposition')
  2507. if cd:
  2508. m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
  2509. if m:
  2510. e = determine_ext(m.group('filename'), default_ext=None)
  2511. if e:
  2512. return e
  2513. meta_ext = getheader('x-amz-meta-name')
  2514. if meta_ext:
  2515. e = meta_ext.rpartition('.')[2]
  2516. if e:
  2517. return e
  2518. return mimetype2ext(getheader('Content-Type'), default=default)
  2519. def encode_data_uri(data, mime_type):
  2520. return 'data:{};base64,{}'.format(mime_type, base64.b64encode(data).decode('ascii'))
  2521. def age_restricted(content_limit, age_limit):
  2522. """ Returns True iff the content should be blocked """
  2523. if age_limit is None: # No limit set
  2524. return False
  2525. if content_limit is None:
  2526. return False # Content available for everyone
  2527. return age_limit < content_limit
  2528. # List of known byte-order-marks (BOM)
  2529. BOMS = [
  2530. (b'\xef\xbb\xbf', 'utf-8'),
  2531. (b'\x00\x00\xfe\xff', 'utf-32-be'),
  2532. (b'\xff\xfe\x00\x00', 'utf-32-le'),
  2533. (b'\xff\xfe', 'utf-16-le'),
  2534. (b'\xfe\xff', 'utf-16-be'),
  2535. ]
  2536. def is_html(first_bytes):
  2537. """ Detect whether a file contains HTML by examining its first bytes. """
  2538. encoding = 'utf-8'
  2539. for bom, enc in BOMS:
  2540. while first_bytes.startswith(bom):
  2541. encoding, first_bytes = enc, first_bytes[len(bom):]
  2542. return re.match(r'\s*<', first_bytes.decode(encoding, 'replace'))
  2543. def determine_protocol(info_dict):
  2544. protocol = info_dict.get('protocol')
  2545. if protocol is not None:
  2546. return protocol
  2547. url = sanitize_url(info_dict['url'])
  2548. if url.startswith('rtmp'):
  2549. return 'rtmp'
  2550. elif url.startswith('mms'):
  2551. return 'mms'
  2552. elif url.startswith('rtsp'):
  2553. return 'rtsp'
  2554. ext = determine_ext(url)
  2555. if ext == 'm3u8':
  2556. return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
  2557. elif ext == 'f4m':
  2558. return 'f4m'
  2559. return urllib.parse.urlparse(url).scheme
  2560. def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
  2561. """ Render a list of rows, each as a list of values.
  2562. Text after a \t will be right aligned """
  2563. def width(string):
  2564. return len(remove_terminal_sequences(string).replace('\t', ''))
  2565. def get_max_lens(table):
  2566. return [max(width(str(v)) for v in col) for col in zip(*table)]
  2567. def filter_using_list(row, filter_array):
  2568. return [col for take, col in itertools.zip_longest(filter_array, row, fillvalue=True) if take]
  2569. max_lens = get_max_lens(data) if hide_empty else []
  2570. header_row = filter_using_list(header_row, max_lens)
  2571. data = [filter_using_list(row, max_lens) for row in data]
  2572. table = [header_row, *data]
  2573. max_lens = get_max_lens(table)
  2574. extra_gap += 1
  2575. if delim:
  2576. table = [header_row, [delim * (ml + extra_gap) for ml in max_lens], *data]
  2577. table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
  2578. for row in table:
  2579. for pos, text in enumerate(map(str, row)):
  2580. if '\t' in text:
  2581. row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
  2582. else:
  2583. row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
  2584. return '\n'.join(''.join(row).rstrip() for row in table)
  2585. def _match_one(filter_part, dct, incomplete):
  2586. # TODO: Generalize code with YoutubeDL._build_format_filter
  2587. STRING_OPERATORS = {
  2588. '*=': operator.contains,
  2589. '^=': lambda attr, value: attr.startswith(value),
  2590. '$=': lambda attr, value: attr.endswith(value),
  2591. '~=': lambda attr, value: re.search(value, attr),
  2592. }
  2593. COMPARISON_OPERATORS = {
  2594. **STRING_OPERATORS,
  2595. '<=': operator.le, # "<=" must be defined above "<"
  2596. '<': operator.lt,
  2597. '>=': operator.ge,
  2598. '>': operator.gt,
  2599. '=': operator.eq,
  2600. }
  2601. if isinstance(incomplete, bool):
  2602. is_incomplete = lambda _: incomplete
  2603. else:
  2604. is_incomplete = lambda k: k in incomplete
  2605. operator_rex = re.compile(r'''(?x)
  2606. (?P<key>[a-z_]+)
  2607. \s*(?P<negation>!\s*)?(?P<op>{})(?P<none_inclusive>\s*\?)?\s*
  2608. (?:
  2609. (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
  2610. (?P<strval>.+?)
  2611. )
  2612. '''.format('|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))))
  2613. m = operator_rex.fullmatch(filter_part.strip())
  2614. if m:
  2615. m = m.groupdict()
  2616. unnegated_op = COMPARISON_OPERATORS[m['op']]
  2617. if m['negation']:
  2618. op = lambda attr, value: not unnegated_op(attr, value)
  2619. else:
  2620. op = unnegated_op
  2621. comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
  2622. if m['quote']:
  2623. comparison_value = comparison_value.replace(r'\{}'.format(m['quote']), m['quote'])
  2624. actual_value = dct.get(m['key'])
  2625. numeric_comparison = None
  2626. if isinstance(actual_value, (int, float)):
  2627. # If the original field is a string and matching comparisonvalue is
  2628. # a number we should respect the origin of the original field
  2629. # and process comparison value as a string (see
  2630. # https://github.com/ytdl-org/youtube-dl/issues/11082)
  2631. try:
  2632. numeric_comparison = int(comparison_value)
  2633. except ValueError:
  2634. numeric_comparison = parse_filesize(comparison_value)
  2635. if numeric_comparison is None:
  2636. numeric_comparison = parse_filesize(f'{comparison_value}B')
  2637. if numeric_comparison is None:
  2638. numeric_comparison = parse_duration(comparison_value)
  2639. if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
  2640. raise ValueError('Operator {} only supports string values!'.format(m['op']))
  2641. if actual_value is None:
  2642. return is_incomplete(m['key']) or m['none_inclusive']
  2643. return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
  2644. UNARY_OPERATORS = {
  2645. '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
  2646. '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
  2647. }
  2648. operator_rex = re.compile(r'''(?x)
  2649. (?P<op>{})\s*(?P<key>[a-z_]+)
  2650. '''.format('|'.join(map(re.escape, UNARY_OPERATORS.keys()))))
  2651. m = operator_rex.fullmatch(filter_part.strip())
  2652. if m:
  2653. op = UNARY_OPERATORS[m.group('op')]
  2654. actual_value = dct.get(m.group('key'))
  2655. if is_incomplete(m.group('key')) and actual_value is None:
  2656. return True
  2657. return op(actual_value)
  2658. raise ValueError(f'Invalid filter part {filter_part!r}')
  2659. def match_str(filter_str, dct, incomplete=False):
  2660. """ Filter a dictionary with a simple string syntax.
  2661. @returns Whether the filter passes
  2662. @param incomplete Set of keys that is expected to be missing from dct.
  2663. Can be True/False to indicate all/none of the keys may be missing.
  2664. All conditions on incomplete keys pass if the key is missing
  2665. """
  2666. return all(
  2667. _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
  2668. for filter_part in re.split(r'(?<!\\)&', filter_str))
  2669. def match_filter_func(filters, breaking_filters=None):
  2670. if not filters and not breaking_filters:
  2671. return None
  2672. repr_ = f'{match_filter_func.__module__}.{match_filter_func.__qualname__}({filters}, {breaking_filters})'
  2673. breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
  2674. filters = set(variadic(filters or []))
  2675. interactive = '-' in filters
  2676. if interactive:
  2677. filters.remove('-')
  2678. @function_with_repr.set_repr(repr_)
  2679. def _match_func(info_dict, incomplete=False):
  2680. ret = breaking_filters(info_dict, incomplete)
  2681. if ret is not None:
  2682. raise RejectedVideoReached(ret)
  2683. if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
  2684. return NO_DEFAULT if interactive and not incomplete else None
  2685. else:
  2686. video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
  2687. filter_str = ') | ('.join(map(str.strip, filters))
  2688. return f'{video_title} does not pass filter ({filter_str}), skipping ..'
  2689. return _match_func
  2690. class download_range_func:
  2691. def __init__(self, chapters, ranges, from_info=False):
  2692. self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
  2693. def __call__(self, info_dict, ydl):
  2694. warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
  2695. else 'Cannot match chapters since chapter information is unavailable')
  2696. for regex in self.chapters or []:
  2697. for i, chapter in enumerate(info_dict.get('chapters') or []):
  2698. if re.search(regex, chapter['title']):
  2699. warning = None
  2700. yield {**chapter, 'index': i}
  2701. if self.chapters and warning:
  2702. ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
  2703. for start, end in self.ranges or []:
  2704. yield {
  2705. 'start_time': self._handle_negative_timestamp(start, info_dict),
  2706. 'end_time': self._handle_negative_timestamp(end, info_dict),
  2707. }
  2708. if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
  2709. yield {
  2710. 'start_time': info_dict.get('start_time') or 0,
  2711. 'end_time': info_dict.get('end_time') or float('inf'),
  2712. }
  2713. elif not self.ranges and not self.chapters:
  2714. yield {}
  2715. @staticmethod
  2716. def _handle_negative_timestamp(time, info):
  2717. return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
  2718. def __eq__(self, other):
  2719. return (isinstance(other, download_range_func)
  2720. and self.chapters == other.chapters and self.ranges == other.ranges)
  2721. def __repr__(self):
  2722. return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
  2723. def parse_dfxp_time_expr(time_expr):
  2724. if not time_expr:
  2725. return
  2726. mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
  2727. if mobj:
  2728. return float(mobj.group('time_offset'))
  2729. mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
  2730. if mobj:
  2731. return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
  2732. def srt_subtitles_timecode(seconds):
  2733. return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
  2734. def ass_subtitles_timecode(seconds):
  2735. time = timetuple_from_msec(seconds * 1000)
  2736. return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
  2737. def dfxp2srt(dfxp_data):
  2738. """
  2739. @param dfxp_data A bytes-like object containing DFXP data
  2740. @returns A unicode object containing converted SRT data
  2741. """
  2742. LEGACY_NAMESPACES = (
  2743. (b'http://www.w3.org/ns/ttml', [
  2744. b'http://www.w3.org/2004/11/ttaf1',
  2745. b'http://www.w3.org/2006/04/ttaf1',
  2746. b'http://www.w3.org/2006/10/ttaf1',
  2747. ]),
  2748. (b'http://www.w3.org/ns/ttml#styling', [
  2749. b'http://www.w3.org/ns/ttml#style',
  2750. ]),
  2751. )
  2752. SUPPORTED_STYLING = [
  2753. 'color',
  2754. 'fontFamily',
  2755. 'fontSize',
  2756. 'fontStyle',
  2757. 'fontWeight',
  2758. 'textDecoration',
  2759. ]
  2760. _x = functools.partial(xpath_with_ns, ns_map={
  2761. 'xml': 'http://www.w3.org/XML/1998/namespace',
  2762. 'ttml': 'http://www.w3.org/ns/ttml',
  2763. 'tts': 'http://www.w3.org/ns/ttml#styling',
  2764. })
  2765. styles = {}
  2766. default_style = {}
  2767. class TTMLPElementParser:
  2768. _out = ''
  2769. _unclosed_elements = []
  2770. _applied_styles = []
  2771. def start(self, tag, attrib):
  2772. if tag in (_x('ttml:br'), 'br'):
  2773. self._out += '\n'
  2774. else:
  2775. unclosed_elements = []
  2776. style = {}
  2777. element_style_id = attrib.get('style')
  2778. if default_style:
  2779. style.update(default_style)
  2780. if element_style_id:
  2781. style.update(styles.get(element_style_id, {}))
  2782. for prop in SUPPORTED_STYLING:
  2783. prop_val = attrib.get(_x('tts:' + prop))
  2784. if prop_val:
  2785. style[prop] = prop_val
  2786. if style:
  2787. font = ''
  2788. for k, v in sorted(style.items()):
  2789. if self._applied_styles and self._applied_styles[-1].get(k) == v:
  2790. continue
  2791. if k == 'color':
  2792. font += f' color="{v}"'
  2793. elif k == 'fontSize':
  2794. font += f' size="{v}"'
  2795. elif k == 'fontFamily':
  2796. font += f' face="{v}"'
  2797. elif k == 'fontWeight' and v == 'bold':
  2798. self._out += '<b>'
  2799. unclosed_elements.append('b')
  2800. elif k == 'fontStyle' and v == 'italic':
  2801. self._out += '<i>'
  2802. unclosed_elements.append('i')
  2803. elif k == 'textDecoration' and v == 'underline':
  2804. self._out += '<u>'
  2805. unclosed_elements.append('u')
  2806. if font:
  2807. self._out += '<font' + font + '>'
  2808. unclosed_elements.append('font')
  2809. applied_style = {}
  2810. if self._applied_styles:
  2811. applied_style.update(self._applied_styles[-1])
  2812. applied_style.update(style)
  2813. self._applied_styles.append(applied_style)
  2814. self._unclosed_elements.append(unclosed_elements)
  2815. def end(self, tag):
  2816. if tag not in (_x('ttml:br'), 'br'):
  2817. unclosed_elements = self._unclosed_elements.pop()
  2818. for element in reversed(unclosed_elements):
  2819. self._out += f'</{element}>'
  2820. if unclosed_elements and self._applied_styles:
  2821. self._applied_styles.pop()
  2822. def data(self, data):
  2823. self._out += data
  2824. def close(self):
  2825. return self._out.strip()
  2826. # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
  2827. # This will not trigger false positives since only UTF-8 text is being replaced
  2828. dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
  2829. def parse_node(node):
  2830. target = TTMLPElementParser()
  2831. parser = xml.etree.ElementTree.XMLParser(target=target)
  2832. parser.feed(xml.etree.ElementTree.tostring(node))
  2833. return parser.close()
  2834. for k, v in LEGACY_NAMESPACES:
  2835. for ns in v:
  2836. dfxp_data = dfxp_data.replace(ns, k)
  2837. dfxp = compat_etree_fromstring(dfxp_data)
  2838. out = []
  2839. paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
  2840. if not paras:
  2841. raise ValueError('Invalid dfxp/TTML subtitle')
  2842. repeat = False
  2843. while True:
  2844. for style in dfxp.findall(_x('.//ttml:style')):
  2845. style_id = style.get('id') or style.get(_x('xml:id'))
  2846. if not style_id:
  2847. continue
  2848. parent_style_id = style.get('style')
  2849. if parent_style_id:
  2850. if parent_style_id not in styles:
  2851. repeat = True
  2852. continue
  2853. styles[style_id] = styles[parent_style_id].copy()
  2854. for prop in SUPPORTED_STYLING:
  2855. prop_val = style.get(_x('tts:' + prop))
  2856. if prop_val:
  2857. styles.setdefault(style_id, {})[prop] = prop_val
  2858. if repeat:
  2859. repeat = False
  2860. else:
  2861. break
  2862. for p in ('body', 'div'):
  2863. ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
  2864. if ele is None:
  2865. continue
  2866. style = styles.get(ele.get('style'))
  2867. if not style:
  2868. continue
  2869. default_style.update(style)
  2870. for para, index in zip(paras, itertools.count(1)):
  2871. begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
  2872. end_time = parse_dfxp_time_expr(para.attrib.get('end'))
  2873. dur = parse_dfxp_time_expr(para.attrib.get('dur'))
  2874. if begin_time is None:
  2875. continue
  2876. if not end_time:
  2877. if not dur:
  2878. continue
  2879. end_time = begin_time + dur
  2880. out.append('%d\n%s --> %s\n%s\n\n' % (
  2881. index,
  2882. srt_subtitles_timecode(begin_time),
  2883. srt_subtitles_timecode(end_time),
  2884. parse_node(para)))
  2885. return ''.join(out)
  2886. def cli_option(params, command_option, param, separator=None):
  2887. param = params.get(param)
  2888. return ([] if param is None
  2889. else [command_option, str(param)] if separator is None
  2890. else [f'{command_option}{separator}{param}'])
  2891. def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
  2892. param = params.get(param)
  2893. assert param in (True, False, None)
  2894. return cli_option({True: true_value, False: false_value}, command_option, param, separator)
  2895. def cli_valueless_option(params, command_option, param, expected_value=True):
  2896. return [command_option] if params.get(param) == expected_value else []
  2897. def cli_configuration_args(argdict, keys, default=[], use_compat=True):
  2898. if isinstance(argdict, (list, tuple)): # for backward compatibility
  2899. if use_compat:
  2900. return argdict
  2901. else:
  2902. argdict = None
  2903. if argdict is None:
  2904. return default
  2905. assert isinstance(argdict, dict)
  2906. assert isinstance(keys, (list, tuple))
  2907. for key_list in keys:
  2908. arg_list = list(filter(
  2909. lambda x: x is not None,
  2910. [argdict.get(key.lower()) for key in variadic(key_list)]))
  2911. if arg_list:
  2912. return [arg for args in arg_list for arg in args]
  2913. return default
  2914. def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
  2915. main_key, exe = main_key.lower(), exe.lower()
  2916. root_key = exe if main_key == exe else f'{main_key}+{exe}'
  2917. keys = [f'{root_key}{k}' for k in (keys or [''])]
  2918. if root_key in keys:
  2919. if main_key != exe:
  2920. keys.append((main_key, exe))
  2921. keys.append('default')
  2922. else:
  2923. use_compat = False
  2924. return cli_configuration_args(argdict, keys, default, use_compat)
  2925. class ISO639Utils:
  2926. # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
  2927. _lang_map = {
  2928. 'aa': 'aar',
  2929. 'ab': 'abk',
  2930. 'ae': 'ave',
  2931. 'af': 'afr',
  2932. 'ak': 'aka',
  2933. 'am': 'amh',
  2934. 'an': 'arg',
  2935. 'ar': 'ara',
  2936. 'as': 'asm',
  2937. 'av': 'ava',
  2938. 'ay': 'aym',
  2939. 'az': 'aze',
  2940. 'ba': 'bak',
  2941. 'be': 'bel',
  2942. 'bg': 'bul',
  2943. 'bh': 'bih',
  2944. 'bi': 'bis',
  2945. 'bm': 'bam',
  2946. 'bn': 'ben',
  2947. 'bo': 'bod',
  2948. 'br': 'bre',
  2949. 'bs': 'bos',
  2950. 'ca': 'cat',
  2951. 'ce': 'che',
  2952. 'ch': 'cha',
  2953. 'co': 'cos',
  2954. 'cr': 'cre',
  2955. 'cs': 'ces',
  2956. 'cu': 'chu',
  2957. 'cv': 'chv',
  2958. 'cy': 'cym',
  2959. 'da': 'dan',
  2960. 'de': 'deu',
  2961. 'dv': 'div',
  2962. 'dz': 'dzo',
  2963. 'ee': 'ewe',
  2964. 'el': 'ell',
  2965. 'en': 'eng',
  2966. 'eo': 'epo',
  2967. 'es': 'spa',
  2968. 'et': 'est',
  2969. 'eu': 'eus',
  2970. 'fa': 'fas',
  2971. 'ff': 'ful',
  2972. 'fi': 'fin',
  2973. 'fj': 'fij',
  2974. 'fo': 'fao',
  2975. 'fr': 'fra',
  2976. 'fy': 'fry',
  2977. 'ga': 'gle',
  2978. 'gd': 'gla',
  2979. 'gl': 'glg',
  2980. 'gn': 'grn',
  2981. 'gu': 'guj',
  2982. 'gv': 'glv',
  2983. 'ha': 'hau',
  2984. 'he': 'heb',
  2985. 'iw': 'heb', # Replaced by he in 1989 revision
  2986. 'hi': 'hin',
  2987. 'ho': 'hmo',
  2988. 'hr': 'hrv',
  2989. 'ht': 'hat',
  2990. 'hu': 'hun',
  2991. 'hy': 'hye',
  2992. 'hz': 'her',
  2993. 'ia': 'ina',
  2994. 'id': 'ind',
  2995. 'in': 'ind', # Replaced by id in 1989 revision
  2996. 'ie': 'ile',
  2997. 'ig': 'ibo',
  2998. 'ii': 'iii',
  2999. 'ik': 'ipk',
  3000. 'io': 'ido',
  3001. 'is': 'isl',
  3002. 'it': 'ita',
  3003. 'iu': 'iku',
  3004. 'ja': 'jpn',
  3005. 'jv': 'jav',
  3006. 'ka': 'kat',
  3007. 'kg': 'kon',
  3008. 'ki': 'kik',
  3009. 'kj': 'kua',
  3010. 'kk': 'kaz',
  3011. 'kl': 'kal',
  3012. 'km': 'khm',
  3013. 'kn': 'kan',
  3014. 'ko': 'kor',
  3015. 'kr': 'kau',
  3016. 'ks': 'kas',
  3017. 'ku': 'kur',
  3018. 'kv': 'kom',
  3019. 'kw': 'cor',
  3020. 'ky': 'kir',
  3021. 'la': 'lat',
  3022. 'lb': 'ltz',
  3023. 'lg': 'lug',
  3024. 'li': 'lim',
  3025. 'ln': 'lin',
  3026. 'lo': 'lao',
  3027. 'lt': 'lit',
  3028. 'lu': 'lub',
  3029. 'lv': 'lav',
  3030. 'mg': 'mlg',
  3031. 'mh': 'mah',
  3032. 'mi': 'mri',
  3033. 'mk': 'mkd',
  3034. 'ml': 'mal',
  3035. 'mn': 'mon',
  3036. 'mr': 'mar',
  3037. 'ms': 'msa',
  3038. 'mt': 'mlt',
  3039. 'my': 'mya',
  3040. 'na': 'nau',
  3041. 'nb': 'nob',
  3042. 'nd': 'nde',
  3043. 'ne': 'nep',
  3044. 'ng': 'ndo',
  3045. 'nl': 'nld',
  3046. 'nn': 'nno',
  3047. 'no': 'nor',
  3048. 'nr': 'nbl',
  3049. 'nv': 'nav',
  3050. 'ny': 'nya',
  3051. 'oc': 'oci',
  3052. 'oj': 'oji',
  3053. 'om': 'orm',
  3054. 'or': 'ori',
  3055. 'os': 'oss',
  3056. 'pa': 'pan',
  3057. 'pe': 'per',
  3058. 'pi': 'pli',
  3059. 'pl': 'pol',
  3060. 'ps': 'pus',
  3061. 'pt': 'por',
  3062. 'qu': 'que',
  3063. 'rm': 'roh',
  3064. 'rn': 'run',
  3065. 'ro': 'ron',
  3066. 'ru': 'rus',
  3067. 'rw': 'kin',
  3068. 'sa': 'san',
  3069. 'sc': 'srd',
  3070. 'sd': 'snd',
  3071. 'se': 'sme',
  3072. 'sg': 'sag',
  3073. 'si': 'sin',
  3074. 'sk': 'slk',
  3075. 'sl': 'slv',
  3076. 'sm': 'smo',
  3077. 'sn': 'sna',
  3078. 'so': 'som',
  3079. 'sq': 'sqi',
  3080. 'sr': 'srp',
  3081. 'ss': 'ssw',
  3082. 'st': 'sot',
  3083. 'su': 'sun',
  3084. 'sv': 'swe',
  3085. 'sw': 'swa',
  3086. 'ta': 'tam',
  3087. 'te': 'tel',
  3088. 'tg': 'tgk',
  3089. 'th': 'tha',
  3090. 'ti': 'tir',
  3091. 'tk': 'tuk',
  3092. 'tl': 'tgl',
  3093. 'tn': 'tsn',
  3094. 'to': 'ton',
  3095. 'tr': 'tur',
  3096. 'ts': 'tso',
  3097. 'tt': 'tat',
  3098. 'tw': 'twi',
  3099. 'ty': 'tah',
  3100. 'ug': 'uig',
  3101. 'uk': 'ukr',
  3102. 'ur': 'urd',
  3103. 'uz': 'uzb',
  3104. 've': 'ven',
  3105. 'vi': 'vie',
  3106. 'vo': 'vol',
  3107. 'wa': 'wln',
  3108. 'wo': 'wol',
  3109. 'xh': 'xho',
  3110. 'yi': 'yid',
  3111. 'ji': 'yid', # Replaced by yi in 1989 revision
  3112. 'yo': 'yor',
  3113. 'za': 'zha',
  3114. 'zh': 'zho',
  3115. 'zu': 'zul',
  3116. }
  3117. @classmethod
  3118. def short2long(cls, code):
  3119. """Convert language code from ISO 639-1 to ISO 639-2/T"""
  3120. return cls._lang_map.get(code[:2])
  3121. @classmethod
  3122. def long2short(cls, code):
  3123. """Convert language code from ISO 639-2/T to ISO 639-1"""
  3124. for short_name, long_name in cls._lang_map.items():
  3125. if long_name == code:
  3126. return short_name
  3127. class ISO3166Utils:
  3128. # From http://data.okfn.org/data/core/country-list
  3129. _country_map = {
  3130. 'AF': 'Afghanistan',
  3131. 'AX': 'Åland Islands',
  3132. 'AL': 'Albania',
  3133. 'DZ': 'Algeria',
  3134. 'AS': 'American Samoa',
  3135. 'AD': 'Andorra',
  3136. 'AO': 'Angola',
  3137. 'AI': 'Anguilla',
  3138. 'AQ': 'Antarctica',
  3139. 'AG': 'Antigua and Barbuda',
  3140. 'AR': 'Argentina',
  3141. 'AM': 'Armenia',
  3142. 'AW': 'Aruba',
  3143. 'AU': 'Australia',
  3144. 'AT': 'Austria',
  3145. 'AZ': 'Azerbaijan',
  3146. 'BS': 'Bahamas',
  3147. 'BH': 'Bahrain',
  3148. 'BD': 'Bangladesh',
  3149. 'BB': 'Barbados',
  3150. 'BY': 'Belarus',
  3151. 'BE': 'Belgium',
  3152. 'BZ': 'Belize',
  3153. 'BJ': 'Benin',
  3154. 'BM': 'Bermuda',
  3155. 'BT': 'Bhutan',
  3156. 'BO': 'Bolivia, Plurinational State of',
  3157. 'BQ': 'Bonaire, Sint Eustatius and Saba',
  3158. 'BA': 'Bosnia and Herzegovina',
  3159. 'BW': 'Botswana',
  3160. 'BV': 'Bouvet Island',
  3161. 'BR': 'Brazil',
  3162. 'IO': 'British Indian Ocean Territory',
  3163. 'BN': 'Brunei Darussalam',
  3164. 'BG': 'Bulgaria',
  3165. 'BF': 'Burkina Faso',
  3166. 'BI': 'Burundi',
  3167. 'KH': 'Cambodia',
  3168. 'CM': 'Cameroon',
  3169. 'CA': 'Canada',
  3170. 'CV': 'Cape Verde',
  3171. 'KY': 'Cayman Islands',
  3172. 'CF': 'Central African Republic',
  3173. 'TD': 'Chad',
  3174. 'CL': 'Chile',
  3175. 'CN': 'China',
  3176. 'CX': 'Christmas Island',
  3177. 'CC': 'Cocos (Keeling) Islands',
  3178. 'CO': 'Colombia',
  3179. 'KM': 'Comoros',
  3180. 'CG': 'Congo',
  3181. 'CD': 'Congo, the Democratic Republic of the',
  3182. 'CK': 'Cook Islands',
  3183. 'CR': 'Costa Rica',
  3184. 'CI': 'Côte d\'Ivoire',
  3185. 'HR': 'Croatia',
  3186. 'CU': 'Cuba',
  3187. 'CW': 'Curaçao',
  3188. 'CY': 'Cyprus',
  3189. 'CZ': 'Czech Republic',
  3190. 'DK': 'Denmark',
  3191. 'DJ': 'Djibouti',
  3192. 'DM': 'Dominica',
  3193. 'DO': 'Dominican Republic',
  3194. 'EC': 'Ecuador',
  3195. 'EG': 'Egypt',
  3196. 'SV': 'El Salvador',
  3197. 'GQ': 'Equatorial Guinea',
  3198. 'ER': 'Eritrea',
  3199. 'EE': 'Estonia',
  3200. 'ET': 'Ethiopia',
  3201. 'FK': 'Falkland Islands (Malvinas)',
  3202. 'FO': 'Faroe Islands',
  3203. 'FJ': 'Fiji',
  3204. 'FI': 'Finland',
  3205. 'FR': 'France',
  3206. 'GF': 'French Guiana',
  3207. 'PF': 'French Polynesia',
  3208. 'TF': 'French Southern Territories',
  3209. 'GA': 'Gabon',
  3210. 'GM': 'Gambia',
  3211. 'GE': 'Georgia',
  3212. 'DE': 'Germany',
  3213. 'GH': 'Ghana',
  3214. 'GI': 'Gibraltar',
  3215. 'GR': 'Greece',
  3216. 'GL': 'Greenland',
  3217. 'GD': 'Grenada',
  3218. 'GP': 'Guadeloupe',
  3219. 'GU': 'Guam',
  3220. 'GT': 'Guatemala',
  3221. 'GG': 'Guernsey',
  3222. 'GN': 'Guinea',
  3223. 'GW': 'Guinea-Bissau',
  3224. 'GY': 'Guyana',
  3225. 'HT': 'Haiti',
  3226. 'HM': 'Heard Island and McDonald Islands',
  3227. 'VA': 'Holy See (Vatican City State)',
  3228. 'HN': 'Honduras',
  3229. 'HK': 'Hong Kong',
  3230. 'HU': 'Hungary',
  3231. 'IS': 'Iceland',
  3232. 'IN': 'India',
  3233. 'ID': 'Indonesia',
  3234. 'IR': 'Iran, Islamic Republic of',
  3235. 'IQ': 'Iraq',
  3236. 'IE': 'Ireland',
  3237. 'IM': 'Isle of Man',
  3238. 'IL': 'Israel',
  3239. 'IT': 'Italy',
  3240. 'JM': 'Jamaica',
  3241. 'JP': 'Japan',
  3242. 'JE': 'Jersey',
  3243. 'JO': 'Jordan',
  3244. 'KZ': 'Kazakhstan',
  3245. 'KE': 'Kenya',
  3246. 'KI': 'Kiribati',
  3247. 'KP': 'Korea, Democratic People\'s Republic of',
  3248. 'KR': 'Korea, Republic of',
  3249. 'KW': 'Kuwait',
  3250. 'KG': 'Kyrgyzstan',
  3251. 'LA': 'Lao People\'s Democratic Republic',
  3252. 'LV': 'Latvia',
  3253. 'LB': 'Lebanon',
  3254. 'LS': 'Lesotho',
  3255. 'LR': 'Liberia',
  3256. 'LY': 'Libya',
  3257. 'LI': 'Liechtenstein',
  3258. 'LT': 'Lithuania',
  3259. 'LU': 'Luxembourg',
  3260. 'MO': 'Macao',
  3261. 'MK': 'Macedonia, the Former Yugoslav Republic of',
  3262. 'MG': 'Madagascar',
  3263. 'MW': 'Malawi',
  3264. 'MY': 'Malaysia',
  3265. 'MV': 'Maldives',
  3266. 'ML': 'Mali',
  3267. 'MT': 'Malta',
  3268. 'MH': 'Marshall Islands',
  3269. 'MQ': 'Martinique',
  3270. 'MR': 'Mauritania',
  3271. 'MU': 'Mauritius',
  3272. 'YT': 'Mayotte',
  3273. 'MX': 'Mexico',
  3274. 'FM': 'Micronesia, Federated States of',
  3275. 'MD': 'Moldova, Republic of',
  3276. 'MC': 'Monaco',
  3277. 'MN': 'Mongolia',
  3278. 'ME': 'Montenegro',
  3279. 'MS': 'Montserrat',
  3280. 'MA': 'Morocco',
  3281. 'MZ': 'Mozambique',
  3282. 'MM': 'Myanmar',
  3283. 'NA': 'Namibia',
  3284. 'NR': 'Nauru',
  3285. 'NP': 'Nepal',
  3286. 'NL': 'Netherlands',
  3287. 'NC': 'New Caledonia',
  3288. 'NZ': 'New Zealand',
  3289. 'NI': 'Nicaragua',
  3290. 'NE': 'Niger',
  3291. 'NG': 'Nigeria',
  3292. 'NU': 'Niue',
  3293. 'NF': 'Norfolk Island',
  3294. 'MP': 'Northern Mariana Islands',
  3295. 'NO': 'Norway',
  3296. 'OM': 'Oman',
  3297. 'PK': 'Pakistan',
  3298. 'PW': 'Palau',
  3299. 'PS': 'Palestine, State of',
  3300. 'PA': 'Panama',
  3301. 'PG': 'Papua New Guinea',
  3302. 'PY': 'Paraguay',
  3303. 'PE': 'Peru',
  3304. 'PH': 'Philippines',
  3305. 'PN': 'Pitcairn',
  3306. 'PL': 'Poland',
  3307. 'PT': 'Portugal',
  3308. 'PR': 'Puerto Rico',
  3309. 'QA': 'Qatar',
  3310. 'RE': 'Réunion',
  3311. 'RO': 'Romania',
  3312. 'RU': 'Russian Federation',
  3313. 'RW': 'Rwanda',
  3314. 'BL': 'Saint Barthélemy',
  3315. 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
  3316. 'KN': 'Saint Kitts and Nevis',
  3317. 'LC': 'Saint Lucia',
  3318. 'MF': 'Saint Martin (French part)',
  3319. 'PM': 'Saint Pierre and Miquelon',
  3320. 'VC': 'Saint Vincent and the Grenadines',
  3321. 'WS': 'Samoa',
  3322. 'SM': 'San Marino',
  3323. 'ST': 'Sao Tome and Principe',
  3324. 'SA': 'Saudi Arabia',
  3325. 'SN': 'Senegal',
  3326. 'RS': 'Serbia',
  3327. 'SC': 'Seychelles',
  3328. 'SL': 'Sierra Leone',
  3329. 'SG': 'Singapore',
  3330. 'SX': 'Sint Maarten (Dutch part)',
  3331. 'SK': 'Slovakia',
  3332. 'SI': 'Slovenia',
  3333. 'SB': 'Solomon Islands',
  3334. 'SO': 'Somalia',
  3335. 'ZA': 'South Africa',
  3336. 'GS': 'South Georgia and the South Sandwich Islands',
  3337. 'SS': 'South Sudan',
  3338. 'ES': 'Spain',
  3339. 'LK': 'Sri Lanka',
  3340. 'SD': 'Sudan',
  3341. 'SR': 'Suriname',
  3342. 'SJ': 'Svalbard and Jan Mayen',
  3343. 'SZ': 'Swaziland',
  3344. 'SE': 'Sweden',
  3345. 'CH': 'Switzerland',
  3346. 'SY': 'Syrian Arab Republic',
  3347. 'TW': 'Taiwan, Province of China',
  3348. 'TJ': 'Tajikistan',
  3349. 'TZ': 'Tanzania, United Republic of',
  3350. 'TH': 'Thailand',
  3351. 'TL': 'Timor-Leste',
  3352. 'TG': 'Togo',
  3353. 'TK': 'Tokelau',
  3354. 'TO': 'Tonga',
  3355. 'TT': 'Trinidad and Tobago',
  3356. 'TN': 'Tunisia',
  3357. 'TR': 'Turkey',
  3358. 'TM': 'Turkmenistan',
  3359. 'TC': 'Turks and Caicos Islands',
  3360. 'TV': 'Tuvalu',
  3361. 'UG': 'Uganda',
  3362. 'UA': 'Ukraine',
  3363. 'AE': 'United Arab Emirates',
  3364. 'GB': 'United Kingdom',
  3365. 'US': 'United States',
  3366. 'UM': 'United States Minor Outlying Islands',
  3367. 'UY': 'Uruguay',
  3368. 'UZ': 'Uzbekistan',
  3369. 'VU': 'Vanuatu',
  3370. 'VE': 'Venezuela, Bolivarian Republic of',
  3371. 'VN': 'Viet Nam',
  3372. 'VG': 'Virgin Islands, British',
  3373. 'VI': 'Virgin Islands, U.S.',
  3374. 'WF': 'Wallis and Futuna',
  3375. 'EH': 'Western Sahara',
  3376. 'YE': 'Yemen',
  3377. 'ZM': 'Zambia',
  3378. 'ZW': 'Zimbabwe',
  3379. # Not ISO 3166 codes, but used for IP blocks
  3380. 'AP': 'Asia/Pacific Region',
  3381. 'EU': 'Europe',
  3382. }
  3383. @classmethod
  3384. def short2full(cls, code):
  3385. """Convert an ISO 3166-2 country code to the corresponding full name"""
  3386. return cls._country_map.get(code.upper())
  3387. class GeoUtils:
  3388. # Major IPv4 address blocks per country
  3389. _country_ip_map = {
  3390. 'AD': '46.172.224.0/19',
  3391. 'AE': '94.200.0.0/13',
  3392. 'AF': '149.54.0.0/17',
  3393. 'AG': '209.59.64.0/18',
  3394. 'AI': '204.14.248.0/21',
  3395. 'AL': '46.99.0.0/16',
  3396. 'AM': '46.70.0.0/15',
  3397. 'AO': '105.168.0.0/13',
  3398. 'AP': '182.50.184.0/21',
  3399. 'AQ': '23.154.160.0/24',
  3400. 'AR': '181.0.0.0/12',
  3401. 'AS': '202.70.112.0/20',
  3402. 'AT': '77.116.0.0/14',
  3403. 'AU': '1.128.0.0/11',
  3404. 'AW': '181.41.0.0/18',
  3405. 'AX': '185.217.4.0/22',
  3406. 'AZ': '5.197.0.0/16',
  3407. 'BA': '31.176.128.0/17',
  3408. 'BB': '65.48.128.0/17',
  3409. 'BD': '114.130.0.0/16',
  3410. 'BE': '57.0.0.0/8',
  3411. 'BF': '102.178.0.0/15',
  3412. 'BG': '95.42.0.0/15',
  3413. 'BH': '37.131.0.0/17',
  3414. 'BI': '154.117.192.0/18',
  3415. 'BJ': '137.255.0.0/16',
  3416. 'BL': '185.212.72.0/23',
  3417. 'BM': '196.12.64.0/18',
  3418. 'BN': '156.31.0.0/16',
  3419. 'BO': '161.56.0.0/16',
  3420. 'BQ': '161.0.80.0/20',
  3421. 'BR': '191.128.0.0/12',
  3422. 'BS': '24.51.64.0/18',
  3423. 'BT': '119.2.96.0/19',
  3424. 'BW': '168.167.0.0/16',
  3425. 'BY': '178.120.0.0/13',
  3426. 'BZ': '179.42.192.0/18',
  3427. 'CA': '99.224.0.0/11',
  3428. 'CD': '41.243.0.0/16',
  3429. 'CF': '197.242.176.0/21',
  3430. 'CG': '160.113.0.0/16',
  3431. 'CH': '85.0.0.0/13',
  3432. 'CI': '102.136.0.0/14',
  3433. 'CK': '202.65.32.0/19',
  3434. 'CL': '152.172.0.0/14',
  3435. 'CM': '102.244.0.0/14',
  3436. 'CN': '36.128.0.0/10',
  3437. 'CO': '181.240.0.0/12',
  3438. 'CR': '201.192.0.0/12',
  3439. 'CU': '152.206.0.0/15',
  3440. 'CV': '165.90.96.0/19',
  3441. 'CW': '190.88.128.0/17',
  3442. 'CY': '31.153.0.0/16',
  3443. 'CZ': '88.100.0.0/14',
  3444. 'DE': '53.0.0.0/8',
  3445. 'DJ': '197.241.0.0/17',
  3446. 'DK': '87.48.0.0/12',
  3447. 'DM': '192.243.48.0/20',
  3448. 'DO': '152.166.0.0/15',
  3449. 'DZ': '41.96.0.0/12',
  3450. 'EC': '186.68.0.0/15',
  3451. 'EE': '90.190.0.0/15',
  3452. 'EG': '156.160.0.0/11',
  3453. 'ER': '196.200.96.0/20',
  3454. 'ES': '88.0.0.0/11',
  3455. 'ET': '196.188.0.0/14',
  3456. 'EU': '2.16.0.0/13',
  3457. 'FI': '91.152.0.0/13',
  3458. 'FJ': '144.120.0.0/16',
  3459. 'FK': '80.73.208.0/21',
  3460. 'FM': '119.252.112.0/20',
  3461. 'FO': '88.85.32.0/19',
  3462. 'FR': '90.0.0.0/9',
  3463. 'GA': '41.158.0.0/15',
  3464. 'GB': '25.0.0.0/8',
  3465. 'GD': '74.122.88.0/21',
  3466. 'GE': '31.146.0.0/16',
  3467. 'GF': '161.22.64.0/18',
  3468. 'GG': '62.68.160.0/19',
  3469. 'GH': '154.160.0.0/12',
  3470. 'GI': '95.164.0.0/16',
  3471. 'GL': '88.83.0.0/19',
  3472. 'GM': '160.182.0.0/15',
  3473. 'GN': '197.149.192.0/18',
  3474. 'GP': '104.250.0.0/19',
  3475. 'GQ': '105.235.224.0/20',
  3476. 'GR': '94.64.0.0/13',
  3477. 'GT': '168.234.0.0/16',
  3478. 'GU': '168.123.0.0/16',
  3479. 'GW': '197.214.80.0/20',
  3480. 'GY': '181.41.64.0/18',
  3481. 'HK': '113.252.0.0/14',
  3482. 'HN': '181.210.0.0/16',
  3483. 'HR': '93.136.0.0/13',
  3484. 'HT': '148.102.128.0/17',
  3485. 'HU': '84.0.0.0/14',
  3486. 'ID': '39.192.0.0/10',
  3487. 'IE': '87.32.0.0/12',
  3488. 'IL': '79.176.0.0/13',
  3489. 'IM': '5.62.80.0/20',
  3490. 'IN': '117.192.0.0/10',
  3491. 'IO': '203.83.48.0/21',
  3492. 'IQ': '37.236.0.0/14',
  3493. 'IR': '2.176.0.0/12',
  3494. 'IS': '82.221.0.0/16',
  3495. 'IT': '79.0.0.0/10',
  3496. 'JE': '87.244.64.0/18',
  3497. 'JM': '72.27.0.0/17',
  3498. 'JO': '176.29.0.0/16',
  3499. 'JP': '133.0.0.0/8',
  3500. 'KE': '105.48.0.0/12',
  3501. 'KG': '158.181.128.0/17',
  3502. 'KH': '36.37.128.0/17',
  3503. 'KI': '103.25.140.0/22',
  3504. 'KM': '197.255.224.0/20',
  3505. 'KN': '198.167.192.0/19',
  3506. 'KP': '175.45.176.0/22',
  3507. 'KR': '175.192.0.0/10',
  3508. 'KW': '37.36.0.0/14',
  3509. 'KY': '64.96.0.0/15',
  3510. 'KZ': '2.72.0.0/13',
  3511. 'LA': '115.84.64.0/18',
  3512. 'LB': '178.135.0.0/16',
  3513. 'LC': '24.92.144.0/20',
  3514. 'LI': '82.117.0.0/19',
  3515. 'LK': '112.134.0.0/15',
  3516. 'LR': '102.183.0.0/16',
  3517. 'LS': '129.232.0.0/17',
  3518. 'LT': '78.56.0.0/13',
  3519. 'LU': '188.42.0.0/16',
  3520. 'LV': '46.109.0.0/16',
  3521. 'LY': '41.252.0.0/14',
  3522. 'MA': '105.128.0.0/11',
  3523. 'MC': '88.209.64.0/18',
  3524. 'MD': '37.246.0.0/16',
  3525. 'ME': '178.175.0.0/17',
  3526. 'MF': '74.112.232.0/21',
  3527. 'MG': '154.126.0.0/17',
  3528. 'MH': '117.103.88.0/21',
  3529. 'MK': '77.28.0.0/15',
  3530. 'ML': '154.118.128.0/18',
  3531. 'MM': '37.111.0.0/17',
  3532. 'MN': '49.0.128.0/17',
  3533. 'MO': '60.246.0.0/16',
  3534. 'MP': '202.88.64.0/20',
  3535. 'MQ': '109.203.224.0/19',
  3536. 'MR': '41.188.64.0/18',
  3537. 'MS': '208.90.112.0/22',
  3538. 'MT': '46.11.0.0/16',
  3539. 'MU': '105.16.0.0/12',
  3540. 'MV': '27.114.128.0/18',
  3541. 'MW': '102.70.0.0/15',
  3542. 'MX': '187.192.0.0/11',
  3543. 'MY': '175.136.0.0/13',
  3544. 'MZ': '197.218.0.0/15',
  3545. 'NA': '41.182.0.0/16',
  3546. 'NC': '101.101.0.0/18',
  3547. 'NE': '197.214.0.0/18',
  3548. 'NF': '203.17.240.0/22',
  3549. 'NG': '105.112.0.0/12',
  3550. 'NI': '186.76.0.0/15',
  3551. 'NL': '145.96.0.0/11',
  3552. 'NO': '84.208.0.0/13',
  3553. 'NP': '36.252.0.0/15',
  3554. 'NR': '203.98.224.0/19',
  3555. 'NU': '49.156.48.0/22',
  3556. 'NZ': '49.224.0.0/14',
  3557. 'OM': '5.36.0.0/15',
  3558. 'PA': '186.72.0.0/15',
  3559. 'PE': '186.160.0.0/14',
  3560. 'PF': '123.50.64.0/18',
  3561. 'PG': '124.240.192.0/19',
  3562. 'PH': '49.144.0.0/13',
  3563. 'PK': '39.32.0.0/11',
  3564. 'PL': '83.0.0.0/11',
  3565. 'PM': '70.36.0.0/20',
  3566. 'PR': '66.50.0.0/16',
  3567. 'PS': '188.161.0.0/16',
  3568. 'PT': '85.240.0.0/13',
  3569. 'PW': '202.124.224.0/20',
  3570. 'PY': '181.120.0.0/14',
  3571. 'QA': '37.210.0.0/15',
  3572. 'RE': '102.35.0.0/16',
  3573. 'RO': '79.112.0.0/13',
  3574. 'RS': '93.86.0.0/15',
  3575. 'RU': '5.136.0.0/13',
  3576. 'RW': '41.186.0.0/16',
  3577. 'SA': '188.48.0.0/13',
  3578. 'SB': '202.1.160.0/19',
  3579. 'SC': '154.192.0.0/11',
  3580. 'SD': '102.120.0.0/13',
  3581. 'SE': '78.64.0.0/12',
  3582. 'SG': '8.128.0.0/10',
  3583. 'SI': '188.196.0.0/14',
  3584. 'SK': '78.98.0.0/15',
  3585. 'SL': '102.143.0.0/17',
  3586. 'SM': '89.186.32.0/19',
  3587. 'SN': '41.82.0.0/15',
  3588. 'SO': '154.115.192.0/18',
  3589. 'SR': '186.179.128.0/17',
  3590. 'SS': '105.235.208.0/21',
  3591. 'ST': '197.159.160.0/19',
  3592. 'SV': '168.243.0.0/16',
  3593. 'SX': '190.102.0.0/20',
  3594. 'SY': '5.0.0.0/16',
  3595. 'SZ': '41.84.224.0/19',
  3596. 'TC': '65.255.48.0/20',
  3597. 'TD': '154.68.128.0/19',
  3598. 'TG': '196.168.0.0/14',
  3599. 'TH': '171.96.0.0/13',
  3600. 'TJ': '85.9.128.0/18',
  3601. 'TK': '27.96.24.0/21',
  3602. 'TL': '180.189.160.0/20',
  3603. 'TM': '95.85.96.0/19',
  3604. 'TN': '197.0.0.0/11',
  3605. 'TO': '175.176.144.0/21',
  3606. 'TR': '78.160.0.0/11',
  3607. 'TT': '186.44.0.0/15',
  3608. 'TV': '202.2.96.0/19',
  3609. 'TW': '120.96.0.0/11',
  3610. 'TZ': '156.156.0.0/14',
  3611. 'UA': '37.52.0.0/14',
  3612. 'UG': '102.80.0.0/13',
  3613. 'US': '6.0.0.0/8',
  3614. 'UY': '167.56.0.0/13',
  3615. 'UZ': '84.54.64.0/18',
  3616. 'VA': '212.77.0.0/19',
  3617. 'VC': '207.191.240.0/21',
  3618. 'VE': '186.88.0.0/13',
  3619. 'VG': '66.81.192.0/20',
  3620. 'VI': '146.226.0.0/16',
  3621. 'VN': '14.160.0.0/11',
  3622. 'VU': '202.80.32.0/20',
  3623. 'WF': '117.20.32.0/21',
  3624. 'WS': '202.4.32.0/19',
  3625. 'YE': '134.35.0.0/16',
  3626. 'YT': '41.242.116.0/22',
  3627. 'ZA': '41.0.0.0/11',
  3628. 'ZM': '102.144.0.0/13',
  3629. 'ZW': '102.177.192.0/18',
  3630. }
  3631. @classmethod
  3632. def random_ipv4(cls, code_or_block):
  3633. if len(code_or_block) == 2:
  3634. block = cls._country_ip_map.get(code_or_block.upper())
  3635. if not block:
  3636. return None
  3637. else:
  3638. block = code_or_block
  3639. addr, preflen = block.split('/')
  3640. addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
  3641. addr_max = addr_min | (0xffffffff >> int(preflen))
  3642. return str(socket.inet_ntoa(
  3643. struct.pack('!L', random.randint(addr_min, addr_max))))
  3644. # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
  3645. # released into Public Domain
  3646. # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
  3647. def long_to_bytes(n, blocksize=0):
  3648. """long_to_bytes(n:long, blocksize:int) : string
  3649. Convert a long integer to a byte string.
  3650. If optional blocksize is given and greater than zero, pad the front of the
  3651. byte string with binary zeros so that the length is a multiple of
  3652. blocksize.
  3653. """
  3654. # after much testing, this algorithm was deemed to be the fastest
  3655. s = b''
  3656. n = int(n)
  3657. while n > 0:
  3658. s = struct.pack('>I', n & 0xffffffff) + s
  3659. n = n >> 32
  3660. # strip off leading zeros
  3661. for i in range(len(s)):
  3662. if s[i] != b'\000'[0]:
  3663. break
  3664. else:
  3665. # only happens when n == 0
  3666. s = b'\000'
  3667. i = 0
  3668. s = s[i:]
  3669. # add back some pad bytes. this could be done more efficiently w.r.t. the
  3670. # de-padding being done above, but sigh...
  3671. if blocksize > 0 and len(s) % blocksize:
  3672. s = (blocksize - len(s) % blocksize) * b'\000' + s
  3673. return s
  3674. def bytes_to_long(s):
  3675. """bytes_to_long(string) : long
  3676. Convert a byte string to a long integer.
  3677. This is (essentially) the inverse of long_to_bytes().
  3678. """
  3679. acc = 0
  3680. length = len(s)
  3681. if length % 4:
  3682. extra = (4 - length % 4)
  3683. s = b'\000' * extra + s
  3684. length = length + extra
  3685. for i in range(0, length, 4):
  3686. acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
  3687. return acc
  3688. def ohdave_rsa_encrypt(data, exponent, modulus):
  3689. """
  3690. Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
  3691. Input:
  3692. data: data to encrypt, bytes-like object
  3693. exponent, modulus: parameter e and N of RSA algorithm, both integer
  3694. Output: hex string of encrypted data
  3695. Limitation: supports one block encryption only
  3696. """
  3697. payload = int(binascii.hexlify(data[::-1]), 16)
  3698. encrypted = pow(payload, exponent, modulus)
  3699. return f'{encrypted:x}'
  3700. def pkcs1pad(data, length):
  3701. """
  3702. Padding input data with PKCS#1 scheme
  3703. @param {int[]} data input data
  3704. @param {int} length target length
  3705. @returns {int[]} padded data
  3706. """
  3707. if len(data) > length - 11:
  3708. raise ValueError('Input data too long for PKCS#1 padding')
  3709. pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
  3710. return [0, 2, *pseudo_random, 0, *data]
  3711. def _base_n_table(n, table):
  3712. if not table and not n:
  3713. raise ValueError('Either table or n must be specified')
  3714. table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
  3715. if n and n != len(table):
  3716. raise ValueError(f'base {n} exceeds table length {len(table)}')
  3717. return table
  3718. def encode_base_n(num, n=None, table=None):
  3719. """Convert given int to a base-n string"""
  3720. table = _base_n_table(n, table)
  3721. if not num:
  3722. return table[0]
  3723. result, base = '', len(table)
  3724. while num:
  3725. result = table[num % base] + result
  3726. num = num // base
  3727. return result
  3728. def decode_base_n(string, n=None, table=None):
  3729. """Convert given base-n string to int"""
  3730. table = {char: index for index, char in enumerate(_base_n_table(n, table))}
  3731. result, base = 0, len(table)
  3732. for char in string:
  3733. result = result * base + table[char]
  3734. return result
  3735. def decode_packed_codes(code):
  3736. mobj = re.search(PACKED_CODES_RE, code)
  3737. obfuscated_code, base, count, symbols = mobj.groups()
  3738. base = int(base)
  3739. count = int(count)
  3740. symbols = symbols.split('|')
  3741. symbol_table = {}
  3742. while count:
  3743. count -= 1
  3744. base_n_count = encode_base_n(count, base)
  3745. symbol_table[base_n_count] = symbols[count] or base_n_count
  3746. return re.sub(
  3747. r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
  3748. obfuscated_code)
  3749. def caesar(s, alphabet, shift):
  3750. if shift == 0:
  3751. return s
  3752. l = len(alphabet)
  3753. return ''.join(
  3754. alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
  3755. for c in s)
  3756. def rot47(s):
  3757. return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
  3758. def parse_m3u8_attributes(attrib):
  3759. info = {}
  3760. for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
  3761. if val.startswith('"'):
  3762. val = val[1:-1]
  3763. info[key] = val
  3764. return info
  3765. def urshift(val, n):
  3766. return val >> n if val >= 0 else (val + 0x100000000) >> n
  3767. def write_xattr(path, key, value):
  3768. # Windows: Write xattrs to NTFS Alternate Data Streams:
  3769. # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
  3770. if os.name == 'nt':
  3771. assert ':' not in key
  3772. assert os.path.exists(path)
  3773. try:
  3774. with open(f'{path}:{key}', 'wb') as f:
  3775. f.write(value)
  3776. except OSError as e:
  3777. raise XAttrMetadataError(e.errno, e.strerror)
  3778. return
  3779. # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
  3780. setxattr = None
  3781. if callable(getattr(os, 'setxattr', None)):
  3782. setxattr = os.setxattr
  3783. elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
  3784. # Unicode arguments are not supported in pyxattr until version 0.5.0
  3785. # See https://github.com/ytdl-org/youtube-dl/issues/5498
  3786. if version_tuple(xattr.__version__) >= (0, 5, 0):
  3787. setxattr = xattr.set
  3788. elif xattr:
  3789. setxattr = xattr.setxattr
  3790. if setxattr:
  3791. try:
  3792. setxattr(path, key, value)
  3793. except OSError as e:
  3794. raise XAttrMetadataError(e.errno, e.strerror)
  3795. return
  3796. # UNIX Method 2. Use setfattr/xattr executables
  3797. exe = ('setfattr' if check_executable('setfattr', ['--version'])
  3798. else 'xattr' if check_executable('xattr', ['-h']) else None)
  3799. if not exe:
  3800. raise XAttrUnavailableError(
  3801. 'Couldn\'t find a tool to set the xattrs. Install either the "xattr" or "pyxattr" Python modules or the '
  3802. + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
  3803. value = value.decode()
  3804. try:
  3805. _, stderr, returncode = Popen.run(
  3806. [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
  3807. text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
  3808. except OSError as e:
  3809. raise XAttrMetadataError(e.errno, e.strerror)
  3810. if returncode:
  3811. raise XAttrMetadataError(returncode, stderr)
  3812. def random_birthday(year_field, month_field, day_field):
  3813. start_date = dt.date(1950, 1, 1)
  3814. end_date = dt.date(1995, 12, 31)
  3815. offset = random.randint(0, (end_date - start_date).days)
  3816. random_date = start_date + dt.timedelta(offset)
  3817. return {
  3818. year_field: str(random_date.year),
  3819. month_field: str(random_date.month),
  3820. day_field: str(random_date.day),
  3821. }
  3822. def find_available_port(interface=''):
  3823. try:
  3824. with socket.socket() as sock:
  3825. sock.bind((interface, 0))
  3826. return sock.getsockname()[1]
  3827. except OSError:
  3828. return None
  3829. # Templates for internet shortcut files, which are plain text files.
  3830. DOT_URL_LINK_TEMPLATE = '''\
  3831. [InternetShortcut]
  3832. URL=%(url)s
  3833. '''
  3834. DOT_WEBLOC_LINK_TEMPLATE = '''\
  3835. <?xml version="1.0" encoding="UTF-8"?>
  3836. <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
  3837. <plist version="1.0">
  3838. <dict>
  3839. \t<key>URL</key>
  3840. \t<string>%(url)s</string>
  3841. </dict>
  3842. </plist>
  3843. '''
  3844. DOT_DESKTOP_LINK_TEMPLATE = '''\
  3845. [Desktop Entry]
  3846. Encoding=UTF-8
  3847. Name=%(filename)s
  3848. Type=Link
  3849. URL=%(url)s
  3850. Icon=text-html
  3851. '''
  3852. LINK_TEMPLATES = {
  3853. 'url': DOT_URL_LINK_TEMPLATE,
  3854. 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
  3855. 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
  3856. }
  3857. def iri_to_uri(iri):
  3858. """
  3859. Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
  3860. The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
  3861. """
  3862. iri_parts = urllib.parse.urlparse(iri)
  3863. if '[' in iri_parts.netloc:
  3864. raise ValueError('IPv6 URIs are not, yet, supported.')
  3865. # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
  3866. # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
  3867. net_location = ''
  3868. if iri_parts.username:
  3869. net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
  3870. if iri_parts.password is not None:
  3871. net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
  3872. net_location += '@'
  3873. net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
  3874. # The 'idna' encoding produces ASCII text.
  3875. if iri_parts.port is not None and iri_parts.port != 80:
  3876. net_location += ':' + str(iri_parts.port)
  3877. return urllib.parse.urlunparse(
  3878. (iri_parts.scheme,
  3879. net_location,
  3880. urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
  3881. # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
  3882. urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
  3883. # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
  3884. urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
  3885. urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
  3886. # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
  3887. def to_high_limit_path(path):
  3888. if sys.platform in ['win32', 'cygwin']:
  3889. # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
  3890. return '\\\\?\\' + os.path.abspath(path)
  3891. return path
  3892. @partial_application
  3893. def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
  3894. val = traversal.traverse_obj(obj, *variadic(field))
  3895. if not val if ignore is NO_DEFAULT else val in variadic(ignore):
  3896. return default
  3897. return template % func(val)
  3898. def clean_podcast_url(url):
  3899. url = re.sub(r'''(?x)
  3900. (?:
  3901. (?:
  3902. chtbl\.com/track|
  3903. media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
  3904. play\.podtrac\.com|
  3905. chrt\.fm/track|
  3906. mgln\.ai/e
  3907. )(?:/[^/.]+)?|
  3908. (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
  3909. flex\.acast\.com|
  3910. pd(?:
  3911. cn\.co| # https://podcorn.com/analytics-prefix/
  3912. st\.fm # https://podsights.com/docs/
  3913. )/e|
  3914. [0-9]\.gum\.fm|
  3915. pscrb\.fm/rss/p
  3916. )/''', '', url)
  3917. return re.sub(r'^\w+://(\w+://)', r'\1', url)
  3918. _HEX_TABLE = '0123456789abcdef'
  3919. def random_uuidv4():
  3920. return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
  3921. def make_dir(path, to_screen=None):
  3922. try:
  3923. dn = os.path.dirname(path)
  3924. if dn:
  3925. os.makedirs(dn, exist_ok=True)
  3926. return True
  3927. except OSError as err:
  3928. if callable(to_screen) is not None:
  3929. to_screen(f'unable to create directory {err}')
  3930. return False
  3931. def get_executable_path():
  3932. from ..update import _get_variant_and_executable_path
  3933. return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
  3934. def get_user_config_dirs(package_name):
  3935. # .config (e.g. ~/.config/package_name)
  3936. xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
  3937. yield os.path.join(xdg_config_home, package_name)
  3938. # appdata (%APPDATA%/package_name)
  3939. appdata_dir = os.getenv('appdata')
  3940. if appdata_dir:
  3941. yield os.path.join(appdata_dir, package_name)
  3942. # home (~/.package_name)
  3943. yield os.path.join(compat_expanduser('~'), f'.{package_name}')
  3944. def get_system_config_dirs(package_name):
  3945. # /etc/package_name
  3946. yield os.path.join('/etc', package_name)
  3947. def time_seconds(**kwargs):
  3948. """
  3949. Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
  3950. """
  3951. return time.time() + dt.timedelta(**kwargs).total_seconds()
  3952. # create a JSON Web Signature (jws) with HS256 algorithm
  3953. # the resulting format is in JWS Compact Serialization
  3954. # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
  3955. # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
  3956. def jwt_encode_hs256(payload_data, key, headers={}):
  3957. header_data = {
  3958. 'alg': 'HS256',
  3959. 'typ': 'JWT',
  3960. }
  3961. if headers:
  3962. header_data.update(headers)
  3963. header_b64 = base64.b64encode(json.dumps(header_data).encode())
  3964. payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
  3965. h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
  3966. signature_b64 = base64.b64encode(h.digest())
  3967. return header_b64 + b'.' + payload_b64 + b'.' + signature_b64
  3968. # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
  3969. def jwt_decode_hs256(jwt):
  3970. header_b64, payload_b64, signature_b64 = jwt.split('.')
  3971. # add trailing ='s that may have been stripped, superfluous ='s are ignored
  3972. return json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
  3973. WINDOWS_VT_MODE = False if os.name == 'nt' else None
  3974. @functools.cache
  3975. def supports_terminal_sequences(stream):
  3976. if os.name == 'nt':
  3977. if not WINDOWS_VT_MODE:
  3978. return False
  3979. elif not os.getenv('TERM'):
  3980. return False
  3981. try:
  3982. return stream.isatty()
  3983. except BaseException:
  3984. return False
  3985. def windows_enable_vt_mode():
  3986. """Ref: https://bugs.python.org/issue30075 """
  3987. if get_windows_version() < (10, 0, 10586):
  3988. return
  3989. import ctypes
  3990. import ctypes.wintypes
  3991. import msvcrt
  3992. ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
  3993. dll = ctypes.WinDLL('kernel32', use_last_error=False)
  3994. handle = os.open('CONOUT$', os.O_RDWR)
  3995. try:
  3996. h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
  3997. dw_original_mode = ctypes.wintypes.DWORD()
  3998. success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
  3999. if not success:
  4000. raise Exception('GetConsoleMode failed')
  4001. success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
  4002. dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
  4003. if not success:
  4004. raise Exception('SetConsoleMode failed')
  4005. finally:
  4006. os.close(handle)
  4007. global WINDOWS_VT_MODE
  4008. WINDOWS_VT_MODE = True
  4009. supports_terminal_sequences.cache_clear()
  4010. _terminal_sequences_re = re.compile('\033\\[[^m]+m')
  4011. def remove_terminal_sequences(string):
  4012. return _terminal_sequences_re.sub('', string)
  4013. def number_of_digits(number):
  4014. return len('%d' % number)
  4015. def join_nonempty(*values, delim='-', from_dict=None):
  4016. if from_dict is not None:
  4017. values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
  4018. return delim.join(map(str, filter(None, values)))
  4019. def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
  4020. """
  4021. Find the largest format dimensions in terms of video width and, for each thumbnail:
  4022. * Modify the URL: Match the width with the provided regex and replace with the former width
  4023. * Update dimensions
  4024. This function is useful with video services that scale the provided thumbnails on demand
  4025. """
  4026. _keys = ('width', 'height')
  4027. max_dimensions = max(
  4028. (tuple(fmt.get(k) or 0 for k in _keys) for fmt in formats),
  4029. default=(0, 0))
  4030. if not max_dimensions[0]:
  4031. return thumbnails
  4032. return [
  4033. merge_dicts(
  4034. {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
  4035. dict(zip(_keys, max_dimensions)), thumbnail)
  4036. for thumbnail in thumbnails
  4037. ]
  4038. def parse_http_range(range):
  4039. """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
  4040. if not range:
  4041. return None, None, None
  4042. crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
  4043. if not crg:
  4044. return None, None, None
  4045. return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
  4046. def read_stdin(what):
  4047. if what:
  4048. eof = 'Ctrl+Z' if os.name == 'nt' else 'Ctrl+D'
  4049. write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
  4050. return sys.stdin
  4051. def determine_file_encoding(data):
  4052. """
  4053. Detect the text encoding used
  4054. @returns (encoding, bytes to skip)
  4055. """
  4056. # BOM marks are given priority over declarations
  4057. for bom, enc in BOMS:
  4058. if data.startswith(bom):
  4059. return enc, len(bom)
  4060. # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
  4061. # We ignore the endianness to get a good enough match
  4062. data = data.replace(b'\0', b'')
  4063. mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
  4064. return mobj.group(1).decode() if mobj else None, 0
  4065. class Config:
  4066. own_args = None
  4067. parsed_args = None
  4068. filename = None
  4069. __initialized = False
  4070. # Internal only, do not use! Hack to enable --plugin-dirs
  4071. # TODO(coletdjnz): remove when plugin globals system is implemented
  4072. _plugin_dirs = None
  4073. def __init__(self, parser, label=None):
  4074. self.parser, self.label = parser, label
  4075. self._loaded_paths, self.configs = set(), []
  4076. def init(self, args=None, filename=None):
  4077. assert not self.__initialized
  4078. self.own_args, self.filename = args, filename
  4079. return self.load_configs()
  4080. def load_configs(self):
  4081. directory = ''
  4082. if self.filename:
  4083. location = os.path.realpath(self.filename)
  4084. directory = os.path.dirname(location)
  4085. if location in self._loaded_paths:
  4086. return False
  4087. self._loaded_paths.add(location)
  4088. self.__initialized = True
  4089. opts, _ = self.parser.parse_known_args(self.own_args)
  4090. self.parsed_args = self.own_args
  4091. for location in opts.config_locations or []:
  4092. if location == '-':
  4093. if location in self._loaded_paths:
  4094. continue
  4095. self._loaded_paths.add(location)
  4096. self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
  4097. continue
  4098. location = os.path.join(directory, expand_path(location))
  4099. if os.path.isdir(location):
  4100. location = os.path.join(location, 'yt-dlp.conf')
  4101. if not os.path.exists(location):
  4102. self.parser.error(f'config location {location} does not exist')
  4103. self.append_config(self.read_file(location), location)
  4104. return True
  4105. def __str__(self):
  4106. label = join_nonempty(
  4107. self.label, 'config', f'"{self.filename}"' if self.filename else '',
  4108. delim=' ')
  4109. return join_nonempty(
  4110. self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
  4111. *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
  4112. delim='\n')
  4113. @staticmethod
  4114. def read_file(filename, default=[]):
  4115. try:
  4116. optionf = open(filename, 'rb')
  4117. except OSError:
  4118. return default # silently skip if file is not present
  4119. try:
  4120. enc, skip = determine_file_encoding(optionf.read(512))
  4121. optionf.seek(skip, io.SEEK_SET)
  4122. except OSError:
  4123. enc = None # silently skip read errors
  4124. try:
  4125. # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
  4126. contents = optionf.read().decode(enc or preferredencoding())
  4127. res = shlex.split(contents, comments=True)
  4128. except Exception as err:
  4129. raise ValueError(f'Unable to parse "{filename}": {err}')
  4130. finally:
  4131. optionf.close()
  4132. return res
  4133. @staticmethod
  4134. def hide_login_info(opts):
  4135. PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
  4136. eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
  4137. def _scrub_eq(o):
  4138. m = eqre.match(o)
  4139. if m:
  4140. return m.group('key') + '=PRIVATE'
  4141. else:
  4142. return o
  4143. opts = list(map(_scrub_eq, opts))
  4144. for idx, opt in enumerate(opts):
  4145. if opt in PRIVATE_OPTS and idx + 1 < len(opts):
  4146. opts[idx + 1] = 'PRIVATE'
  4147. return opts
  4148. def append_config(self, *args, label=None):
  4149. config = type(self)(self.parser, label)
  4150. config._loaded_paths = self._loaded_paths
  4151. if config.init(*args):
  4152. self.configs.append(config)
  4153. @property
  4154. def all_args(self):
  4155. for config in reversed(self.configs):
  4156. yield from config.all_args
  4157. yield from self.parsed_args or []
  4158. def parse_known_args(self, **kwargs):
  4159. return self.parser.parse_known_args(self.all_args, **kwargs)
  4160. def parse_args(self):
  4161. return self.parser.parse_args(self.all_args)
  4162. def merge_headers(*dicts):
  4163. """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
  4164. return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
  4165. def cached_method(f):
  4166. """Cache a method"""
  4167. signature = inspect.signature(f)
  4168. @functools.wraps(f)
  4169. def wrapper(self, *args, **kwargs):
  4170. bound_args = signature.bind(self, *args, **kwargs)
  4171. bound_args.apply_defaults()
  4172. key = tuple(bound_args.arguments.values())[1:]
  4173. cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
  4174. if key not in cache:
  4175. cache[key] = f(self, *args, **kwargs)
  4176. return cache[key]
  4177. return wrapper
  4178. class classproperty:
  4179. """property access for class methods with optional caching"""
  4180. def __new__(cls, func=None, *args, **kwargs):
  4181. if not func:
  4182. return functools.partial(cls, *args, **kwargs)
  4183. return super().__new__(cls)
  4184. def __init__(self, func, *, cache=False):
  4185. functools.update_wrapper(self, func)
  4186. self.func = func
  4187. self._cache = {} if cache else None
  4188. def __get__(self, _, cls):
  4189. if self._cache is None:
  4190. return self.func(cls)
  4191. elif cls not in self._cache:
  4192. self._cache[cls] = self.func(cls)
  4193. return self._cache[cls]
  4194. class function_with_repr:
  4195. def __init__(self, func, repr_=None):
  4196. functools.update_wrapper(self, func)
  4197. self.func, self.__repr = func, repr_
  4198. def __call__(self, *args, **kwargs):
  4199. return self.func(*args, **kwargs)
  4200. @classmethod
  4201. def set_repr(cls, repr_):
  4202. return functools.partial(cls, repr_=repr_)
  4203. def __repr__(self):
  4204. if self.__repr:
  4205. return self.__repr
  4206. return f'{self.func.__module__}.{self.func.__qualname__}'
  4207. class Namespace(types.SimpleNamespace):
  4208. """Immutable namespace"""
  4209. def __iter__(self):
  4210. return iter(self.__dict__.values())
  4211. @property
  4212. def items_(self):
  4213. return self.__dict__.items()
  4214. MEDIA_EXTENSIONS = Namespace(
  4215. common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
  4216. video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
  4217. common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
  4218. audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
  4219. thumbnails=('jpg', 'png', 'webp'),
  4220. storyboards=('mhtml', ),
  4221. subtitles=('srt', 'vtt', 'ass', 'lrc'),
  4222. manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
  4223. )
  4224. MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
  4225. MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
  4226. KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
  4227. class _UnsafeExtensionError(Exception):
  4228. """
  4229. Mitigation exception for uncommon/malicious file extensions
  4230. This should be caught in YoutubeDL.py alongside a warning
  4231. Ref: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-79w7-vh3h-8g4j
  4232. """
  4233. ALLOWED_EXTENSIONS = frozenset([
  4234. # internal
  4235. 'description',
  4236. 'json',
  4237. 'meta',
  4238. 'orig',
  4239. 'part',
  4240. 'temp',
  4241. 'uncut',
  4242. 'unknown_video',
  4243. 'ytdl',
  4244. # video
  4245. *MEDIA_EXTENSIONS.video,
  4246. 'asx',
  4247. 'ismv',
  4248. 'm2t',
  4249. 'm2ts',
  4250. 'm2v',
  4251. 'm4s',
  4252. 'mng',
  4253. 'mp2v',
  4254. 'mp4v',
  4255. 'mpe',
  4256. 'mpeg',
  4257. 'mpeg1',
  4258. 'mpeg2',
  4259. 'mpeg4',
  4260. 'mxf',
  4261. 'ogm',
  4262. 'qt',
  4263. 'rm',
  4264. 'swf',
  4265. 'ts',
  4266. 'vid',
  4267. 'vob',
  4268. 'vp9',
  4269. # audio
  4270. *MEDIA_EXTENSIONS.audio,
  4271. '3ga',
  4272. 'ac3',
  4273. 'adts',
  4274. 'aif',
  4275. 'au',
  4276. 'dts',
  4277. 'isma',
  4278. 'it',
  4279. 'mid',
  4280. 'mod',
  4281. 'mpga',
  4282. 'mp1',
  4283. 'mp2',
  4284. 'mp4a',
  4285. 'mpa',
  4286. 'ra',
  4287. 'shn',
  4288. 'xm',
  4289. # image
  4290. *MEDIA_EXTENSIONS.thumbnails,
  4291. 'avif',
  4292. 'bmp',
  4293. 'gif',
  4294. 'heic',
  4295. 'ico',
  4296. 'image',
  4297. 'jfif',
  4298. 'jng',
  4299. 'jpe',
  4300. 'jpeg',
  4301. 'jxl',
  4302. 'svg',
  4303. 'tif',
  4304. 'tiff',
  4305. 'wbmp',
  4306. # subtitle
  4307. *MEDIA_EXTENSIONS.subtitles,
  4308. 'dfxp',
  4309. 'fs',
  4310. 'ismt',
  4311. 'json3',
  4312. 'sami',
  4313. 'scc',
  4314. 'srv1',
  4315. 'srv2',
  4316. 'srv3',
  4317. 'ssa',
  4318. 'tt',
  4319. 'ttml',
  4320. 'xml',
  4321. # others
  4322. *MEDIA_EXTENSIONS.manifests,
  4323. *MEDIA_EXTENSIONS.storyboards,
  4324. 'desktop',
  4325. 'ism',
  4326. 'm3u',
  4327. 'sbv',
  4328. 'url',
  4329. 'webloc',
  4330. ])
  4331. def __init__(self, extension, /):
  4332. super().__init__(f'unsafe file extension: {extension!r}')
  4333. self.extension = extension
  4334. @classmethod
  4335. def sanitize_extension(cls, extension, /, *, prepend=False):
  4336. if extension is None:
  4337. return None
  4338. if '/' in extension or '\\' in extension:
  4339. raise cls(extension)
  4340. if not prepend:
  4341. _, _, last = extension.rpartition('.')
  4342. if last == 'bin':
  4343. extension = last = 'unknown_video'
  4344. if last.lower() not in cls.ALLOWED_EXTENSIONS:
  4345. raise cls(extension)
  4346. return extension
  4347. class RetryManager:
  4348. """Usage:
  4349. for retry in RetryManager(...):
  4350. try:
  4351. ...
  4352. except SomeException as err:
  4353. retry.error = err
  4354. continue
  4355. """
  4356. attempt, _error = 0, None
  4357. def __init__(self, _retries, _error_callback, **kwargs):
  4358. self.retries = _retries or 0
  4359. self.error_callback = functools.partial(_error_callback, **kwargs)
  4360. def _should_retry(self):
  4361. return self._error is not NO_DEFAULT and self.attempt <= self.retries
  4362. @property
  4363. def error(self):
  4364. if self._error is NO_DEFAULT:
  4365. return None
  4366. return self._error
  4367. @error.setter
  4368. def error(self, value):
  4369. self._error = value
  4370. def __iter__(self):
  4371. while self._should_retry():
  4372. self.error = NO_DEFAULT
  4373. self.attempt += 1
  4374. yield self
  4375. if self.error:
  4376. self.error_callback(self.error, self.attempt, self.retries)
  4377. @staticmethod
  4378. def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
  4379. """Utility function for reporting retries"""
  4380. if count > retries:
  4381. if error:
  4382. return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
  4383. raise e
  4384. if not count:
  4385. return warn(e)
  4386. elif isinstance(e, ExtractorError):
  4387. e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
  4388. warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
  4389. delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
  4390. if delay:
  4391. info(f'Sleeping {delay:.2f} seconds ...')
  4392. time.sleep(delay)
  4393. @partial_application
  4394. def make_archive_id(ie, video_id):
  4395. ie_key = ie if isinstance(ie, str) else ie.ie_key()
  4396. return f'{ie_key.lower()} {video_id}'
  4397. @partial_application
  4398. def truncate_string(s, left, right=0):
  4399. assert left > 3 and right >= 0
  4400. if s is None or len(s) <= left + right:
  4401. return s
  4402. return f'{s[:left - 3]}...{s[-right:] if right else ""}'
  4403. def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
  4404. assert 'all' in alias_dict, '"all" alias is required'
  4405. requested = list(start or [])
  4406. for val in options:
  4407. discard = val.startswith('-')
  4408. if discard:
  4409. val = val[1:]
  4410. if val in alias_dict:
  4411. val = alias_dict[val] if not discard else [
  4412. i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
  4413. # NB: Do not allow regex in aliases for performance
  4414. requested = orderedSet_from_options(val, alias_dict, start=requested)
  4415. continue
  4416. current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
  4417. else [val] if val in alias_dict['all'] else None)
  4418. if current is None:
  4419. raise ValueError(val)
  4420. if discard:
  4421. for item in current:
  4422. while item in requested:
  4423. requested.remove(item)
  4424. else:
  4425. requested.extend(current)
  4426. return orderedSet(requested)
  4427. # TODO: Rewrite
  4428. class FormatSorter:
  4429. regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
  4430. default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
  4431. 'res', 'fps', 'hdr:12', 'vcodec', 'channels', 'acodec',
  4432. 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
  4433. _prefer_vp9_sort = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
  4434. 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
  4435. 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id')
  4436. ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
  4437. 'height', 'width', 'proto', 'vext', 'abr', 'aext',
  4438. 'fps', 'fs_approx', 'source', 'id')
  4439. settings = {
  4440. 'vcodec': {'type': 'ordered', 'regex': True,
  4441. 'order': ['av0?1', 'vp0?9.0?2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
  4442. 'acodec': {'type': 'ordered', 'regex': True,
  4443. 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
  4444. 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
  4445. 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
  4446. 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
  4447. 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
  4448. 'vext': {'type': 'ordered', 'field': 'video_ext',
  4449. 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
  4450. 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
  4451. 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
  4452. 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
  4453. 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
  4454. 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
  4455. 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
  4456. 'field': ('vcodec', 'acodec'),
  4457. 'function': lambda it: int(any(v != 'none' for v in it))},
  4458. 'ie_pref': {'priority': True, 'type': 'extractor'},
  4459. 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
  4460. 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
  4461. 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
  4462. 'quality': {'convert': 'float', 'default': -1},
  4463. 'filesize': {'convert': 'bytes'},
  4464. 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
  4465. 'id': {'convert': 'string', 'field': 'format_id'},
  4466. 'height': {'convert': 'float_none'},
  4467. 'width': {'convert': 'float_none'},
  4468. 'fps': {'convert': 'float_none'},
  4469. 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
  4470. 'tbr': {'convert': 'float_none'},
  4471. 'vbr': {'convert': 'float_none'},
  4472. 'abr': {'convert': 'float_none'},
  4473. 'asr': {'convert': 'float_none'},
  4474. 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
  4475. 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
  4476. 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
  4477. 'function': lambda it: next(filter(None, it), None)},
  4478. 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
  4479. 'function': lambda it: next(filter(None, it), None)},
  4480. 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
  4481. 'res': {'type': 'multiple', 'field': ('height', 'width'),
  4482. 'function': lambda it: min(filter(None, it), default=0)},
  4483. # Actual field names
  4484. 'format_id': {'type': 'alias', 'field': 'id'},
  4485. 'preference': {'type': 'alias', 'field': 'ie_pref'},
  4486. 'language_preference': {'type': 'alias', 'field': 'lang'},
  4487. 'source_preference': {'type': 'alias', 'field': 'source'},
  4488. 'protocol': {'type': 'alias', 'field': 'proto'},
  4489. 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
  4490. 'audio_channels': {'type': 'alias', 'field': 'channels'},
  4491. # Deprecated
  4492. 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
  4493. 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
  4494. 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
  4495. 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
  4496. 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
  4497. 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
  4498. 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
  4499. 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
  4500. 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
  4501. 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
  4502. 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
  4503. 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
  4504. 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
  4505. 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
  4506. 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
  4507. 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
  4508. 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
  4509. 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
  4510. 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
  4511. 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
  4512. }
  4513. def __init__(self, ydl, field_preference):
  4514. self.ydl = ydl
  4515. self._order = []
  4516. self.evaluate_params(self.ydl.params, field_preference)
  4517. if ydl.params.get('verbose'):
  4518. self.print_verbose_info(self.ydl.write_debug)
  4519. def _get_field_setting(self, field, key):
  4520. if field not in self.settings:
  4521. if key in ('forced', 'priority'):
  4522. return False
  4523. self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
  4524. 'deprecated and may be removed in a future version')
  4525. self.settings[field] = {}
  4526. prop_obj = self.settings[field]
  4527. if key not in prop_obj:
  4528. type_ = prop_obj.get('type')
  4529. if key == 'field':
  4530. default = 'preference' if type_ == 'extractor' else (field,) if type_ in ('combined', 'multiple') else field
  4531. elif key == 'convert':
  4532. default = 'order' if type_ == 'ordered' else 'float_string' if field else 'ignore'
  4533. else:
  4534. default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key)
  4535. prop_obj[key] = default
  4536. return prop_obj[key]
  4537. def _resolve_field_value(self, field, value, convert_none=False):
  4538. if value is None:
  4539. if not convert_none:
  4540. return None
  4541. else:
  4542. value = value.lower()
  4543. conversion = self._get_field_setting(field, 'convert')
  4544. if conversion == 'ignore':
  4545. return None
  4546. if conversion == 'string':
  4547. return value
  4548. elif conversion == 'float_none':
  4549. return float_or_none(value)
  4550. elif conversion == 'bytes':
  4551. return parse_bytes(value)
  4552. elif conversion == 'order':
  4553. order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
  4554. use_regex = self._get_field_setting(field, 'regex')
  4555. list_length = len(order_list)
  4556. empty_pos = order_list.index('') if '' in order_list else list_length + 1
  4557. if use_regex and value is not None:
  4558. for i, regex in enumerate(order_list):
  4559. if regex and re.match(regex, value):
  4560. return list_length - i
  4561. return list_length - empty_pos # not in list
  4562. else: # not regex or value = None
  4563. return list_length - (order_list.index(value) if value in order_list else empty_pos)
  4564. else:
  4565. if value.isnumeric():
  4566. return float(value)
  4567. else:
  4568. self.settings[field]['convert'] = 'string'
  4569. return value
  4570. def evaluate_params(self, params, sort_extractor):
  4571. self._use_free_order = params.get('prefer_free_formats', False)
  4572. self._sort_user = params.get('format_sort', [])
  4573. self._sort_extractor = sort_extractor
  4574. def add_item(field, reverse, closest, limit_text):
  4575. field = field.lower()
  4576. if field in self._order:
  4577. return
  4578. self._order.append(field)
  4579. limit = self._resolve_field_value(field, limit_text)
  4580. data = {
  4581. 'reverse': reverse,
  4582. 'closest': False if limit is None else closest,
  4583. 'limit_text': limit_text,
  4584. 'limit': limit}
  4585. if field in self.settings:
  4586. self.settings[field].update(data)
  4587. else:
  4588. self.settings[field] = data
  4589. sort_list = (
  4590. tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
  4591. + (tuple() if params.get('format_sort_force', False)
  4592. else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
  4593. + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
  4594. for item in sort_list:
  4595. match = re.match(self.regex, item)
  4596. if match is None:
  4597. raise ExtractorError(f'Invalid format sort string "{item}" given by extractor')
  4598. field = match.group('field')
  4599. if field is None:
  4600. continue
  4601. if self._get_field_setting(field, 'type') == 'alias':
  4602. alias, field = field, self._get_field_setting(field, 'field')
  4603. if self._get_field_setting(alias, 'deprecated'):
  4604. self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
  4605. f'be removed in a future version. Please use {field} instead')
  4606. reverse = match.group('reverse') is not None
  4607. closest = match.group('separator') == '~'
  4608. limit_text = match.group('limit')
  4609. has_limit = limit_text is not None
  4610. has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
  4611. has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
  4612. fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
  4613. limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
  4614. limit_count = len(limits)
  4615. for (i, f) in enumerate(fields):
  4616. add_item(f, reverse, closest,
  4617. limits[i] if i < limit_count
  4618. else limits[0] if has_limit and not has_multiple_limits
  4619. else None)
  4620. def print_verbose_info(self, write_debug):
  4621. if self._sort_user:
  4622. write_debug('Sort order given by user: {}'.format(', '.join(self._sort_user)))
  4623. if self._sort_extractor:
  4624. write_debug('Sort order given by extractor: {}'.format(', '.join(self._sort_extractor)))
  4625. write_debug('Formats sorted by: {}'.format(', '.join(['{}{}{}'.format(
  4626. '+' if self._get_field_setting(field, 'reverse') else '', field,
  4627. '{}{}({})'.format('~' if self._get_field_setting(field, 'closest') else ':',
  4628. self._get_field_setting(field, 'limit_text'),
  4629. self._get_field_setting(field, 'limit'))
  4630. if self._get_field_setting(field, 'limit_text') is not None else '')
  4631. for field in self._order if self._get_field_setting(field, 'visible')])))
  4632. def _calculate_field_preference_from_value(self, format_, field, type_, value):
  4633. reverse = self._get_field_setting(field, 'reverse')
  4634. closest = self._get_field_setting(field, 'closest')
  4635. limit = self._get_field_setting(field, 'limit')
  4636. if type_ == 'extractor':
  4637. maximum = self._get_field_setting(field, 'max')
  4638. if value is None or (maximum is not None and value >= maximum):
  4639. value = -1
  4640. elif type_ == 'boolean':
  4641. in_list = self._get_field_setting(field, 'in_list')
  4642. not_in_list = self._get_field_setting(field, 'not_in_list')
  4643. value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
  4644. elif type_ == 'ordered':
  4645. value = self._resolve_field_value(field, value, True)
  4646. # try to convert to number
  4647. val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
  4648. is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
  4649. if is_num:
  4650. value = val_num
  4651. return ((-10, 0) if value is None
  4652. else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
  4653. else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
  4654. else (0, value, 0) if not reverse and (limit is None or value <= limit)
  4655. else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
  4656. else (-1, value, 0))
  4657. def _calculate_field_preference(self, format_, field):
  4658. type_ = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
  4659. get_value = lambda f: format_.get(self._get_field_setting(f, 'field'))
  4660. if type_ == 'multiple':
  4661. type_ = 'field' # Only 'field' is allowed in multiple for now
  4662. actual_fields = self._get_field_setting(field, 'field')
  4663. value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
  4664. else:
  4665. value = get_value(field)
  4666. return self._calculate_field_preference_from_value(format_, field, type_, value)
  4667. @staticmethod
  4668. def _fill_sorting_fields(format):
  4669. # Determine missing protocol
  4670. if not format.get('protocol'):
  4671. format['protocol'] = determine_protocol(format)
  4672. # Determine missing ext
  4673. if not format.get('ext') and 'url' in format:
  4674. format['ext'] = determine_ext(format['url']).lower()
  4675. if format.get('vcodec') == 'none':
  4676. format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
  4677. format['video_ext'] = 'none'
  4678. else:
  4679. format['video_ext'] = format['ext']
  4680. format['audio_ext'] = 'none'
  4681. # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
  4682. # format['preference'] = -1000
  4683. if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
  4684. # HEVC-over-FLV is out-of-spec by FLV's original spec
  4685. # ref. https://trac.ffmpeg.org/ticket/6389
  4686. # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
  4687. format['preference'] = -100
  4688. # Determine missing bitrates
  4689. if format.get('vcodec') == 'none':
  4690. format['vbr'] = 0
  4691. if format.get('acodec') == 'none':
  4692. format['abr'] = 0
  4693. if not format.get('vbr') and format.get('vcodec') != 'none':
  4694. format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
  4695. if not format.get('abr') and format.get('acodec') != 'none':
  4696. format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
  4697. if not format.get('tbr'):
  4698. format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
  4699. def calculate_preference(self, format):
  4700. self._fill_sorting_fields(format)
  4701. return tuple(self._calculate_field_preference(format, field) for field in self._order)
  4702. def filesize_from_tbr(tbr, duration):
  4703. """
  4704. @param tbr: Total bitrate in kbps (1000 bits/sec)
  4705. @param duration: Duration in seconds
  4706. @returns Filesize in bytes
  4707. """
  4708. if tbr is None or duration is None:
  4709. return None
  4710. return int(duration * tbr * (1000 / 8))
  4711. # XXX: Temporary
  4712. class _YDLLogger:
  4713. def __init__(self, ydl=None):
  4714. self._ydl = ydl
  4715. def debug(self, message):
  4716. if self._ydl:
  4717. self._ydl.write_debug(message)
  4718. def info(self, message):
  4719. if self._ydl:
  4720. self._ydl.to_screen(message)
  4721. def warning(self, message, *, once=False):
  4722. if self._ydl:
  4723. self._ydl.report_warning(message, once)
  4724. def error(self, message, *, is_error=True):
  4725. if self._ydl:
  4726. self._ydl.report_error(message, is_error=is_error)
  4727. def stdout(self, message):
  4728. if self._ydl:
  4729. self._ydl.to_stdout(message)
  4730. def stderr(self, message):
  4731. if self._ydl:
  4732. self._ydl.to_stderr(message)