_utils.py 199 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959
  1. import asyncio
  2. import atexit
  3. import base64
  4. import binascii
  5. import calendar
  6. import codecs
  7. import collections
  8. import collections.abc
  9. import contextlib
  10. import datetime
  11. import email.header
  12. import email.utils
  13. import errno
  14. import gzip
  15. import hashlib
  16. import hmac
  17. import html.entities
  18. import html.parser
  19. import http.client
  20. import http.cookiejar
  21. import inspect
  22. import io
  23. import itertools
  24. import json
  25. import locale
  26. import math
  27. import mimetypes
  28. import operator
  29. import os
  30. import platform
  31. import random
  32. import re
  33. import shlex
  34. import socket
  35. import ssl
  36. import struct
  37. import subprocess
  38. import sys
  39. import tempfile
  40. import time
  41. import traceback
  42. import types
  43. import unicodedata
  44. import urllib.error
  45. import urllib.parse
  46. import urllib.request
  47. import xml.etree.ElementTree
  48. import zlib
  49. from . import traversal
  50. from ..compat import functools # isort: split
  51. from ..compat import (
  52. compat_etree_fromstring,
  53. compat_expanduser,
  54. compat_HTMLParseError,
  55. compat_os_name,
  56. compat_shlex_quote,
  57. )
  58. from ..dependencies import brotli, certifi, websockets, xattr
  59. from ..socks import ProxyType, sockssocket
  60. __name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
  61. # This is not clearly defined otherwise
  62. compiled_regex_type = type(re.compile(''))
  63. def random_user_agent():
  64. _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  65. _CHROME_VERSIONS = (
  66. '90.0.4430.212',
  67. '90.0.4430.24',
  68. '90.0.4430.70',
  69. '90.0.4430.72',
  70. '90.0.4430.85',
  71. '90.0.4430.93',
  72. '91.0.4472.101',
  73. '91.0.4472.106',
  74. '91.0.4472.114',
  75. '91.0.4472.124',
  76. '91.0.4472.164',
  77. '91.0.4472.19',
  78. '91.0.4472.77',
  79. '92.0.4515.107',
  80. '92.0.4515.115',
  81. '92.0.4515.131',
  82. '92.0.4515.159',
  83. '92.0.4515.43',
  84. '93.0.4556.0',
  85. '93.0.4577.15',
  86. '93.0.4577.63',
  87. '93.0.4577.82',
  88. '94.0.4606.41',
  89. '94.0.4606.54',
  90. '94.0.4606.61',
  91. '94.0.4606.71',
  92. '94.0.4606.81',
  93. '94.0.4606.85',
  94. '95.0.4638.17',
  95. '95.0.4638.50',
  96. '95.0.4638.54',
  97. '95.0.4638.69',
  98. '95.0.4638.74',
  99. '96.0.4664.18',
  100. '96.0.4664.45',
  101. '96.0.4664.55',
  102. '96.0.4664.93',
  103. '97.0.4692.20',
  104. )
  105. return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
  106. SUPPORTED_ENCODINGS = [
  107. 'gzip', 'deflate'
  108. ]
  109. if brotli:
  110. SUPPORTED_ENCODINGS.append('br')
  111. std_headers = {
  112. 'User-Agent': random_user_agent(),
  113. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  114. 'Accept-Language': 'en-us,en;q=0.5',
  115. 'Sec-Fetch-Mode': 'navigate',
  116. }
  117. USER_AGENTS = {
  118. 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
  119. }
  120. class NO_DEFAULT:
  121. pass
  122. def IDENTITY(x):
  123. return x
  124. ENGLISH_MONTH_NAMES = [
  125. 'January', 'February', 'March', 'April', 'May', 'June',
  126. 'July', 'August', 'September', 'October', 'November', 'December']
  127. MONTH_NAMES = {
  128. 'en': ENGLISH_MONTH_NAMES,
  129. 'fr': [
  130. 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  131. 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  132. # these follow the genitive grammatical case (dopełniacz)
  133. # some websites might be using nominative, which will require another month list
  134. # https://en.wikibooks.org/wiki/Polish/Noun_cases
  135. 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
  136. 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
  137. }
  138. # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
  139. TIMEZONE_NAMES = {
  140. 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
  141. 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
  142. 'EST': -5, 'EDT': -4, # Eastern
  143. 'CST': -6, 'CDT': -5, # Central
  144. 'MST': -7, 'MDT': -6, # Mountain
  145. 'PST': -8, 'PDT': -7 # Pacific
  146. }
  147. # needed for sanitizing filenames in restricted mode
  148. ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
  149. itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
  150. 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
  151. DATE_FORMATS = (
  152. '%d %B %Y',
  153. '%d %b %Y',
  154. '%B %d %Y',
  155. '%B %dst %Y',
  156. '%B %dnd %Y',
  157. '%B %drd %Y',
  158. '%B %dth %Y',
  159. '%b %d %Y',
  160. '%b %dst %Y',
  161. '%b %dnd %Y',
  162. '%b %drd %Y',
  163. '%b %dth %Y',
  164. '%b %dst %Y %I:%M',
  165. '%b %dnd %Y %I:%M',
  166. '%b %drd %Y %I:%M',
  167. '%b %dth %Y %I:%M',
  168. '%Y %m %d',
  169. '%Y-%m-%d',
  170. '%Y.%m.%d.',
  171. '%Y/%m/%d',
  172. '%Y/%m/%d %H:%M',
  173. '%Y/%m/%d %H:%M:%S',
  174. '%Y%m%d%H%M',
  175. '%Y%m%d%H%M%S',
  176. '%Y%m%d',
  177. '%Y-%m-%d %H:%M',
  178. '%Y-%m-%d %H:%M:%S',
  179. '%Y-%m-%d %H:%M:%S.%f',
  180. '%Y-%m-%d %H:%M:%S:%f',
  181. '%d.%m.%Y %H:%M',
  182. '%d.%m.%Y %H.%M',
  183. '%Y-%m-%dT%H:%M:%SZ',
  184. '%Y-%m-%dT%H:%M:%S.%fZ',
  185. '%Y-%m-%dT%H:%M:%S.%f0Z',
  186. '%Y-%m-%dT%H:%M:%S',
  187. '%Y-%m-%dT%H:%M:%S.%f',
  188. '%Y-%m-%dT%H:%M',
  189. '%b %d %Y at %H:%M',
  190. '%b %d %Y at %H:%M:%S',
  191. '%B %d %Y at %H:%M',
  192. '%B %d %Y at %H:%M:%S',
  193. '%H:%M %d-%b-%Y',
  194. )
  195. DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
  196. DATE_FORMATS_DAY_FIRST.extend([
  197. '%d-%m-%Y',
  198. '%d.%m.%Y',
  199. '%d.%m.%y',
  200. '%d/%m/%Y',
  201. '%d/%m/%y',
  202. '%d/%m/%Y %H:%M:%S',
  203. '%d-%m-%Y %H:%M',
  204. ])
  205. DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
  206. DATE_FORMATS_MONTH_FIRST.extend([
  207. '%m-%d-%Y',
  208. '%m.%d.%Y',
  209. '%m/%d/%Y',
  210. '%m/%d/%y',
  211. '%m/%d/%Y %H:%M:%S',
  212. ])
  213. PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
  214. JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
  215. NUMBER_RE = r'\d+(?:\.\d+)?'
  216. @functools.cache
  217. def preferredencoding():
  218. """Get preferred encoding.
  219. Returns the best encoding scheme for the system, based on
  220. locale.getpreferredencoding() and some further tweaks.
  221. """
  222. try:
  223. pref = locale.getpreferredencoding()
  224. 'TEST'.encode(pref)
  225. except Exception:
  226. pref = 'UTF-8'
  227. return pref
  228. def write_json_file(obj, fn):
  229. """ Encode obj as JSON and write it to fn, atomically if possible """
  230. tf = tempfile.NamedTemporaryFile(
  231. prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
  232. suffix='.tmp', delete=False, mode='w', encoding='utf-8')
  233. try:
  234. with tf:
  235. json.dump(obj, tf, ensure_ascii=False)
  236. if sys.platform == 'win32':
  237. # Need to remove existing file on Windows, else os.rename raises
  238. # WindowsError or FileExistsError.
  239. with contextlib.suppress(OSError):
  240. os.unlink(fn)
  241. with contextlib.suppress(OSError):
  242. mask = os.umask(0)
  243. os.umask(mask)
  244. os.chmod(tf.name, 0o666 & ~mask)
  245. os.rename(tf.name, fn)
  246. except Exception:
  247. with contextlib.suppress(OSError):
  248. os.remove(tf.name)
  249. raise
  250. def find_xpath_attr(node, xpath, key, val=None):
  251. """ Find the xpath xpath[@key=val] """
  252. assert re.match(r'^[a-zA-Z_-]+$', key)
  253. expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
  254. return node.find(expr)
  255. # On python2.6 the xml.etree.ElementTree.Element methods don't support
  256. # the namespace parameter
  257. def xpath_with_ns(path, ns_map):
  258. components = [c.split(':') for c in path.split('/')]
  259. replaced = []
  260. for c in components:
  261. if len(c) == 1:
  262. replaced.append(c[0])
  263. else:
  264. ns, tag = c
  265. replaced.append('{%s}%s' % (ns_map[ns], tag))
  266. return '/'.join(replaced)
  267. def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
  268. def _find_xpath(xpath):
  269. return node.find(xpath)
  270. if isinstance(xpath, str):
  271. n = _find_xpath(xpath)
  272. else:
  273. for xp in xpath:
  274. n = _find_xpath(xp)
  275. if n is not None:
  276. break
  277. if n is None:
  278. if default is not NO_DEFAULT:
  279. return default
  280. elif fatal:
  281. name = xpath if name is None else name
  282. raise ExtractorError('Could not find XML element %s' % name)
  283. else:
  284. return None
  285. return n
  286. def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
  287. n = xpath_element(node, xpath, name, fatal=fatal, default=default)
  288. if n is None or n == default:
  289. return n
  290. if n.text is None:
  291. if default is not NO_DEFAULT:
  292. return default
  293. elif fatal:
  294. name = xpath if name is None else name
  295. raise ExtractorError('Could not find XML element\'s text %s' % name)
  296. else:
  297. return None
  298. return n.text
  299. def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
  300. n = find_xpath_attr(node, xpath, key)
  301. if n is None:
  302. if default is not NO_DEFAULT:
  303. return default
  304. elif fatal:
  305. name = f'{xpath}[@{key}]' if name is None else name
  306. raise ExtractorError('Could not find XML attribute %s' % name)
  307. else:
  308. return None
  309. return n.attrib[key]
  310. def get_element_by_id(id, html, **kwargs):
  311. """Return the content of the tag with the specified ID in the passed HTML document"""
  312. return get_element_by_attribute('id', id, html, **kwargs)
  313. def get_element_html_by_id(id, html, **kwargs):
  314. """Return the html of the tag with the specified ID in the passed HTML document"""
  315. return get_element_html_by_attribute('id', id, html, **kwargs)
  316. def get_element_by_class(class_name, html):
  317. """Return the content of the first tag with the specified class in the passed HTML document"""
  318. retval = get_elements_by_class(class_name, html)
  319. return retval[0] if retval else None
  320. def get_element_html_by_class(class_name, html):
  321. """Return the html of the first tag with the specified class in the passed HTML document"""
  322. retval = get_elements_html_by_class(class_name, html)
  323. return retval[0] if retval else None
  324. def get_element_by_attribute(attribute, value, html, **kwargs):
  325. retval = get_elements_by_attribute(attribute, value, html, **kwargs)
  326. return retval[0] if retval else None
  327. def get_element_html_by_attribute(attribute, value, html, **kargs):
  328. retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
  329. return retval[0] if retval else None
  330. def get_elements_by_class(class_name, html, **kargs):
  331. """Return the content of all tags with the specified class in the passed HTML document as a list"""
  332. return get_elements_by_attribute(
  333. 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
  334. html, escape_value=False)
  335. def get_elements_html_by_class(class_name, html):
  336. """Return the html of all tags with the specified class in the passed HTML document as a list"""
  337. return get_elements_html_by_attribute(
  338. 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
  339. html, escape_value=False)
  340. def get_elements_by_attribute(*args, **kwargs):
  341. """Return the content of the tag with the specified attribute in the passed HTML document"""
  342. return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
  343. def get_elements_html_by_attribute(*args, **kwargs):
  344. """Return the html of the tag with the specified attribute in the passed HTML document"""
  345. return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
  346. def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
  347. """
  348. Return the text (content) and the html (whole) of the tag with the specified
  349. attribute in the passed HTML document
  350. """
  351. if not value:
  352. return
  353. quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
  354. value = re.escape(value) if escape_value else value
  355. partial_element_re = rf'''(?x)
  356. <(?P<tag>{tag})
  357. (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
  358. \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
  359. '''
  360. for m in re.finditer(partial_element_re, html):
  361. content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
  362. yield (
  363. unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
  364. whole
  365. )
  366. class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
  367. """
  368. HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
  369. closing tag for the first opening tag it has encountered, and can be used
  370. as a context manager
  371. """
  372. class HTMLBreakOnClosingTagException(Exception):
  373. pass
  374. def __init__(self):
  375. self.tagstack = collections.deque()
  376. html.parser.HTMLParser.__init__(self)
  377. def __enter__(self):
  378. return self
  379. def __exit__(self, *_):
  380. self.close()
  381. def close(self):
  382. # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
  383. # so data remains buffered; we no longer have any interest in it, thus
  384. # override this method to discard it
  385. pass
  386. def handle_starttag(self, tag, _):
  387. self.tagstack.append(tag)
  388. def handle_endtag(self, tag):
  389. if not self.tagstack:
  390. raise compat_HTMLParseError('no tags in the stack')
  391. while self.tagstack:
  392. inner_tag = self.tagstack.pop()
  393. if inner_tag == tag:
  394. break
  395. else:
  396. raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
  397. if not self.tagstack:
  398. raise self.HTMLBreakOnClosingTagException()
  399. # XXX: This should be far less strict
  400. def get_element_text_and_html_by_tag(tag, html):
  401. """
  402. For the first element with the specified tag in the passed HTML document
  403. return its' content (text) and the whole element (html)
  404. """
  405. def find_or_raise(haystack, needle, exc):
  406. try:
  407. return haystack.index(needle)
  408. except ValueError:
  409. raise exc
  410. closing_tag = f'</{tag}>'
  411. whole_start = find_or_raise(
  412. html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
  413. content_start = find_or_raise(
  414. html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
  415. content_start += whole_start + 1
  416. with HTMLBreakOnClosingTagParser() as parser:
  417. parser.feed(html[whole_start:content_start])
  418. if not parser.tagstack or parser.tagstack[0] != tag:
  419. raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
  420. offset = content_start
  421. while offset < len(html):
  422. next_closing_tag_start = find_or_raise(
  423. html[offset:], closing_tag,
  424. compat_HTMLParseError(f'closing {tag} tag not found'))
  425. next_closing_tag_end = next_closing_tag_start + len(closing_tag)
  426. try:
  427. parser.feed(html[offset:offset + next_closing_tag_end])
  428. offset += next_closing_tag_end
  429. except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
  430. return html[content_start:offset + next_closing_tag_start], \
  431. html[whole_start:offset + next_closing_tag_end]
  432. raise compat_HTMLParseError('unexpected end of html')
  433. class HTMLAttributeParser(html.parser.HTMLParser):
  434. """Trivial HTML parser to gather the attributes for a single element"""
  435. def __init__(self):
  436. self.attrs = {}
  437. html.parser.HTMLParser.__init__(self)
  438. def handle_starttag(self, tag, attrs):
  439. self.attrs = dict(attrs)
  440. raise compat_HTMLParseError('done')
  441. class HTMLListAttrsParser(html.parser.HTMLParser):
  442. """HTML parser to gather the attributes for the elements of a list"""
  443. def __init__(self):
  444. html.parser.HTMLParser.__init__(self)
  445. self.items = []
  446. self._level = 0
  447. def handle_starttag(self, tag, attrs):
  448. if tag == 'li' and self._level == 0:
  449. self.items.append(dict(attrs))
  450. self._level += 1
  451. def handle_endtag(self, tag):
  452. self._level -= 1
  453. def extract_attributes(html_element):
  454. """Given a string for an HTML element such as
  455. <el
  456. a="foo" B="bar" c="&98;az" d=boz
  457. empty= noval entity="&amp;"
  458. sq='"' dq="'"
  459. >
  460. Decode and return a dictionary of attributes.
  461. {
  462. 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
  463. 'empty': '', 'noval': None, 'entity': '&',
  464. 'sq': '"', 'dq': '\''
  465. }.
  466. """
  467. parser = HTMLAttributeParser()
  468. with contextlib.suppress(compat_HTMLParseError):
  469. parser.feed(html_element)
  470. parser.close()
  471. return parser.attrs
  472. def parse_list(webpage):
  473. """Given a string for an series of HTML <li> elements,
  474. return a dictionary of their attributes"""
  475. parser = HTMLListAttrsParser()
  476. parser.feed(webpage)
  477. parser.close()
  478. return parser.items
  479. def clean_html(html):
  480. """Clean an HTML snippet into a readable string"""
  481. if html is None: # Convenience for sanitizing descriptions etc.
  482. return html
  483. html = re.sub(r'\s+', ' ', html)
  484. html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
  485. html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
  486. # Strip html tags
  487. html = re.sub('<.*?>', '', html)
  488. # Replace html entities
  489. html = unescapeHTML(html)
  490. return html.strip()
  491. class LenientJSONDecoder(json.JSONDecoder):
  492. # TODO: Write tests
  493. def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
  494. self.transform_source, self.ignore_extra = transform_source, ignore_extra
  495. self._close_attempts = 2 * close_objects
  496. super().__init__(*args, **kwargs)
  497. @staticmethod
  498. def _close_object(err):
  499. doc = err.doc[:err.pos]
  500. # We need to add comma first to get the correct error message
  501. if err.msg.startswith('Expecting \',\''):
  502. return doc + ','
  503. elif not doc.endswith(','):
  504. return
  505. if err.msg.startswith('Expecting property name'):
  506. return doc[:-1] + '}'
  507. elif err.msg.startswith('Expecting value'):
  508. return doc[:-1] + ']'
  509. def decode(self, s):
  510. if self.transform_source:
  511. s = self.transform_source(s)
  512. for attempt in range(self._close_attempts + 1):
  513. try:
  514. if self.ignore_extra:
  515. return self.raw_decode(s.lstrip())[0]
  516. return super().decode(s)
  517. except json.JSONDecodeError as e:
  518. if e.pos is None:
  519. raise
  520. elif attempt < self._close_attempts:
  521. s = self._close_object(e)
  522. if s is not None:
  523. continue
  524. raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
  525. assert False, 'Too many attempts to decode JSON'
  526. def sanitize_open(filename, open_mode):
  527. """Try to open the given filename, and slightly tweak it if this fails.
  528. Attempts to open the given filename. If this fails, it tries to change
  529. the filename slightly, step by step, until it's either able to open it
  530. or it fails and raises a final exception, like the standard open()
  531. function.
  532. It returns the tuple (stream, definitive_file_name).
  533. """
  534. if filename == '-':
  535. if sys.platform == 'win32':
  536. import msvcrt
  537. # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
  538. with contextlib.suppress(io.UnsupportedOperation):
  539. msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
  540. return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
  541. for attempt in range(2):
  542. try:
  543. try:
  544. if sys.platform == 'win32':
  545. # FIXME: An exclusive lock also locks the file from being read.
  546. # Since windows locks are mandatory, don't lock the file on windows (for now).
  547. # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
  548. raise LockingUnsupportedError()
  549. stream = locked_file(filename, open_mode, block=False).__enter__()
  550. except OSError:
  551. stream = open(filename, open_mode)
  552. return stream, filename
  553. except OSError as err:
  554. if attempt or err.errno in (errno.EACCES,):
  555. raise
  556. old_filename, filename = filename, sanitize_path(filename)
  557. if old_filename == filename:
  558. raise
  559. def timeconvert(timestr):
  560. """Convert RFC 2822 defined time string into system timestamp"""
  561. timestamp = None
  562. timetuple = email.utils.parsedate_tz(timestr)
  563. if timetuple is not None:
  564. timestamp = email.utils.mktime_tz(timetuple)
  565. return timestamp
  566. def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
  567. """Sanitizes a string so it could be used as part of a filename.
  568. @param restricted Use a stricter subset of allowed characters
  569. @param is_id Whether this is an ID that should be kept unchanged if possible.
  570. If unset, yt-dlp's new sanitization rules are in effect
  571. """
  572. if s == '':
  573. return ''
  574. def replace_insane(char):
  575. if restricted and char in ACCENT_CHARS:
  576. return ACCENT_CHARS[char]
  577. elif not restricted and char == '\n':
  578. return '\0 '
  579. elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
  580. # Replace with their full-width unicode counterparts
  581. return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
  582. elif char == '?' or ord(char) < 32 or ord(char) == 127:
  583. return ''
  584. elif char == '"':
  585. return '' if restricted else '\''
  586. elif char == ':':
  587. return '\0_\0-' if restricted else '\0 \0-'
  588. elif char in '\\/|*<>':
  589. return '\0_'
  590. if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
  591. return '\0_'
  592. return char
  593. # Replace look-alike Unicode glyphs
  594. if restricted and (is_id is NO_DEFAULT or not is_id):
  595. s = unicodedata.normalize('NFKC', s)
  596. s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
  597. result = ''.join(map(replace_insane, s))
  598. if is_id is NO_DEFAULT:
  599. result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
  600. STRIP_RE = r'(?:\0.|[ _-])*'
  601. result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
  602. result = result.replace('\0', '') or '_'
  603. if not is_id:
  604. while '__' in result:
  605. result = result.replace('__', '_')
  606. result = result.strip('_')
  607. # Common case of "Foreign band name - English song title"
  608. if restricted and result.startswith('-_'):
  609. result = result[2:]
  610. if result.startswith('-'):
  611. result = '_' + result[len('-'):]
  612. result = result.lstrip('.')
  613. if not result:
  614. result = '_'
  615. return result
  616. def sanitize_path(s, force=False):
  617. """Sanitizes and normalizes path on Windows"""
  618. if sys.platform == 'win32':
  619. force = False
  620. drive_or_unc, _ = os.path.splitdrive(s)
  621. elif force:
  622. drive_or_unc = ''
  623. else:
  624. return s
  625. norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
  626. if drive_or_unc:
  627. norm_path.pop(0)
  628. sanitized_path = [
  629. path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
  630. for path_part in norm_path]
  631. if drive_or_unc:
  632. sanitized_path.insert(0, drive_or_unc + os.path.sep)
  633. elif force and s and s[0] == os.path.sep:
  634. sanitized_path.insert(0, os.path.sep)
  635. return os.path.join(*sanitized_path)
  636. def sanitize_url(url, *, scheme='http'):
  637. # Prepend protocol-less URLs with `http:` scheme in order to mitigate
  638. # the number of unwanted failures due to missing protocol
  639. if url is None:
  640. return
  641. elif url.startswith('//'):
  642. return f'{scheme}:{url}'
  643. # Fix some common typos seen so far
  644. COMMON_TYPOS = (
  645. # https://github.com/ytdl-org/youtube-dl/issues/15649
  646. (r'^httpss://', r'https://'),
  647. # https://bx1.be/lives/direct-tv/
  648. (r'^rmtp([es]?)://', r'rtmp\1://'),
  649. )
  650. for mistake, fixup in COMMON_TYPOS:
  651. if re.match(mistake, url):
  652. return re.sub(mistake, fixup, url)
  653. return url
  654. def extract_basic_auth(url):
  655. parts = urllib.parse.urlsplit(url)
  656. if parts.username is None:
  657. return url, None
  658. url = urllib.parse.urlunsplit(parts._replace(netloc=(
  659. parts.hostname if parts.port is None
  660. else '%s:%d' % (parts.hostname, parts.port))))
  661. auth_payload = base64.b64encode(
  662. ('%s:%s' % (parts.username, parts.password or '')).encode())
  663. return url, f'Basic {auth_payload.decode()}'
  664. def sanitized_Request(url, *args, **kwargs):
  665. url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
  666. if auth_header is not None:
  667. headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
  668. headers['Authorization'] = auth_header
  669. return urllib.request.Request(url, *args, **kwargs)
  670. def expand_path(s):
  671. """Expand shell variables and ~"""
  672. return os.path.expandvars(compat_expanduser(s))
  673. def orderedSet(iterable, *, lazy=False):
  674. """Remove all duplicates from the input iterable"""
  675. def _iter():
  676. seen = [] # Do not use set since the items can be unhashable
  677. for x in iterable:
  678. if x not in seen:
  679. seen.append(x)
  680. yield x
  681. return _iter() if lazy else list(_iter())
  682. def _htmlentity_transform(entity_with_semicolon):
  683. """Transforms an HTML entity to a character."""
  684. entity = entity_with_semicolon[:-1]
  685. # Known non-numeric HTML entity
  686. if entity in html.entities.name2codepoint:
  687. return chr(html.entities.name2codepoint[entity])
  688. # TODO: HTML5 allows entities without a semicolon.
  689. # E.g. '&Eacuteric' should be decoded as 'Éric'.
  690. if entity_with_semicolon in html.entities.html5:
  691. return html.entities.html5[entity_with_semicolon]
  692. mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
  693. if mobj is not None:
  694. numstr = mobj.group(1)
  695. if numstr.startswith('x'):
  696. base = 16
  697. numstr = '0%s' % numstr
  698. else:
  699. base = 10
  700. # See https://github.com/ytdl-org/youtube-dl/issues/7518
  701. with contextlib.suppress(ValueError):
  702. return chr(int(numstr, base))
  703. # Unknown entity in name, return its literal representation
  704. return '&%s;' % entity
  705. def unescapeHTML(s):
  706. if s is None:
  707. return None
  708. assert isinstance(s, str)
  709. return re.sub(
  710. r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
  711. def escapeHTML(text):
  712. return (
  713. text
  714. .replace('&', '&amp;')
  715. .replace('<', '&lt;')
  716. .replace('>', '&gt;')
  717. .replace('"', '&quot;')
  718. .replace("'", '&#39;')
  719. )
  720. def process_communicate_or_kill(p, *args, **kwargs):
  721. deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
  722. f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
  723. return Popen.communicate_or_kill(p, *args, **kwargs)
  724. class Popen(subprocess.Popen):
  725. if sys.platform == 'win32':
  726. _startupinfo = subprocess.STARTUPINFO()
  727. _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
  728. else:
  729. _startupinfo = None
  730. @staticmethod
  731. def _fix_pyinstaller_ld_path(env):
  732. """Restore LD_LIBRARY_PATH when using PyInstaller
  733. Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
  734. https://github.com/yt-dlp/yt-dlp/issues/4573
  735. """
  736. if not hasattr(sys, '_MEIPASS'):
  737. return
  738. def _fix(key):
  739. orig = env.get(f'{key}_ORIG')
  740. if orig is None:
  741. env.pop(key, None)
  742. else:
  743. env[key] = orig
  744. _fix('LD_LIBRARY_PATH') # Linux
  745. _fix('DYLD_LIBRARY_PATH') # macOS
  746. def __init__(self, *args, env=None, text=False, **kwargs):
  747. if env is None:
  748. env = os.environ.copy()
  749. self._fix_pyinstaller_ld_path(env)
  750. self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
  751. if text is True:
  752. kwargs['universal_newlines'] = True # For 3.6 compatibility
  753. kwargs.setdefault('encoding', 'utf-8')
  754. kwargs.setdefault('errors', 'replace')
  755. super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
  756. def communicate_or_kill(self, *args, **kwargs):
  757. try:
  758. return self.communicate(*args, **kwargs)
  759. except BaseException: # Including KeyboardInterrupt
  760. self.kill(timeout=None)
  761. raise
  762. def kill(self, *, timeout=0):
  763. super().kill()
  764. if timeout != 0:
  765. self.wait(timeout=timeout)
  766. @classmethod
  767. def run(cls, *args, timeout=None, **kwargs):
  768. with cls(*args, **kwargs) as proc:
  769. default = '' if proc.__text_mode else b''
  770. stdout, stderr = proc.communicate_or_kill(timeout=timeout)
  771. return stdout or default, stderr or default, proc.returncode
  772. def encodeArgument(s):
  773. # Legacy code that uses byte strings
  774. # Uncomment the following line after fixing all post processors
  775. # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
  776. return s if isinstance(s, str) else s.decode('ascii')
  777. _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
  778. def timetuple_from_msec(msec):
  779. secs, msec = divmod(msec, 1000)
  780. mins, secs = divmod(secs, 60)
  781. hrs, mins = divmod(mins, 60)
  782. return _timetuple(hrs, mins, secs, msec)
  783. def formatSeconds(secs, delim=':', msec=False):
  784. time = timetuple_from_msec(secs * 1000)
  785. if time.hours:
  786. ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
  787. elif time.minutes:
  788. ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
  789. else:
  790. ret = '%d' % time.seconds
  791. return '%s.%03d' % (ret, time.milliseconds) if msec else ret
  792. def _ssl_load_windows_store_certs(ssl_context, storename):
  793. # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
  794. try:
  795. certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
  796. if encoding == 'x509_asn' and (
  797. trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
  798. except PermissionError:
  799. return
  800. for cert in certs:
  801. with contextlib.suppress(ssl.SSLError):
  802. ssl_context.load_verify_locations(cadata=cert)
  803. def make_HTTPS_handler(params, **kwargs):
  804. opts_check_certificate = not params.get('nocheckcertificate')
  805. context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
  806. context.check_hostname = opts_check_certificate
  807. if params.get('legacyserverconnect'):
  808. context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
  809. # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
  810. context.set_ciphers('DEFAULT')
  811. elif (
  812. sys.version_info < (3, 10)
  813. and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
  814. and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
  815. ):
  816. # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
  817. # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
  818. # in some situations [2][3].
  819. # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
  820. # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
  821. # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
  822. # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
  823. # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
  824. # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
  825. # 4. https://peps.python.org/pep-0644/
  826. # 5. https://peps.python.org/pep-0644/#libressl-support
  827. # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
  828. context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
  829. context.minimum_version = ssl.TLSVersion.TLSv1_2
  830. context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
  831. if opts_check_certificate:
  832. if certifi and 'no-certifi' not in params.get('compat_opts', []):
  833. context.load_verify_locations(cafile=certifi.where())
  834. else:
  835. try:
  836. context.load_default_certs()
  837. # Work around the issue in load_default_certs when there are bad certificates. See:
  838. # https://github.com/yt-dlp/yt-dlp/issues/1060,
  839. # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
  840. except ssl.SSLError:
  841. # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
  842. if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
  843. for storename in ('CA', 'ROOT'):
  844. _ssl_load_windows_store_certs(context, storename)
  845. context.set_default_verify_paths()
  846. client_certfile = params.get('client_certificate')
  847. if client_certfile:
  848. try:
  849. context.load_cert_chain(
  850. client_certfile, keyfile=params.get('client_certificate_key'),
  851. password=params.get('client_certificate_password'))
  852. except ssl.SSLError:
  853. raise YoutubeDLError('Unable to load client certificate')
  854. # Some servers may reject requests if ALPN extension is not sent. See:
  855. # https://github.com/python/cpython/issues/85140
  856. # https://github.com/yt-dlp/yt-dlp/issues/3878
  857. with contextlib.suppress(NotImplementedError):
  858. context.set_alpn_protocols(['http/1.1'])
  859. return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
  860. def bug_reports_message(before=';'):
  861. from ..update import REPOSITORY
  862. msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
  863. 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
  864. before = before.rstrip()
  865. if not before or before.endswith(('.', '!', '?')):
  866. msg = msg[0].title() + msg[1:]
  867. return (before + ' ' if before else '') + msg
  868. class YoutubeDLError(Exception):
  869. """Base exception for YoutubeDL errors."""
  870. msg = None
  871. def __init__(self, msg=None):
  872. if msg is not None:
  873. self.msg = msg
  874. elif self.msg is None:
  875. self.msg = type(self).__name__
  876. super().__init__(self.msg)
  877. network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
  878. if hasattr(ssl, 'CertificateError'):
  879. network_exceptions.append(ssl.CertificateError)
  880. network_exceptions = tuple(network_exceptions)
  881. class ExtractorError(YoutubeDLError):
  882. """Error during info extraction."""
  883. def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
  884. """ tb, if given, is the original traceback (so that it can be printed out).
  885. If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
  886. """
  887. if sys.exc_info()[0] in network_exceptions:
  888. expected = True
  889. self.orig_msg = str(msg)
  890. self.traceback = tb
  891. self.expected = expected
  892. self.cause = cause
  893. self.video_id = video_id
  894. self.ie = ie
  895. self.exc_info = sys.exc_info() # preserve original exception
  896. if isinstance(self.exc_info[1], ExtractorError):
  897. self.exc_info = self.exc_info[1].exc_info
  898. super().__init__(self.__msg)
  899. @property
  900. def __msg(self):
  901. return ''.join((
  902. format_field(self.ie, None, '[%s] '),
  903. format_field(self.video_id, None, '%s: '),
  904. self.orig_msg,
  905. format_field(self.cause, None, ' (caused by %r)'),
  906. '' if self.expected else bug_reports_message()))
  907. def format_traceback(self):
  908. return join_nonempty(
  909. self.traceback and ''.join(traceback.format_tb(self.traceback)),
  910. self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
  911. delim='\n') or None
  912. def __setattr__(self, name, value):
  913. super().__setattr__(name, value)
  914. if getattr(self, 'msg', None) and name not in ('msg', 'args'):
  915. self.msg = self.__msg or type(self).__name__
  916. self.args = (self.msg, ) # Cannot be property
  917. class UnsupportedError(ExtractorError):
  918. def __init__(self, url):
  919. super().__init__(
  920. 'Unsupported URL: %s' % url, expected=True)
  921. self.url = url
  922. class RegexNotFoundError(ExtractorError):
  923. """Error when a regex didn't match"""
  924. pass
  925. class GeoRestrictedError(ExtractorError):
  926. """Geographic restriction Error exception.
  927. This exception may be thrown when a video is not available from your
  928. geographic location due to geographic restrictions imposed by a website.
  929. """
  930. def __init__(self, msg, countries=None, **kwargs):
  931. kwargs['expected'] = True
  932. super().__init__(msg, **kwargs)
  933. self.countries = countries
  934. class UserNotLive(ExtractorError):
  935. """Error when a channel/user is not live"""
  936. def __init__(self, msg=None, **kwargs):
  937. kwargs['expected'] = True
  938. super().__init__(msg or 'The channel is not currently live', **kwargs)
  939. class DownloadError(YoutubeDLError):
  940. """Download Error exception.
  941. This exception may be thrown by FileDownloader objects if they are not
  942. configured to continue on errors. They will contain the appropriate
  943. error message.
  944. """
  945. def __init__(self, msg, exc_info=None):
  946. """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
  947. super().__init__(msg)
  948. self.exc_info = exc_info
  949. class EntryNotInPlaylist(YoutubeDLError):
  950. """Entry not in playlist exception.
  951. This exception will be thrown by YoutubeDL when a requested entry
  952. is not found in the playlist info_dict
  953. """
  954. msg = 'Entry not found in info'
  955. class SameFileError(YoutubeDLError):
  956. """Same File exception.
  957. This exception will be thrown by FileDownloader objects if they detect
  958. multiple files would have to be downloaded to the same file on disk.
  959. """
  960. msg = 'Fixed output name but more than one file to download'
  961. def __init__(self, filename=None):
  962. if filename is not None:
  963. self.msg += f': {filename}'
  964. super().__init__(self.msg)
  965. class PostProcessingError(YoutubeDLError):
  966. """Post Processing exception.
  967. This exception may be raised by PostProcessor's .run() method to
  968. indicate an error in the postprocessing task.
  969. """
  970. class DownloadCancelled(YoutubeDLError):
  971. """ Exception raised when the download queue should be interrupted """
  972. msg = 'The download was cancelled'
  973. class ExistingVideoReached(DownloadCancelled):
  974. """ --break-on-existing triggered """
  975. msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
  976. class RejectedVideoReached(DownloadCancelled):
  977. """ --break-match-filter triggered """
  978. msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
  979. class MaxDownloadsReached(DownloadCancelled):
  980. """ --max-downloads limit has been reached. """
  981. msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
  982. class ReExtractInfo(YoutubeDLError):
  983. """ Video info needs to be re-extracted. """
  984. def __init__(self, msg, expected=False):
  985. super().__init__(msg)
  986. self.expected = expected
  987. class ThrottledDownload(ReExtractInfo):
  988. """ Download speed below --throttled-rate. """
  989. msg = 'The download speed is below throttle limit'
  990. def __init__(self):
  991. super().__init__(self.msg, expected=False)
  992. class UnavailableVideoError(YoutubeDLError):
  993. """Unavailable Format exception.
  994. This exception will be thrown when a video is requested
  995. in a format that is not available for that video.
  996. """
  997. msg = 'Unable to download video'
  998. def __init__(self, err=None):
  999. if err is not None:
  1000. self.msg += f': {err}'
  1001. super().__init__(self.msg)
  1002. class ContentTooShortError(YoutubeDLError):
  1003. """Content Too Short exception.
  1004. This exception may be raised by FileDownloader objects when a file they
  1005. download is too small for what the server announced first, indicating
  1006. the connection was probably interrupted.
  1007. """
  1008. def __init__(self, downloaded, expected):
  1009. super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
  1010. # Both in bytes
  1011. self.downloaded = downloaded
  1012. self.expected = expected
  1013. class XAttrMetadataError(YoutubeDLError):
  1014. def __init__(self, code=None, msg='Unknown error'):
  1015. super().__init__(msg)
  1016. self.code = code
  1017. self.msg = msg
  1018. # Parsing code and msg
  1019. if (self.code in (errno.ENOSPC, errno.EDQUOT)
  1020. or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
  1021. self.reason = 'NO_SPACE'
  1022. elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
  1023. self.reason = 'VALUE_TOO_LONG'
  1024. else:
  1025. self.reason = 'NOT_SUPPORTED'
  1026. class XAttrUnavailableError(YoutubeDLError):
  1027. pass
  1028. def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
  1029. hc = http_class(*args, **kwargs)
  1030. source_address = ydl_handler._params.get('source_address')
  1031. if source_address is not None:
  1032. # This is to workaround _create_connection() from socket where it will try all
  1033. # address data from getaddrinfo() including IPv6. This filters the result from
  1034. # getaddrinfo() based on the source_address value.
  1035. # This is based on the cpython socket.create_connection() function.
  1036. # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
  1037. def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
  1038. host, port = address
  1039. err = None
  1040. addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
  1041. af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
  1042. ip_addrs = [addr for addr in addrs if addr[0] == af]
  1043. if addrs and not ip_addrs:
  1044. ip_version = 'v4' if af == socket.AF_INET else 'v6'
  1045. raise OSError(
  1046. "No remote IP%s addresses available for connect, can't use '%s' as source address"
  1047. % (ip_version, source_address[0]))
  1048. for res in ip_addrs:
  1049. af, socktype, proto, canonname, sa = res
  1050. sock = None
  1051. try:
  1052. sock = socket.socket(af, socktype, proto)
  1053. if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
  1054. sock.settimeout(timeout)
  1055. sock.bind(source_address)
  1056. sock.connect(sa)
  1057. err = None # Explicitly break reference cycle
  1058. return sock
  1059. except OSError as _:
  1060. err = _
  1061. if sock is not None:
  1062. sock.close()
  1063. if err is not None:
  1064. raise err
  1065. else:
  1066. raise OSError('getaddrinfo returns an empty list')
  1067. if hasattr(hc, '_create_connection'):
  1068. hc._create_connection = _create_connection
  1069. hc.source_address = (source_address, 0)
  1070. return hc
  1071. class YoutubeDLHandler(urllib.request.HTTPHandler):
  1072. """Handler for HTTP requests and responses.
  1073. This class, when installed with an OpenerDirector, automatically adds
  1074. the standard headers to every HTTP request and handles gzipped, deflated and
  1075. brotli responses from web servers.
  1076. Part of this code was copied from:
  1077. http://techknack.net/python-urllib2-handlers/
  1078. Andrew Rowls, the author of that code, agreed to release it to the
  1079. public domain.
  1080. """
  1081. def __init__(self, params, *args, **kwargs):
  1082. urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
  1083. self._params = params
  1084. def http_open(self, req):
  1085. conn_class = http.client.HTTPConnection
  1086. socks_proxy = req.headers.get('Ytdl-socks-proxy')
  1087. if socks_proxy:
  1088. conn_class = make_socks_conn_class(conn_class, socks_proxy)
  1089. del req.headers['Ytdl-socks-proxy']
  1090. return self.do_open(functools.partial(
  1091. _create_http_connection, self, conn_class, False),
  1092. req)
  1093. @staticmethod
  1094. def deflate(data):
  1095. if not data:
  1096. return data
  1097. try:
  1098. return zlib.decompress(data, -zlib.MAX_WBITS)
  1099. except zlib.error:
  1100. return zlib.decompress(data)
  1101. @staticmethod
  1102. def brotli(data):
  1103. if not data:
  1104. return data
  1105. return brotli.decompress(data)
  1106. @staticmethod
  1107. def gz(data):
  1108. gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
  1109. try:
  1110. return gz.read()
  1111. except OSError as original_oserror:
  1112. # There may be junk add the end of the file
  1113. # See http://stackoverflow.com/q/4928560/35070 for details
  1114. for i in range(1, 1024):
  1115. try:
  1116. gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
  1117. return gz.read()
  1118. except OSError:
  1119. continue
  1120. else:
  1121. raise original_oserror
  1122. def http_request(self, req):
  1123. # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
  1124. # always respected by websites, some tend to give out URLs with non percent-encoded
  1125. # non-ASCII characters (see telemb.py, ard.py [#3412])
  1126. # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
  1127. # To work around aforementioned issue we will replace request's original URL with
  1128. # percent-encoded one
  1129. # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
  1130. # the code of this workaround has been moved here from YoutubeDL.urlopen()
  1131. url = req.get_full_url()
  1132. url_escaped = escape_url(url)
  1133. # Substitute URL if any change after escaping
  1134. if url != url_escaped:
  1135. req = update_Request(req, url=url_escaped)
  1136. for h, v in self._params.get('http_headers', std_headers).items():
  1137. # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
  1138. # The dict keys are capitalized because of this bug by urllib
  1139. if h.capitalize() not in req.headers:
  1140. req.add_header(h, v)
  1141. if 'Youtubedl-no-compression' in req.headers: # deprecated
  1142. req.headers.pop('Youtubedl-no-compression', None)
  1143. req.add_header('Accept-encoding', 'identity')
  1144. if 'Accept-encoding' not in req.headers:
  1145. req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
  1146. return super().do_request_(req)
  1147. def http_response(self, req, resp):
  1148. old_resp = resp
  1149. # Content-Encoding header lists the encodings in order that they were applied [1].
  1150. # To decompress, we simply do the reverse.
  1151. # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
  1152. decoded_response = None
  1153. for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
  1154. if encoding == 'gzip':
  1155. decoded_response = self.gz(decoded_response or resp.read())
  1156. elif encoding == 'deflate':
  1157. decoded_response = self.deflate(decoded_response or resp.read())
  1158. elif encoding == 'br' and brotli:
  1159. decoded_response = self.brotli(decoded_response or resp.read())
  1160. if decoded_response is not None:
  1161. resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
  1162. resp.msg = old_resp.msg
  1163. # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
  1164. # https://github.com/ytdl-org/youtube-dl/issues/6457).
  1165. if 300 <= resp.code < 400:
  1166. location = resp.headers.get('Location')
  1167. if location:
  1168. # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
  1169. location = location.encode('iso-8859-1').decode()
  1170. location_escaped = escape_url(location)
  1171. if location != location_escaped:
  1172. del resp.headers['Location']
  1173. resp.headers['Location'] = location_escaped
  1174. return resp
  1175. https_request = http_request
  1176. https_response = http_response
  1177. def make_socks_conn_class(base_class, socks_proxy):
  1178. assert issubclass(base_class, (
  1179. http.client.HTTPConnection, http.client.HTTPSConnection))
  1180. url_components = urllib.parse.urlparse(socks_proxy)
  1181. if url_components.scheme.lower() == 'socks5':
  1182. socks_type = ProxyType.SOCKS5
  1183. elif url_components.scheme.lower() in ('socks', 'socks4'):
  1184. socks_type = ProxyType.SOCKS4
  1185. elif url_components.scheme.lower() == 'socks4a':
  1186. socks_type = ProxyType.SOCKS4A
  1187. def unquote_if_non_empty(s):
  1188. if not s:
  1189. return s
  1190. return urllib.parse.unquote_plus(s)
  1191. proxy_args = (
  1192. socks_type,
  1193. url_components.hostname, url_components.port or 1080,
  1194. True, # Remote DNS
  1195. unquote_if_non_empty(url_components.username),
  1196. unquote_if_non_empty(url_components.password),
  1197. )
  1198. class SocksConnection(base_class):
  1199. def connect(self):
  1200. self.sock = sockssocket()
  1201. self.sock.setproxy(*proxy_args)
  1202. if isinstance(self.timeout, (int, float)):
  1203. self.sock.settimeout(self.timeout)
  1204. self.sock.connect((self.host, self.port))
  1205. if isinstance(self, http.client.HTTPSConnection):
  1206. if hasattr(self, '_context'): # Python > 2.6
  1207. self.sock = self._context.wrap_socket(
  1208. self.sock, server_hostname=self.host)
  1209. else:
  1210. self.sock = ssl.wrap_socket(self.sock)
  1211. return SocksConnection
  1212. class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
  1213. def __init__(self, params, https_conn_class=None, *args, **kwargs):
  1214. urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
  1215. self._https_conn_class = https_conn_class or http.client.HTTPSConnection
  1216. self._params = params
  1217. def https_open(self, req):
  1218. kwargs = {}
  1219. conn_class = self._https_conn_class
  1220. if hasattr(self, '_context'): # python > 2.6
  1221. kwargs['context'] = self._context
  1222. if hasattr(self, '_check_hostname'): # python 3.x
  1223. kwargs['check_hostname'] = self._check_hostname
  1224. socks_proxy = req.headers.get('Ytdl-socks-proxy')
  1225. if socks_proxy:
  1226. conn_class = make_socks_conn_class(conn_class, socks_proxy)
  1227. del req.headers['Ytdl-socks-proxy']
  1228. try:
  1229. return self.do_open(
  1230. functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
  1231. except urllib.error.URLError as e:
  1232. if (isinstance(e.reason, ssl.SSLError)
  1233. and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
  1234. raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
  1235. raise
  1236. def is_path_like(f):
  1237. return isinstance(f, (str, bytes, os.PathLike))
  1238. class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
  1239. def __init__(self, cookiejar=None):
  1240. urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
  1241. def http_response(self, request, response):
  1242. return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
  1243. https_request = urllib.request.HTTPCookieProcessor.http_request
  1244. https_response = http_response
  1245. class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
  1246. """YoutubeDL redirect handler
  1247. The code is based on HTTPRedirectHandler implementation from CPython [1].
  1248. This redirect handler fixes and improves the logic to better align with RFC7261
  1249. and what browsers tend to do [2][3]
  1250. 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
  1251. 2. https://datatracker.ietf.org/doc/html/rfc7231
  1252. 3. https://github.com/python/cpython/issues/91306
  1253. """
  1254. http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
  1255. def redirect_request(self, req, fp, code, msg, headers, newurl):
  1256. if code not in (301, 302, 303, 307, 308):
  1257. raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
  1258. new_method = req.get_method()
  1259. new_data = req.data
  1260. remove_headers = []
  1261. # A 303 must either use GET or HEAD for subsequent request
  1262. # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
  1263. if code == 303 and req.get_method() != 'HEAD':
  1264. new_method = 'GET'
  1265. # 301 and 302 redirects are commonly turned into a GET from a POST
  1266. # for subsequent requests by browsers, so we'll do the same.
  1267. # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
  1268. # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
  1269. elif code in (301, 302) and req.get_method() == 'POST':
  1270. new_method = 'GET'
  1271. # only remove payload if method changed (e.g. POST to GET)
  1272. if new_method != req.get_method():
  1273. new_data = None
  1274. remove_headers.extend(['Content-Length', 'Content-Type'])
  1275. new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers}
  1276. return urllib.request.Request(
  1277. newurl, headers=new_headers, origin_req_host=req.origin_req_host,
  1278. unverifiable=True, method=new_method, data=new_data)
  1279. def extract_timezone(date_str):
  1280. m = re.search(
  1281. r'''(?x)
  1282. ^.{8,}? # >=8 char non-TZ prefix, if present
  1283. (?P<tz>Z| # just the UTC Z, or
  1284. (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
  1285. (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
  1286. [ ]? # optional space
  1287. (?P<sign>\+|-) # +/-
  1288. (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
  1289. $)
  1290. ''', date_str)
  1291. if not m:
  1292. m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
  1293. timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
  1294. if timezone is not None:
  1295. date_str = date_str[:-len(m.group('tz'))]
  1296. timezone = datetime.timedelta(hours=timezone or 0)
  1297. else:
  1298. date_str = date_str[:-len(m.group('tz'))]
  1299. if not m.group('sign'):
  1300. timezone = datetime.timedelta()
  1301. else:
  1302. sign = 1 if m.group('sign') == '+' else -1
  1303. timezone = datetime.timedelta(
  1304. hours=sign * int(m.group('hours')),
  1305. minutes=sign * int(m.group('minutes')))
  1306. return timezone, date_str
  1307. def parse_iso8601(date_str, delimiter='T', timezone=None):
  1308. """ Return a UNIX timestamp from the given date """
  1309. if date_str is None:
  1310. return None
  1311. date_str = re.sub(r'\.[0-9]+', '', date_str)
  1312. if timezone is None:
  1313. timezone, date_str = extract_timezone(date_str)
  1314. with contextlib.suppress(ValueError):
  1315. date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
  1316. dt = datetime.datetime.strptime(date_str, date_format) - timezone
  1317. return calendar.timegm(dt.timetuple())
  1318. def date_formats(day_first=True):
  1319. return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
  1320. def unified_strdate(date_str, day_first=True):
  1321. """Return a string with the date in the format YYYYMMDD"""
  1322. if date_str is None:
  1323. return None
  1324. upload_date = None
  1325. # Replace commas
  1326. date_str = date_str.replace(',', ' ')
  1327. # Remove AM/PM + timezone
  1328. date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
  1329. _, date_str = extract_timezone(date_str)
  1330. for expression in date_formats(day_first):
  1331. with contextlib.suppress(ValueError):
  1332. upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
  1333. if upload_date is None:
  1334. timetuple = email.utils.parsedate_tz(date_str)
  1335. if timetuple:
  1336. with contextlib.suppress(ValueError):
  1337. upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
  1338. if upload_date is not None:
  1339. return str(upload_date)
  1340. def unified_timestamp(date_str, day_first=True):
  1341. if date_str is None:
  1342. return None
  1343. date_str = re.sub(r'\s+', ' ', re.sub(
  1344. r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
  1345. pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
  1346. timezone, date_str = extract_timezone(date_str)
  1347. # Remove AM/PM + timezone
  1348. date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
  1349. # Remove unrecognized timezones from ISO 8601 alike timestamps
  1350. m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
  1351. if m:
  1352. date_str = date_str[:-len(m.group('tz'))]
  1353. # Python only supports microseconds, so remove nanoseconds
  1354. m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
  1355. if m:
  1356. date_str = m.group(1)
  1357. for expression in date_formats(day_first):
  1358. with contextlib.suppress(ValueError):
  1359. dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
  1360. return calendar.timegm(dt.timetuple())
  1361. timetuple = email.utils.parsedate_tz(date_str)
  1362. if timetuple:
  1363. return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
  1364. def determine_ext(url, default_ext='unknown_video'):
  1365. if url is None or '.' not in url:
  1366. return default_ext
  1367. guess = url.partition('?')[0].rpartition('.')[2]
  1368. if re.match(r'^[A-Za-z0-9]+$', guess):
  1369. return guess
  1370. # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
  1371. elif guess.rstrip('/') in KNOWN_EXTENSIONS:
  1372. return guess.rstrip('/')
  1373. else:
  1374. return default_ext
  1375. def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
  1376. return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
  1377. def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
  1378. R"""
  1379. Return a datetime object from a string.
  1380. Supported format:
  1381. (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
  1382. @param format strftime format of DATE
  1383. @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
  1384. auto: round to the unit provided in date_str (if applicable).
  1385. """
  1386. auto_precision = False
  1387. if precision == 'auto':
  1388. auto_precision = True
  1389. precision = 'microsecond'
  1390. today = datetime_round(datetime.datetime.utcnow(), precision)
  1391. if date_str in ('now', 'today'):
  1392. return today
  1393. if date_str == 'yesterday':
  1394. return today - datetime.timedelta(days=1)
  1395. match = re.match(
  1396. r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
  1397. date_str)
  1398. if match is not None:
  1399. start_time = datetime_from_str(match.group('start'), precision, format)
  1400. time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
  1401. unit = match.group('unit')
  1402. if unit == 'month' or unit == 'year':
  1403. new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
  1404. unit = 'day'
  1405. else:
  1406. if unit == 'week':
  1407. unit = 'day'
  1408. time *= 7
  1409. delta = datetime.timedelta(**{unit + 's': time})
  1410. new_date = start_time + delta
  1411. if auto_precision:
  1412. return datetime_round(new_date, unit)
  1413. return new_date
  1414. return datetime_round(datetime.datetime.strptime(date_str, format), precision)
  1415. def date_from_str(date_str, format='%Y%m%d', strict=False):
  1416. R"""
  1417. Return a date object from a string using datetime_from_str
  1418. @param strict Restrict allowed patterns to "YYYYMMDD" and
  1419. (now|today|yesterday)(-\d+(day|week|month|year)s?)?
  1420. """
  1421. if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
  1422. raise ValueError(f'Invalid date format "{date_str}"')
  1423. return datetime_from_str(date_str, precision='microsecond', format=format).date()
  1424. def datetime_add_months(dt, months):
  1425. """Increment/Decrement a datetime object by months."""
  1426. month = dt.month + months - 1
  1427. year = dt.year + month // 12
  1428. month = month % 12 + 1
  1429. day = min(dt.day, calendar.monthrange(year, month)[1])
  1430. return dt.replace(year, month, day)
  1431. def datetime_round(dt, precision='day'):
  1432. """
  1433. Round a datetime object's time to a specific precision
  1434. """
  1435. if precision == 'microsecond':
  1436. return dt
  1437. unit_seconds = {
  1438. 'day': 86400,
  1439. 'hour': 3600,
  1440. 'minute': 60,
  1441. 'second': 1,
  1442. }
  1443. roundto = lambda x, n: ((x + n / 2) // n) * n
  1444. timestamp = calendar.timegm(dt.timetuple())
  1445. return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
  1446. def hyphenate_date(date_str):
  1447. """
  1448. Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
  1449. match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
  1450. if match is not None:
  1451. return '-'.join(match.groups())
  1452. else:
  1453. return date_str
  1454. class DateRange:
  1455. """Represents a time interval between two dates"""
  1456. def __init__(self, start=None, end=None):
  1457. """start and end must be strings in the format accepted by date"""
  1458. if start is not None:
  1459. self.start = date_from_str(start, strict=True)
  1460. else:
  1461. self.start = datetime.datetime.min.date()
  1462. if end is not None:
  1463. self.end = date_from_str(end, strict=True)
  1464. else:
  1465. self.end = datetime.datetime.max.date()
  1466. if self.start > self.end:
  1467. raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
  1468. @classmethod
  1469. def day(cls, day):
  1470. """Returns a range that only contains the given day"""
  1471. return cls(day, day)
  1472. def __contains__(self, date):
  1473. """Check if the date is in the range"""
  1474. if not isinstance(date, datetime.date):
  1475. date = date_from_str(date)
  1476. return self.start <= date <= self.end
  1477. def __repr__(self):
  1478. return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
  1479. def __eq__(self, other):
  1480. return (isinstance(other, DateRange)
  1481. and self.start == other.start and self.end == other.end)
  1482. @functools.cache
  1483. def system_identifier():
  1484. python_implementation = platform.python_implementation()
  1485. if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
  1486. python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
  1487. libc_ver = []
  1488. with contextlib.suppress(OSError): # We may not have access to the executable
  1489. libc_ver = platform.libc_ver()
  1490. return 'Python %s (%s %s %s) - %s (%s%s)' % (
  1491. platform.python_version(),
  1492. python_implementation,
  1493. platform.machine(),
  1494. platform.architecture()[0],
  1495. platform.platform(),
  1496. ssl.OPENSSL_VERSION,
  1497. format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
  1498. )
  1499. @functools.cache
  1500. def get_windows_version():
  1501. ''' Get Windows version. returns () if it's not running on Windows '''
  1502. if compat_os_name == 'nt':
  1503. return version_tuple(platform.win32_ver()[1])
  1504. else:
  1505. return ()
  1506. def write_string(s, out=None, encoding=None):
  1507. assert isinstance(s, str)
  1508. out = out or sys.stderr
  1509. # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
  1510. if not out:
  1511. return
  1512. if compat_os_name == 'nt' and supports_terminal_sequences(out):
  1513. s = re.sub(r'([\r\n]+)', r' \1', s)
  1514. enc, buffer = None, out
  1515. if 'b' in getattr(out, 'mode', ''):
  1516. enc = encoding or preferredencoding()
  1517. elif hasattr(out, 'buffer'):
  1518. buffer = out.buffer
  1519. enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
  1520. buffer.write(s.encode(enc, 'ignore') if enc else s)
  1521. out.flush()
  1522. def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
  1523. from .. import _IN_CLI
  1524. if _IN_CLI:
  1525. if msg in deprecation_warning._cache:
  1526. return
  1527. deprecation_warning._cache.add(msg)
  1528. if printer:
  1529. return printer(f'{msg}{bug_reports_message()}', **kwargs)
  1530. return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
  1531. else:
  1532. import warnings
  1533. warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
  1534. deprecation_warning._cache = set()
  1535. def bytes_to_intlist(bs):
  1536. if not bs:
  1537. return []
  1538. if isinstance(bs[0], int): # Python 3
  1539. return list(bs)
  1540. else:
  1541. return [ord(c) for c in bs]
  1542. def intlist_to_bytes(xs):
  1543. if not xs:
  1544. return b''
  1545. return struct.pack('%dB' % len(xs), *xs)
  1546. class LockingUnsupportedError(OSError):
  1547. msg = 'File locking is not supported'
  1548. def __init__(self):
  1549. super().__init__(self.msg)
  1550. # Cross-platform file locking
  1551. if sys.platform == 'win32':
  1552. import ctypes
  1553. import ctypes.wintypes
  1554. import msvcrt
  1555. class OVERLAPPED(ctypes.Structure):
  1556. _fields_ = [
  1557. ('Internal', ctypes.wintypes.LPVOID),
  1558. ('InternalHigh', ctypes.wintypes.LPVOID),
  1559. ('Offset', ctypes.wintypes.DWORD),
  1560. ('OffsetHigh', ctypes.wintypes.DWORD),
  1561. ('hEvent', ctypes.wintypes.HANDLE),
  1562. ]
  1563. kernel32 = ctypes.WinDLL('kernel32')
  1564. LockFileEx = kernel32.LockFileEx
  1565. LockFileEx.argtypes = [
  1566. ctypes.wintypes.HANDLE, # hFile
  1567. ctypes.wintypes.DWORD, # dwFlags
  1568. ctypes.wintypes.DWORD, # dwReserved
  1569. ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
  1570. ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
  1571. ctypes.POINTER(OVERLAPPED) # Overlapped
  1572. ]
  1573. LockFileEx.restype = ctypes.wintypes.BOOL
  1574. UnlockFileEx = kernel32.UnlockFileEx
  1575. UnlockFileEx.argtypes = [
  1576. ctypes.wintypes.HANDLE, # hFile
  1577. ctypes.wintypes.DWORD, # dwReserved
  1578. ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
  1579. ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
  1580. ctypes.POINTER(OVERLAPPED) # Overlapped
  1581. ]
  1582. UnlockFileEx.restype = ctypes.wintypes.BOOL
  1583. whole_low = 0xffffffff
  1584. whole_high = 0x7fffffff
  1585. def _lock_file(f, exclusive, block):
  1586. overlapped = OVERLAPPED()
  1587. overlapped.Offset = 0
  1588. overlapped.OffsetHigh = 0
  1589. overlapped.hEvent = 0
  1590. f._lock_file_overlapped_p = ctypes.pointer(overlapped)
  1591. if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
  1592. (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
  1593. 0, whole_low, whole_high, f._lock_file_overlapped_p):
  1594. # NB: No argument form of "ctypes.FormatError" does not work on PyPy
  1595. raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
  1596. def _unlock_file(f):
  1597. assert f._lock_file_overlapped_p
  1598. handle = msvcrt.get_osfhandle(f.fileno())
  1599. if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
  1600. raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
  1601. else:
  1602. try:
  1603. import fcntl
  1604. def _lock_file(f, exclusive, block):
  1605. flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
  1606. if not block:
  1607. flags |= fcntl.LOCK_NB
  1608. try:
  1609. fcntl.flock(f, flags)
  1610. except BlockingIOError:
  1611. raise
  1612. except OSError: # AOSP does not have flock()
  1613. fcntl.lockf(f, flags)
  1614. def _unlock_file(f):
  1615. with contextlib.suppress(OSError):
  1616. return fcntl.flock(f, fcntl.LOCK_UN)
  1617. with contextlib.suppress(OSError):
  1618. return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
  1619. return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
  1620. except ImportError:
  1621. def _lock_file(f, exclusive, block):
  1622. raise LockingUnsupportedError()
  1623. def _unlock_file(f):
  1624. raise LockingUnsupportedError()
  1625. class locked_file:
  1626. locked = False
  1627. def __init__(self, filename, mode, block=True, encoding=None):
  1628. if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
  1629. raise NotImplementedError(mode)
  1630. self.mode, self.block = mode, block
  1631. writable = any(f in mode for f in 'wax+')
  1632. readable = any(f in mode for f in 'r+')
  1633. flags = functools.reduce(operator.ior, (
  1634. getattr(os, 'O_CLOEXEC', 0), # UNIX only
  1635. getattr(os, 'O_BINARY', 0), # Windows only
  1636. getattr(os, 'O_NOINHERIT', 0), # Windows only
  1637. os.O_CREAT if writable else 0, # O_TRUNC only after locking
  1638. os.O_APPEND if 'a' in mode else 0,
  1639. os.O_EXCL if 'x' in mode else 0,
  1640. os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
  1641. ))
  1642. self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
  1643. def __enter__(self):
  1644. exclusive = 'r' not in self.mode
  1645. try:
  1646. _lock_file(self.f, exclusive, self.block)
  1647. self.locked = True
  1648. except OSError:
  1649. self.f.close()
  1650. raise
  1651. if 'w' in self.mode:
  1652. try:
  1653. self.f.truncate()
  1654. except OSError as e:
  1655. if e.errno not in (
  1656. errno.ESPIPE, # Illegal seek - expected for FIFO
  1657. errno.EINVAL, # Invalid argument - expected for /dev/null
  1658. ):
  1659. raise
  1660. return self
  1661. def unlock(self):
  1662. if not self.locked:
  1663. return
  1664. try:
  1665. _unlock_file(self.f)
  1666. finally:
  1667. self.locked = False
  1668. def __exit__(self, *_):
  1669. try:
  1670. self.unlock()
  1671. finally:
  1672. self.f.close()
  1673. open = __enter__
  1674. close = __exit__
  1675. def __getattr__(self, attr):
  1676. return getattr(self.f, attr)
  1677. def __iter__(self):
  1678. return iter(self.f)
  1679. @functools.cache
  1680. def get_filesystem_encoding():
  1681. encoding = sys.getfilesystemencoding()
  1682. return encoding if encoding is not None else 'utf-8'
  1683. def shell_quote(args):
  1684. quoted_args = []
  1685. encoding = get_filesystem_encoding()
  1686. for a in args:
  1687. if isinstance(a, bytes):
  1688. # We may get a filename encoded with 'encodeFilename'
  1689. a = a.decode(encoding)
  1690. quoted_args.append(compat_shlex_quote(a))
  1691. return ' '.join(quoted_args)
  1692. def smuggle_url(url, data):
  1693. """ Pass additional data in a URL for internal use. """
  1694. url, idata = unsmuggle_url(url, {})
  1695. data.update(idata)
  1696. sdata = urllib.parse.urlencode(
  1697. {'__youtubedl_smuggle': json.dumps(data)})
  1698. return url + '#' + sdata
  1699. def unsmuggle_url(smug_url, default=None):
  1700. if '#__youtubedl_smuggle' not in smug_url:
  1701. return smug_url, default
  1702. url, _, sdata = smug_url.rpartition('#')
  1703. jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
  1704. data = json.loads(jsond)
  1705. return url, data
  1706. def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
  1707. """ Formats numbers with decimal sufixes like K, M, etc """
  1708. num, factor = float_or_none(num), float(factor)
  1709. if num is None or num < 0:
  1710. return None
  1711. POSSIBLE_SUFFIXES = 'kMGTPEZY'
  1712. exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
  1713. suffix = ['', *POSSIBLE_SUFFIXES][exponent]
  1714. if factor == 1024:
  1715. suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
  1716. converted = num / (factor ** exponent)
  1717. return fmt % (converted, suffix)
  1718. def format_bytes(bytes):
  1719. return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
  1720. def lookup_unit_table(unit_table, s, strict=False):
  1721. num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
  1722. units_re = '|'.join(re.escape(u) for u in unit_table)
  1723. m = (re.fullmatch if strict else re.match)(
  1724. rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
  1725. if not m:
  1726. return None
  1727. num = float(m.group('num').replace(',', '.'))
  1728. mult = unit_table[m.group('unit')]
  1729. return round(num * mult)
  1730. def parse_bytes(s):
  1731. """Parse a string indicating a byte quantity into an integer"""
  1732. return lookup_unit_table(
  1733. {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
  1734. s.upper(), strict=True)
  1735. def parse_filesize(s):
  1736. if s is None:
  1737. return None
  1738. # The lower-case forms are of course incorrect and unofficial,
  1739. # but we support those too
  1740. _UNIT_TABLE = {
  1741. 'B': 1,
  1742. 'b': 1,
  1743. 'bytes': 1,
  1744. 'KiB': 1024,
  1745. 'KB': 1000,
  1746. 'kB': 1024,
  1747. 'Kb': 1000,
  1748. 'kb': 1000,
  1749. 'kilobytes': 1000,
  1750. 'kibibytes': 1024,
  1751. 'MiB': 1024 ** 2,
  1752. 'MB': 1000 ** 2,
  1753. 'mB': 1024 ** 2,
  1754. 'Mb': 1000 ** 2,
  1755. 'mb': 1000 ** 2,
  1756. 'megabytes': 1000 ** 2,
  1757. 'mebibytes': 1024 ** 2,
  1758. 'GiB': 1024 ** 3,
  1759. 'GB': 1000 ** 3,
  1760. 'gB': 1024 ** 3,
  1761. 'Gb': 1000 ** 3,
  1762. 'gb': 1000 ** 3,
  1763. 'gigabytes': 1000 ** 3,
  1764. 'gibibytes': 1024 ** 3,
  1765. 'TiB': 1024 ** 4,
  1766. 'TB': 1000 ** 4,
  1767. 'tB': 1024 ** 4,
  1768. 'Tb': 1000 ** 4,
  1769. 'tb': 1000 ** 4,
  1770. 'terabytes': 1000 ** 4,
  1771. 'tebibytes': 1024 ** 4,
  1772. 'PiB': 1024 ** 5,
  1773. 'PB': 1000 ** 5,
  1774. 'pB': 1024 ** 5,
  1775. 'Pb': 1000 ** 5,
  1776. 'pb': 1000 ** 5,
  1777. 'petabytes': 1000 ** 5,
  1778. 'pebibytes': 1024 ** 5,
  1779. 'EiB': 1024 ** 6,
  1780. 'EB': 1000 ** 6,
  1781. 'eB': 1024 ** 6,
  1782. 'Eb': 1000 ** 6,
  1783. 'eb': 1000 ** 6,
  1784. 'exabytes': 1000 ** 6,
  1785. 'exbibytes': 1024 ** 6,
  1786. 'ZiB': 1024 ** 7,
  1787. 'ZB': 1000 ** 7,
  1788. 'zB': 1024 ** 7,
  1789. 'Zb': 1000 ** 7,
  1790. 'zb': 1000 ** 7,
  1791. 'zettabytes': 1000 ** 7,
  1792. 'zebibytes': 1024 ** 7,
  1793. 'YiB': 1024 ** 8,
  1794. 'YB': 1000 ** 8,
  1795. 'yB': 1024 ** 8,
  1796. 'Yb': 1000 ** 8,
  1797. 'yb': 1000 ** 8,
  1798. 'yottabytes': 1000 ** 8,
  1799. 'yobibytes': 1024 ** 8,
  1800. }
  1801. return lookup_unit_table(_UNIT_TABLE, s)
  1802. def parse_count(s):
  1803. if s is None:
  1804. return None
  1805. s = re.sub(r'^[^\d]+\s', '', s).strip()
  1806. if re.match(r'^[\d,.]+$', s):
  1807. return str_to_int(s)
  1808. _UNIT_TABLE = {
  1809. 'k': 1000,
  1810. 'K': 1000,
  1811. 'm': 1000 ** 2,
  1812. 'M': 1000 ** 2,
  1813. 'kk': 1000 ** 2,
  1814. 'KK': 1000 ** 2,
  1815. 'b': 1000 ** 3,
  1816. 'B': 1000 ** 3,
  1817. }
  1818. ret = lookup_unit_table(_UNIT_TABLE, s)
  1819. if ret is not None:
  1820. return ret
  1821. mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
  1822. if mobj:
  1823. return str_to_int(mobj.group(1))
  1824. def parse_resolution(s, *, lenient=False):
  1825. if s is None:
  1826. return {}
  1827. if lenient:
  1828. mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
  1829. else:
  1830. mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
  1831. if mobj:
  1832. return {
  1833. 'width': int(mobj.group('w')),
  1834. 'height': int(mobj.group('h')),
  1835. }
  1836. mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
  1837. if mobj:
  1838. return {'height': int(mobj.group(1))}
  1839. mobj = re.search(r'\b([48])[kK]\b', s)
  1840. if mobj:
  1841. return {'height': int(mobj.group(1)) * 540}
  1842. return {}
  1843. def parse_bitrate(s):
  1844. if not isinstance(s, str):
  1845. return
  1846. mobj = re.search(r'\b(\d+)\s*kbps', s)
  1847. if mobj:
  1848. return int(mobj.group(1))
  1849. def month_by_name(name, lang='en'):
  1850. """ Return the number of a month by (locale-independently) English name """
  1851. month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
  1852. try:
  1853. return month_names.index(name) + 1
  1854. except ValueError:
  1855. return None
  1856. def month_by_abbreviation(abbrev):
  1857. """ Return the number of a month by (locale-independently) English
  1858. abbreviations """
  1859. try:
  1860. return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
  1861. except ValueError:
  1862. return None
  1863. def fix_xml_ampersands(xml_str):
  1864. """Replace all the '&' by '&amp;' in XML"""
  1865. return re.sub(
  1866. r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
  1867. '&amp;',
  1868. xml_str)
  1869. def setproctitle(title):
  1870. assert isinstance(title, str)
  1871. # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
  1872. try:
  1873. import ctypes
  1874. except ImportError:
  1875. return
  1876. try:
  1877. libc = ctypes.cdll.LoadLibrary('libc.so.6')
  1878. except OSError:
  1879. return
  1880. except TypeError:
  1881. # LoadLibrary in Windows Python 2.7.13 only expects
  1882. # a bytestring, but since unicode_literals turns
  1883. # every string into a unicode string, it fails.
  1884. return
  1885. title_bytes = title.encode()
  1886. buf = ctypes.create_string_buffer(len(title_bytes))
  1887. buf.value = title_bytes
  1888. try:
  1889. libc.prctl(15, buf, 0, 0, 0)
  1890. except AttributeError:
  1891. return # Strange libc, just skip this
  1892. def remove_start(s, start):
  1893. return s[len(start):] if s is not None and s.startswith(start) else s
  1894. def remove_end(s, end):
  1895. return s[:-len(end)] if s is not None and s.endswith(end) else s
  1896. def remove_quotes(s):
  1897. if s is None or len(s) < 2:
  1898. return s
  1899. for quote in ('"', "'", ):
  1900. if s[0] == quote and s[-1] == quote:
  1901. return s[1:-1]
  1902. return s
  1903. def get_domain(url):
  1904. """
  1905. This implementation is inconsistent, but is kept for compatibility.
  1906. Use this only for "webpage_url_domain"
  1907. """
  1908. return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
  1909. def url_basename(url):
  1910. path = urllib.parse.urlparse(url).path
  1911. return path.strip('/').split('/')[-1]
  1912. def base_url(url):
  1913. return re.match(r'https?://[^?#]+/', url).group()
  1914. def urljoin(base, path):
  1915. if isinstance(path, bytes):
  1916. path = path.decode()
  1917. if not isinstance(path, str) or not path:
  1918. return None
  1919. if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
  1920. return path
  1921. if isinstance(base, bytes):
  1922. base = base.decode()
  1923. if not isinstance(base, str) or not re.match(
  1924. r'^(?:https?:)?//', base):
  1925. return None
  1926. return urllib.parse.urljoin(base, path)
  1927. class HEADRequest(urllib.request.Request):
  1928. def get_method(self):
  1929. return 'HEAD'
  1930. class PUTRequest(urllib.request.Request):
  1931. def get_method(self):
  1932. return 'PUT'
  1933. def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
  1934. if get_attr and v is not None:
  1935. v = getattr(v, get_attr, None)
  1936. try:
  1937. return int(v) * invscale // scale
  1938. except (ValueError, TypeError, OverflowError):
  1939. return default
  1940. def str_or_none(v, default=None):
  1941. return default if v is None else str(v)
  1942. def str_to_int(int_str):
  1943. """ A more relaxed version of int_or_none """
  1944. if isinstance(int_str, int):
  1945. return int_str
  1946. elif isinstance(int_str, str):
  1947. int_str = re.sub(r'[,\.\+]', '', int_str)
  1948. return int_or_none(int_str)
  1949. def float_or_none(v, scale=1, invscale=1, default=None):
  1950. if v is None:
  1951. return default
  1952. try:
  1953. return float(v) * invscale / scale
  1954. except (ValueError, TypeError):
  1955. return default
  1956. def bool_or_none(v, default=None):
  1957. return v if isinstance(v, bool) else default
  1958. def strip_or_none(v, default=None):
  1959. return v.strip() if isinstance(v, str) else default
  1960. def url_or_none(url):
  1961. if not url or not isinstance(url, str):
  1962. return None
  1963. url = url.strip()
  1964. return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
  1965. def request_to_url(req):
  1966. if isinstance(req, urllib.request.Request):
  1967. return req.get_full_url()
  1968. else:
  1969. return req
  1970. def strftime_or_none(timestamp, date_format, default=None):
  1971. datetime_object = None
  1972. try:
  1973. if isinstance(timestamp, (int, float)): # unix timestamp
  1974. # Using naive datetime here can break timestamp() in Windows
  1975. # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
  1976. datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
  1977. elif isinstance(timestamp, str): # assume YYYYMMDD
  1978. datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
  1979. date_format = re.sub( # Support %s on windows
  1980. r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
  1981. return datetime_object.strftime(date_format)
  1982. except (ValueError, TypeError, AttributeError):
  1983. return default
  1984. def parse_duration(s):
  1985. if not isinstance(s, str):
  1986. return None
  1987. s = s.strip()
  1988. if not s:
  1989. return None
  1990. days, hours, mins, secs, ms = [None] * 5
  1991. m = re.match(r'''(?x)
  1992. (?P<before_secs>
  1993. (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
  1994. (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
  1995. (?P<ms>[.:][0-9]+)?Z?$
  1996. ''', s)
  1997. if m:
  1998. days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
  1999. else:
  2000. m = re.match(
  2001. r'''(?ix)(?:P?
  2002. (?:
  2003. [0-9]+\s*y(?:ears?)?,?\s*
  2004. )?
  2005. (?:
  2006. [0-9]+\s*m(?:onths?)?,?\s*
  2007. )?
  2008. (?:
  2009. [0-9]+\s*w(?:eeks?)?,?\s*
  2010. )?
  2011. (?:
  2012. (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
  2013. )?
  2014. T)?
  2015. (?:
  2016. (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
  2017. )?
  2018. (?:
  2019. (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
  2020. )?
  2021. (?:
  2022. (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
  2023. )?Z?$''', s)
  2024. if m:
  2025. days, hours, mins, secs, ms = m.groups()
  2026. else:
  2027. m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
  2028. if m:
  2029. hours, mins = m.groups()
  2030. else:
  2031. return None
  2032. if ms:
  2033. ms = ms.replace(':', '.')
  2034. return sum(float(part or 0) * mult for part, mult in (
  2035. (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
  2036. def prepend_extension(filename, ext, expected_real_ext=None):
  2037. name, real_ext = os.path.splitext(filename)
  2038. return (
  2039. f'{name}.{ext}{real_ext}'
  2040. if not expected_real_ext or real_ext[1:] == expected_real_ext
  2041. else f'{filename}.{ext}')
  2042. def replace_extension(filename, ext, expected_real_ext=None):
  2043. name, real_ext = os.path.splitext(filename)
  2044. return '{}.{}'.format(
  2045. name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
  2046. ext)
  2047. def check_executable(exe, args=[]):
  2048. """ Checks if the given binary is installed somewhere in PATH, and returns its name.
  2049. args can be a list of arguments for a short output (like -version) """
  2050. try:
  2051. Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  2052. except OSError:
  2053. return False
  2054. return exe
  2055. def _get_exe_version_output(exe, args):
  2056. try:
  2057. # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
  2058. # SIGTTOU if yt-dlp is run in the background.
  2059. # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
  2060. stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
  2061. stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
  2062. if ret:
  2063. return None
  2064. except OSError:
  2065. return False
  2066. return stdout
  2067. def detect_exe_version(output, version_re=None, unrecognized='present'):
  2068. assert isinstance(output, str)
  2069. if version_re is None:
  2070. version_re = r'version\s+([-0-9._a-zA-Z]+)'
  2071. m = re.search(version_re, output)
  2072. if m:
  2073. return m.group(1)
  2074. else:
  2075. return unrecognized
  2076. def get_exe_version(exe, args=['--version'],
  2077. version_re=None, unrecognized=('present', 'broken')):
  2078. """ Returns the version of the specified executable,
  2079. or False if the executable is not present """
  2080. unrecognized = variadic(unrecognized)
  2081. assert len(unrecognized) in (1, 2)
  2082. out = _get_exe_version_output(exe, args)
  2083. if out is None:
  2084. return unrecognized[-1]
  2085. return out and detect_exe_version(out, version_re, unrecognized[0])
  2086. def frange(start=0, stop=None, step=1):
  2087. """Float range"""
  2088. if stop is None:
  2089. start, stop = 0, start
  2090. sign = [-1, 1][step > 0] if step else 0
  2091. while sign * start < sign * stop:
  2092. yield start
  2093. start += step
  2094. class LazyList(collections.abc.Sequence):
  2095. """Lazy immutable list from an iterable
  2096. Note that slices of a LazyList are lists and not LazyList"""
  2097. class IndexError(IndexError):
  2098. pass
  2099. def __init__(self, iterable, *, reverse=False, _cache=None):
  2100. self._iterable = iter(iterable)
  2101. self._cache = [] if _cache is None else _cache
  2102. self._reversed = reverse
  2103. def __iter__(self):
  2104. if self._reversed:
  2105. # We need to consume the entire iterable to iterate in reverse
  2106. yield from self.exhaust()
  2107. return
  2108. yield from self._cache
  2109. for item in self._iterable:
  2110. self._cache.append(item)
  2111. yield item
  2112. def _exhaust(self):
  2113. self._cache.extend(self._iterable)
  2114. self._iterable = [] # Discard the emptied iterable to make it pickle-able
  2115. return self._cache
  2116. def exhaust(self):
  2117. """Evaluate the entire iterable"""
  2118. return self._exhaust()[::-1 if self._reversed else 1]
  2119. @staticmethod
  2120. def _reverse_index(x):
  2121. return None if x is None else ~x
  2122. def __getitem__(self, idx):
  2123. if isinstance(idx, slice):
  2124. if self._reversed:
  2125. idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
  2126. start, stop, step = idx.start, idx.stop, idx.step or 1
  2127. elif isinstance(idx, int):
  2128. if self._reversed:
  2129. idx = self._reverse_index(idx)
  2130. start, stop, step = idx, idx, 0
  2131. else:
  2132. raise TypeError('indices must be integers or slices')
  2133. if ((start or 0) < 0 or (stop or 0) < 0
  2134. or (start is None and step < 0)
  2135. or (stop is None and step > 0)):
  2136. # We need to consume the entire iterable to be able to slice from the end
  2137. # Obviously, never use this with infinite iterables
  2138. self._exhaust()
  2139. try:
  2140. return self._cache[idx]
  2141. except IndexError as e:
  2142. raise self.IndexError(e) from e
  2143. n = max(start or 0, stop or 0) - len(self._cache) + 1
  2144. if n > 0:
  2145. self._cache.extend(itertools.islice(self._iterable, n))
  2146. try:
  2147. return self._cache[idx]
  2148. except IndexError as e:
  2149. raise self.IndexError(e) from e
  2150. def __bool__(self):
  2151. try:
  2152. self[-1] if self._reversed else self[0]
  2153. except self.IndexError:
  2154. return False
  2155. return True
  2156. def __len__(self):
  2157. self._exhaust()
  2158. return len(self._cache)
  2159. def __reversed__(self):
  2160. return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
  2161. def __copy__(self):
  2162. return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
  2163. def __repr__(self):
  2164. # repr and str should mimic a list. So we exhaust the iterable
  2165. return repr(self.exhaust())
  2166. def __str__(self):
  2167. return repr(self.exhaust())
  2168. class PagedList:
  2169. class IndexError(IndexError):
  2170. pass
  2171. def __len__(self):
  2172. # This is only useful for tests
  2173. return len(self.getslice())
  2174. def __init__(self, pagefunc, pagesize, use_cache=True):
  2175. self._pagefunc = pagefunc
  2176. self._pagesize = pagesize
  2177. self._pagecount = float('inf')
  2178. self._use_cache = use_cache
  2179. self._cache = {}
  2180. def getpage(self, pagenum):
  2181. page_results = self._cache.get(pagenum)
  2182. if page_results is None:
  2183. page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
  2184. if self._use_cache:
  2185. self._cache[pagenum] = page_results
  2186. return page_results
  2187. def getslice(self, start=0, end=None):
  2188. return list(self._getslice(start, end))
  2189. def _getslice(self, start, end):
  2190. raise NotImplementedError('This method must be implemented by subclasses')
  2191. def __getitem__(self, idx):
  2192. assert self._use_cache, 'Indexing PagedList requires cache'
  2193. if not isinstance(idx, int) or idx < 0:
  2194. raise TypeError('indices must be non-negative integers')
  2195. entries = self.getslice(idx, idx + 1)
  2196. if not entries:
  2197. raise self.IndexError()
  2198. return entries[0]
  2199. class OnDemandPagedList(PagedList):
  2200. """Download pages until a page with less than maximum results"""
  2201. def _getslice(self, start, end):
  2202. for pagenum in itertools.count(start // self._pagesize):
  2203. firstid = pagenum * self._pagesize
  2204. nextfirstid = pagenum * self._pagesize + self._pagesize
  2205. if start >= nextfirstid:
  2206. continue
  2207. startv = (
  2208. start % self._pagesize
  2209. if firstid <= start < nextfirstid
  2210. else 0)
  2211. endv = (
  2212. ((end - 1) % self._pagesize) + 1
  2213. if (end is not None and firstid <= end <= nextfirstid)
  2214. else None)
  2215. try:
  2216. page_results = self.getpage(pagenum)
  2217. except Exception:
  2218. self._pagecount = pagenum - 1
  2219. raise
  2220. if startv != 0 or endv is not None:
  2221. page_results = page_results[startv:endv]
  2222. yield from page_results
  2223. # A little optimization - if current page is not "full", ie. does
  2224. # not contain page_size videos then we can assume that this page
  2225. # is the last one - there are no more ids on further pages -
  2226. # i.e. no need to query again.
  2227. if len(page_results) + startv < self._pagesize:
  2228. break
  2229. # If we got the whole page, but the next page is not interesting,
  2230. # break out early as well
  2231. if end == nextfirstid:
  2232. break
  2233. class InAdvancePagedList(PagedList):
  2234. """PagedList with total number of pages known in advance"""
  2235. def __init__(self, pagefunc, pagecount, pagesize):
  2236. PagedList.__init__(self, pagefunc, pagesize, True)
  2237. self._pagecount = pagecount
  2238. def _getslice(self, start, end):
  2239. start_page = start // self._pagesize
  2240. end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
  2241. skip_elems = start - start_page * self._pagesize
  2242. only_more = None if end is None else end - start
  2243. for pagenum in range(start_page, end_page):
  2244. page_results = self.getpage(pagenum)
  2245. if skip_elems:
  2246. page_results = page_results[skip_elems:]
  2247. skip_elems = None
  2248. if only_more is not None:
  2249. if len(page_results) < only_more:
  2250. only_more -= len(page_results)
  2251. else:
  2252. yield from page_results[:only_more]
  2253. break
  2254. yield from page_results
  2255. class PlaylistEntries:
  2256. MissingEntry = object()
  2257. is_exhausted = False
  2258. def __init__(self, ydl, info_dict):
  2259. self.ydl = ydl
  2260. # _entries must be assigned now since infodict can change during iteration
  2261. entries = info_dict.get('entries')
  2262. if entries is None:
  2263. raise EntryNotInPlaylist('There are no entries')
  2264. elif isinstance(entries, list):
  2265. self.is_exhausted = True
  2266. requested_entries = info_dict.get('requested_entries')
  2267. self.is_incomplete = requested_entries is not None
  2268. if self.is_incomplete:
  2269. assert self.is_exhausted
  2270. self._entries = [self.MissingEntry] * max(requested_entries or [0])
  2271. for i, entry in zip(requested_entries, entries):
  2272. self._entries[i - 1] = entry
  2273. elif isinstance(entries, (list, PagedList, LazyList)):
  2274. self._entries = entries
  2275. else:
  2276. self._entries = LazyList(entries)
  2277. PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
  2278. (?P<start>[+-]?\d+)?
  2279. (?P<range>[:-]
  2280. (?P<end>[+-]?\d+|inf(?:inite)?)?
  2281. (?::(?P<step>[+-]?\d+))?
  2282. )?''')
  2283. @classmethod
  2284. def parse_playlist_items(cls, string):
  2285. for segment in string.split(','):
  2286. if not segment:
  2287. raise ValueError('There is two or more consecutive commas')
  2288. mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
  2289. if not mobj:
  2290. raise ValueError(f'{segment!r} is not a valid specification')
  2291. start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
  2292. if int_or_none(step) == 0:
  2293. raise ValueError(f'Step in {segment!r} cannot be zero')
  2294. yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
  2295. def get_requested_items(self):
  2296. playlist_items = self.ydl.params.get('playlist_items')
  2297. playlist_start = self.ydl.params.get('playliststart', 1)
  2298. playlist_end = self.ydl.params.get('playlistend')
  2299. # For backwards compatibility, interpret -1 as whole list
  2300. if playlist_end in (-1, None):
  2301. playlist_end = ''
  2302. if not playlist_items:
  2303. playlist_items = f'{playlist_start}:{playlist_end}'
  2304. elif playlist_start != 1 or playlist_end:
  2305. self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
  2306. for index in self.parse_playlist_items(playlist_items):
  2307. for i, entry in self[index]:
  2308. yield i, entry
  2309. if not entry:
  2310. continue
  2311. try:
  2312. # The item may have just been added to archive. Don't break due to it
  2313. if not self.ydl.params.get('lazy_playlist'):
  2314. # TODO: Add auto-generated fields
  2315. self.ydl._match_entry(entry, incomplete=True, silent=True)
  2316. except (ExistingVideoReached, RejectedVideoReached):
  2317. return
  2318. def get_full_count(self):
  2319. if self.is_exhausted and not self.is_incomplete:
  2320. return len(self)
  2321. elif isinstance(self._entries, InAdvancePagedList):
  2322. if self._entries._pagesize == 1:
  2323. return self._entries._pagecount
  2324. @functools.cached_property
  2325. def _getter(self):
  2326. if isinstance(self._entries, list):
  2327. def get_entry(i):
  2328. try:
  2329. entry = self._entries[i]
  2330. except IndexError:
  2331. entry = self.MissingEntry
  2332. if not self.is_incomplete:
  2333. raise self.IndexError()
  2334. if entry is self.MissingEntry:
  2335. raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
  2336. return entry
  2337. else:
  2338. def get_entry(i):
  2339. try:
  2340. return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
  2341. except (LazyList.IndexError, PagedList.IndexError):
  2342. raise self.IndexError()
  2343. return get_entry
  2344. def __getitem__(self, idx):
  2345. if isinstance(idx, int):
  2346. idx = slice(idx, idx)
  2347. # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
  2348. step = 1 if idx.step is None else idx.step
  2349. if idx.start is None:
  2350. start = 0 if step > 0 else len(self) - 1
  2351. else:
  2352. start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
  2353. # NB: Do not call len(self) when idx == [:]
  2354. if idx.stop is None:
  2355. stop = 0 if step < 0 else float('inf')
  2356. else:
  2357. stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
  2358. stop += [-1, 1][step > 0]
  2359. for i in frange(start, stop, step):
  2360. if i < 0:
  2361. continue
  2362. try:
  2363. entry = self._getter(i)
  2364. except self.IndexError:
  2365. self.is_exhausted = True
  2366. if step > 0:
  2367. break
  2368. continue
  2369. yield i + 1, entry
  2370. def __len__(self):
  2371. return len(tuple(self[:]))
  2372. class IndexError(IndexError):
  2373. pass
  2374. def uppercase_escape(s):
  2375. unicode_escape = codecs.getdecoder('unicode_escape')
  2376. return re.sub(
  2377. r'\\U[0-9a-fA-F]{8}',
  2378. lambda m: unicode_escape(m.group(0))[0],
  2379. s)
  2380. def lowercase_escape(s):
  2381. unicode_escape = codecs.getdecoder('unicode_escape')
  2382. return re.sub(
  2383. r'\\u[0-9a-fA-F]{4}',
  2384. lambda m: unicode_escape(m.group(0))[0],
  2385. s)
  2386. def escape_rfc3986(s):
  2387. """Escape non-ASCII characters as suggested by RFC 3986"""
  2388. return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
  2389. def escape_url(url):
  2390. """Escape URL as suggested by RFC 3986"""
  2391. url_parsed = urllib.parse.urlparse(url)
  2392. return url_parsed._replace(
  2393. netloc=url_parsed.netloc.encode('idna').decode('ascii'),
  2394. path=escape_rfc3986(url_parsed.path),
  2395. params=escape_rfc3986(url_parsed.params),
  2396. query=escape_rfc3986(url_parsed.query),
  2397. fragment=escape_rfc3986(url_parsed.fragment)
  2398. ).geturl()
  2399. def parse_qs(url, **kwargs):
  2400. return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
  2401. def read_batch_urls(batch_fd):
  2402. def fixup(url):
  2403. if not isinstance(url, str):
  2404. url = url.decode('utf-8', 'replace')
  2405. BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
  2406. for bom in BOM_UTF8:
  2407. if url.startswith(bom):
  2408. url = url[len(bom):]
  2409. url = url.lstrip()
  2410. if not url or url.startswith(('#', ';', ']')):
  2411. return False
  2412. # "#" cannot be stripped out since it is part of the URI
  2413. # However, it can be safely stripped out if following a whitespace
  2414. return re.split(r'\s#', url, 1)[0].rstrip()
  2415. with contextlib.closing(batch_fd) as fd:
  2416. return [url for url in map(fixup, fd) if url]
  2417. def urlencode_postdata(*args, **kargs):
  2418. return urllib.parse.urlencode(*args, **kargs).encode('ascii')
  2419. def update_url(url, *, query_update=None, **kwargs):
  2420. """Replace URL components specified by kwargs
  2421. @param url str or parse url tuple
  2422. @param query_update update query
  2423. @returns str
  2424. """
  2425. if isinstance(url, str):
  2426. if not kwargs and not query_update:
  2427. return url
  2428. else:
  2429. url = urllib.parse.urlparse(url)
  2430. if query_update:
  2431. assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
  2432. kwargs['query'] = urllib.parse.urlencode({
  2433. **urllib.parse.parse_qs(url.query),
  2434. **query_update
  2435. }, True)
  2436. return urllib.parse.urlunparse(url._replace(**kwargs))
  2437. def update_url_query(url, query):
  2438. return update_url(url, query_update=query)
  2439. def update_Request(req, url=None, data=None, headers=None, query=None):
  2440. req_headers = req.headers.copy()
  2441. req_headers.update(headers or {})
  2442. req_data = data or req.data
  2443. req_url = update_url_query(url or req.get_full_url(), query)
  2444. req_get_method = req.get_method()
  2445. if req_get_method == 'HEAD':
  2446. req_type = HEADRequest
  2447. elif req_get_method == 'PUT':
  2448. req_type = PUTRequest
  2449. else:
  2450. req_type = urllib.request.Request
  2451. new_req = req_type(
  2452. req_url, data=req_data, headers=req_headers,
  2453. origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
  2454. if hasattr(req, 'timeout'):
  2455. new_req.timeout = req.timeout
  2456. return new_req
  2457. def _multipart_encode_impl(data, boundary):
  2458. content_type = 'multipart/form-data; boundary=%s' % boundary
  2459. out = b''
  2460. for k, v in data.items():
  2461. out += b'--' + boundary.encode('ascii') + b'\r\n'
  2462. if isinstance(k, str):
  2463. k = k.encode()
  2464. if isinstance(v, str):
  2465. v = v.encode()
  2466. # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
  2467. # suggests sending UTF-8 directly. Firefox sends UTF-8, too
  2468. content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
  2469. if boundary.encode('ascii') in content:
  2470. raise ValueError('Boundary overlaps with data')
  2471. out += content
  2472. out += b'--' + boundary.encode('ascii') + b'--\r\n'
  2473. return out, content_type
  2474. def multipart_encode(data, boundary=None):
  2475. '''
  2476. Encode a dict to RFC 7578-compliant form-data
  2477. data:
  2478. A dict where keys and values can be either Unicode or bytes-like
  2479. objects.
  2480. boundary:
  2481. If specified a Unicode object, it's used as the boundary. Otherwise
  2482. a random boundary is generated.
  2483. Reference: https://tools.ietf.org/html/rfc7578
  2484. '''
  2485. has_specified_boundary = boundary is not None
  2486. while True:
  2487. if boundary is None:
  2488. boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
  2489. try:
  2490. out, content_type = _multipart_encode_impl(data, boundary)
  2491. break
  2492. except ValueError:
  2493. if has_specified_boundary:
  2494. raise
  2495. boundary = None
  2496. return out, content_type
  2497. def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
  2498. if blocked_types is NO_DEFAULT:
  2499. blocked_types = (str, bytes, collections.abc.Mapping)
  2500. return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
  2501. def variadic(x, allowed_types=NO_DEFAULT):
  2502. if not isinstance(allowed_types, (tuple, type)):
  2503. deprecation_warning('allowed_types should be a tuple or a type')
  2504. allowed_types = tuple(allowed_types)
  2505. return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
  2506. def try_call(*funcs, expected_type=None, args=[], kwargs={}):
  2507. for f in funcs:
  2508. try:
  2509. val = f(*args, **kwargs)
  2510. except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
  2511. pass
  2512. else:
  2513. if expected_type is None or isinstance(val, expected_type):
  2514. return val
  2515. def try_get(src, getter, expected_type=None):
  2516. return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
  2517. def filter_dict(dct, cndn=lambda _, v: v is not None):
  2518. return {k: v for k, v in dct.items() if cndn(k, v)}
  2519. def merge_dicts(*dicts):
  2520. merged = {}
  2521. for a_dict in dicts:
  2522. for k, v in a_dict.items():
  2523. if (v is not None and k not in merged
  2524. or isinstance(v, str) and merged[k] == ''):
  2525. merged[k] = v
  2526. return merged
  2527. def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
  2528. return string if isinstance(string, str) else str(string, encoding, errors)
  2529. US_RATINGS = {
  2530. 'G': 0,
  2531. 'PG': 10,
  2532. 'PG-13': 13,
  2533. 'R': 16,
  2534. 'NC': 18,
  2535. }
  2536. TV_PARENTAL_GUIDELINES = {
  2537. 'TV-Y': 0,
  2538. 'TV-Y7': 7,
  2539. 'TV-G': 0,
  2540. 'TV-PG': 0,
  2541. 'TV-14': 14,
  2542. 'TV-MA': 17,
  2543. }
  2544. def parse_age_limit(s):
  2545. # isinstance(False, int) is True. So type() must be used instead
  2546. if type(s) is int: # noqa: E721
  2547. return s if 0 <= s <= 21 else None
  2548. elif not isinstance(s, str):
  2549. return None
  2550. m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
  2551. if m:
  2552. return int(m.group('age'))
  2553. s = s.upper()
  2554. if s in US_RATINGS:
  2555. return US_RATINGS[s]
  2556. m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
  2557. if m:
  2558. return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
  2559. return None
  2560. def strip_jsonp(code):
  2561. return re.sub(
  2562. r'''(?sx)^
  2563. (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
  2564. (?:\s*&&\s*(?P=func_name))?
  2565. \s*\(\s*(?P<callback_data>.*)\);?
  2566. \s*?(?://[^\n]*)*$''',
  2567. r'\g<callback_data>', code)
  2568. def js_to_json(code, vars={}, *, strict=False):
  2569. # vars is a dict of var, val pairs to substitute
  2570. STRING_QUOTES = '\'"`'
  2571. STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
  2572. COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
  2573. SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
  2574. INTEGER_TABLE = (
  2575. (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
  2576. (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
  2577. )
  2578. def process_escape(match):
  2579. JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
  2580. escape = match.group(1) or match.group(2)
  2581. return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
  2582. else R'\u00' if escape == 'x'
  2583. else '' if escape == '\n'
  2584. else escape)
  2585. def template_substitute(match):
  2586. evaluated = js_to_json(match.group(1), vars, strict=strict)
  2587. if evaluated[0] == '"':
  2588. return json.loads(evaluated)
  2589. return evaluated
  2590. def fix_kv(m):
  2591. v = m.group(0)
  2592. if v in ('true', 'false', 'null'):
  2593. return v
  2594. elif v in ('undefined', 'void 0'):
  2595. return 'null'
  2596. elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
  2597. return ''
  2598. if v[0] in STRING_QUOTES:
  2599. v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
  2600. escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
  2601. return f'"{escaped}"'
  2602. for regex, base in INTEGER_TABLE:
  2603. im = re.match(regex, v)
  2604. if im:
  2605. i = int(im.group(1), base)
  2606. return f'"{i}":' if v.endswith(':') else str(i)
  2607. if v in vars:
  2608. try:
  2609. if not strict:
  2610. json.loads(vars[v])
  2611. except json.JSONDecodeError:
  2612. return json.dumps(vars[v])
  2613. else:
  2614. return vars[v]
  2615. if not strict:
  2616. return f'"{v}"'
  2617. raise ValueError(f'Unknown value: {v}')
  2618. def create_map(mobj):
  2619. return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
  2620. code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
  2621. if not strict:
  2622. code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
  2623. code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
  2624. code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
  2625. code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
  2626. return re.sub(rf'''(?sx)
  2627. {STRING_RE}|
  2628. {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
  2629. void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
  2630. \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
  2631. [0-9]+(?={SKIP_RE}:)|
  2632. !+
  2633. ''', fix_kv, code)
  2634. def qualities(quality_ids):
  2635. """ Get a numeric quality value out of a list of possible values """
  2636. def q(qid):
  2637. try:
  2638. return quality_ids.index(qid)
  2639. except ValueError:
  2640. return -1
  2641. return q
  2642. POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
  2643. DEFAULT_OUTTMPL = {
  2644. 'default': '%(title)s [%(id)s].%(ext)s',
  2645. 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
  2646. }
  2647. OUTTMPL_TYPES = {
  2648. 'chapter': None,
  2649. 'subtitle': None,
  2650. 'thumbnail': None,
  2651. 'description': 'description',
  2652. 'annotation': 'annotations.xml',
  2653. 'infojson': 'info.json',
  2654. 'link': None,
  2655. 'pl_video': None,
  2656. 'pl_thumbnail': None,
  2657. 'pl_description': 'description',
  2658. 'pl_infojson': 'info.json',
  2659. }
  2660. # As of [1] format syntax is:
  2661. # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
  2662. # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
  2663. STR_FORMAT_RE_TMPL = r'''(?x)
  2664. (?<!%)(?P<prefix>(?:%%)*)
  2665. %
  2666. (?P<has_key>\((?P<key>{0})\))?
  2667. (?P<format>
  2668. (?P<conversion>[#0\-+ ]+)?
  2669. (?P<min_width>\d+)?
  2670. (?P<precision>\.\d+)?
  2671. (?P<len_mod>[hlL])? # unused in python
  2672. {1} # conversion type
  2673. )
  2674. '''
  2675. STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
  2676. def limit_length(s, length):
  2677. """ Add ellipses to overly long strings """
  2678. if s is None:
  2679. return None
  2680. ELLIPSES = '...'
  2681. if len(s) > length:
  2682. return s[:length - len(ELLIPSES)] + ELLIPSES
  2683. return s
  2684. def version_tuple(v):
  2685. return tuple(int(e) for e in re.split(r'[-.]', v))
  2686. def is_outdated_version(version, limit, assume_new=True):
  2687. if not version:
  2688. return not assume_new
  2689. try:
  2690. return version_tuple(version) < version_tuple(limit)
  2691. except ValueError:
  2692. return not assume_new
  2693. def ytdl_is_updateable():
  2694. """ Returns if yt-dlp can be updated with -U """
  2695. from ..update import is_non_updateable
  2696. return not is_non_updateable()
  2697. def args_to_str(args):
  2698. # Get a short string representation for a subprocess command
  2699. return ' '.join(compat_shlex_quote(a) for a in args)
  2700. def error_to_str(err):
  2701. return f'{type(err).__name__}: {err}'
  2702. def mimetype2ext(mt, default=NO_DEFAULT):
  2703. if not isinstance(mt, str):
  2704. if default is not NO_DEFAULT:
  2705. return default
  2706. return None
  2707. MAP = {
  2708. # video
  2709. '3gpp': '3gp',
  2710. 'mp2t': 'ts',
  2711. 'mp4': 'mp4',
  2712. 'mpeg': 'mpeg',
  2713. 'mpegurl': 'm3u8',
  2714. 'quicktime': 'mov',
  2715. 'webm': 'webm',
  2716. 'vp9': 'vp9',
  2717. 'x-flv': 'flv',
  2718. 'x-m4v': 'm4v',
  2719. 'x-matroska': 'mkv',
  2720. 'x-mng': 'mng',
  2721. 'x-mp4-fragmented': 'mp4',
  2722. 'x-ms-asf': 'asf',
  2723. 'x-ms-wmv': 'wmv',
  2724. 'x-msvideo': 'avi',
  2725. # application (streaming playlists)
  2726. 'dash+xml': 'mpd',
  2727. 'f4m+xml': 'f4m',
  2728. 'hds+xml': 'f4m',
  2729. 'vnd.apple.mpegurl': 'm3u8',
  2730. 'vnd.ms-sstr+xml': 'ism',
  2731. 'x-mpegurl': 'm3u8',
  2732. # audio
  2733. 'audio/mp4': 'm4a',
  2734. # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
  2735. # Using .mp3 as it's the most popular one
  2736. 'audio/mpeg': 'mp3',
  2737. 'audio/webm': 'webm',
  2738. 'audio/x-matroska': 'mka',
  2739. 'audio/x-mpegurl': 'm3u',
  2740. 'midi': 'mid',
  2741. 'ogg': 'ogg',
  2742. 'wav': 'wav',
  2743. 'wave': 'wav',
  2744. 'x-aac': 'aac',
  2745. 'x-flac': 'flac',
  2746. 'x-m4a': 'm4a',
  2747. 'x-realaudio': 'ra',
  2748. 'x-wav': 'wav',
  2749. # image
  2750. 'avif': 'avif',
  2751. 'bmp': 'bmp',
  2752. 'gif': 'gif',
  2753. 'jpeg': 'jpg',
  2754. 'png': 'png',
  2755. 'svg+xml': 'svg',
  2756. 'tiff': 'tif',
  2757. 'vnd.wap.wbmp': 'wbmp',
  2758. 'webp': 'webp',
  2759. 'x-icon': 'ico',
  2760. 'x-jng': 'jng',
  2761. 'x-ms-bmp': 'bmp',
  2762. # caption
  2763. 'filmstrip+json': 'fs',
  2764. 'smptett+xml': 'tt',
  2765. 'ttaf+xml': 'dfxp',
  2766. 'ttml+xml': 'ttml',
  2767. 'x-ms-sami': 'sami',
  2768. # misc
  2769. 'gzip': 'gz',
  2770. 'json': 'json',
  2771. 'xml': 'xml',
  2772. 'zip': 'zip',
  2773. }
  2774. mimetype = mt.partition(';')[0].strip().lower()
  2775. _, _, subtype = mimetype.rpartition('/')
  2776. ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
  2777. if ext:
  2778. return ext
  2779. elif default is not NO_DEFAULT:
  2780. return default
  2781. return subtype.replace('+', '.')
  2782. def ext2mimetype(ext_or_url):
  2783. if not ext_or_url:
  2784. return None
  2785. if '.' not in ext_or_url:
  2786. ext_or_url = f'file.{ext_or_url}'
  2787. return mimetypes.guess_type(ext_or_url)[0]
  2788. def parse_codecs(codecs_str):
  2789. # http://tools.ietf.org/html/rfc6381
  2790. if not codecs_str:
  2791. return {}
  2792. split_codecs = list(filter(None, map(
  2793. str.strip, codecs_str.strip().strip(',').split(','))))
  2794. vcodec, acodec, scodec, hdr = None, None, None, None
  2795. for full_codec in split_codecs:
  2796. parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
  2797. if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
  2798. 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
  2799. if vcodec:
  2800. continue
  2801. vcodec = full_codec
  2802. if parts[0] in ('dvh1', 'dvhe'):
  2803. hdr = 'DV'
  2804. elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
  2805. hdr = 'HDR10'
  2806. elif parts[:2] == ['vp9', '2']:
  2807. hdr = 'HDR10'
  2808. elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
  2809. 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
  2810. acodec = acodec or full_codec
  2811. elif parts[0] in ('stpp', 'wvtt'):
  2812. scodec = scodec or full_codec
  2813. else:
  2814. write_string(f'WARNING: Unknown codec {full_codec}\n')
  2815. if vcodec or acodec or scodec:
  2816. return {
  2817. 'vcodec': vcodec or 'none',
  2818. 'acodec': acodec or 'none',
  2819. 'dynamic_range': hdr,
  2820. **({'scodec': scodec} if scodec is not None else {}),
  2821. }
  2822. elif len(split_codecs) == 2:
  2823. return {
  2824. 'vcodec': split_codecs[0],
  2825. 'acodec': split_codecs[1],
  2826. }
  2827. return {}
  2828. def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
  2829. assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
  2830. allow_mkv = not preferences or 'mkv' in preferences
  2831. if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
  2832. return 'mkv' # TODO: any other format allows this?
  2833. # TODO: All codecs supported by parse_codecs isn't handled here
  2834. COMPATIBLE_CODECS = {
  2835. 'mp4': {
  2836. 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
  2837. 'h264', 'aacl', 'ec-3', # Set in ISM
  2838. },
  2839. 'webm': {
  2840. 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
  2841. 'vp9x', 'vp8x', # in the webm spec
  2842. },
  2843. }
  2844. sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
  2845. vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
  2846. for ext in preferences or COMPATIBLE_CODECS.keys():
  2847. codec_set = COMPATIBLE_CODECS.get(ext, set())
  2848. if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
  2849. return ext
  2850. COMPATIBLE_EXTS = (
  2851. {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
  2852. {'webm', 'weba'},
  2853. )
  2854. for ext in preferences or vexts:
  2855. current_exts = {ext, *vexts, *aexts}
  2856. if ext == 'mkv' or current_exts == {ext} or any(
  2857. ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
  2858. return ext
  2859. return 'mkv' if allow_mkv else preferences[-1]
  2860. def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
  2861. getheader = url_handle.headers.get
  2862. cd = getheader('Content-Disposition')
  2863. if cd:
  2864. m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
  2865. if m:
  2866. e = determine_ext(m.group('filename'), default_ext=None)
  2867. if e:
  2868. return e
  2869. meta_ext = getheader('x-amz-meta-name')
  2870. if meta_ext:
  2871. e = meta_ext.rpartition('.')[2]
  2872. if e:
  2873. return e
  2874. return mimetype2ext(getheader('Content-Type'), default=default)
  2875. def encode_data_uri(data, mime_type):
  2876. return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
  2877. def age_restricted(content_limit, age_limit):
  2878. """ Returns True iff the content should be blocked """
  2879. if age_limit is None: # No limit set
  2880. return False
  2881. if content_limit is None:
  2882. return False # Content available for everyone
  2883. return age_limit < content_limit
  2884. # List of known byte-order-marks (BOM)
  2885. BOMS = [
  2886. (b'\xef\xbb\xbf', 'utf-8'),
  2887. (b'\x00\x00\xfe\xff', 'utf-32-be'),
  2888. (b'\xff\xfe\x00\x00', 'utf-32-le'),
  2889. (b'\xff\xfe', 'utf-16-le'),
  2890. (b'\xfe\xff', 'utf-16-be'),
  2891. ]
  2892. def is_html(first_bytes):
  2893. """ Detect whether a file contains HTML by examining its first bytes. """
  2894. encoding = 'utf-8'
  2895. for bom, enc in BOMS:
  2896. while first_bytes.startswith(bom):
  2897. encoding, first_bytes = enc, first_bytes[len(bom):]
  2898. return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
  2899. def determine_protocol(info_dict):
  2900. protocol = info_dict.get('protocol')
  2901. if protocol is not None:
  2902. return protocol
  2903. url = sanitize_url(info_dict['url'])
  2904. if url.startswith('rtmp'):
  2905. return 'rtmp'
  2906. elif url.startswith('mms'):
  2907. return 'mms'
  2908. elif url.startswith('rtsp'):
  2909. return 'rtsp'
  2910. ext = determine_ext(url)
  2911. if ext == 'm3u8':
  2912. return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
  2913. elif ext == 'f4m':
  2914. return 'f4m'
  2915. return urllib.parse.urlparse(url).scheme
  2916. def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
  2917. """ Render a list of rows, each as a list of values.
  2918. Text after a \t will be right aligned """
  2919. def width(string):
  2920. return len(remove_terminal_sequences(string).replace('\t', ''))
  2921. def get_max_lens(table):
  2922. return [max(width(str(v)) for v in col) for col in zip(*table)]
  2923. def filter_using_list(row, filterArray):
  2924. return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
  2925. max_lens = get_max_lens(data) if hide_empty else []
  2926. header_row = filter_using_list(header_row, max_lens)
  2927. data = [filter_using_list(row, max_lens) for row in data]
  2928. table = [header_row] + data
  2929. max_lens = get_max_lens(table)
  2930. extra_gap += 1
  2931. if delim:
  2932. table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
  2933. table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
  2934. for row in table:
  2935. for pos, text in enumerate(map(str, row)):
  2936. if '\t' in text:
  2937. row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
  2938. else:
  2939. row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
  2940. ret = '\n'.join(''.join(row).rstrip() for row in table)
  2941. return ret
  2942. def _match_one(filter_part, dct, incomplete):
  2943. # TODO: Generalize code with YoutubeDL._build_format_filter
  2944. STRING_OPERATORS = {
  2945. '*=': operator.contains,
  2946. '^=': lambda attr, value: attr.startswith(value),
  2947. '$=': lambda attr, value: attr.endswith(value),
  2948. '~=': lambda attr, value: re.search(value, attr),
  2949. }
  2950. COMPARISON_OPERATORS = {
  2951. **STRING_OPERATORS,
  2952. '<=': operator.le, # "<=" must be defined above "<"
  2953. '<': operator.lt,
  2954. '>=': operator.ge,
  2955. '>': operator.gt,
  2956. '=': operator.eq,
  2957. }
  2958. if isinstance(incomplete, bool):
  2959. is_incomplete = lambda _: incomplete
  2960. else:
  2961. is_incomplete = lambda k: k in incomplete
  2962. operator_rex = re.compile(r'''(?x)
  2963. (?P<key>[a-z_]+)
  2964. \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
  2965. (?:
  2966. (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
  2967. (?P<strval>.+?)
  2968. )
  2969. ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
  2970. m = operator_rex.fullmatch(filter_part.strip())
  2971. if m:
  2972. m = m.groupdict()
  2973. unnegated_op = COMPARISON_OPERATORS[m['op']]
  2974. if m['negation']:
  2975. op = lambda attr, value: not unnegated_op(attr, value)
  2976. else:
  2977. op = unnegated_op
  2978. comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
  2979. if m['quote']:
  2980. comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
  2981. actual_value = dct.get(m['key'])
  2982. numeric_comparison = None
  2983. if isinstance(actual_value, (int, float)):
  2984. # If the original field is a string and matching comparisonvalue is
  2985. # a number we should respect the origin of the original field
  2986. # and process comparison value as a string (see
  2987. # https://github.com/ytdl-org/youtube-dl/issues/11082)
  2988. try:
  2989. numeric_comparison = int(comparison_value)
  2990. except ValueError:
  2991. numeric_comparison = parse_filesize(comparison_value)
  2992. if numeric_comparison is None:
  2993. numeric_comparison = parse_filesize(f'{comparison_value}B')
  2994. if numeric_comparison is None:
  2995. numeric_comparison = parse_duration(comparison_value)
  2996. if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
  2997. raise ValueError('Operator %s only supports string values!' % m['op'])
  2998. if actual_value is None:
  2999. return is_incomplete(m['key']) or m['none_inclusive']
  3000. return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
  3001. UNARY_OPERATORS = {
  3002. '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
  3003. '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
  3004. }
  3005. operator_rex = re.compile(r'''(?x)
  3006. (?P<op>%s)\s*(?P<key>[a-z_]+)
  3007. ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
  3008. m = operator_rex.fullmatch(filter_part.strip())
  3009. if m:
  3010. op = UNARY_OPERATORS[m.group('op')]
  3011. actual_value = dct.get(m.group('key'))
  3012. if is_incomplete(m.group('key')) and actual_value is None:
  3013. return True
  3014. return op(actual_value)
  3015. raise ValueError('Invalid filter part %r' % filter_part)
  3016. def match_str(filter_str, dct, incomplete=False):
  3017. """ Filter a dictionary with a simple string syntax.
  3018. @returns Whether the filter passes
  3019. @param incomplete Set of keys that is expected to be missing from dct.
  3020. Can be True/False to indicate all/none of the keys may be missing.
  3021. All conditions on incomplete keys pass if the key is missing
  3022. """
  3023. return all(
  3024. _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
  3025. for filter_part in re.split(r'(?<!\\)&', filter_str))
  3026. def match_filter_func(filters, breaking_filters=None):
  3027. if not filters and not breaking_filters:
  3028. return None
  3029. breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
  3030. filters = set(variadic(filters or []))
  3031. interactive = '-' in filters
  3032. if interactive:
  3033. filters.remove('-')
  3034. def _match_func(info_dict, incomplete=False):
  3035. ret = breaking_filters(info_dict, incomplete)
  3036. if ret is not None:
  3037. raise RejectedVideoReached(ret)
  3038. if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
  3039. return NO_DEFAULT if interactive and not incomplete else None
  3040. else:
  3041. video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
  3042. filter_str = ') | ('.join(map(str.strip, filters))
  3043. return f'{video_title} does not pass filter ({filter_str}), skipping ..'
  3044. return _match_func
  3045. class download_range_func:
  3046. def __init__(self, chapters, ranges):
  3047. self.chapters, self.ranges = chapters, ranges
  3048. def __call__(self, info_dict, ydl):
  3049. if not self.ranges and not self.chapters:
  3050. yield {}
  3051. warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
  3052. else 'Cannot match chapters since chapter information is unavailable')
  3053. for regex in self.chapters or []:
  3054. for i, chapter in enumerate(info_dict.get('chapters') or []):
  3055. if re.search(regex, chapter['title']):
  3056. warning = None
  3057. yield {**chapter, 'index': i}
  3058. if self.chapters and warning:
  3059. ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
  3060. yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
  3061. def __eq__(self, other):
  3062. return (isinstance(other, download_range_func)
  3063. and self.chapters == other.chapters and self.ranges == other.ranges)
  3064. def __repr__(self):
  3065. return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
  3066. def parse_dfxp_time_expr(time_expr):
  3067. if not time_expr:
  3068. return
  3069. mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
  3070. if mobj:
  3071. return float(mobj.group('time_offset'))
  3072. mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
  3073. if mobj:
  3074. return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
  3075. def srt_subtitles_timecode(seconds):
  3076. return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
  3077. def ass_subtitles_timecode(seconds):
  3078. time = timetuple_from_msec(seconds * 1000)
  3079. return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
  3080. def dfxp2srt(dfxp_data):
  3081. '''
  3082. @param dfxp_data A bytes-like object containing DFXP data
  3083. @returns A unicode object containing converted SRT data
  3084. '''
  3085. LEGACY_NAMESPACES = (
  3086. (b'http://www.w3.org/ns/ttml', [
  3087. b'http://www.w3.org/2004/11/ttaf1',
  3088. b'http://www.w3.org/2006/04/ttaf1',
  3089. b'http://www.w3.org/2006/10/ttaf1',
  3090. ]),
  3091. (b'http://www.w3.org/ns/ttml#styling', [
  3092. b'http://www.w3.org/ns/ttml#style',
  3093. ]),
  3094. )
  3095. SUPPORTED_STYLING = [
  3096. 'color',
  3097. 'fontFamily',
  3098. 'fontSize',
  3099. 'fontStyle',
  3100. 'fontWeight',
  3101. 'textDecoration'
  3102. ]
  3103. _x = functools.partial(xpath_with_ns, ns_map={
  3104. 'xml': 'http://www.w3.org/XML/1998/namespace',
  3105. 'ttml': 'http://www.w3.org/ns/ttml',
  3106. 'tts': 'http://www.w3.org/ns/ttml#styling',
  3107. })
  3108. styles = {}
  3109. default_style = {}
  3110. class TTMLPElementParser:
  3111. _out = ''
  3112. _unclosed_elements = []
  3113. _applied_styles = []
  3114. def start(self, tag, attrib):
  3115. if tag in (_x('ttml:br'), 'br'):
  3116. self._out += '\n'
  3117. else:
  3118. unclosed_elements = []
  3119. style = {}
  3120. element_style_id = attrib.get('style')
  3121. if default_style:
  3122. style.update(default_style)
  3123. if element_style_id:
  3124. style.update(styles.get(element_style_id, {}))
  3125. for prop in SUPPORTED_STYLING:
  3126. prop_val = attrib.get(_x('tts:' + prop))
  3127. if prop_val:
  3128. style[prop] = prop_val
  3129. if style:
  3130. font = ''
  3131. for k, v in sorted(style.items()):
  3132. if self._applied_styles and self._applied_styles[-1].get(k) == v:
  3133. continue
  3134. if k == 'color':
  3135. font += ' color="%s"' % v
  3136. elif k == 'fontSize':
  3137. font += ' size="%s"' % v
  3138. elif k == 'fontFamily':
  3139. font += ' face="%s"' % v
  3140. elif k == 'fontWeight' and v == 'bold':
  3141. self._out += '<b>'
  3142. unclosed_elements.append('b')
  3143. elif k == 'fontStyle' and v == 'italic':
  3144. self._out += '<i>'
  3145. unclosed_elements.append('i')
  3146. elif k == 'textDecoration' and v == 'underline':
  3147. self._out += '<u>'
  3148. unclosed_elements.append('u')
  3149. if font:
  3150. self._out += '<font' + font + '>'
  3151. unclosed_elements.append('font')
  3152. applied_style = {}
  3153. if self._applied_styles:
  3154. applied_style.update(self._applied_styles[-1])
  3155. applied_style.update(style)
  3156. self._applied_styles.append(applied_style)
  3157. self._unclosed_elements.append(unclosed_elements)
  3158. def end(self, tag):
  3159. if tag not in (_x('ttml:br'), 'br'):
  3160. unclosed_elements = self._unclosed_elements.pop()
  3161. for element in reversed(unclosed_elements):
  3162. self._out += '</%s>' % element
  3163. if unclosed_elements and self._applied_styles:
  3164. self._applied_styles.pop()
  3165. def data(self, data):
  3166. self._out += data
  3167. def close(self):
  3168. return self._out.strip()
  3169. # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
  3170. # This will not trigger false positives since only UTF-8 text is being replaced
  3171. dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
  3172. def parse_node(node):
  3173. target = TTMLPElementParser()
  3174. parser = xml.etree.ElementTree.XMLParser(target=target)
  3175. parser.feed(xml.etree.ElementTree.tostring(node))
  3176. return parser.close()
  3177. for k, v in LEGACY_NAMESPACES:
  3178. for ns in v:
  3179. dfxp_data = dfxp_data.replace(ns, k)
  3180. dfxp = compat_etree_fromstring(dfxp_data)
  3181. out = []
  3182. paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
  3183. if not paras:
  3184. raise ValueError('Invalid dfxp/TTML subtitle')
  3185. repeat = False
  3186. while True:
  3187. for style in dfxp.findall(_x('.//ttml:style')):
  3188. style_id = style.get('id') or style.get(_x('xml:id'))
  3189. if not style_id:
  3190. continue
  3191. parent_style_id = style.get('style')
  3192. if parent_style_id:
  3193. if parent_style_id not in styles:
  3194. repeat = True
  3195. continue
  3196. styles[style_id] = styles[parent_style_id].copy()
  3197. for prop in SUPPORTED_STYLING:
  3198. prop_val = style.get(_x('tts:' + prop))
  3199. if prop_val:
  3200. styles.setdefault(style_id, {})[prop] = prop_val
  3201. if repeat:
  3202. repeat = False
  3203. else:
  3204. break
  3205. for p in ('body', 'div'):
  3206. ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
  3207. if ele is None:
  3208. continue
  3209. style = styles.get(ele.get('style'))
  3210. if not style:
  3211. continue
  3212. default_style.update(style)
  3213. for para, index in zip(paras, itertools.count(1)):
  3214. begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
  3215. end_time = parse_dfxp_time_expr(para.attrib.get('end'))
  3216. dur = parse_dfxp_time_expr(para.attrib.get('dur'))
  3217. if begin_time is None:
  3218. continue
  3219. if not end_time:
  3220. if not dur:
  3221. continue
  3222. end_time = begin_time + dur
  3223. out.append('%d\n%s --> %s\n%s\n\n' % (
  3224. index,
  3225. srt_subtitles_timecode(begin_time),
  3226. srt_subtitles_timecode(end_time),
  3227. parse_node(para)))
  3228. return ''.join(out)
  3229. def cli_option(params, command_option, param, separator=None):
  3230. param = params.get(param)
  3231. return ([] if param is None
  3232. else [command_option, str(param)] if separator is None
  3233. else [f'{command_option}{separator}{param}'])
  3234. def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
  3235. param = params.get(param)
  3236. assert param in (True, False, None)
  3237. return cli_option({True: true_value, False: false_value}, command_option, param, separator)
  3238. def cli_valueless_option(params, command_option, param, expected_value=True):
  3239. return [command_option] if params.get(param) == expected_value else []
  3240. def cli_configuration_args(argdict, keys, default=[], use_compat=True):
  3241. if isinstance(argdict, (list, tuple)): # for backward compatibility
  3242. if use_compat:
  3243. return argdict
  3244. else:
  3245. argdict = None
  3246. if argdict is None:
  3247. return default
  3248. assert isinstance(argdict, dict)
  3249. assert isinstance(keys, (list, tuple))
  3250. for key_list in keys:
  3251. arg_list = list(filter(
  3252. lambda x: x is not None,
  3253. [argdict.get(key.lower()) for key in variadic(key_list)]))
  3254. if arg_list:
  3255. return [arg for args in arg_list for arg in args]
  3256. return default
  3257. def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
  3258. main_key, exe = main_key.lower(), exe.lower()
  3259. root_key = exe if main_key == exe else f'{main_key}+{exe}'
  3260. keys = [f'{root_key}{k}' for k in (keys or [''])]
  3261. if root_key in keys:
  3262. if main_key != exe:
  3263. keys.append((main_key, exe))
  3264. keys.append('default')
  3265. else:
  3266. use_compat = False
  3267. return cli_configuration_args(argdict, keys, default, use_compat)
  3268. class ISO639Utils:
  3269. # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
  3270. _lang_map = {
  3271. 'aa': 'aar',
  3272. 'ab': 'abk',
  3273. 'ae': 'ave',
  3274. 'af': 'afr',
  3275. 'ak': 'aka',
  3276. 'am': 'amh',
  3277. 'an': 'arg',
  3278. 'ar': 'ara',
  3279. 'as': 'asm',
  3280. 'av': 'ava',
  3281. 'ay': 'aym',
  3282. 'az': 'aze',
  3283. 'ba': 'bak',
  3284. 'be': 'bel',
  3285. 'bg': 'bul',
  3286. 'bh': 'bih',
  3287. 'bi': 'bis',
  3288. 'bm': 'bam',
  3289. 'bn': 'ben',
  3290. 'bo': 'bod',
  3291. 'br': 'bre',
  3292. 'bs': 'bos',
  3293. 'ca': 'cat',
  3294. 'ce': 'che',
  3295. 'ch': 'cha',
  3296. 'co': 'cos',
  3297. 'cr': 'cre',
  3298. 'cs': 'ces',
  3299. 'cu': 'chu',
  3300. 'cv': 'chv',
  3301. 'cy': 'cym',
  3302. 'da': 'dan',
  3303. 'de': 'deu',
  3304. 'dv': 'div',
  3305. 'dz': 'dzo',
  3306. 'ee': 'ewe',
  3307. 'el': 'ell',
  3308. 'en': 'eng',
  3309. 'eo': 'epo',
  3310. 'es': 'spa',
  3311. 'et': 'est',
  3312. 'eu': 'eus',
  3313. 'fa': 'fas',
  3314. 'ff': 'ful',
  3315. 'fi': 'fin',
  3316. 'fj': 'fij',
  3317. 'fo': 'fao',
  3318. 'fr': 'fra',
  3319. 'fy': 'fry',
  3320. 'ga': 'gle',
  3321. 'gd': 'gla',
  3322. 'gl': 'glg',
  3323. 'gn': 'grn',
  3324. 'gu': 'guj',
  3325. 'gv': 'glv',
  3326. 'ha': 'hau',
  3327. 'he': 'heb',
  3328. 'iw': 'heb', # Replaced by he in 1989 revision
  3329. 'hi': 'hin',
  3330. 'ho': 'hmo',
  3331. 'hr': 'hrv',
  3332. 'ht': 'hat',
  3333. 'hu': 'hun',
  3334. 'hy': 'hye',
  3335. 'hz': 'her',
  3336. 'ia': 'ina',
  3337. 'id': 'ind',
  3338. 'in': 'ind', # Replaced by id in 1989 revision
  3339. 'ie': 'ile',
  3340. 'ig': 'ibo',
  3341. 'ii': 'iii',
  3342. 'ik': 'ipk',
  3343. 'io': 'ido',
  3344. 'is': 'isl',
  3345. 'it': 'ita',
  3346. 'iu': 'iku',
  3347. 'ja': 'jpn',
  3348. 'jv': 'jav',
  3349. 'ka': 'kat',
  3350. 'kg': 'kon',
  3351. 'ki': 'kik',
  3352. 'kj': 'kua',
  3353. 'kk': 'kaz',
  3354. 'kl': 'kal',
  3355. 'km': 'khm',
  3356. 'kn': 'kan',
  3357. 'ko': 'kor',
  3358. 'kr': 'kau',
  3359. 'ks': 'kas',
  3360. 'ku': 'kur',
  3361. 'kv': 'kom',
  3362. 'kw': 'cor',
  3363. 'ky': 'kir',
  3364. 'la': 'lat',
  3365. 'lb': 'ltz',
  3366. 'lg': 'lug',
  3367. 'li': 'lim',
  3368. 'ln': 'lin',
  3369. 'lo': 'lao',
  3370. 'lt': 'lit',
  3371. 'lu': 'lub',
  3372. 'lv': 'lav',
  3373. 'mg': 'mlg',
  3374. 'mh': 'mah',
  3375. 'mi': 'mri',
  3376. 'mk': 'mkd',
  3377. 'ml': 'mal',
  3378. 'mn': 'mon',
  3379. 'mr': 'mar',
  3380. 'ms': 'msa',
  3381. 'mt': 'mlt',
  3382. 'my': 'mya',
  3383. 'na': 'nau',
  3384. 'nb': 'nob',
  3385. 'nd': 'nde',
  3386. 'ne': 'nep',
  3387. 'ng': 'ndo',
  3388. 'nl': 'nld',
  3389. 'nn': 'nno',
  3390. 'no': 'nor',
  3391. 'nr': 'nbl',
  3392. 'nv': 'nav',
  3393. 'ny': 'nya',
  3394. 'oc': 'oci',
  3395. 'oj': 'oji',
  3396. 'om': 'orm',
  3397. 'or': 'ori',
  3398. 'os': 'oss',
  3399. 'pa': 'pan',
  3400. 'pi': 'pli',
  3401. 'pl': 'pol',
  3402. 'ps': 'pus',
  3403. 'pt': 'por',
  3404. 'qu': 'que',
  3405. 'rm': 'roh',
  3406. 'rn': 'run',
  3407. 'ro': 'ron',
  3408. 'ru': 'rus',
  3409. 'rw': 'kin',
  3410. 'sa': 'san',
  3411. 'sc': 'srd',
  3412. 'sd': 'snd',
  3413. 'se': 'sme',
  3414. 'sg': 'sag',
  3415. 'si': 'sin',
  3416. 'sk': 'slk',
  3417. 'sl': 'slv',
  3418. 'sm': 'smo',
  3419. 'sn': 'sna',
  3420. 'so': 'som',
  3421. 'sq': 'sqi',
  3422. 'sr': 'srp',
  3423. 'ss': 'ssw',
  3424. 'st': 'sot',
  3425. 'su': 'sun',
  3426. 'sv': 'swe',
  3427. 'sw': 'swa',
  3428. 'ta': 'tam',
  3429. 'te': 'tel',
  3430. 'tg': 'tgk',
  3431. 'th': 'tha',
  3432. 'ti': 'tir',
  3433. 'tk': 'tuk',
  3434. 'tl': 'tgl',
  3435. 'tn': 'tsn',
  3436. 'to': 'ton',
  3437. 'tr': 'tur',
  3438. 'ts': 'tso',
  3439. 'tt': 'tat',
  3440. 'tw': 'twi',
  3441. 'ty': 'tah',
  3442. 'ug': 'uig',
  3443. 'uk': 'ukr',
  3444. 'ur': 'urd',
  3445. 'uz': 'uzb',
  3446. 've': 'ven',
  3447. 'vi': 'vie',
  3448. 'vo': 'vol',
  3449. 'wa': 'wln',
  3450. 'wo': 'wol',
  3451. 'xh': 'xho',
  3452. 'yi': 'yid',
  3453. 'ji': 'yid', # Replaced by yi in 1989 revision
  3454. 'yo': 'yor',
  3455. 'za': 'zha',
  3456. 'zh': 'zho',
  3457. 'zu': 'zul',
  3458. }
  3459. @classmethod
  3460. def short2long(cls, code):
  3461. """Convert language code from ISO 639-1 to ISO 639-2/T"""
  3462. return cls._lang_map.get(code[:2])
  3463. @classmethod
  3464. def long2short(cls, code):
  3465. """Convert language code from ISO 639-2/T to ISO 639-1"""
  3466. for short_name, long_name in cls._lang_map.items():
  3467. if long_name == code:
  3468. return short_name
  3469. class ISO3166Utils:
  3470. # From http://data.okfn.org/data/core/country-list
  3471. _country_map = {
  3472. 'AF': 'Afghanistan',
  3473. 'AX': 'Åland Islands',
  3474. 'AL': 'Albania',
  3475. 'DZ': 'Algeria',
  3476. 'AS': 'American Samoa',
  3477. 'AD': 'Andorra',
  3478. 'AO': 'Angola',
  3479. 'AI': 'Anguilla',
  3480. 'AQ': 'Antarctica',
  3481. 'AG': 'Antigua and Barbuda',
  3482. 'AR': 'Argentina',
  3483. 'AM': 'Armenia',
  3484. 'AW': 'Aruba',
  3485. 'AU': 'Australia',
  3486. 'AT': 'Austria',
  3487. 'AZ': 'Azerbaijan',
  3488. 'BS': 'Bahamas',
  3489. 'BH': 'Bahrain',
  3490. 'BD': 'Bangladesh',
  3491. 'BB': 'Barbados',
  3492. 'BY': 'Belarus',
  3493. 'BE': 'Belgium',
  3494. 'BZ': 'Belize',
  3495. 'BJ': 'Benin',
  3496. 'BM': 'Bermuda',
  3497. 'BT': 'Bhutan',
  3498. 'BO': 'Bolivia, Plurinational State of',
  3499. 'BQ': 'Bonaire, Sint Eustatius and Saba',
  3500. 'BA': 'Bosnia and Herzegovina',
  3501. 'BW': 'Botswana',
  3502. 'BV': 'Bouvet Island',
  3503. 'BR': 'Brazil',
  3504. 'IO': 'British Indian Ocean Territory',
  3505. 'BN': 'Brunei Darussalam',
  3506. 'BG': 'Bulgaria',
  3507. 'BF': 'Burkina Faso',
  3508. 'BI': 'Burundi',
  3509. 'KH': 'Cambodia',
  3510. 'CM': 'Cameroon',
  3511. 'CA': 'Canada',
  3512. 'CV': 'Cape Verde',
  3513. 'KY': 'Cayman Islands',
  3514. 'CF': 'Central African Republic',
  3515. 'TD': 'Chad',
  3516. 'CL': 'Chile',
  3517. 'CN': 'China',
  3518. 'CX': 'Christmas Island',
  3519. 'CC': 'Cocos (Keeling) Islands',
  3520. 'CO': 'Colombia',
  3521. 'KM': 'Comoros',
  3522. 'CG': 'Congo',
  3523. 'CD': 'Congo, the Democratic Republic of the',
  3524. 'CK': 'Cook Islands',
  3525. 'CR': 'Costa Rica',
  3526. 'CI': 'Côte d\'Ivoire',
  3527. 'HR': 'Croatia',
  3528. 'CU': 'Cuba',
  3529. 'CW': 'Curaçao',
  3530. 'CY': 'Cyprus',
  3531. 'CZ': 'Czech Republic',
  3532. 'DK': 'Denmark',
  3533. 'DJ': 'Djibouti',
  3534. 'DM': 'Dominica',
  3535. 'DO': 'Dominican Republic',
  3536. 'EC': 'Ecuador',
  3537. 'EG': 'Egypt',
  3538. 'SV': 'El Salvador',
  3539. 'GQ': 'Equatorial Guinea',
  3540. 'ER': 'Eritrea',
  3541. 'EE': 'Estonia',
  3542. 'ET': 'Ethiopia',
  3543. 'FK': 'Falkland Islands (Malvinas)',
  3544. 'FO': 'Faroe Islands',
  3545. 'FJ': 'Fiji',
  3546. 'FI': 'Finland',
  3547. 'FR': 'France',
  3548. 'GF': 'French Guiana',
  3549. 'PF': 'French Polynesia',
  3550. 'TF': 'French Southern Territories',
  3551. 'GA': 'Gabon',
  3552. 'GM': 'Gambia',
  3553. 'GE': 'Georgia',
  3554. 'DE': 'Germany',
  3555. 'GH': 'Ghana',
  3556. 'GI': 'Gibraltar',
  3557. 'GR': 'Greece',
  3558. 'GL': 'Greenland',
  3559. 'GD': 'Grenada',
  3560. 'GP': 'Guadeloupe',
  3561. 'GU': 'Guam',
  3562. 'GT': 'Guatemala',
  3563. 'GG': 'Guernsey',
  3564. 'GN': 'Guinea',
  3565. 'GW': 'Guinea-Bissau',
  3566. 'GY': 'Guyana',
  3567. 'HT': 'Haiti',
  3568. 'HM': 'Heard Island and McDonald Islands',
  3569. 'VA': 'Holy See (Vatican City State)',
  3570. 'HN': 'Honduras',
  3571. 'HK': 'Hong Kong',
  3572. 'HU': 'Hungary',
  3573. 'IS': 'Iceland',
  3574. 'IN': 'India',
  3575. 'ID': 'Indonesia',
  3576. 'IR': 'Iran, Islamic Republic of',
  3577. 'IQ': 'Iraq',
  3578. 'IE': 'Ireland',
  3579. 'IM': 'Isle of Man',
  3580. 'IL': 'Israel',
  3581. 'IT': 'Italy',
  3582. 'JM': 'Jamaica',
  3583. 'JP': 'Japan',
  3584. 'JE': 'Jersey',
  3585. 'JO': 'Jordan',
  3586. 'KZ': 'Kazakhstan',
  3587. 'KE': 'Kenya',
  3588. 'KI': 'Kiribati',
  3589. 'KP': 'Korea, Democratic People\'s Republic of',
  3590. 'KR': 'Korea, Republic of',
  3591. 'KW': 'Kuwait',
  3592. 'KG': 'Kyrgyzstan',
  3593. 'LA': 'Lao People\'s Democratic Republic',
  3594. 'LV': 'Latvia',
  3595. 'LB': 'Lebanon',
  3596. 'LS': 'Lesotho',
  3597. 'LR': 'Liberia',
  3598. 'LY': 'Libya',
  3599. 'LI': 'Liechtenstein',
  3600. 'LT': 'Lithuania',
  3601. 'LU': 'Luxembourg',
  3602. 'MO': 'Macao',
  3603. 'MK': 'Macedonia, the Former Yugoslav Republic of',
  3604. 'MG': 'Madagascar',
  3605. 'MW': 'Malawi',
  3606. 'MY': 'Malaysia',
  3607. 'MV': 'Maldives',
  3608. 'ML': 'Mali',
  3609. 'MT': 'Malta',
  3610. 'MH': 'Marshall Islands',
  3611. 'MQ': 'Martinique',
  3612. 'MR': 'Mauritania',
  3613. 'MU': 'Mauritius',
  3614. 'YT': 'Mayotte',
  3615. 'MX': 'Mexico',
  3616. 'FM': 'Micronesia, Federated States of',
  3617. 'MD': 'Moldova, Republic of',
  3618. 'MC': 'Monaco',
  3619. 'MN': 'Mongolia',
  3620. 'ME': 'Montenegro',
  3621. 'MS': 'Montserrat',
  3622. 'MA': 'Morocco',
  3623. 'MZ': 'Mozambique',
  3624. 'MM': 'Myanmar',
  3625. 'NA': 'Namibia',
  3626. 'NR': 'Nauru',
  3627. 'NP': 'Nepal',
  3628. 'NL': 'Netherlands',
  3629. 'NC': 'New Caledonia',
  3630. 'NZ': 'New Zealand',
  3631. 'NI': 'Nicaragua',
  3632. 'NE': 'Niger',
  3633. 'NG': 'Nigeria',
  3634. 'NU': 'Niue',
  3635. 'NF': 'Norfolk Island',
  3636. 'MP': 'Northern Mariana Islands',
  3637. 'NO': 'Norway',
  3638. 'OM': 'Oman',
  3639. 'PK': 'Pakistan',
  3640. 'PW': 'Palau',
  3641. 'PS': 'Palestine, State of',
  3642. 'PA': 'Panama',
  3643. 'PG': 'Papua New Guinea',
  3644. 'PY': 'Paraguay',
  3645. 'PE': 'Peru',
  3646. 'PH': 'Philippines',
  3647. 'PN': 'Pitcairn',
  3648. 'PL': 'Poland',
  3649. 'PT': 'Portugal',
  3650. 'PR': 'Puerto Rico',
  3651. 'QA': 'Qatar',
  3652. 'RE': 'Réunion',
  3653. 'RO': 'Romania',
  3654. 'RU': 'Russian Federation',
  3655. 'RW': 'Rwanda',
  3656. 'BL': 'Saint Barthélemy',
  3657. 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
  3658. 'KN': 'Saint Kitts and Nevis',
  3659. 'LC': 'Saint Lucia',
  3660. 'MF': 'Saint Martin (French part)',
  3661. 'PM': 'Saint Pierre and Miquelon',
  3662. 'VC': 'Saint Vincent and the Grenadines',
  3663. 'WS': 'Samoa',
  3664. 'SM': 'San Marino',
  3665. 'ST': 'Sao Tome and Principe',
  3666. 'SA': 'Saudi Arabia',
  3667. 'SN': 'Senegal',
  3668. 'RS': 'Serbia',
  3669. 'SC': 'Seychelles',
  3670. 'SL': 'Sierra Leone',
  3671. 'SG': 'Singapore',
  3672. 'SX': 'Sint Maarten (Dutch part)',
  3673. 'SK': 'Slovakia',
  3674. 'SI': 'Slovenia',
  3675. 'SB': 'Solomon Islands',
  3676. 'SO': 'Somalia',
  3677. 'ZA': 'South Africa',
  3678. 'GS': 'South Georgia and the South Sandwich Islands',
  3679. 'SS': 'South Sudan',
  3680. 'ES': 'Spain',
  3681. 'LK': 'Sri Lanka',
  3682. 'SD': 'Sudan',
  3683. 'SR': 'Suriname',
  3684. 'SJ': 'Svalbard and Jan Mayen',
  3685. 'SZ': 'Swaziland',
  3686. 'SE': 'Sweden',
  3687. 'CH': 'Switzerland',
  3688. 'SY': 'Syrian Arab Republic',
  3689. 'TW': 'Taiwan, Province of China',
  3690. 'TJ': 'Tajikistan',
  3691. 'TZ': 'Tanzania, United Republic of',
  3692. 'TH': 'Thailand',
  3693. 'TL': 'Timor-Leste',
  3694. 'TG': 'Togo',
  3695. 'TK': 'Tokelau',
  3696. 'TO': 'Tonga',
  3697. 'TT': 'Trinidad and Tobago',
  3698. 'TN': 'Tunisia',
  3699. 'TR': 'Turkey',
  3700. 'TM': 'Turkmenistan',
  3701. 'TC': 'Turks and Caicos Islands',
  3702. 'TV': 'Tuvalu',
  3703. 'UG': 'Uganda',
  3704. 'UA': 'Ukraine',
  3705. 'AE': 'United Arab Emirates',
  3706. 'GB': 'United Kingdom',
  3707. 'US': 'United States',
  3708. 'UM': 'United States Minor Outlying Islands',
  3709. 'UY': 'Uruguay',
  3710. 'UZ': 'Uzbekistan',
  3711. 'VU': 'Vanuatu',
  3712. 'VE': 'Venezuela, Bolivarian Republic of',
  3713. 'VN': 'Viet Nam',
  3714. 'VG': 'Virgin Islands, British',
  3715. 'VI': 'Virgin Islands, U.S.',
  3716. 'WF': 'Wallis and Futuna',
  3717. 'EH': 'Western Sahara',
  3718. 'YE': 'Yemen',
  3719. 'ZM': 'Zambia',
  3720. 'ZW': 'Zimbabwe',
  3721. # Not ISO 3166 codes, but used for IP blocks
  3722. 'AP': 'Asia/Pacific Region',
  3723. 'EU': 'Europe',
  3724. }
  3725. @classmethod
  3726. def short2full(cls, code):
  3727. """Convert an ISO 3166-2 country code to the corresponding full name"""
  3728. return cls._country_map.get(code.upper())
  3729. class GeoUtils:
  3730. # Major IPv4 address blocks per country
  3731. _country_ip_map = {
  3732. 'AD': '46.172.224.0/19',
  3733. 'AE': '94.200.0.0/13',
  3734. 'AF': '149.54.0.0/17',
  3735. 'AG': '209.59.64.0/18',
  3736. 'AI': '204.14.248.0/21',
  3737. 'AL': '46.99.0.0/16',
  3738. 'AM': '46.70.0.0/15',
  3739. 'AO': '105.168.0.0/13',
  3740. 'AP': '182.50.184.0/21',
  3741. 'AQ': '23.154.160.0/24',
  3742. 'AR': '181.0.0.0/12',
  3743. 'AS': '202.70.112.0/20',
  3744. 'AT': '77.116.0.0/14',
  3745. 'AU': '1.128.0.0/11',
  3746. 'AW': '181.41.0.0/18',
  3747. 'AX': '185.217.4.0/22',
  3748. 'AZ': '5.197.0.0/16',
  3749. 'BA': '31.176.128.0/17',
  3750. 'BB': '65.48.128.0/17',
  3751. 'BD': '114.130.0.0/16',
  3752. 'BE': '57.0.0.0/8',
  3753. 'BF': '102.178.0.0/15',
  3754. 'BG': '95.42.0.0/15',
  3755. 'BH': '37.131.0.0/17',
  3756. 'BI': '154.117.192.0/18',
  3757. 'BJ': '137.255.0.0/16',
  3758. 'BL': '185.212.72.0/23',
  3759. 'BM': '196.12.64.0/18',
  3760. 'BN': '156.31.0.0/16',
  3761. 'BO': '161.56.0.0/16',
  3762. 'BQ': '161.0.80.0/20',
  3763. 'BR': '191.128.0.0/12',
  3764. 'BS': '24.51.64.0/18',
  3765. 'BT': '119.2.96.0/19',
  3766. 'BW': '168.167.0.0/16',
  3767. 'BY': '178.120.0.0/13',
  3768. 'BZ': '179.42.192.0/18',
  3769. 'CA': '99.224.0.0/11',
  3770. 'CD': '41.243.0.0/16',
  3771. 'CF': '197.242.176.0/21',
  3772. 'CG': '160.113.0.0/16',
  3773. 'CH': '85.0.0.0/13',
  3774. 'CI': '102.136.0.0/14',
  3775. 'CK': '202.65.32.0/19',
  3776. 'CL': '152.172.0.0/14',
  3777. 'CM': '102.244.0.0/14',
  3778. 'CN': '36.128.0.0/10',
  3779. 'CO': '181.240.0.0/12',
  3780. 'CR': '201.192.0.0/12',
  3781. 'CU': '152.206.0.0/15',
  3782. 'CV': '165.90.96.0/19',
  3783. 'CW': '190.88.128.0/17',
  3784. 'CY': '31.153.0.0/16',
  3785. 'CZ': '88.100.0.0/14',
  3786. 'DE': '53.0.0.0/8',
  3787. 'DJ': '197.241.0.0/17',
  3788. 'DK': '87.48.0.0/12',
  3789. 'DM': '192.243.48.0/20',
  3790. 'DO': '152.166.0.0/15',
  3791. 'DZ': '41.96.0.0/12',
  3792. 'EC': '186.68.0.0/15',
  3793. 'EE': '90.190.0.0/15',
  3794. 'EG': '156.160.0.0/11',
  3795. 'ER': '196.200.96.0/20',
  3796. 'ES': '88.0.0.0/11',
  3797. 'ET': '196.188.0.0/14',
  3798. 'EU': '2.16.0.0/13',
  3799. 'FI': '91.152.0.0/13',
  3800. 'FJ': '144.120.0.0/16',
  3801. 'FK': '80.73.208.0/21',
  3802. 'FM': '119.252.112.0/20',
  3803. 'FO': '88.85.32.0/19',
  3804. 'FR': '90.0.0.0/9',
  3805. 'GA': '41.158.0.0/15',
  3806. 'GB': '25.0.0.0/8',
  3807. 'GD': '74.122.88.0/21',
  3808. 'GE': '31.146.0.0/16',
  3809. 'GF': '161.22.64.0/18',
  3810. 'GG': '62.68.160.0/19',
  3811. 'GH': '154.160.0.0/12',
  3812. 'GI': '95.164.0.0/16',
  3813. 'GL': '88.83.0.0/19',
  3814. 'GM': '160.182.0.0/15',
  3815. 'GN': '197.149.192.0/18',
  3816. 'GP': '104.250.0.0/19',
  3817. 'GQ': '105.235.224.0/20',
  3818. 'GR': '94.64.0.0/13',
  3819. 'GT': '168.234.0.0/16',
  3820. 'GU': '168.123.0.0/16',
  3821. 'GW': '197.214.80.0/20',
  3822. 'GY': '181.41.64.0/18',
  3823. 'HK': '113.252.0.0/14',
  3824. 'HN': '181.210.0.0/16',
  3825. 'HR': '93.136.0.0/13',
  3826. 'HT': '148.102.128.0/17',
  3827. 'HU': '84.0.0.0/14',
  3828. 'ID': '39.192.0.0/10',
  3829. 'IE': '87.32.0.0/12',
  3830. 'IL': '79.176.0.0/13',
  3831. 'IM': '5.62.80.0/20',
  3832. 'IN': '117.192.0.0/10',
  3833. 'IO': '203.83.48.0/21',
  3834. 'IQ': '37.236.0.0/14',
  3835. 'IR': '2.176.0.0/12',
  3836. 'IS': '82.221.0.0/16',
  3837. 'IT': '79.0.0.0/10',
  3838. 'JE': '87.244.64.0/18',
  3839. 'JM': '72.27.0.0/17',
  3840. 'JO': '176.29.0.0/16',
  3841. 'JP': '133.0.0.0/8',
  3842. 'KE': '105.48.0.0/12',
  3843. 'KG': '158.181.128.0/17',
  3844. 'KH': '36.37.128.0/17',
  3845. 'KI': '103.25.140.0/22',
  3846. 'KM': '197.255.224.0/20',
  3847. 'KN': '198.167.192.0/19',
  3848. 'KP': '175.45.176.0/22',
  3849. 'KR': '175.192.0.0/10',
  3850. 'KW': '37.36.0.0/14',
  3851. 'KY': '64.96.0.0/15',
  3852. 'KZ': '2.72.0.0/13',
  3853. 'LA': '115.84.64.0/18',
  3854. 'LB': '178.135.0.0/16',
  3855. 'LC': '24.92.144.0/20',
  3856. 'LI': '82.117.0.0/19',
  3857. 'LK': '112.134.0.0/15',
  3858. 'LR': '102.183.0.0/16',
  3859. 'LS': '129.232.0.0/17',
  3860. 'LT': '78.56.0.0/13',
  3861. 'LU': '188.42.0.0/16',
  3862. 'LV': '46.109.0.0/16',
  3863. 'LY': '41.252.0.0/14',
  3864. 'MA': '105.128.0.0/11',
  3865. 'MC': '88.209.64.0/18',
  3866. 'MD': '37.246.0.0/16',
  3867. 'ME': '178.175.0.0/17',
  3868. 'MF': '74.112.232.0/21',
  3869. 'MG': '154.126.0.0/17',
  3870. 'MH': '117.103.88.0/21',
  3871. 'MK': '77.28.0.0/15',
  3872. 'ML': '154.118.128.0/18',
  3873. 'MM': '37.111.0.0/17',
  3874. 'MN': '49.0.128.0/17',
  3875. 'MO': '60.246.0.0/16',
  3876. 'MP': '202.88.64.0/20',
  3877. 'MQ': '109.203.224.0/19',
  3878. 'MR': '41.188.64.0/18',
  3879. 'MS': '208.90.112.0/22',
  3880. 'MT': '46.11.0.0/16',
  3881. 'MU': '105.16.0.0/12',
  3882. 'MV': '27.114.128.0/18',
  3883. 'MW': '102.70.0.0/15',
  3884. 'MX': '187.192.0.0/11',
  3885. 'MY': '175.136.0.0/13',
  3886. 'MZ': '197.218.0.0/15',
  3887. 'NA': '41.182.0.0/16',
  3888. 'NC': '101.101.0.0/18',
  3889. 'NE': '197.214.0.0/18',
  3890. 'NF': '203.17.240.0/22',
  3891. 'NG': '105.112.0.0/12',
  3892. 'NI': '186.76.0.0/15',
  3893. 'NL': '145.96.0.0/11',
  3894. 'NO': '84.208.0.0/13',
  3895. 'NP': '36.252.0.0/15',
  3896. 'NR': '203.98.224.0/19',
  3897. 'NU': '49.156.48.0/22',
  3898. 'NZ': '49.224.0.0/14',
  3899. 'OM': '5.36.0.0/15',
  3900. 'PA': '186.72.0.0/15',
  3901. 'PE': '186.160.0.0/14',
  3902. 'PF': '123.50.64.0/18',
  3903. 'PG': '124.240.192.0/19',
  3904. 'PH': '49.144.0.0/13',
  3905. 'PK': '39.32.0.0/11',
  3906. 'PL': '83.0.0.0/11',
  3907. 'PM': '70.36.0.0/20',
  3908. 'PR': '66.50.0.0/16',
  3909. 'PS': '188.161.0.0/16',
  3910. 'PT': '85.240.0.0/13',
  3911. 'PW': '202.124.224.0/20',
  3912. 'PY': '181.120.0.0/14',
  3913. 'QA': '37.210.0.0/15',
  3914. 'RE': '102.35.0.0/16',
  3915. 'RO': '79.112.0.0/13',
  3916. 'RS': '93.86.0.0/15',
  3917. 'RU': '5.136.0.0/13',
  3918. 'RW': '41.186.0.0/16',
  3919. 'SA': '188.48.0.0/13',
  3920. 'SB': '202.1.160.0/19',
  3921. 'SC': '154.192.0.0/11',
  3922. 'SD': '102.120.0.0/13',
  3923. 'SE': '78.64.0.0/12',
  3924. 'SG': '8.128.0.0/10',
  3925. 'SI': '188.196.0.0/14',
  3926. 'SK': '78.98.0.0/15',
  3927. 'SL': '102.143.0.0/17',
  3928. 'SM': '89.186.32.0/19',
  3929. 'SN': '41.82.0.0/15',
  3930. 'SO': '154.115.192.0/18',
  3931. 'SR': '186.179.128.0/17',
  3932. 'SS': '105.235.208.0/21',
  3933. 'ST': '197.159.160.0/19',
  3934. 'SV': '168.243.0.0/16',
  3935. 'SX': '190.102.0.0/20',
  3936. 'SY': '5.0.0.0/16',
  3937. 'SZ': '41.84.224.0/19',
  3938. 'TC': '65.255.48.0/20',
  3939. 'TD': '154.68.128.0/19',
  3940. 'TG': '196.168.0.0/14',
  3941. 'TH': '171.96.0.0/13',
  3942. 'TJ': '85.9.128.0/18',
  3943. 'TK': '27.96.24.0/21',
  3944. 'TL': '180.189.160.0/20',
  3945. 'TM': '95.85.96.0/19',
  3946. 'TN': '197.0.0.0/11',
  3947. 'TO': '175.176.144.0/21',
  3948. 'TR': '78.160.0.0/11',
  3949. 'TT': '186.44.0.0/15',
  3950. 'TV': '202.2.96.0/19',
  3951. 'TW': '120.96.0.0/11',
  3952. 'TZ': '156.156.0.0/14',
  3953. 'UA': '37.52.0.0/14',
  3954. 'UG': '102.80.0.0/13',
  3955. 'US': '6.0.0.0/8',
  3956. 'UY': '167.56.0.0/13',
  3957. 'UZ': '84.54.64.0/18',
  3958. 'VA': '212.77.0.0/19',
  3959. 'VC': '207.191.240.0/21',
  3960. 'VE': '186.88.0.0/13',
  3961. 'VG': '66.81.192.0/20',
  3962. 'VI': '146.226.0.0/16',
  3963. 'VN': '14.160.0.0/11',
  3964. 'VU': '202.80.32.0/20',
  3965. 'WF': '117.20.32.0/21',
  3966. 'WS': '202.4.32.0/19',
  3967. 'YE': '134.35.0.0/16',
  3968. 'YT': '41.242.116.0/22',
  3969. 'ZA': '41.0.0.0/11',
  3970. 'ZM': '102.144.0.0/13',
  3971. 'ZW': '102.177.192.0/18',
  3972. }
  3973. @classmethod
  3974. def random_ipv4(cls, code_or_block):
  3975. if len(code_or_block) == 2:
  3976. block = cls._country_ip_map.get(code_or_block.upper())
  3977. if not block:
  3978. return None
  3979. else:
  3980. block = code_or_block
  3981. addr, preflen = block.split('/')
  3982. addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
  3983. addr_max = addr_min | (0xffffffff >> int(preflen))
  3984. return str(socket.inet_ntoa(
  3985. struct.pack('!L', random.randint(addr_min, addr_max))))
  3986. class PerRequestProxyHandler(urllib.request.ProxyHandler):
  3987. def __init__(self, proxies=None):
  3988. # Set default handlers
  3989. for type in ('http', 'https'):
  3990. setattr(self, '%s_open' % type,
  3991. lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
  3992. meth(r, proxy, type))
  3993. urllib.request.ProxyHandler.__init__(self, proxies)
  3994. def proxy_open(self, req, proxy, type):
  3995. req_proxy = req.headers.get('Ytdl-request-proxy')
  3996. if req_proxy is not None:
  3997. proxy = req_proxy
  3998. del req.headers['Ytdl-request-proxy']
  3999. if proxy == '__noproxy__':
  4000. return None # No Proxy
  4001. if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
  4002. req.add_header('Ytdl-socks-proxy', proxy)
  4003. # yt-dlp's http/https handlers do wrapping the socket with socks
  4004. return None
  4005. return urllib.request.ProxyHandler.proxy_open(
  4006. self, req, proxy, type)
  4007. # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
  4008. # released into Public Domain
  4009. # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
  4010. def long_to_bytes(n, blocksize=0):
  4011. """long_to_bytes(n:long, blocksize:int) : string
  4012. Convert a long integer to a byte string.
  4013. If optional blocksize is given and greater than zero, pad the front of the
  4014. byte string with binary zeros so that the length is a multiple of
  4015. blocksize.
  4016. """
  4017. # after much testing, this algorithm was deemed to be the fastest
  4018. s = b''
  4019. n = int(n)
  4020. while n > 0:
  4021. s = struct.pack('>I', n & 0xffffffff) + s
  4022. n = n >> 32
  4023. # strip off leading zeros
  4024. for i in range(len(s)):
  4025. if s[i] != b'\000'[0]:
  4026. break
  4027. else:
  4028. # only happens when n == 0
  4029. s = b'\000'
  4030. i = 0
  4031. s = s[i:]
  4032. # add back some pad bytes. this could be done more efficiently w.r.t. the
  4033. # de-padding being done above, but sigh...
  4034. if blocksize > 0 and len(s) % blocksize:
  4035. s = (blocksize - len(s) % blocksize) * b'\000' + s
  4036. return s
  4037. def bytes_to_long(s):
  4038. """bytes_to_long(string) : long
  4039. Convert a byte string to a long integer.
  4040. This is (essentially) the inverse of long_to_bytes().
  4041. """
  4042. acc = 0
  4043. length = len(s)
  4044. if length % 4:
  4045. extra = (4 - length % 4)
  4046. s = b'\000' * extra + s
  4047. length = length + extra
  4048. for i in range(0, length, 4):
  4049. acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
  4050. return acc
  4051. def ohdave_rsa_encrypt(data, exponent, modulus):
  4052. '''
  4053. Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
  4054. Input:
  4055. data: data to encrypt, bytes-like object
  4056. exponent, modulus: parameter e and N of RSA algorithm, both integer
  4057. Output: hex string of encrypted data
  4058. Limitation: supports one block encryption only
  4059. '''
  4060. payload = int(binascii.hexlify(data[::-1]), 16)
  4061. encrypted = pow(payload, exponent, modulus)
  4062. return '%x' % encrypted
  4063. def pkcs1pad(data, length):
  4064. """
  4065. Padding input data with PKCS#1 scheme
  4066. @param {int[]} data input data
  4067. @param {int} length target length
  4068. @returns {int[]} padded data
  4069. """
  4070. if len(data) > length - 11:
  4071. raise ValueError('Input data too long for PKCS#1 padding')
  4072. pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
  4073. return [0, 2] + pseudo_random + [0] + data
  4074. def _base_n_table(n, table):
  4075. if not table and not n:
  4076. raise ValueError('Either table or n must be specified')
  4077. table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
  4078. if n and n != len(table):
  4079. raise ValueError(f'base {n} exceeds table length {len(table)}')
  4080. return table
  4081. def encode_base_n(num, n=None, table=None):
  4082. """Convert given int to a base-n string"""
  4083. table = _base_n_table(n, table)
  4084. if not num:
  4085. return table[0]
  4086. result, base = '', len(table)
  4087. while num:
  4088. result = table[num % base] + result
  4089. num = num // base
  4090. return result
  4091. def decode_base_n(string, n=None, table=None):
  4092. """Convert given base-n string to int"""
  4093. table = {char: index for index, char in enumerate(_base_n_table(n, table))}
  4094. result, base = 0, len(table)
  4095. for char in string:
  4096. result = result * base + table[char]
  4097. return result
  4098. def decode_packed_codes(code):
  4099. mobj = re.search(PACKED_CODES_RE, code)
  4100. obfuscated_code, base, count, symbols = mobj.groups()
  4101. base = int(base)
  4102. count = int(count)
  4103. symbols = symbols.split('|')
  4104. symbol_table = {}
  4105. while count:
  4106. count -= 1
  4107. base_n_count = encode_base_n(count, base)
  4108. symbol_table[base_n_count] = symbols[count] or base_n_count
  4109. return re.sub(
  4110. r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
  4111. obfuscated_code)
  4112. def caesar(s, alphabet, shift):
  4113. if shift == 0:
  4114. return s
  4115. l = len(alphabet)
  4116. return ''.join(
  4117. alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
  4118. for c in s)
  4119. def rot47(s):
  4120. return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
  4121. def parse_m3u8_attributes(attrib):
  4122. info = {}
  4123. for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
  4124. if val.startswith('"'):
  4125. val = val[1:-1]
  4126. info[key] = val
  4127. return info
  4128. def urshift(val, n):
  4129. return val >> n if val >= 0 else (val + 0x100000000) >> n
  4130. def write_xattr(path, key, value):
  4131. # Windows: Write xattrs to NTFS Alternate Data Streams:
  4132. # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
  4133. if compat_os_name == 'nt':
  4134. assert ':' not in key
  4135. assert os.path.exists(path)
  4136. try:
  4137. with open(f'{path}:{key}', 'wb') as f:
  4138. f.write(value)
  4139. except OSError as e:
  4140. raise XAttrMetadataError(e.errno, e.strerror)
  4141. return
  4142. # UNIX Method 1. Use xattrs/pyxattrs modules
  4143. setxattr = None
  4144. if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
  4145. # Unicode arguments are not supported in pyxattr until version 0.5.0
  4146. # See https://github.com/ytdl-org/youtube-dl/issues/5498
  4147. if version_tuple(xattr.__version__) >= (0, 5, 0):
  4148. setxattr = xattr.set
  4149. elif xattr:
  4150. setxattr = xattr.setxattr
  4151. if setxattr:
  4152. try:
  4153. setxattr(path, key, value)
  4154. except OSError as e:
  4155. raise XAttrMetadataError(e.errno, e.strerror)
  4156. return
  4157. # UNIX Method 2. Use setfattr/xattr executables
  4158. exe = ('setfattr' if check_executable('setfattr', ['--version'])
  4159. else 'xattr' if check_executable('xattr', ['-h']) else None)
  4160. if not exe:
  4161. raise XAttrUnavailableError(
  4162. 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
  4163. + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
  4164. value = value.decode()
  4165. try:
  4166. _, stderr, returncode = Popen.run(
  4167. [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
  4168. text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
  4169. except OSError as e:
  4170. raise XAttrMetadataError(e.errno, e.strerror)
  4171. if returncode:
  4172. raise XAttrMetadataError(returncode, stderr)
  4173. def random_birthday(year_field, month_field, day_field):
  4174. start_date = datetime.date(1950, 1, 1)
  4175. end_date = datetime.date(1995, 12, 31)
  4176. offset = random.randint(0, (end_date - start_date).days)
  4177. random_date = start_date + datetime.timedelta(offset)
  4178. return {
  4179. year_field: str(random_date.year),
  4180. month_field: str(random_date.month),
  4181. day_field: str(random_date.day),
  4182. }
  4183. def find_available_port(interface=''):
  4184. try:
  4185. with socket.socket() as sock:
  4186. sock.bind((interface, 0))
  4187. return sock.getsockname()[1]
  4188. except OSError:
  4189. return None
  4190. # Templates for internet shortcut files, which are plain text files.
  4191. DOT_URL_LINK_TEMPLATE = '''\
  4192. [InternetShortcut]
  4193. URL=%(url)s
  4194. '''
  4195. DOT_WEBLOC_LINK_TEMPLATE = '''\
  4196. <?xml version="1.0" encoding="UTF-8"?>
  4197. <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
  4198. <plist version="1.0">
  4199. <dict>
  4200. \t<key>URL</key>
  4201. \t<string>%(url)s</string>
  4202. </dict>
  4203. </plist>
  4204. '''
  4205. DOT_DESKTOP_LINK_TEMPLATE = '''\
  4206. [Desktop Entry]
  4207. Encoding=UTF-8
  4208. Name=%(filename)s
  4209. Type=Link
  4210. URL=%(url)s
  4211. Icon=text-html
  4212. '''
  4213. LINK_TEMPLATES = {
  4214. 'url': DOT_URL_LINK_TEMPLATE,
  4215. 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
  4216. 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
  4217. }
  4218. def iri_to_uri(iri):
  4219. """
  4220. Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
  4221. The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
  4222. """
  4223. iri_parts = urllib.parse.urlparse(iri)
  4224. if '[' in iri_parts.netloc:
  4225. raise ValueError('IPv6 URIs are not, yet, supported.')
  4226. # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
  4227. # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
  4228. net_location = ''
  4229. if iri_parts.username:
  4230. net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
  4231. if iri_parts.password is not None:
  4232. net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
  4233. net_location += '@'
  4234. net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
  4235. # The 'idna' encoding produces ASCII text.
  4236. if iri_parts.port is not None and iri_parts.port != 80:
  4237. net_location += ':' + str(iri_parts.port)
  4238. return urllib.parse.urlunparse(
  4239. (iri_parts.scheme,
  4240. net_location,
  4241. urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
  4242. # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
  4243. urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
  4244. # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
  4245. urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
  4246. urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
  4247. # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
  4248. def to_high_limit_path(path):
  4249. if sys.platform in ['win32', 'cygwin']:
  4250. # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
  4251. return '\\\\?\\' + os.path.abspath(path)
  4252. return path
  4253. def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
  4254. val = traversal.traverse_obj(obj, *variadic(field))
  4255. if not val if ignore is NO_DEFAULT else val in variadic(ignore):
  4256. return default
  4257. return template % func(val)
  4258. def clean_podcast_url(url):
  4259. return re.sub(r'''(?x)
  4260. (?:
  4261. (?:
  4262. chtbl\.com/track|
  4263. media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
  4264. play\.podtrac\.com
  4265. )/[^/]+|
  4266. (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
  4267. flex\.acast\.com|
  4268. pd(?:
  4269. cn\.co| # https://podcorn.com/analytics-prefix/
  4270. st\.fm # https://podsights.com/docs/
  4271. )/e
  4272. )/''', '', url)
  4273. _HEX_TABLE = '0123456789abcdef'
  4274. def random_uuidv4():
  4275. return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
  4276. def make_dir(path, to_screen=None):
  4277. try:
  4278. dn = os.path.dirname(path)
  4279. if dn:
  4280. os.makedirs(dn, exist_ok=True)
  4281. return True
  4282. except OSError as err:
  4283. if callable(to_screen) is not None:
  4284. to_screen(f'unable to create directory {err}')
  4285. return False
  4286. def get_executable_path():
  4287. from ..update import _get_variant_and_executable_path
  4288. return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
  4289. def get_user_config_dirs(package_name):
  4290. # .config (e.g. ~/.config/package_name)
  4291. xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
  4292. yield os.path.join(xdg_config_home, package_name)
  4293. # appdata (%APPDATA%/package_name)
  4294. appdata_dir = os.getenv('appdata')
  4295. if appdata_dir:
  4296. yield os.path.join(appdata_dir, package_name)
  4297. # home (~/.package_name)
  4298. yield os.path.join(compat_expanduser('~'), f'.{package_name}')
  4299. def get_system_config_dirs(package_name):
  4300. # /etc/package_name
  4301. yield os.path.join('/etc', package_name)
  4302. def time_seconds(**kwargs):
  4303. """
  4304. Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
  4305. """
  4306. return time.time() + datetime.timedelta(**kwargs).total_seconds()
  4307. # create a JSON Web Signature (jws) with HS256 algorithm
  4308. # the resulting format is in JWS Compact Serialization
  4309. # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
  4310. # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
  4311. def jwt_encode_hs256(payload_data, key, headers={}):
  4312. header_data = {
  4313. 'alg': 'HS256',
  4314. 'typ': 'JWT',
  4315. }
  4316. if headers:
  4317. header_data.update(headers)
  4318. header_b64 = base64.b64encode(json.dumps(header_data).encode())
  4319. payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
  4320. h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
  4321. signature_b64 = base64.b64encode(h.digest())
  4322. token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
  4323. return token
  4324. # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
  4325. def jwt_decode_hs256(jwt):
  4326. header_b64, payload_b64, signature_b64 = jwt.split('.')
  4327. # add trailing ='s that may have been stripped, superfluous ='s are ignored
  4328. payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
  4329. return payload_data
  4330. WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
  4331. @functools.cache
  4332. def supports_terminal_sequences(stream):
  4333. if compat_os_name == 'nt':
  4334. if not WINDOWS_VT_MODE:
  4335. return False
  4336. elif not os.getenv('TERM'):
  4337. return False
  4338. try:
  4339. return stream.isatty()
  4340. except BaseException:
  4341. return False
  4342. def windows_enable_vt_mode():
  4343. """Ref: https://bugs.python.org/issue30075 """
  4344. if get_windows_version() < (10, 0, 10586):
  4345. return
  4346. import ctypes
  4347. import ctypes.wintypes
  4348. import msvcrt
  4349. ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
  4350. dll = ctypes.WinDLL('kernel32', use_last_error=False)
  4351. handle = os.open('CONOUT$', os.O_RDWR)
  4352. try:
  4353. h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
  4354. dw_original_mode = ctypes.wintypes.DWORD()
  4355. success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
  4356. if not success:
  4357. raise Exception('GetConsoleMode failed')
  4358. success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
  4359. dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
  4360. if not success:
  4361. raise Exception('SetConsoleMode failed')
  4362. finally:
  4363. os.close(handle)
  4364. global WINDOWS_VT_MODE
  4365. WINDOWS_VT_MODE = True
  4366. supports_terminal_sequences.cache_clear()
  4367. _terminal_sequences_re = re.compile('\033\\[[^m]+m')
  4368. def remove_terminal_sequences(string):
  4369. return _terminal_sequences_re.sub('', string)
  4370. def number_of_digits(number):
  4371. return len('%d' % number)
  4372. def join_nonempty(*values, delim='-', from_dict=None):
  4373. if from_dict is not None:
  4374. values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
  4375. return delim.join(map(str, filter(None, values)))
  4376. def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
  4377. """
  4378. Find the largest format dimensions in terms of video width and, for each thumbnail:
  4379. * Modify the URL: Match the width with the provided regex and replace with the former width
  4380. * Update dimensions
  4381. This function is useful with video services that scale the provided thumbnails on demand
  4382. """
  4383. _keys = ('width', 'height')
  4384. max_dimensions = max(
  4385. (tuple(format.get(k) or 0 for k in _keys) for format in formats),
  4386. default=(0, 0))
  4387. if not max_dimensions[0]:
  4388. return thumbnails
  4389. return [
  4390. merge_dicts(
  4391. {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
  4392. dict(zip(_keys, max_dimensions)), thumbnail)
  4393. for thumbnail in thumbnails
  4394. ]
  4395. def parse_http_range(range):
  4396. """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
  4397. if not range:
  4398. return None, None, None
  4399. crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
  4400. if not crg:
  4401. return None, None, None
  4402. return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
  4403. def read_stdin(what):
  4404. eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
  4405. write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
  4406. return sys.stdin
  4407. def determine_file_encoding(data):
  4408. """
  4409. Detect the text encoding used
  4410. @returns (encoding, bytes to skip)
  4411. """
  4412. # BOM marks are given priority over declarations
  4413. for bom, enc in BOMS:
  4414. if data.startswith(bom):
  4415. return enc, len(bom)
  4416. # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
  4417. # We ignore the endianness to get a good enough match
  4418. data = data.replace(b'\0', b'')
  4419. mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
  4420. return mobj.group(1).decode() if mobj else None, 0
  4421. class Config:
  4422. own_args = None
  4423. parsed_args = None
  4424. filename = None
  4425. __initialized = False
  4426. def __init__(self, parser, label=None):
  4427. self.parser, self.label = parser, label
  4428. self._loaded_paths, self.configs = set(), []
  4429. def init(self, args=None, filename=None):
  4430. assert not self.__initialized
  4431. self.own_args, self.filename = args, filename
  4432. return self.load_configs()
  4433. def load_configs(self):
  4434. directory = ''
  4435. if self.filename:
  4436. location = os.path.realpath(self.filename)
  4437. directory = os.path.dirname(location)
  4438. if location in self._loaded_paths:
  4439. return False
  4440. self._loaded_paths.add(location)
  4441. self.__initialized = True
  4442. opts, _ = self.parser.parse_known_args(self.own_args)
  4443. self.parsed_args = self.own_args
  4444. for location in opts.config_locations or []:
  4445. if location == '-':
  4446. if location in self._loaded_paths:
  4447. continue
  4448. self._loaded_paths.add(location)
  4449. self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
  4450. continue
  4451. location = os.path.join(directory, expand_path(location))
  4452. if os.path.isdir(location):
  4453. location = os.path.join(location, 'yt-dlp.conf')
  4454. if not os.path.exists(location):
  4455. self.parser.error(f'config location {location} does not exist')
  4456. self.append_config(self.read_file(location), location)
  4457. return True
  4458. def __str__(self):
  4459. label = join_nonempty(
  4460. self.label, 'config', f'"{self.filename}"' if self.filename else '',
  4461. delim=' ')
  4462. return join_nonempty(
  4463. self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
  4464. *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
  4465. delim='\n')
  4466. @staticmethod
  4467. def read_file(filename, default=[]):
  4468. try:
  4469. optionf = open(filename, 'rb')
  4470. except OSError:
  4471. return default # silently skip if file is not present
  4472. try:
  4473. enc, skip = determine_file_encoding(optionf.read(512))
  4474. optionf.seek(skip, io.SEEK_SET)
  4475. except OSError:
  4476. enc = None # silently skip read errors
  4477. try:
  4478. # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
  4479. contents = optionf.read().decode(enc or preferredencoding())
  4480. res = shlex.split(contents, comments=True)
  4481. except Exception as err:
  4482. raise ValueError(f'Unable to parse "{filename}": {err}')
  4483. finally:
  4484. optionf.close()
  4485. return res
  4486. @staticmethod
  4487. def hide_login_info(opts):
  4488. PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
  4489. eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
  4490. def _scrub_eq(o):
  4491. m = eqre.match(o)
  4492. if m:
  4493. return m.group('key') + '=PRIVATE'
  4494. else:
  4495. return o
  4496. opts = list(map(_scrub_eq, opts))
  4497. for idx, opt in enumerate(opts):
  4498. if opt in PRIVATE_OPTS and idx + 1 < len(opts):
  4499. opts[idx + 1] = 'PRIVATE'
  4500. return opts
  4501. def append_config(self, *args, label=None):
  4502. config = type(self)(self.parser, label)
  4503. config._loaded_paths = self._loaded_paths
  4504. if config.init(*args):
  4505. self.configs.append(config)
  4506. @property
  4507. def all_args(self):
  4508. for config in reversed(self.configs):
  4509. yield from config.all_args
  4510. yield from self.parsed_args or []
  4511. def parse_known_args(self, **kwargs):
  4512. return self.parser.parse_known_args(self.all_args, **kwargs)
  4513. def parse_args(self):
  4514. return self.parser.parse_args(self.all_args)
  4515. class WebSocketsWrapper:
  4516. """Wraps websockets module to use in non-async scopes"""
  4517. pool = None
  4518. def __init__(self, url, headers=None, connect=True):
  4519. self.loop = asyncio.new_event_loop()
  4520. # XXX: "loop" is deprecated
  4521. self.conn = websockets.connect(
  4522. url, extra_headers=headers, ping_interval=None,
  4523. close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
  4524. if connect:
  4525. self.__enter__()
  4526. atexit.register(self.__exit__, None, None, None)
  4527. def __enter__(self):
  4528. if not self.pool:
  4529. self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
  4530. return self
  4531. def send(self, *args):
  4532. self.run_with_loop(self.pool.send(*args), self.loop)
  4533. def recv(self, *args):
  4534. return self.run_with_loop(self.pool.recv(*args), self.loop)
  4535. def __exit__(self, type, value, traceback):
  4536. try:
  4537. return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
  4538. finally:
  4539. self.loop.close()
  4540. self._cancel_all_tasks(self.loop)
  4541. # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
  4542. # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
  4543. @staticmethod
  4544. def run_with_loop(main, loop):
  4545. if not asyncio.iscoroutine(main):
  4546. raise ValueError(f'a coroutine was expected, got {main!r}')
  4547. try:
  4548. return loop.run_until_complete(main)
  4549. finally:
  4550. loop.run_until_complete(loop.shutdown_asyncgens())
  4551. if hasattr(loop, 'shutdown_default_executor'):
  4552. loop.run_until_complete(loop.shutdown_default_executor())
  4553. @staticmethod
  4554. def _cancel_all_tasks(loop):
  4555. to_cancel = asyncio.all_tasks(loop)
  4556. if not to_cancel:
  4557. return
  4558. for task in to_cancel:
  4559. task.cancel()
  4560. # XXX: "loop" is removed in python 3.10+
  4561. loop.run_until_complete(
  4562. asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
  4563. for task in to_cancel:
  4564. if task.cancelled():
  4565. continue
  4566. if task.exception() is not None:
  4567. loop.call_exception_handler({
  4568. 'message': 'unhandled exception during asyncio.run() shutdown',
  4569. 'exception': task.exception(),
  4570. 'task': task,
  4571. })
  4572. def merge_headers(*dicts):
  4573. """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
  4574. return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
  4575. def cached_method(f):
  4576. """Cache a method"""
  4577. signature = inspect.signature(f)
  4578. @functools.wraps(f)
  4579. def wrapper(self, *args, **kwargs):
  4580. bound_args = signature.bind(self, *args, **kwargs)
  4581. bound_args.apply_defaults()
  4582. key = tuple(bound_args.arguments.values())[1:]
  4583. cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
  4584. if key not in cache:
  4585. cache[key] = f(self, *args, **kwargs)
  4586. return cache[key]
  4587. return wrapper
  4588. class classproperty:
  4589. """property access for class methods with optional caching"""
  4590. def __new__(cls, func=None, *args, **kwargs):
  4591. if not func:
  4592. return functools.partial(cls, *args, **kwargs)
  4593. return super().__new__(cls)
  4594. def __init__(self, func, *, cache=False):
  4595. functools.update_wrapper(self, func)
  4596. self.func = func
  4597. self._cache = {} if cache else None
  4598. def __get__(self, _, cls):
  4599. if self._cache is None:
  4600. return self.func(cls)
  4601. elif cls not in self._cache:
  4602. self._cache[cls] = self.func(cls)
  4603. return self._cache[cls]
  4604. class function_with_repr:
  4605. def __init__(self, func, repr_=None):
  4606. functools.update_wrapper(self, func)
  4607. self.func, self.__repr = func, repr_
  4608. def __call__(self, *args, **kwargs):
  4609. return self.func(*args, **kwargs)
  4610. def __repr__(self):
  4611. if self.__repr:
  4612. return self.__repr
  4613. return f'{self.func.__module__}.{self.func.__qualname__}'
  4614. class Namespace(types.SimpleNamespace):
  4615. """Immutable namespace"""
  4616. def __iter__(self):
  4617. return iter(self.__dict__.values())
  4618. @property
  4619. def items_(self):
  4620. return self.__dict__.items()
  4621. MEDIA_EXTENSIONS = Namespace(
  4622. common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
  4623. video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
  4624. common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
  4625. audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
  4626. thumbnails=('jpg', 'png', 'webp'),
  4627. storyboards=('mhtml', ),
  4628. subtitles=('srt', 'vtt', 'ass', 'lrc'),
  4629. manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
  4630. )
  4631. MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
  4632. MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
  4633. KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
  4634. class RetryManager:
  4635. """Usage:
  4636. for retry in RetryManager(...):
  4637. try:
  4638. ...
  4639. except SomeException as err:
  4640. retry.error = err
  4641. continue
  4642. """
  4643. attempt, _error = 0, None
  4644. def __init__(self, _retries, _error_callback, **kwargs):
  4645. self.retries = _retries or 0
  4646. self.error_callback = functools.partial(_error_callback, **kwargs)
  4647. def _should_retry(self):
  4648. return self._error is not NO_DEFAULT and self.attempt <= self.retries
  4649. @property
  4650. def error(self):
  4651. if self._error is NO_DEFAULT:
  4652. return None
  4653. return self._error
  4654. @error.setter
  4655. def error(self, value):
  4656. self._error = value
  4657. def __iter__(self):
  4658. while self._should_retry():
  4659. self.error = NO_DEFAULT
  4660. self.attempt += 1
  4661. yield self
  4662. if self.error:
  4663. self.error_callback(self.error, self.attempt, self.retries)
  4664. @staticmethod
  4665. def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
  4666. """Utility function for reporting retries"""
  4667. if count > retries:
  4668. if error:
  4669. return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
  4670. raise e
  4671. if not count:
  4672. return warn(e)
  4673. elif isinstance(e, ExtractorError):
  4674. e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
  4675. warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
  4676. delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
  4677. if delay:
  4678. info(f'Sleeping {delay:.2f} seconds ...')
  4679. time.sleep(delay)
  4680. def make_archive_id(ie, video_id):
  4681. ie_key = ie if isinstance(ie, str) else ie.ie_key()
  4682. return f'{ie_key.lower()} {video_id}'
  4683. def truncate_string(s, left, right=0):
  4684. assert left > 3 and right >= 0
  4685. if s is None or len(s) <= left + right:
  4686. return s
  4687. return f'{s[:left-3]}...{s[-right:] if right else ""}'
  4688. def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
  4689. assert 'all' in alias_dict, '"all" alias is required'
  4690. requested = list(start or [])
  4691. for val in options:
  4692. discard = val.startswith('-')
  4693. if discard:
  4694. val = val[1:]
  4695. if val in alias_dict:
  4696. val = alias_dict[val] if not discard else [
  4697. i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
  4698. # NB: Do not allow regex in aliases for performance
  4699. requested = orderedSet_from_options(val, alias_dict, start=requested)
  4700. continue
  4701. current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
  4702. else [val] if val in alias_dict['all'] else None)
  4703. if current is None:
  4704. raise ValueError(val)
  4705. if discard:
  4706. for item in current:
  4707. while item in requested:
  4708. requested.remove(item)
  4709. else:
  4710. requested.extend(current)
  4711. return orderedSet(requested)
  4712. class FormatSorter:
  4713. regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
  4714. default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
  4715. 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
  4716. 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
  4717. ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
  4718. 'height', 'width', 'proto', 'vext', 'abr', 'aext',
  4719. 'fps', 'fs_approx', 'source', 'id')
  4720. settings = {
  4721. 'vcodec': {'type': 'ordered', 'regex': True,
  4722. 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
  4723. 'acodec': {'type': 'ordered', 'regex': True,
  4724. 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
  4725. 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
  4726. 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
  4727. 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
  4728. 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
  4729. 'vext': {'type': 'ordered', 'field': 'video_ext',
  4730. 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
  4731. 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
  4732. 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
  4733. 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
  4734. 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
  4735. 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
  4736. 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
  4737. 'field': ('vcodec', 'acodec'),
  4738. 'function': lambda it: int(any(v != 'none' for v in it))},
  4739. 'ie_pref': {'priority': True, 'type': 'extractor'},
  4740. 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
  4741. 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
  4742. 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
  4743. 'quality': {'convert': 'float', 'default': -1},
  4744. 'filesize': {'convert': 'bytes'},
  4745. 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
  4746. 'id': {'convert': 'string', 'field': 'format_id'},
  4747. 'height': {'convert': 'float_none'},
  4748. 'width': {'convert': 'float_none'},
  4749. 'fps': {'convert': 'float_none'},
  4750. 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
  4751. 'tbr': {'convert': 'float_none'},
  4752. 'vbr': {'convert': 'float_none'},
  4753. 'abr': {'convert': 'float_none'},
  4754. 'asr': {'convert': 'float_none'},
  4755. 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
  4756. 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
  4757. 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
  4758. 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
  4759. 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
  4760. 'res': {'type': 'multiple', 'field': ('height', 'width'),
  4761. 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
  4762. # Actual field names
  4763. 'format_id': {'type': 'alias', 'field': 'id'},
  4764. 'preference': {'type': 'alias', 'field': 'ie_pref'},
  4765. 'language_preference': {'type': 'alias', 'field': 'lang'},
  4766. 'source_preference': {'type': 'alias', 'field': 'source'},
  4767. 'protocol': {'type': 'alias', 'field': 'proto'},
  4768. 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
  4769. 'audio_channels': {'type': 'alias', 'field': 'channels'},
  4770. # Deprecated
  4771. 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
  4772. 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
  4773. 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
  4774. 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
  4775. 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
  4776. 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
  4777. 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
  4778. 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
  4779. 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
  4780. 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
  4781. 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
  4782. 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
  4783. 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
  4784. 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
  4785. 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
  4786. 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
  4787. 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
  4788. 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
  4789. 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
  4790. 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
  4791. }
  4792. def __init__(self, ydl, field_preference):
  4793. self.ydl = ydl
  4794. self._order = []
  4795. self.evaluate_params(self.ydl.params, field_preference)
  4796. if ydl.params.get('verbose'):
  4797. self.print_verbose_info(self.ydl.write_debug)
  4798. def _get_field_setting(self, field, key):
  4799. if field not in self.settings:
  4800. if key in ('forced', 'priority'):
  4801. return False
  4802. self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
  4803. 'deprecated and may be removed in a future version')
  4804. self.settings[field] = {}
  4805. propObj = self.settings[field]
  4806. if key not in propObj:
  4807. type = propObj.get('type')
  4808. if key == 'field':
  4809. default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
  4810. elif key == 'convert':
  4811. default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
  4812. else:
  4813. default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
  4814. propObj[key] = default
  4815. return propObj[key]
  4816. def _resolve_field_value(self, field, value, convertNone=False):
  4817. if value is None:
  4818. if not convertNone:
  4819. return None
  4820. else:
  4821. value = value.lower()
  4822. conversion = self._get_field_setting(field, 'convert')
  4823. if conversion == 'ignore':
  4824. return None
  4825. if conversion == 'string':
  4826. return value
  4827. elif conversion == 'float_none':
  4828. return float_or_none(value)
  4829. elif conversion == 'bytes':
  4830. return parse_bytes(value)
  4831. elif conversion == 'order':
  4832. order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
  4833. use_regex = self._get_field_setting(field, 'regex')
  4834. list_length = len(order_list)
  4835. empty_pos = order_list.index('') if '' in order_list else list_length + 1
  4836. if use_regex and value is not None:
  4837. for i, regex in enumerate(order_list):
  4838. if regex and re.match(regex, value):
  4839. return list_length - i
  4840. return list_length - empty_pos # not in list
  4841. else: # not regex or value = None
  4842. return list_length - (order_list.index(value) if value in order_list else empty_pos)
  4843. else:
  4844. if value.isnumeric():
  4845. return float(value)
  4846. else:
  4847. self.settings[field]['convert'] = 'string'
  4848. return value
  4849. def evaluate_params(self, params, sort_extractor):
  4850. self._use_free_order = params.get('prefer_free_formats', False)
  4851. self._sort_user = params.get('format_sort', [])
  4852. self._sort_extractor = sort_extractor
  4853. def add_item(field, reverse, closest, limit_text):
  4854. field = field.lower()
  4855. if field in self._order:
  4856. return
  4857. self._order.append(field)
  4858. limit = self._resolve_field_value(field, limit_text)
  4859. data = {
  4860. 'reverse': reverse,
  4861. 'closest': False if limit is None else closest,
  4862. 'limit_text': limit_text,
  4863. 'limit': limit}
  4864. if field in self.settings:
  4865. self.settings[field].update(data)
  4866. else:
  4867. self.settings[field] = data
  4868. sort_list = (
  4869. tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
  4870. + (tuple() if params.get('format_sort_force', False)
  4871. else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
  4872. + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
  4873. for item in sort_list:
  4874. match = re.match(self.regex, item)
  4875. if match is None:
  4876. raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
  4877. field = match.group('field')
  4878. if field is None:
  4879. continue
  4880. if self._get_field_setting(field, 'type') == 'alias':
  4881. alias, field = field, self._get_field_setting(field, 'field')
  4882. if self._get_field_setting(alias, 'deprecated'):
  4883. self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
  4884. f'be removed in a future version. Please use {field} instead')
  4885. reverse = match.group('reverse') is not None
  4886. closest = match.group('separator') == '~'
  4887. limit_text = match.group('limit')
  4888. has_limit = limit_text is not None
  4889. has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
  4890. has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
  4891. fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
  4892. limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
  4893. limit_count = len(limits)
  4894. for (i, f) in enumerate(fields):
  4895. add_item(f, reverse, closest,
  4896. limits[i] if i < limit_count
  4897. else limits[0] if has_limit and not has_multiple_limits
  4898. else None)
  4899. def print_verbose_info(self, write_debug):
  4900. if self._sort_user:
  4901. write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
  4902. if self._sort_extractor:
  4903. write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
  4904. write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
  4905. '+' if self._get_field_setting(field, 'reverse') else '', field,
  4906. '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
  4907. self._get_field_setting(field, 'limit_text'),
  4908. self._get_field_setting(field, 'limit'))
  4909. if self._get_field_setting(field, 'limit_text') is not None else '')
  4910. for field in self._order if self._get_field_setting(field, 'visible')]))
  4911. def _calculate_field_preference_from_value(self, format, field, type, value):
  4912. reverse = self._get_field_setting(field, 'reverse')
  4913. closest = self._get_field_setting(field, 'closest')
  4914. limit = self._get_field_setting(field, 'limit')
  4915. if type == 'extractor':
  4916. maximum = self._get_field_setting(field, 'max')
  4917. if value is None or (maximum is not None and value >= maximum):
  4918. value = -1
  4919. elif type == 'boolean':
  4920. in_list = self._get_field_setting(field, 'in_list')
  4921. not_in_list = self._get_field_setting(field, 'not_in_list')
  4922. value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
  4923. elif type == 'ordered':
  4924. value = self._resolve_field_value(field, value, True)
  4925. # try to convert to number
  4926. val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
  4927. is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
  4928. if is_num:
  4929. value = val_num
  4930. return ((-10, 0) if value is None
  4931. else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
  4932. else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
  4933. else (0, value, 0) if not reverse and (limit is None or value <= limit)
  4934. else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
  4935. else (-1, value, 0))
  4936. def _calculate_field_preference(self, format, field):
  4937. type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
  4938. get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
  4939. if type == 'multiple':
  4940. type = 'field' # Only 'field' is allowed in multiple for now
  4941. actual_fields = self._get_field_setting(field, 'field')
  4942. value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
  4943. else:
  4944. value = get_value(field)
  4945. return self._calculate_field_preference_from_value(format, field, type, value)
  4946. def calculate_preference(self, format):
  4947. # Determine missing protocol
  4948. if not format.get('protocol'):
  4949. format['protocol'] = determine_protocol(format)
  4950. # Determine missing ext
  4951. if not format.get('ext') and 'url' in format:
  4952. format['ext'] = determine_ext(format['url'])
  4953. if format.get('vcodec') == 'none':
  4954. format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
  4955. format['video_ext'] = 'none'
  4956. else:
  4957. format['video_ext'] = format['ext']
  4958. format['audio_ext'] = 'none'
  4959. # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
  4960. # format['preference'] = -1000
  4961. if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
  4962. # HEVC-over-FLV is out-of-spec by FLV's original spec
  4963. # ref. https://trac.ffmpeg.org/ticket/6389
  4964. # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
  4965. format['preference'] = -100
  4966. # Determine missing bitrates
  4967. if format.get('tbr') is None:
  4968. if format.get('vbr') is not None and format.get('abr') is not None:
  4969. format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
  4970. else:
  4971. if format.get('vcodec') != 'none' and format.get('vbr') is None:
  4972. format['vbr'] = format.get('tbr') - format.get('abr', 0)
  4973. if format.get('acodec') != 'none' and format.get('abr') is None:
  4974. format['abr'] = format.get('tbr') - format.get('vbr', 0)
  4975. return tuple(self._calculate_field_preference(format, field) for field in self._order)