unicodeobject.c 457 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620116211162211623116241162511626116271162811629116301163111632116331163411635116361163711638116391164011641116421164311644116451164611647116481164911650116511165211653116541165511656116571165811659116601166111662116631166411665116661166711668116691167011671116721167311674116751167611677116781167911680116811168211683116841168511686116871168811689116901169111692116931169411695116961169711698116991170011701117021170311704117051170611707117081170911710117111171211713117141171511716117171171811719117201172111722117231172411725117261172711728117291173011731117321173311734117351173611737117381173911740117411174211743117441174511746117471174811749117501175111752117531175411755117561175711758117591176011761117621176311764117651176611767117681176911770117711177211773117741177511776117771177811779117801178111782117831178411785117861178711788117891179011791117921179311794117951179611797117981179911800118011180211803118041180511806118071180811809118101181111812118131181411815118161181711818118191182011821118221182311824118251182611827118281182911830118311183211833118341183511836118371183811839118401184111842118431184411845118461184711848118491185011851118521185311854118551185611857118581185911860118611186211863118641186511866118671186811869118701187111872118731187411875118761187711878118791188011881118821188311884118851188611887118881188911890118911189211893118941189511896118971189811899119001190111902119031190411905119061190711908119091191011911119121191311914119151191611917119181191911920119211192211923119241192511926119271192811929119301193111932119331193411935119361193711938119391194011941119421194311944119451194611947119481194911950119511195211953119541195511956119571195811959119601196111962119631196411965119661196711968119691197011971119721197311974119751197611977119781197911980119811198211983119841198511986119871198811989119901199111992119931199411995119961199711998119991200012001120021200312004120051200612007120081200912010120111201212013120141201512016120171201812019120201202112022120231202412025120261202712028120291203012031120321203312034120351203612037120381203912040120411204212043120441204512046120471204812049120501205112052120531205412055120561205712058120591206012061120621206312064120651206612067120681206912070120711207212073120741207512076120771207812079120801208112082120831208412085120861208712088120891209012091120921209312094120951209612097120981209912100121011210212103121041210512106121071210812109121101211112112121131211412115121161211712118121191212012121121221212312124121251212612127121281212912130121311213212133121341213512136121371213812139121401214112142121431214412145121461214712148121491215012151121521215312154121551215612157121581215912160121611216212163121641216512166121671216812169121701217112172121731217412175121761217712178121791218012181121821218312184121851218612187121881218912190121911219212193121941219512196121971219812199122001220112202122031220412205122061220712208122091221012211122121221312214122151221612217122181221912220122211222212223122241222512226122271222812229122301223112232122331223412235122361223712238122391224012241122421224312244122451224612247122481224912250122511225212253122541225512256122571225812259122601226112262122631226412265122661226712268122691227012271122721227312274122751227612277122781227912280122811228212283122841228512286122871228812289122901229112292122931229412295122961229712298122991230012301123021230312304123051230612307123081230912310123111231212313123141231512316123171231812319123201232112322123231232412325123261232712328123291233012331123321233312334123351233612337123381233912340123411234212343123441234512346123471234812349123501235112352123531235412355123561235712358123591236012361123621236312364123651236612367123681236912370123711237212373123741237512376123771237812379123801238112382123831238412385123861238712388123891239012391123921239312394123951239612397123981239912400124011240212403124041240512406124071240812409124101241112412124131241412415124161241712418124191242012421124221242312424124251242612427124281242912430124311243212433124341243512436124371243812439124401244112442124431244412445124461244712448124491245012451124521245312454124551245612457124581245912460124611246212463124641246512466124671246812469124701247112472124731247412475124761247712478124791248012481124821248312484124851248612487124881248912490124911249212493124941249512496124971249812499125001250112502125031250412505125061250712508125091251012511125121251312514125151251612517125181251912520125211252212523125241252512526125271252812529125301253112532125331253412535125361253712538125391254012541125421254312544125451254612547125481254912550125511255212553125541255512556125571255812559125601256112562125631256412565125661256712568125691257012571125721257312574125751257612577125781257912580125811258212583125841258512586125871258812589125901259112592125931259412595125961259712598125991260012601126021260312604126051260612607126081260912610126111261212613126141261512616126171261812619126201262112622126231262412625126261262712628126291263012631126321263312634126351263612637126381263912640126411264212643126441264512646126471264812649126501265112652126531265412655126561265712658126591266012661126621266312664126651266612667126681266912670126711267212673126741267512676126771267812679126801268112682126831268412685126861268712688126891269012691126921269312694126951269612697126981269912700127011270212703127041270512706127071270812709127101271112712127131271412715127161271712718127191272012721127221272312724127251272612727127281272912730127311273212733127341273512736127371273812739127401274112742127431274412745127461274712748127491275012751127521275312754127551275612757127581275912760127611276212763127641276512766127671276812769127701277112772127731277412775127761277712778127791278012781127821278312784127851278612787127881278912790127911279212793127941279512796127971279812799128001280112802128031280412805128061280712808128091281012811128121281312814128151281612817128181281912820128211282212823128241282512826128271282812829128301283112832128331283412835128361283712838128391284012841128421284312844128451284612847128481284912850128511285212853128541285512856128571285812859128601286112862128631286412865128661286712868128691287012871128721287312874128751287612877128781287912880128811288212883128841288512886128871288812889128901289112892128931289412895128961289712898128991290012901129021290312904129051290612907129081290912910129111291212913129141291512916129171291812919129201292112922129231292412925129261292712928129291293012931129321293312934129351293612937129381293912940129411294212943129441294512946129471294812949129501295112952129531295412955129561295712958129591296012961129621296312964129651296612967129681296912970129711297212973129741297512976129771297812979129801298112982129831298412985129861298712988129891299012991129921299312994129951299612997129981299913000130011300213003130041300513006130071300813009130101301113012130131301413015130161301713018130191302013021130221302313024130251302613027130281302913030130311303213033130341303513036130371303813039130401304113042130431304413045130461304713048130491305013051130521305313054130551305613057130581305913060130611306213063130641306513066130671306813069130701307113072130731307413075130761307713078130791308013081130821308313084130851308613087130881308913090130911309213093130941309513096130971309813099131001310113102131031310413105131061310713108131091311013111131121311313114131151311613117131181311913120131211312213123131241312513126131271312813129131301313113132131331313413135131361313713138131391314013141131421314313144131451314613147131481314913150131511315213153131541315513156131571315813159131601316113162131631316413165131661316713168131691317013171131721317313174131751317613177131781317913180131811318213183131841318513186131871318813189131901319113192131931319413195131961319713198131991320013201132021320313204132051320613207132081320913210132111321213213132141321513216132171321813219132201322113222132231322413225132261322713228132291323013231132321323313234132351323613237132381323913240132411324213243132441324513246132471324813249132501325113252132531325413255132561325713258132591326013261132621326313264132651326613267132681326913270132711327213273132741327513276132771327813279132801328113282132831328413285132861328713288132891329013291132921329313294132951329613297132981329913300133011330213303133041330513306133071330813309133101331113312133131331413315133161331713318133191332013321133221332313324133251332613327133281332913330133311333213333133341333513336133371333813339133401334113342133431334413345133461334713348133491335013351133521335313354133551335613357133581335913360133611336213363133641336513366133671336813369133701337113372133731337413375133761337713378133791338013381133821338313384133851338613387133881338913390133911339213393133941339513396133971339813399134001340113402134031340413405134061340713408134091341013411134121341313414134151341613417134181341913420134211342213423134241342513426134271342813429134301343113432134331343413435134361343713438134391344013441134421344313444134451344613447134481344913450134511345213453134541345513456134571345813459134601346113462134631346413465134661346713468134691347013471134721347313474134751347613477134781347913480134811348213483134841348513486134871348813489134901349113492134931349413495134961349713498134991350013501135021350313504135051350613507135081350913510135111351213513135141351513516135171351813519135201352113522135231352413525135261352713528135291353013531135321353313534135351353613537135381353913540135411354213543135441354513546135471354813549135501355113552135531355413555135561355713558135591356013561135621356313564135651356613567135681356913570135711357213573135741357513576135771357813579135801358113582135831358413585135861358713588135891359013591135921359313594135951359613597135981359913600136011360213603136041360513606136071360813609136101361113612136131361413615136161361713618136191362013621136221362313624136251362613627136281362913630136311363213633136341363513636136371363813639136401364113642136431364413645136461364713648136491365013651136521365313654136551365613657136581365913660136611366213663136641366513666136671366813669136701367113672136731367413675136761367713678136791368013681136821368313684136851368613687136881368913690136911369213693136941369513696136971369813699137001370113702137031370413705137061370713708137091371013711137121371313714137151371613717137181371913720137211372213723137241372513726137271372813729137301373113732137331373413735137361373713738137391374013741137421374313744137451374613747137481374913750137511375213753137541375513756137571375813759137601376113762137631376413765137661376713768137691377013771137721377313774137751377613777137781377913780137811378213783137841378513786137871378813789137901379113792137931379413795137961379713798137991380013801138021380313804138051380613807138081380913810138111381213813138141381513816138171381813819138201382113822138231382413825138261382713828138291383013831138321383313834138351383613837138381383913840138411384213843138441384513846138471384813849138501385113852138531385413855138561385713858138591386013861138621386313864138651386613867138681386913870138711387213873138741387513876138771387813879138801388113882138831388413885138861388713888138891389013891138921389313894138951389613897138981389913900139011390213903139041390513906139071390813909139101391113912139131391413915139161391713918139191392013921139221392313924139251392613927139281392913930139311393213933139341393513936139371393813939139401394113942139431394413945139461394713948139491395013951139521395313954139551395613957139581395913960139611396213963139641396513966139671396813969139701397113972139731397413975139761397713978139791398013981139821398313984139851398613987139881398913990139911399213993139941399513996139971399813999140001400114002140031400414005140061400714008140091401014011140121401314014140151401614017140181401914020140211402214023140241402514026140271402814029140301403114032140331403414035140361403714038140391404014041140421404314044140451404614047140481404914050140511405214053140541405514056140571405814059140601406114062140631406414065140661406714068140691407014071140721407314074140751407614077140781407914080140811408214083140841408514086140871408814089140901409114092140931409414095140961409714098140991410014101141021410314104141051410614107141081410914110141111411214113141141411514116141171411814119141201412114122141231412414125141261412714128141291413014131141321413314134141351413614137141381413914140141411414214143141441414514146141471414814149141501415114152141531415414155141561415714158141591416014161141621416314164141651416614167141681416914170141711417214173141741417514176141771417814179141801418114182141831418414185141861418714188141891419014191141921419314194141951419614197141981419914200142011420214203142041420514206142071420814209142101421114212142131421414215142161421714218142191422014221142221422314224142251422614227142281422914230142311423214233142341423514236142371423814239142401424114242142431424414245142461424714248142491425014251142521425314254142551425614257142581425914260142611426214263142641426514266142671426814269142701427114272142731427414275142761427714278142791428014281142821428314284142851428614287142881428914290142911429214293142941429514296142971429814299143001430114302143031430414305143061430714308143091431014311143121431314314143151431614317143181431914320143211432214323143241432514326143271432814329143301433114332143331433414335143361433714338143391434014341143421434314344143451434614347143481434914350143511435214353143541435514356143571435814359143601436114362143631436414365143661436714368143691437014371143721437314374143751437614377143781437914380143811438214383143841438514386143871438814389143901439114392143931439414395143961439714398143991440014401144021440314404144051440614407144081440914410144111441214413144141441514416144171441814419144201442114422144231442414425144261442714428144291443014431144321443314434144351443614437144381443914440144411444214443144441444514446144471444814449144501445114452144531445414455144561445714458144591446014461144621446314464144651446614467144681446914470144711447214473144741447514476144771447814479144801448114482144831448414485144861448714488144891449014491144921449314494144951449614497144981449914500145011450214503145041450514506145071450814509145101451114512145131451414515145161451714518145191452014521145221452314524145251452614527145281452914530145311453214533145341453514536145371453814539145401454114542145431454414545145461454714548145491455014551145521455314554145551455614557145581455914560145611456214563145641456514566145671456814569145701457114572145731457414575145761457714578145791458014581145821458314584145851458614587145881458914590145911459214593145941459514596145971459814599146001460114602146031460414605146061460714608146091461014611146121461314614146151461614617146181461914620146211462214623146241462514626146271462814629146301463114632146331463414635146361463714638146391464014641146421464314644146451464614647146481464914650146511465214653146541465514656146571465814659146601466114662146631466414665146661466714668146691467014671146721467314674146751467614677146781467914680146811468214683146841468514686146871468814689146901469114692146931469414695146961469714698146991470014701147021470314704147051470614707147081470914710147111471214713147141471514716147171471814719147201472114722147231472414725147261472714728147291473014731147321473314734147351473614737147381473914740147411474214743147441474514746147471474814749147501475114752147531475414755147561475714758147591476014761147621476314764147651476614767147681476914770147711477214773147741477514776147771477814779147801478114782147831478414785147861478714788147891479014791147921479314794147951479614797147981479914800148011480214803148041480514806148071480814809148101481114812148131481414815148161481714818148191482014821148221482314824148251482614827148281482914830148311483214833148341483514836148371483814839148401484114842148431484414845148461484714848148491485014851148521485314854148551485614857148581485914860148611486214863148641486514866148671486814869148701487114872148731487414875148761487714878148791488014881148821488314884148851488614887148881488914890148911489214893148941489514896148971489814899149001490114902149031490414905149061490714908149091491014911149121491314914149151491614917149181491914920149211492214923149241492514926149271492814929149301493114932149331493414935149361493714938149391494014941149421494314944149451494614947149481494914950149511495214953149541495514956149571495814959149601496114962149631496414965149661496714968149691497014971149721497314974149751497614977149781497914980149811498214983149841498514986149871498814989149901499114992149931499414995149961499714998149991500015001150021500315004150051500615007150081500915010150111501215013150141501515016150171501815019150201502115022150231502415025150261502715028150291503015031150321503315034150351503615037150381503915040150411504215043150441504515046150471504815049150501505115052150531505415055150561505715058150591506015061150621506315064150651506615067150681506915070150711507215073150741507515076150771507815079150801508115082150831508415085150861508715088150891509015091150921509315094150951509615097150981509915100151011510215103151041510515106151071510815109151101511115112151131511415115151161511715118151191512015121151221512315124151251512615127151281512915130151311513215133151341513515136151371513815139151401514115142151431514415145151461514715148151491515015151151521515315154151551515615157151581515915160151611516215163151641516515166151671516815169151701517115172151731517415175151761517715178151791518015181151821518315184151851518615187151881518915190151911519215193151941519515196151971519815199152001520115202152031520415205152061520715208152091521015211152121521315214152151521615217152181521915220152211522215223152241522515226152271522815229152301523115232152331523415235152361523715238152391524015241152421524315244152451524615247152481524915250152511525215253152541525515256152571525815259152601526115262152631526415265152661526715268152691527015271152721527315274152751527615277152781527915280152811528215283152841528515286152871528815289152901529115292152931529415295152961529715298152991530015301153021530315304153051530615307153081530915310153111531215313153141531515316153171531815319153201532115322153231532415325153261532715328153291533015331153321533315334153351533615337153381533915340153411534215343153441534515346153471534815349153501535115352153531535415355153561535715358153591536015361153621536315364153651536615367153681536915370153711537215373153741537515376153771537815379153801538115382153831538415385153861538715388153891539015391153921539315394153951539615397153981539915400154011540215403154041540515406154071540815409154101541115412154131541415415154161541715418154191542015421154221542315424154251542615427154281542915430154311543215433154341543515436154371543815439154401544115442154431544415445154461544715448154491545015451154521545315454154551545615457154581545915460154611546215463154641546515466154671546815469154701547115472154731547415475154761547715478154791548015481154821548315484154851548615487154881548915490154911549215493154941549515496154971549815499155001550115502155031550415505155061550715508155091551015511155121551315514155151551615517155181551915520155211552215523155241552515526155271552815529155301553115532155331553415535155361553715538155391554015541155421554315544155451554615547155481554915550155511555215553155541555515556155571555815559155601556115562155631556415565155661556715568155691557015571155721557315574155751557615577155781557915580155811558215583155841558515586155871558815589155901559115592155931559415595155961559715598155991560015601156021560315604156051560615607156081560915610156111561215613156141561515616156171561815619156201562115622156231562415625156261562715628156291563015631156321563315634156351563615637156381563915640156411564215643156441564515646156471564815649156501565115652156531565415655156561565715658156591566015661156621566315664156651566615667156681566915670156711567215673156741567515676156771567815679156801568115682156831568415685156861568715688156891569015691156921569315694156951569615697156981569915700157011570215703157041570515706157071570815709157101571115712157131571415715157161571715718157191572015721157221572315724157251572615727157281572915730157311573215733157341573515736157371573815739157401574115742157431574415745157461574715748157491575015751157521575315754157551575615757157581575915760157611576215763
  1. /*
  2. Unicode implementation based on original code by Fredrik Lundh,
  3. modified by Marc-Andre Lemburg <mal@lemburg.com>.
  4. Major speed upgrades to the method implementations at the Reykjavik
  5. NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
  6. Copyright (c) Corporation for National Research Initiatives.
  7. --------------------------------------------------------------------
  8. The original string type implementation is:
  9. Copyright (c) 1999 by Secret Labs AB
  10. Copyright (c) 1999 by Fredrik Lundh
  11. By obtaining, using, and/or copying this software and/or its
  12. associated documentation, you agree that you have read, understood,
  13. and will comply with the following terms and conditions:
  14. Permission to use, copy, modify, and distribute this software and its
  15. associated documentation for any purpose and without fee is hereby
  16. granted, provided that the above copyright notice appears in all
  17. copies, and that both that copyright notice and this permission notice
  18. appear in supporting documentation, and that the name of Secret Labs
  19. AB or the author not be used in advertising or publicity pertaining to
  20. distribution of the software without specific, written prior
  21. permission.
  22. SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  23. THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  24. FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  25. ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  26. WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  27. ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  28. OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  29. --------------------------------------------------------------------
  30. */
  31. #define PY_SSIZE_T_CLEAN
  32. #include "Python.h"
  33. #include "pycore_abstract.h" // _PyIndex_Check()
  34. #include "pycore_atomic_funcs.h" // _Py_atomic_size_get()
  35. #include "pycore_bytesobject.h" // _PyBytes_Repeat()
  36. #include "pycore_bytes_methods.h" // _Py_bytes_lower()
  37. #include "pycore_format.h" // F_LJUST
  38. #include "pycore_initconfig.h" // _PyStatus_OK()
  39. #include "pycore_interp.h" // PyInterpreterState.fs_codec
  40. #include "pycore_long.h" // _PyLong_FormatWriter()
  41. #include "pycore_object.h" // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
  42. #include "pycore_pathconfig.h" // _Py_DumpPathConfig()
  43. #include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
  44. #include "pycore_pystate.h" // _PyInterpreterState_GET()
  45. #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
  46. #include "pycore_unicodeobject.h" // struct _Py_unicode_state
  47. #include "pycore_unicodeobject_generated.h" // _PyUnicode_InitStaticStrings()
  48. #include "stringlib/eq.h" // unicode_eq()
  49. #include <stddef.h> // ptrdiff_t
  50. #ifdef MS_WINDOWS
  51. #include <windows.h>
  52. #endif
  53. #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
  54. # include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
  55. #endif
  56. /* Uncomment to display statistics on interned strings at exit
  57. in _PyUnicode_ClearInterned(). */
  58. /* #define INTERNED_STATS 1 */
  59. /*[clinic input]
  60. class str "PyObject *" "&PyUnicode_Type"
  61. [clinic start generated code]*/
  62. /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
  63. /*[python input]
  64. class Py_UCS4_converter(CConverter):
  65. type = 'Py_UCS4'
  66. converter = 'convert_uc'
  67. def converter_init(self):
  68. if self.default is not unspecified:
  69. self.c_default = ascii(self.default)
  70. if len(self.c_default) > 4 or self.c_default[0] != "'":
  71. self.c_default = hex(ord(self.default))
  72. [python start generated code]*/
  73. /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
  74. /* --- Globals ------------------------------------------------------------
  75. NOTE: In the interpreter's initialization phase, some globals are currently
  76. initialized dynamically as needed. In the process Unicode objects may
  77. be created before the Unicode type is ready.
  78. */
  79. #ifdef __cplusplus
  80. extern "C" {
  81. #endif
  82. // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
  83. // The value must be the same in fileutils.c.
  84. #define MAX_UNICODE 0x10ffff
  85. #ifdef Py_DEBUG
  86. # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
  87. #else
  88. # define _PyUnicode_CHECK(op) PyUnicode_Check(op)
  89. #endif
  90. #define _PyUnicode_UTF8(op) \
  91. (_PyCompactUnicodeObject_CAST(op)->utf8)
  92. #define PyUnicode_UTF8(op) \
  93. (assert(_PyUnicode_CHECK(op)), \
  94. PyUnicode_IS_COMPACT_ASCII(op) ? \
  95. ((char*)(_PyASCIIObject_CAST(op) + 1)) : \
  96. _PyUnicode_UTF8(op))
  97. #define _PyUnicode_UTF8_LENGTH(op) \
  98. (_PyCompactUnicodeObject_CAST(op)->utf8_length)
  99. #define PyUnicode_UTF8_LENGTH(op) \
  100. (assert(_PyUnicode_CHECK(op)), \
  101. PyUnicode_IS_COMPACT_ASCII(op) ? \
  102. _PyASCIIObject_CAST(op)->length : \
  103. _PyUnicode_UTF8_LENGTH(op))
  104. #define _PyUnicode_LENGTH(op) \
  105. (_PyASCIIObject_CAST(op)->length)
  106. #define _PyUnicode_STATE(op) \
  107. (_PyASCIIObject_CAST(op)->state)
  108. #define _PyUnicode_HASH(op) \
  109. (_PyASCIIObject_CAST(op)->hash)
  110. #define _PyUnicode_KIND(op) \
  111. (assert(_PyUnicode_CHECK(op)), \
  112. _PyASCIIObject_CAST(op)->state.kind)
  113. #define _PyUnicode_GET_LENGTH(op) \
  114. (assert(_PyUnicode_CHECK(op)), \
  115. _PyASCIIObject_CAST(op)->length)
  116. #define _PyUnicode_DATA_ANY(op) \
  117. (_PyUnicodeObject_CAST(op)->data.any)
  118. #define _PyUnicode_SHARE_UTF8(op) \
  119. (assert(_PyUnicode_CHECK(op)), \
  120. assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
  121. (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
  122. /* true if the Unicode object has an allocated UTF-8 memory block
  123. (not shared with other data) */
  124. #define _PyUnicode_HAS_UTF8_MEMORY(op) \
  125. ((!PyUnicode_IS_COMPACT_ASCII(op) \
  126. && _PyUnicode_UTF8(op) \
  127. && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
  128. /* Generic helper macro to convert characters of different types.
  129. from_type and to_type have to be valid type names, begin and end
  130. are pointers to the source characters which should be of type
  131. "from_type *". to is a pointer of type "to_type *" and points to the
  132. buffer where the result characters are written to. */
  133. #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
  134. do { \
  135. to_type *_to = (to_type *)(to); \
  136. const from_type *_iter = (const from_type *)(begin);\
  137. const from_type *_end = (const from_type *)(end);\
  138. Py_ssize_t n = (_end) - (_iter); \
  139. const from_type *_unrolled_end = \
  140. _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
  141. while (_iter < (_unrolled_end)) { \
  142. _to[0] = (to_type) _iter[0]; \
  143. _to[1] = (to_type) _iter[1]; \
  144. _to[2] = (to_type) _iter[2]; \
  145. _to[3] = (to_type) _iter[3]; \
  146. _iter += 4; _to += 4; \
  147. } \
  148. while (_iter < (_end)) \
  149. *_to++ = (to_type) *_iter++; \
  150. } while (0)
  151. #define LATIN1 _Py_LATIN1_CHR
  152. #ifdef MS_WINDOWS
  153. /* On Windows, overallocate by 50% is the best factor */
  154. # define OVERALLOCATE_FACTOR 2
  155. #else
  156. /* On Linux, overallocate by 25% is the best factor */
  157. # define OVERALLOCATE_FACTOR 4
  158. #endif
  159. /* Forward declaration */
  160. static inline int
  161. _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
  162. static inline void
  163. _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
  164. static PyObject *
  165. unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
  166. const char *errors);
  167. static PyObject *
  168. unicode_decode_utf8(const char *s, Py_ssize_t size,
  169. _Py_error_handler error_handler, const char *errors,
  170. Py_ssize_t *consumed);
  171. #ifdef Py_DEBUG
  172. static inline int unicode_is_finalizing(void);
  173. static int unicode_is_singleton(PyObject *unicode);
  174. #endif
  175. // Return a borrowed reference to the empty string singleton.
  176. static inline PyObject* unicode_get_empty(void)
  177. {
  178. _Py_DECLARE_STR(empty, "");
  179. return &_Py_STR(empty);
  180. }
  181. // Return a strong reference to the empty string singleton.
  182. static inline PyObject* unicode_new_empty(void)
  183. {
  184. PyObject *empty = unicode_get_empty();
  185. return Py_NewRef(empty);
  186. }
  187. /* This dictionary holds per-interpreter interned strings.
  188. * See InternalDocs/string_interning.md for details.
  189. */
  190. static inline PyObject *get_interned_dict(PyInterpreterState *interp)
  191. {
  192. return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
  193. }
  194. /* This hashtable holds statically allocated interned strings.
  195. * See InternalDocs/string_interning.md for details.
  196. */
  197. #define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
  198. /* Get number of all interned strings for the current interpreter. */
  199. Py_ssize_t
  200. _PyUnicode_InternedSize(void)
  201. {
  202. PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
  203. return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
  204. }
  205. /* Get number of immortal interned strings for the current interpreter. */
  206. Py_ssize_t
  207. _PyUnicode_InternedSize_Immortal(void)
  208. {
  209. PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
  210. PyObject *key, *value;
  211. Py_ssize_t pos = 0;
  212. Py_ssize_t count = 0;
  213. // It's tempting to keep a count and avoid a loop here. But, this function
  214. // is intended for refleak tests. It spends extra work to report the true
  215. // value, to help detect bugs in optimizations.
  216. while (PyDict_Next(dict, &pos, &key, &value)) {
  217. assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
  218. if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
  219. count++;
  220. }
  221. }
  222. return _Py_hashtable_len(INTERNED_STRINGS) + count;
  223. }
  224. static Py_hash_t unicode_hash(PyObject *);
  225. static int unicode_compare_eq(PyObject *, PyObject *);
  226. static Py_uhash_t
  227. hashtable_unicode_hash(const void *key)
  228. {
  229. return unicode_hash((PyObject *)key);
  230. }
  231. static int
  232. hashtable_unicode_compare(const void *key1, const void *key2)
  233. {
  234. PyObject *obj1 = (PyObject *)key1;
  235. PyObject *obj2 = (PyObject *)key2;
  236. if (obj1 != NULL && obj2 != NULL) {
  237. return unicode_compare_eq(obj1, obj2);
  238. }
  239. else {
  240. return obj1 == obj2;
  241. }
  242. }
  243. /* Return true if this interpreter should share the main interpreter's
  244. intern_dict. That's important for interpreters which load basic
  245. single-phase init extension modules (m_size == -1). There could be interned
  246. immortal strings that are shared between interpreters, due to the
  247. PyDict_Update(mdict, m_copy) call in import_find_extension().
  248. It's not safe to deallocate those strings until all interpreters that
  249. potentially use them are freed. By storing them in the main interpreter, we
  250. ensure they get freed after all other interpreters are freed.
  251. */
  252. static bool
  253. has_shared_intern_dict(PyInterpreterState *interp)
  254. {
  255. PyInterpreterState *main_interp = _PyInterpreterState_Main();
  256. return interp != main_interp && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
  257. }
  258. static int
  259. init_interned_dict(PyInterpreterState *interp)
  260. {
  261. assert(get_interned_dict(interp) == NULL);
  262. PyObject *interned;
  263. if (has_shared_intern_dict(interp)) {
  264. interned = get_interned_dict(_PyInterpreterState_Main());
  265. Py_INCREF(interned);
  266. }
  267. else {
  268. interned = PyDict_New();
  269. if (interned == NULL) {
  270. return -1;
  271. }
  272. }
  273. _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
  274. return 0;
  275. }
  276. static void
  277. clear_interned_dict(PyInterpreterState *interp)
  278. {
  279. PyObject *interned = get_interned_dict(interp);
  280. if (interned != NULL) {
  281. if (!has_shared_intern_dict(interp)) {
  282. // only clear if the dict belongs to this interpreter
  283. PyDict_Clear(interned);
  284. }
  285. Py_DECREF(interned);
  286. _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
  287. }
  288. }
  289. static PyStatus
  290. init_global_interned_strings(PyInterpreterState *interp)
  291. {
  292. assert(INTERNED_STRINGS == NULL);
  293. _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
  294. INTERNED_STRINGS = _Py_hashtable_new_full(
  295. hashtable_unicode_hash,
  296. hashtable_unicode_compare,
  297. // Objects stored here are immortal and statically allocated,
  298. // so we don't need key_destroy_func & value_destroy_func:
  299. NULL,
  300. NULL,
  301. &hashtable_alloc
  302. );
  303. if (INTERNED_STRINGS == NULL) {
  304. PyErr_Clear();
  305. return _PyStatus_ERR("failed to create global interned dict");
  306. }
  307. /* Intern statically allocated string identifiers, deepfreeze strings,
  308. * and one-byte latin-1 strings.
  309. * This must be done before any module initialization so that statically
  310. * allocated string identifiers are used instead of heap allocated strings.
  311. * Deepfreeze uses the interned identifiers if present to save space
  312. * else generates them and they are interned to speed up dict lookups.
  313. */
  314. _PyUnicode_InitStaticStrings(interp);
  315. for (int i = 0; i < 256; i++) {
  316. PyObject *s = LATIN1(i);
  317. _PyUnicode_InternStatic(interp, &s);
  318. assert(s == LATIN1(i));
  319. }
  320. #ifdef Py_DEBUG
  321. assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
  322. for (int i = 0; i < 256; i++) {
  323. assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
  324. }
  325. #endif
  326. return _PyStatus_OK();
  327. }
  328. static void clear_global_interned_strings(void)
  329. {
  330. if (INTERNED_STRINGS != NULL) {
  331. _Py_hashtable_destroy(INTERNED_STRINGS);
  332. INTERNED_STRINGS = NULL;
  333. }
  334. }
  335. #define _Py_RETURN_UNICODE_EMPTY() \
  336. do { \
  337. return unicode_new_empty(); \
  338. } while (0)
  339. static inline void
  340. unicode_fill(int kind, void *data, Py_UCS4 value,
  341. Py_ssize_t start, Py_ssize_t length)
  342. {
  343. assert(0 <= start);
  344. switch (kind) {
  345. case PyUnicode_1BYTE_KIND: {
  346. assert(value <= 0xff);
  347. Py_UCS1 ch = (unsigned char)value;
  348. Py_UCS1 *to = (Py_UCS1 *)data + start;
  349. memset(to, ch, length);
  350. break;
  351. }
  352. case PyUnicode_2BYTE_KIND: {
  353. assert(value <= 0xffff);
  354. Py_UCS2 ch = (Py_UCS2)value;
  355. Py_UCS2 *to = (Py_UCS2 *)data + start;
  356. const Py_UCS2 *end = to + length;
  357. for (; to < end; ++to) *to = ch;
  358. break;
  359. }
  360. case PyUnicode_4BYTE_KIND: {
  361. assert(value <= MAX_UNICODE);
  362. Py_UCS4 ch = value;
  363. Py_UCS4 * to = (Py_UCS4 *)data + start;
  364. const Py_UCS4 *end = to + length;
  365. for (; to < end; ++to) *to = ch;
  366. break;
  367. }
  368. default: Py_UNREACHABLE();
  369. }
  370. }
  371. /* Fast detection of the most frequent whitespace characters */
  372. const unsigned char _Py_ascii_whitespace[] = {
  373. 0, 0, 0, 0, 0, 0, 0, 0,
  374. /* case 0x0009: * CHARACTER TABULATION */
  375. /* case 0x000A: * LINE FEED */
  376. /* case 0x000B: * LINE TABULATION */
  377. /* case 0x000C: * FORM FEED */
  378. /* case 0x000D: * CARRIAGE RETURN */
  379. 0, 1, 1, 1, 1, 1, 0, 0,
  380. 0, 0, 0, 0, 0, 0, 0, 0,
  381. /* case 0x001C: * FILE SEPARATOR */
  382. /* case 0x001D: * GROUP SEPARATOR */
  383. /* case 0x001E: * RECORD SEPARATOR */
  384. /* case 0x001F: * UNIT SEPARATOR */
  385. 0, 0, 0, 0, 1, 1, 1, 1,
  386. /* case 0x0020: * SPACE */
  387. 1, 0, 0, 0, 0, 0, 0, 0,
  388. 0, 0, 0, 0, 0, 0, 0, 0,
  389. 0, 0, 0, 0, 0, 0, 0, 0,
  390. 0, 0, 0, 0, 0, 0, 0, 0,
  391. 0, 0, 0, 0, 0, 0, 0, 0,
  392. 0, 0, 0, 0, 0, 0, 0, 0,
  393. 0, 0, 0, 0, 0, 0, 0, 0,
  394. 0, 0, 0, 0, 0, 0, 0, 0,
  395. 0, 0, 0, 0, 0, 0, 0, 0,
  396. 0, 0, 0, 0, 0, 0, 0, 0,
  397. 0, 0, 0, 0, 0, 0, 0, 0,
  398. 0, 0, 0, 0, 0, 0, 0, 0
  399. };
  400. /* forward */
  401. static PyObject* get_latin1_char(unsigned char ch);
  402. static int unicode_modifiable(PyObject *unicode);
  403. static PyObject *
  404. _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
  405. static PyObject *
  406. _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
  407. static PyObject *
  408. _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
  409. static PyObject *
  410. unicode_encode_call_errorhandler(const char *errors,
  411. PyObject **errorHandler,const char *encoding, const char *reason,
  412. PyObject *unicode, PyObject **exceptionObject,
  413. Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
  414. static void
  415. raise_encode_exception(PyObject **exceptionObject,
  416. const char *encoding,
  417. PyObject *unicode,
  418. Py_ssize_t startpos, Py_ssize_t endpos,
  419. const char *reason);
  420. /* Same for linebreaks */
  421. static const unsigned char ascii_linebreak[] = {
  422. 0, 0, 0, 0, 0, 0, 0, 0,
  423. /* 0x000A, * LINE FEED */
  424. /* 0x000B, * LINE TABULATION */
  425. /* 0x000C, * FORM FEED */
  426. /* 0x000D, * CARRIAGE RETURN */
  427. 0, 0, 1, 1, 1, 1, 0, 0,
  428. 0, 0, 0, 0, 0, 0, 0, 0,
  429. /* 0x001C, * FILE SEPARATOR */
  430. /* 0x001D, * GROUP SEPARATOR */
  431. /* 0x001E, * RECORD SEPARATOR */
  432. 0, 0, 0, 0, 1, 1, 1, 0,
  433. 0, 0, 0, 0, 0, 0, 0, 0,
  434. 0, 0, 0, 0, 0, 0, 0, 0,
  435. 0, 0, 0, 0, 0, 0, 0, 0,
  436. 0, 0, 0, 0, 0, 0, 0, 0,
  437. 0, 0, 0, 0, 0, 0, 0, 0,
  438. 0, 0, 0, 0, 0, 0, 0, 0,
  439. 0, 0, 0, 0, 0, 0, 0, 0,
  440. 0, 0, 0, 0, 0, 0, 0, 0,
  441. 0, 0, 0, 0, 0, 0, 0, 0,
  442. 0, 0, 0, 0, 0, 0, 0, 0,
  443. 0, 0, 0, 0, 0, 0, 0, 0,
  444. 0, 0, 0, 0, 0, 0, 0, 0
  445. };
  446. static int convert_uc(PyObject *obj, void *addr);
  447. struct encoding_map;
  448. #include "clinic/unicodeobject.c.h"
  449. _Py_error_handler
  450. _Py_GetErrorHandler(const char *errors)
  451. {
  452. if (errors == NULL || strcmp(errors, "strict") == 0) {
  453. return _Py_ERROR_STRICT;
  454. }
  455. if (strcmp(errors, "surrogateescape") == 0) {
  456. return _Py_ERROR_SURROGATEESCAPE;
  457. }
  458. if (strcmp(errors, "replace") == 0) {
  459. return _Py_ERROR_REPLACE;
  460. }
  461. if (strcmp(errors, "ignore") == 0) {
  462. return _Py_ERROR_IGNORE;
  463. }
  464. if (strcmp(errors, "backslashreplace") == 0) {
  465. return _Py_ERROR_BACKSLASHREPLACE;
  466. }
  467. if (strcmp(errors, "surrogatepass") == 0) {
  468. return _Py_ERROR_SURROGATEPASS;
  469. }
  470. if (strcmp(errors, "xmlcharrefreplace") == 0) {
  471. return _Py_ERROR_XMLCHARREFREPLACE;
  472. }
  473. return _Py_ERROR_OTHER;
  474. }
  475. static _Py_error_handler
  476. get_error_handler_wide(const wchar_t *errors)
  477. {
  478. if (errors == NULL || wcscmp(errors, L"strict") == 0) {
  479. return _Py_ERROR_STRICT;
  480. }
  481. if (wcscmp(errors, L"surrogateescape") == 0) {
  482. return _Py_ERROR_SURROGATEESCAPE;
  483. }
  484. if (wcscmp(errors, L"replace") == 0) {
  485. return _Py_ERROR_REPLACE;
  486. }
  487. if (wcscmp(errors, L"ignore") == 0) {
  488. return _Py_ERROR_IGNORE;
  489. }
  490. if (wcscmp(errors, L"backslashreplace") == 0) {
  491. return _Py_ERROR_BACKSLASHREPLACE;
  492. }
  493. if (wcscmp(errors, L"surrogatepass") == 0) {
  494. return _Py_ERROR_SURROGATEPASS;
  495. }
  496. if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
  497. return _Py_ERROR_XMLCHARREFREPLACE;
  498. }
  499. return _Py_ERROR_OTHER;
  500. }
  501. static inline int
  502. unicode_check_encoding_errors(const char *encoding, const char *errors)
  503. {
  504. if (encoding == NULL && errors == NULL) {
  505. return 0;
  506. }
  507. PyInterpreterState *interp = _PyInterpreterState_GET();
  508. #ifndef Py_DEBUG
  509. /* In release mode, only check in development mode (-X dev) */
  510. if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
  511. return 0;
  512. }
  513. #else
  514. /* Always check in debug mode */
  515. #endif
  516. /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
  517. codec registry is ready: before_PyUnicode_InitEncodings() is called. */
  518. if (!interp->unicode.fs_codec.encoding) {
  519. return 0;
  520. }
  521. /* Disable checks during Python finalization. For example, it allows to
  522. call _PyObject_Dump() during finalization for debugging purpose. */
  523. if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
  524. return 0;
  525. }
  526. if (encoding != NULL
  527. // Fast path for the most common built-in encodings. Even if the codec
  528. // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
  529. // create a temporary Unicode string (the key in the cache).
  530. && strcmp(encoding, "utf-8") != 0
  531. && strcmp(encoding, "utf8") != 0
  532. && strcmp(encoding, "ascii") != 0)
  533. {
  534. PyObject *handler = _PyCodec_Lookup(encoding);
  535. if (handler == NULL) {
  536. return -1;
  537. }
  538. Py_DECREF(handler);
  539. }
  540. if (errors != NULL
  541. // Fast path for the most common built-in error handlers.
  542. && strcmp(errors, "strict") != 0
  543. && strcmp(errors, "ignore") != 0
  544. && strcmp(errors, "replace") != 0
  545. && strcmp(errors, "surrogateescape") != 0
  546. && strcmp(errors, "surrogatepass") != 0)
  547. {
  548. PyObject *handler = PyCodec_LookupError(errors);
  549. if (handler == NULL) {
  550. return -1;
  551. }
  552. Py_DECREF(handler);
  553. }
  554. return 0;
  555. }
  556. int
  557. _PyUnicode_CheckConsistency(PyObject *op, int check_content)
  558. {
  559. #define CHECK(expr) \
  560. do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
  561. assert(op != NULL);
  562. CHECK(PyUnicode_Check(op));
  563. PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
  564. int kind = ascii->state.kind;
  565. if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
  566. CHECK(kind == PyUnicode_1BYTE_KIND);
  567. }
  568. else {
  569. PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
  570. void *data;
  571. if (ascii->state.compact == 1) {
  572. data = compact + 1;
  573. CHECK(kind == PyUnicode_1BYTE_KIND
  574. || kind == PyUnicode_2BYTE_KIND
  575. || kind == PyUnicode_4BYTE_KIND);
  576. CHECK(ascii->state.ascii == 0);
  577. CHECK(compact->utf8 != data);
  578. }
  579. else {
  580. PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
  581. data = unicode->data.any;
  582. CHECK(kind == PyUnicode_1BYTE_KIND
  583. || kind == PyUnicode_2BYTE_KIND
  584. || kind == PyUnicode_4BYTE_KIND);
  585. CHECK(ascii->state.compact == 0);
  586. CHECK(data != NULL);
  587. if (ascii->state.ascii) {
  588. CHECK(compact->utf8 == data);
  589. CHECK(compact->utf8_length == ascii->length);
  590. }
  591. else {
  592. CHECK(compact->utf8 != data);
  593. }
  594. }
  595. if (compact->utf8 == NULL)
  596. CHECK(compact->utf8_length == 0);
  597. }
  598. /* check that the best kind is used: O(n) operation */
  599. if (check_content) {
  600. Py_ssize_t i;
  601. Py_UCS4 maxchar = 0;
  602. const void *data;
  603. Py_UCS4 ch;
  604. data = PyUnicode_DATA(ascii);
  605. for (i=0; i < ascii->length; i++)
  606. {
  607. ch = PyUnicode_READ(kind, data, i);
  608. if (ch > maxchar)
  609. maxchar = ch;
  610. }
  611. if (kind == PyUnicode_1BYTE_KIND) {
  612. if (ascii->state.ascii == 0) {
  613. CHECK(maxchar >= 128);
  614. CHECK(maxchar <= 255);
  615. }
  616. else
  617. CHECK(maxchar < 128);
  618. }
  619. else if (kind == PyUnicode_2BYTE_KIND) {
  620. CHECK(maxchar >= 0x100);
  621. CHECK(maxchar <= 0xFFFF);
  622. }
  623. else {
  624. CHECK(maxchar >= 0x10000);
  625. CHECK(maxchar <= MAX_UNICODE);
  626. }
  627. CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
  628. }
  629. /* Check interning state */
  630. #ifdef Py_DEBUG
  631. // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
  632. // extensions can make immortal strings mortal (but with a high enough
  633. // refcount).
  634. // The other way is extremely unlikely (worth a potential failed assertion
  635. // in a debug build), so we do check `!_Py_IsImmortal(op)`.
  636. switch (PyUnicode_CHECK_INTERNED(op)) {
  637. case SSTATE_NOT_INTERNED:
  638. if (ascii->state.statically_allocated) {
  639. // This state is for two exceptions:
  640. // - strings are currently checked before they're interned
  641. // - the 256 one-latin1-character strings
  642. // are static but use SSTATE_NOT_INTERNED
  643. }
  644. else {
  645. CHECK(!_Py_IsImmortal(op));
  646. }
  647. break;
  648. case SSTATE_INTERNED_MORTAL:
  649. CHECK(!ascii->state.statically_allocated);
  650. CHECK(!_Py_IsImmortal(op));
  651. break;
  652. case SSTATE_INTERNED_IMMORTAL:
  653. CHECK(!ascii->state.statically_allocated);
  654. break;
  655. case SSTATE_INTERNED_IMMORTAL_STATIC:
  656. CHECK(ascii->state.statically_allocated);
  657. break;
  658. default:
  659. Py_UNREACHABLE();
  660. }
  661. #endif
  662. return 1;
  663. #undef CHECK
  664. }
  665. static PyObject*
  666. unicode_result(PyObject *unicode)
  667. {
  668. assert(_PyUnicode_CHECK(unicode));
  669. Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
  670. if (length == 0) {
  671. PyObject *empty = unicode_get_empty();
  672. if (unicode != empty) {
  673. Py_DECREF(unicode);
  674. Py_INCREF(empty);
  675. }
  676. return empty;
  677. }
  678. if (length == 1) {
  679. int kind = PyUnicode_KIND(unicode);
  680. if (kind == PyUnicode_1BYTE_KIND) {
  681. const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
  682. Py_UCS1 ch = data[0];
  683. PyObject *latin1_char = LATIN1(ch);
  684. if (unicode != latin1_char) {
  685. Py_INCREF(latin1_char);
  686. Py_DECREF(unicode);
  687. }
  688. return latin1_char;
  689. }
  690. }
  691. assert(_PyUnicode_CheckConsistency(unicode, 1));
  692. return unicode;
  693. }
  694. static PyObject*
  695. unicode_result_unchanged(PyObject *unicode)
  696. {
  697. if (PyUnicode_CheckExact(unicode)) {
  698. return Py_NewRef(unicode);
  699. }
  700. else
  701. /* Subtype -- return genuine unicode string with the same value. */
  702. return _PyUnicode_Copy(unicode);
  703. }
  704. /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
  705. ASCII, Latin1, UTF-8, etc. */
  706. static char*
  707. backslashreplace(_PyBytesWriter *writer, char *str,
  708. PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
  709. {
  710. Py_ssize_t size, i;
  711. Py_UCS4 ch;
  712. int kind;
  713. const void *data;
  714. kind = PyUnicode_KIND(unicode);
  715. data = PyUnicode_DATA(unicode);
  716. size = 0;
  717. /* determine replacement size */
  718. for (i = collstart; i < collend; ++i) {
  719. Py_ssize_t incr;
  720. ch = PyUnicode_READ(kind, data, i);
  721. if (ch < 0x100)
  722. incr = 2+2;
  723. else if (ch < 0x10000)
  724. incr = 2+4;
  725. else {
  726. assert(ch <= MAX_UNICODE);
  727. incr = 2+8;
  728. }
  729. if (size > PY_SSIZE_T_MAX - incr) {
  730. PyErr_SetString(PyExc_OverflowError,
  731. "encoded result is too long for a Python string");
  732. return NULL;
  733. }
  734. size += incr;
  735. }
  736. str = _PyBytesWriter_Prepare(writer, str, size);
  737. if (str == NULL)
  738. return NULL;
  739. /* generate replacement */
  740. for (i = collstart; i < collend; ++i) {
  741. ch = PyUnicode_READ(kind, data, i);
  742. *str++ = '\\';
  743. if (ch >= 0x00010000) {
  744. *str++ = 'U';
  745. *str++ = Py_hexdigits[(ch>>28)&0xf];
  746. *str++ = Py_hexdigits[(ch>>24)&0xf];
  747. *str++ = Py_hexdigits[(ch>>20)&0xf];
  748. *str++ = Py_hexdigits[(ch>>16)&0xf];
  749. *str++ = Py_hexdigits[(ch>>12)&0xf];
  750. *str++ = Py_hexdigits[(ch>>8)&0xf];
  751. }
  752. else if (ch >= 0x100) {
  753. *str++ = 'u';
  754. *str++ = Py_hexdigits[(ch>>12)&0xf];
  755. *str++ = Py_hexdigits[(ch>>8)&0xf];
  756. }
  757. else
  758. *str++ = 'x';
  759. *str++ = Py_hexdigits[(ch>>4)&0xf];
  760. *str++ = Py_hexdigits[ch&0xf];
  761. }
  762. return str;
  763. }
  764. /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
  765. ASCII, Latin1, UTF-8, etc. */
  766. static char*
  767. xmlcharrefreplace(_PyBytesWriter *writer, char *str,
  768. PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
  769. {
  770. Py_ssize_t size, i;
  771. Py_UCS4 ch;
  772. int kind;
  773. const void *data;
  774. kind = PyUnicode_KIND(unicode);
  775. data = PyUnicode_DATA(unicode);
  776. size = 0;
  777. /* determine replacement size */
  778. for (i = collstart; i < collend; ++i) {
  779. Py_ssize_t incr;
  780. ch = PyUnicode_READ(kind, data, i);
  781. if (ch < 10)
  782. incr = 2+1+1;
  783. else if (ch < 100)
  784. incr = 2+2+1;
  785. else if (ch < 1000)
  786. incr = 2+3+1;
  787. else if (ch < 10000)
  788. incr = 2+4+1;
  789. else if (ch < 100000)
  790. incr = 2+5+1;
  791. else if (ch < 1000000)
  792. incr = 2+6+1;
  793. else {
  794. assert(ch <= MAX_UNICODE);
  795. incr = 2+7+1;
  796. }
  797. if (size > PY_SSIZE_T_MAX - incr) {
  798. PyErr_SetString(PyExc_OverflowError,
  799. "encoded result is too long for a Python string");
  800. return NULL;
  801. }
  802. size += incr;
  803. }
  804. str = _PyBytesWriter_Prepare(writer, str, size);
  805. if (str == NULL)
  806. return NULL;
  807. /* generate replacement */
  808. for (i = collstart; i < collend; ++i) {
  809. size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
  810. if (size < 0) {
  811. return NULL;
  812. }
  813. str += size;
  814. }
  815. return str;
  816. }
  817. /* --- Bloom Filters ----------------------------------------------------- */
  818. /* stuff to implement simple "bloom filters" for Unicode characters.
  819. to keep things simple, we use a single bitmask, using the least 5
  820. bits from each unicode characters as the bit index. */
  821. /* the linebreak mask is set up by _PyUnicode_Init() below */
  822. #if LONG_BIT >= 128
  823. #define BLOOM_WIDTH 128
  824. #elif LONG_BIT >= 64
  825. #define BLOOM_WIDTH 64
  826. #elif LONG_BIT >= 32
  827. #define BLOOM_WIDTH 32
  828. #else
  829. #error "LONG_BIT is smaller than 32"
  830. #endif
  831. #define BLOOM_MASK unsigned long
  832. static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
  833. #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
  834. #define BLOOM_LINEBREAK(ch) \
  835. ((ch) < 128U ? ascii_linebreak[(ch)] : \
  836. (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
  837. static inline BLOOM_MASK
  838. make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
  839. {
  840. #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
  841. do { \
  842. TYPE *data = (TYPE *)PTR; \
  843. TYPE *end = data + LEN; \
  844. Py_UCS4 ch; \
  845. for (; data != end; data++) { \
  846. ch = *data; \
  847. MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
  848. } \
  849. break; \
  850. } while (0)
  851. /* calculate simple bloom-style bitmask for a given unicode string */
  852. BLOOM_MASK mask;
  853. mask = 0;
  854. switch (kind) {
  855. case PyUnicode_1BYTE_KIND:
  856. BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
  857. break;
  858. case PyUnicode_2BYTE_KIND:
  859. BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
  860. break;
  861. case PyUnicode_4BYTE_KIND:
  862. BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
  863. break;
  864. default:
  865. Py_UNREACHABLE();
  866. }
  867. return mask;
  868. #undef BLOOM_UPDATE
  869. }
  870. static int
  871. ensure_unicode(PyObject *obj)
  872. {
  873. if (!PyUnicode_Check(obj)) {
  874. PyErr_Format(PyExc_TypeError,
  875. "must be str, not %.100s",
  876. Py_TYPE(obj)->tp_name);
  877. return -1;
  878. }
  879. return 0;
  880. }
  881. /* Compilation of templated routines */
  882. #define STRINGLIB_GET_EMPTY() unicode_get_empty()
  883. #include "stringlib/asciilib.h"
  884. #include "stringlib/fastsearch.h"
  885. #include "stringlib/partition.h"
  886. #include "stringlib/split.h"
  887. #include "stringlib/count.h"
  888. #include "stringlib/find.h"
  889. #include "stringlib/find_max_char.h"
  890. #include "stringlib/undef.h"
  891. #include "stringlib/ucs1lib.h"
  892. #include "stringlib/fastsearch.h"
  893. #include "stringlib/partition.h"
  894. #include "stringlib/split.h"
  895. #include "stringlib/count.h"
  896. #include "stringlib/find.h"
  897. #include "stringlib/replace.h"
  898. #include "stringlib/find_max_char.h"
  899. #include "stringlib/undef.h"
  900. #include "stringlib/ucs2lib.h"
  901. #include "stringlib/fastsearch.h"
  902. #include "stringlib/partition.h"
  903. #include "stringlib/split.h"
  904. #include "stringlib/count.h"
  905. #include "stringlib/find.h"
  906. #include "stringlib/replace.h"
  907. #include "stringlib/find_max_char.h"
  908. #include "stringlib/undef.h"
  909. #include "stringlib/ucs4lib.h"
  910. #include "stringlib/fastsearch.h"
  911. #include "stringlib/partition.h"
  912. #include "stringlib/split.h"
  913. #include "stringlib/count.h"
  914. #include "stringlib/find.h"
  915. #include "stringlib/replace.h"
  916. #include "stringlib/find_max_char.h"
  917. #include "stringlib/undef.h"
  918. #undef STRINGLIB_GET_EMPTY
  919. /* --- Unicode Object ----------------------------------------------------- */
  920. static inline Py_ssize_t
  921. findchar(const void *s, int kind,
  922. Py_ssize_t size, Py_UCS4 ch,
  923. int direction)
  924. {
  925. switch (kind) {
  926. case PyUnicode_1BYTE_KIND:
  927. if ((Py_UCS1) ch != ch)
  928. return -1;
  929. if (direction > 0)
  930. return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
  931. else
  932. return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
  933. case PyUnicode_2BYTE_KIND:
  934. if ((Py_UCS2) ch != ch)
  935. return -1;
  936. if (direction > 0)
  937. return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
  938. else
  939. return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
  940. case PyUnicode_4BYTE_KIND:
  941. if (direction > 0)
  942. return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
  943. else
  944. return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
  945. default:
  946. Py_UNREACHABLE();
  947. }
  948. }
  949. #ifdef Py_DEBUG
  950. /* Fill the data of a Unicode string with invalid characters to detect bugs
  951. earlier.
  952. _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
  953. ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
  954. invalid character in Unicode 6.0. */
  955. static void
  956. unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
  957. {
  958. int kind = PyUnicode_KIND(unicode);
  959. Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
  960. Py_ssize_t length = _PyUnicode_LENGTH(unicode);
  961. if (length <= old_length)
  962. return;
  963. memset(data + old_length * kind, 0xff, (length - old_length) * kind);
  964. }
  965. #endif
  966. static PyObject*
  967. resize_compact(PyObject *unicode, Py_ssize_t length)
  968. {
  969. Py_ssize_t char_size;
  970. Py_ssize_t struct_size;
  971. Py_ssize_t new_size;
  972. PyObject *new_unicode;
  973. #ifdef Py_DEBUG
  974. Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
  975. #endif
  976. assert(unicode_modifiable(unicode));
  977. assert(PyUnicode_IS_COMPACT(unicode));
  978. char_size = PyUnicode_KIND(unicode);
  979. if (PyUnicode_IS_ASCII(unicode))
  980. struct_size = sizeof(PyASCIIObject);
  981. else
  982. struct_size = sizeof(PyCompactUnicodeObject);
  983. if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
  984. PyErr_NoMemory();
  985. return NULL;
  986. }
  987. new_size = (struct_size + (length + 1) * char_size);
  988. if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
  989. PyObject_Free(_PyUnicode_UTF8(unicode));
  990. _PyUnicode_UTF8(unicode) = NULL;
  991. _PyUnicode_UTF8_LENGTH(unicode) = 0;
  992. }
  993. #ifdef Py_TRACE_REFS
  994. _Py_ForgetReference(unicode);
  995. #endif
  996. new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
  997. if (new_unicode == NULL) {
  998. _Py_NewReferenceNoTotal(unicode);
  999. PyErr_NoMemory();
  1000. return NULL;
  1001. }
  1002. unicode = new_unicode;
  1003. _Py_NewReferenceNoTotal(unicode);
  1004. _PyUnicode_LENGTH(unicode) = length;
  1005. #ifdef Py_DEBUG
  1006. unicode_fill_invalid(unicode, old_length);
  1007. #endif
  1008. PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
  1009. length, 0);
  1010. assert(_PyUnicode_CheckConsistency(unicode, 0));
  1011. return unicode;
  1012. }
  1013. static int
  1014. resize_inplace(PyObject *unicode, Py_ssize_t length)
  1015. {
  1016. assert(!PyUnicode_IS_COMPACT(unicode));
  1017. assert(Py_REFCNT(unicode) == 1);
  1018. Py_ssize_t new_size;
  1019. Py_ssize_t char_size;
  1020. int share_utf8;
  1021. void *data;
  1022. #ifdef Py_DEBUG
  1023. Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
  1024. #endif
  1025. data = _PyUnicode_DATA_ANY(unicode);
  1026. char_size = PyUnicode_KIND(unicode);
  1027. share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
  1028. if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
  1029. PyErr_NoMemory();
  1030. return -1;
  1031. }
  1032. new_size = (length + 1) * char_size;
  1033. if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
  1034. {
  1035. PyObject_Free(_PyUnicode_UTF8(unicode));
  1036. _PyUnicode_UTF8(unicode) = NULL;
  1037. _PyUnicode_UTF8_LENGTH(unicode) = 0;
  1038. }
  1039. data = (PyObject *)PyObject_Realloc(data, new_size);
  1040. if (data == NULL) {
  1041. PyErr_NoMemory();
  1042. return -1;
  1043. }
  1044. _PyUnicode_DATA_ANY(unicode) = data;
  1045. if (share_utf8) {
  1046. _PyUnicode_UTF8(unicode) = data;
  1047. _PyUnicode_UTF8_LENGTH(unicode) = length;
  1048. }
  1049. _PyUnicode_LENGTH(unicode) = length;
  1050. PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
  1051. #ifdef Py_DEBUG
  1052. unicode_fill_invalid(unicode, old_length);
  1053. #endif
  1054. /* check for integer overflow */
  1055. if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
  1056. PyErr_NoMemory();
  1057. return -1;
  1058. }
  1059. assert(_PyUnicode_CheckConsistency(unicode, 0));
  1060. return 0;
  1061. }
  1062. static PyObject*
  1063. resize_copy(PyObject *unicode, Py_ssize_t length)
  1064. {
  1065. Py_ssize_t copy_length;
  1066. PyObject *copy;
  1067. copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
  1068. if (copy == NULL)
  1069. return NULL;
  1070. copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
  1071. _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
  1072. return copy;
  1073. }
  1074. static const char*
  1075. unicode_kind_name(PyObject *unicode)
  1076. {
  1077. /* don't check consistency: unicode_kind_name() is called from
  1078. _PyUnicode_Dump() */
  1079. if (!PyUnicode_IS_COMPACT(unicode))
  1080. {
  1081. switch (PyUnicode_KIND(unicode))
  1082. {
  1083. case PyUnicode_1BYTE_KIND:
  1084. if (PyUnicode_IS_ASCII(unicode))
  1085. return "legacy ascii";
  1086. else
  1087. return "legacy latin1";
  1088. case PyUnicode_2BYTE_KIND:
  1089. return "legacy UCS2";
  1090. case PyUnicode_4BYTE_KIND:
  1091. return "legacy UCS4";
  1092. default:
  1093. return "<legacy invalid kind>";
  1094. }
  1095. }
  1096. switch (PyUnicode_KIND(unicode)) {
  1097. case PyUnicode_1BYTE_KIND:
  1098. if (PyUnicode_IS_ASCII(unicode))
  1099. return "ascii";
  1100. else
  1101. return "latin1";
  1102. case PyUnicode_2BYTE_KIND:
  1103. return "UCS2";
  1104. case PyUnicode_4BYTE_KIND:
  1105. return "UCS4";
  1106. default:
  1107. return "<invalid compact kind>";
  1108. }
  1109. }
  1110. #ifdef Py_DEBUG
  1111. /* Functions wrapping macros for use in debugger */
  1112. const char *_PyUnicode_utf8(void *unicode_raw){
  1113. PyObject *unicode = _PyObject_CAST(unicode_raw);
  1114. return PyUnicode_UTF8(unicode);
  1115. }
  1116. const void *_PyUnicode_compact_data(void *unicode_raw) {
  1117. PyObject *unicode = _PyObject_CAST(unicode_raw);
  1118. return _PyUnicode_COMPACT_DATA(unicode);
  1119. }
  1120. const void *_PyUnicode_data(void *unicode_raw) {
  1121. PyObject *unicode = _PyObject_CAST(unicode_raw);
  1122. printf("obj %p\n", (void*)unicode);
  1123. printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
  1124. printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
  1125. printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
  1126. printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
  1127. printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
  1128. return PyUnicode_DATA(unicode);
  1129. }
  1130. void
  1131. _PyUnicode_Dump(PyObject *op)
  1132. {
  1133. PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
  1134. PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
  1135. PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
  1136. const void *data;
  1137. if (ascii->state.compact)
  1138. {
  1139. if (ascii->state.ascii)
  1140. data = (ascii + 1);
  1141. else
  1142. data = (compact + 1);
  1143. }
  1144. else
  1145. data = unicode->data.any;
  1146. printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
  1147. if (!ascii->state.ascii) {
  1148. printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
  1149. }
  1150. printf(", data=%p\n", data);
  1151. }
  1152. #endif
  1153. PyObject *
  1154. PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
  1155. {
  1156. /* Optimization for empty strings */
  1157. if (size == 0) {
  1158. return unicode_new_empty();
  1159. }
  1160. PyObject *obj;
  1161. PyCompactUnicodeObject *unicode;
  1162. void *data;
  1163. int kind;
  1164. int is_ascii;
  1165. Py_ssize_t char_size;
  1166. Py_ssize_t struct_size;
  1167. is_ascii = 0;
  1168. struct_size = sizeof(PyCompactUnicodeObject);
  1169. if (maxchar < 128) {
  1170. kind = PyUnicode_1BYTE_KIND;
  1171. char_size = 1;
  1172. is_ascii = 1;
  1173. struct_size = sizeof(PyASCIIObject);
  1174. }
  1175. else if (maxchar < 256) {
  1176. kind = PyUnicode_1BYTE_KIND;
  1177. char_size = 1;
  1178. }
  1179. else if (maxchar < 65536) {
  1180. kind = PyUnicode_2BYTE_KIND;
  1181. char_size = 2;
  1182. }
  1183. else {
  1184. if (maxchar > MAX_UNICODE) {
  1185. PyErr_SetString(PyExc_SystemError,
  1186. "invalid maximum character passed to PyUnicode_New");
  1187. return NULL;
  1188. }
  1189. kind = PyUnicode_4BYTE_KIND;
  1190. char_size = 4;
  1191. }
  1192. /* Ensure we won't overflow the size. */
  1193. if (size < 0) {
  1194. PyErr_SetString(PyExc_SystemError,
  1195. "Negative size passed to PyUnicode_New");
  1196. return NULL;
  1197. }
  1198. if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
  1199. return PyErr_NoMemory();
  1200. /* Duplicated allocation code from _PyObject_New() instead of a call to
  1201. * PyObject_New() so we are able to allocate space for the object and
  1202. * it's data buffer.
  1203. */
  1204. obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
  1205. if (obj == NULL) {
  1206. return PyErr_NoMemory();
  1207. }
  1208. _PyObject_Init(obj, &PyUnicode_Type);
  1209. unicode = (PyCompactUnicodeObject *)obj;
  1210. if (is_ascii)
  1211. data = ((PyASCIIObject*)obj) + 1;
  1212. else
  1213. data = unicode + 1;
  1214. _PyUnicode_LENGTH(unicode) = size;
  1215. _PyUnicode_HASH(unicode) = -1;
  1216. _PyUnicode_STATE(unicode).interned = 0;
  1217. _PyUnicode_STATE(unicode).kind = kind;
  1218. _PyUnicode_STATE(unicode).compact = 1;
  1219. _PyUnicode_STATE(unicode).ascii = is_ascii;
  1220. _PyUnicode_STATE(unicode).statically_allocated = 0;
  1221. if (is_ascii) {
  1222. ((char*)data)[size] = 0;
  1223. }
  1224. else if (kind == PyUnicode_1BYTE_KIND) {
  1225. ((char*)data)[size] = 0;
  1226. unicode->utf8 = NULL;
  1227. unicode->utf8_length = 0;
  1228. }
  1229. else {
  1230. unicode->utf8 = NULL;
  1231. unicode->utf8_length = 0;
  1232. if (kind == PyUnicode_2BYTE_KIND)
  1233. ((Py_UCS2*)data)[size] = 0;
  1234. else /* kind == PyUnicode_4BYTE_KIND */
  1235. ((Py_UCS4*)data)[size] = 0;
  1236. }
  1237. #ifdef Py_DEBUG
  1238. unicode_fill_invalid((PyObject*)unicode, 0);
  1239. #endif
  1240. assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
  1241. return obj;
  1242. }
  1243. #if SIZEOF_WCHAR_T == 2
  1244. /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
  1245. will decode surrogate pairs, the other conversions are implemented as macros
  1246. for efficiency.
  1247. This function assumes that unicode can hold one more code point than wstr
  1248. characters for a terminating null character. */
  1249. static void
  1250. unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
  1251. PyObject *unicode)
  1252. {
  1253. const wchar_t *iter;
  1254. Py_UCS4 *ucs4_out;
  1255. assert(unicode != NULL);
  1256. assert(_PyUnicode_CHECK(unicode));
  1257. assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
  1258. ucs4_out = PyUnicode_4BYTE_DATA(unicode);
  1259. for (iter = begin; iter < end; ) {
  1260. assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
  1261. _PyUnicode_GET_LENGTH(unicode)));
  1262. if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
  1263. && (iter+1) < end
  1264. && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
  1265. {
  1266. *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
  1267. iter += 2;
  1268. }
  1269. else {
  1270. *ucs4_out++ = *iter;
  1271. iter++;
  1272. }
  1273. }
  1274. assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
  1275. _PyUnicode_GET_LENGTH(unicode)));
  1276. }
  1277. #endif
  1278. static int
  1279. unicode_check_modifiable(PyObject *unicode)
  1280. {
  1281. if (!unicode_modifiable(unicode)) {
  1282. PyErr_SetString(PyExc_SystemError,
  1283. "Cannot modify a string currently used");
  1284. return -1;
  1285. }
  1286. return 0;
  1287. }
  1288. static int
  1289. _copy_characters(PyObject *to, Py_ssize_t to_start,
  1290. PyObject *from, Py_ssize_t from_start,
  1291. Py_ssize_t how_many, int check_maxchar)
  1292. {
  1293. int from_kind, to_kind;
  1294. const void *from_data;
  1295. void *to_data;
  1296. assert(0 <= how_many);
  1297. assert(0 <= from_start);
  1298. assert(0 <= to_start);
  1299. assert(PyUnicode_Check(from));
  1300. assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
  1301. assert(to == NULL || PyUnicode_Check(to));
  1302. if (how_many == 0) {
  1303. return 0;
  1304. }
  1305. assert(to != NULL);
  1306. assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
  1307. from_kind = PyUnicode_KIND(from);
  1308. from_data = PyUnicode_DATA(from);
  1309. to_kind = PyUnicode_KIND(to);
  1310. to_data = PyUnicode_DATA(to);
  1311. #ifdef Py_DEBUG
  1312. if (!check_maxchar
  1313. && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
  1314. {
  1315. Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
  1316. Py_UCS4 ch;
  1317. Py_ssize_t i;
  1318. for (i=0; i < how_many; i++) {
  1319. ch = PyUnicode_READ(from_kind, from_data, from_start + i);
  1320. assert(ch <= to_maxchar);
  1321. }
  1322. }
  1323. #endif
  1324. if (from_kind == to_kind) {
  1325. if (check_maxchar
  1326. && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
  1327. {
  1328. /* Writing Latin-1 characters into an ASCII string requires to
  1329. check that all written characters are pure ASCII */
  1330. Py_UCS4 max_char;
  1331. max_char = ucs1lib_find_max_char(from_data,
  1332. (const Py_UCS1*)from_data + how_many);
  1333. if (max_char >= 128)
  1334. return -1;
  1335. }
  1336. memcpy((char*)to_data + to_kind * to_start,
  1337. (const char*)from_data + from_kind * from_start,
  1338. to_kind * how_many);
  1339. }
  1340. else if (from_kind == PyUnicode_1BYTE_KIND
  1341. && to_kind == PyUnicode_2BYTE_KIND)
  1342. {
  1343. _PyUnicode_CONVERT_BYTES(
  1344. Py_UCS1, Py_UCS2,
  1345. PyUnicode_1BYTE_DATA(from) + from_start,
  1346. PyUnicode_1BYTE_DATA(from) + from_start + how_many,
  1347. PyUnicode_2BYTE_DATA(to) + to_start
  1348. );
  1349. }
  1350. else if (from_kind == PyUnicode_1BYTE_KIND
  1351. && to_kind == PyUnicode_4BYTE_KIND)
  1352. {
  1353. _PyUnicode_CONVERT_BYTES(
  1354. Py_UCS1, Py_UCS4,
  1355. PyUnicode_1BYTE_DATA(from) + from_start,
  1356. PyUnicode_1BYTE_DATA(from) + from_start + how_many,
  1357. PyUnicode_4BYTE_DATA(to) + to_start
  1358. );
  1359. }
  1360. else if (from_kind == PyUnicode_2BYTE_KIND
  1361. && to_kind == PyUnicode_4BYTE_KIND)
  1362. {
  1363. _PyUnicode_CONVERT_BYTES(
  1364. Py_UCS2, Py_UCS4,
  1365. PyUnicode_2BYTE_DATA(from) + from_start,
  1366. PyUnicode_2BYTE_DATA(from) + from_start + how_many,
  1367. PyUnicode_4BYTE_DATA(to) + to_start
  1368. );
  1369. }
  1370. else {
  1371. assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
  1372. if (!check_maxchar) {
  1373. if (from_kind == PyUnicode_2BYTE_KIND
  1374. && to_kind == PyUnicode_1BYTE_KIND)
  1375. {
  1376. _PyUnicode_CONVERT_BYTES(
  1377. Py_UCS2, Py_UCS1,
  1378. PyUnicode_2BYTE_DATA(from) + from_start,
  1379. PyUnicode_2BYTE_DATA(from) + from_start + how_many,
  1380. PyUnicode_1BYTE_DATA(to) + to_start
  1381. );
  1382. }
  1383. else if (from_kind == PyUnicode_4BYTE_KIND
  1384. && to_kind == PyUnicode_1BYTE_KIND)
  1385. {
  1386. _PyUnicode_CONVERT_BYTES(
  1387. Py_UCS4, Py_UCS1,
  1388. PyUnicode_4BYTE_DATA(from) + from_start,
  1389. PyUnicode_4BYTE_DATA(from) + from_start + how_many,
  1390. PyUnicode_1BYTE_DATA(to) + to_start
  1391. );
  1392. }
  1393. else if (from_kind == PyUnicode_4BYTE_KIND
  1394. && to_kind == PyUnicode_2BYTE_KIND)
  1395. {
  1396. _PyUnicode_CONVERT_BYTES(
  1397. Py_UCS4, Py_UCS2,
  1398. PyUnicode_4BYTE_DATA(from) + from_start,
  1399. PyUnicode_4BYTE_DATA(from) + from_start + how_many,
  1400. PyUnicode_2BYTE_DATA(to) + to_start
  1401. );
  1402. }
  1403. else {
  1404. Py_UNREACHABLE();
  1405. }
  1406. }
  1407. else {
  1408. const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
  1409. Py_UCS4 ch;
  1410. Py_ssize_t i;
  1411. for (i=0; i < how_many; i++) {
  1412. ch = PyUnicode_READ(from_kind, from_data, from_start + i);
  1413. if (ch > to_maxchar)
  1414. return -1;
  1415. PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
  1416. }
  1417. }
  1418. }
  1419. return 0;
  1420. }
  1421. void
  1422. _PyUnicode_FastCopyCharacters(
  1423. PyObject *to, Py_ssize_t to_start,
  1424. PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
  1425. {
  1426. (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
  1427. }
  1428. Py_ssize_t
  1429. PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
  1430. PyObject *from, Py_ssize_t from_start,
  1431. Py_ssize_t how_many)
  1432. {
  1433. int err;
  1434. if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
  1435. PyErr_BadInternalCall();
  1436. return -1;
  1437. }
  1438. if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
  1439. PyErr_SetString(PyExc_IndexError, "string index out of range");
  1440. return -1;
  1441. }
  1442. if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
  1443. PyErr_SetString(PyExc_IndexError, "string index out of range");
  1444. return -1;
  1445. }
  1446. if (how_many < 0) {
  1447. PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
  1448. return -1;
  1449. }
  1450. how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
  1451. if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
  1452. PyErr_Format(PyExc_SystemError,
  1453. "Cannot write %zi characters at %zi "
  1454. "in a string of %zi characters",
  1455. how_many, to_start, PyUnicode_GET_LENGTH(to));
  1456. return -1;
  1457. }
  1458. if (how_many == 0)
  1459. return 0;
  1460. if (unicode_check_modifiable(to))
  1461. return -1;
  1462. err = _copy_characters(to, to_start, from, from_start, how_many, 1);
  1463. if (err) {
  1464. PyErr_Format(PyExc_SystemError,
  1465. "Cannot copy %s characters "
  1466. "into a string of %s characters",
  1467. unicode_kind_name(from),
  1468. unicode_kind_name(to));
  1469. return -1;
  1470. }
  1471. return how_many;
  1472. }
  1473. /* Find the maximum code point and count the number of surrogate pairs so a
  1474. correct string length can be computed before converting a string to UCS4.
  1475. This function counts single surrogates as a character and not as a pair.
  1476. Return 0 on success, or -1 on error. */
  1477. static int
  1478. find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
  1479. Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
  1480. {
  1481. const wchar_t *iter;
  1482. Py_UCS4 ch;
  1483. assert(num_surrogates != NULL && maxchar != NULL);
  1484. *num_surrogates = 0;
  1485. *maxchar = 0;
  1486. for (iter = begin; iter < end; ) {
  1487. #if SIZEOF_WCHAR_T == 2
  1488. if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
  1489. && (iter+1) < end
  1490. && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
  1491. {
  1492. ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
  1493. ++(*num_surrogates);
  1494. iter += 2;
  1495. }
  1496. else
  1497. #endif
  1498. {
  1499. ch = *iter;
  1500. iter++;
  1501. }
  1502. if (ch > *maxchar) {
  1503. *maxchar = ch;
  1504. if (*maxchar > MAX_UNICODE) {
  1505. PyErr_Format(PyExc_ValueError,
  1506. "character U+%x is not in range [U+0000; U+%x]",
  1507. ch, MAX_UNICODE);
  1508. return -1;
  1509. }
  1510. }
  1511. }
  1512. return 0;
  1513. }
  1514. static void
  1515. unicode_dealloc(PyObject *unicode)
  1516. {
  1517. #ifdef Py_DEBUG
  1518. if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
  1519. _Py_FatalRefcountError("deallocating an Unicode singleton");
  1520. }
  1521. #endif
  1522. if (_PyUnicode_STATE(unicode).statically_allocated) {
  1523. /* This should never get called, but we also don't want to SEGV if
  1524. * we accidentally decref an immortal string out of existence. Since
  1525. * the string is an immortal object, just re-set the reference count.
  1526. */
  1527. #ifdef Py_DEBUG
  1528. Py_UNREACHABLE();
  1529. #endif
  1530. _Py_SetImmortal(unicode);
  1531. return;
  1532. }
  1533. switch (_PyUnicode_STATE(unicode).interned) {
  1534. case SSTATE_NOT_INTERNED:
  1535. break;
  1536. case SSTATE_INTERNED_MORTAL:
  1537. /* Remove the object from the intern dict.
  1538. * Before doing so, we set the refcount to 3: the key and value
  1539. * in the interned_dict, plus one to work with.
  1540. */
  1541. assert(Py_REFCNT(unicode) == 0);
  1542. Py_SET_REFCNT(unicode, 3);
  1543. #ifdef Py_REF_DEBUG
  1544. /* let's be pedantic with the ref total */
  1545. _Py_IncRefTotal(_PyInterpreterState_GET());
  1546. _Py_IncRefTotal(_PyInterpreterState_GET());
  1547. _Py_IncRefTotal(_PyInterpreterState_GET());
  1548. #endif
  1549. PyInterpreterState *interp = _PyInterpreterState_GET();
  1550. PyObject *interned = get_interned_dict(interp);
  1551. assert(interned != NULL);
  1552. int r = PyDict_DelItem(interned, unicode);
  1553. if (r == -1) {
  1554. PyErr_WriteUnraisable(unicode);
  1555. // We don't know what happened to the string. It's probably
  1556. // best to leak it:
  1557. // - if it was not found, something is very wrong
  1558. // - if it was deleted, there are no more references to it
  1559. // so it can't cause trouble (except wasted memory)
  1560. // - if it wasn't deleted, it'll remain interned
  1561. _Py_SetImmortal(unicode);
  1562. _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
  1563. return;
  1564. }
  1565. // Only our work reference should be left; remove it too.
  1566. assert(Py_REFCNT(unicode) == 1);
  1567. Py_SET_REFCNT(unicode, 0);
  1568. #ifdef Py_REF_DEBUG
  1569. /* let's be pedantic with the ref total */
  1570. _Py_DecRefTotal(_PyInterpreterState_GET());
  1571. #endif
  1572. break;
  1573. default:
  1574. // As with `statically_allocated` above.
  1575. #ifdef Py_REF_DEBUG
  1576. Py_UNREACHABLE();
  1577. #endif
  1578. _Py_SetImmortal(unicode);
  1579. return;
  1580. }
  1581. if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
  1582. PyObject_Free(_PyUnicode_UTF8(unicode));
  1583. }
  1584. if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
  1585. PyObject_Free(_PyUnicode_DATA_ANY(unicode));
  1586. }
  1587. Py_TYPE(unicode)->tp_free(unicode);
  1588. }
  1589. #ifdef Py_DEBUG
  1590. static int
  1591. unicode_is_singleton(PyObject *unicode)
  1592. {
  1593. if (unicode == &_Py_STR(empty)) {
  1594. return 1;
  1595. }
  1596. PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
  1597. if (ascii->length == 1) {
  1598. Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
  1599. if (ch < 256 && LATIN1(ch) == unicode) {
  1600. return 1;
  1601. }
  1602. }
  1603. return 0;
  1604. }
  1605. #endif
  1606. static int
  1607. unicode_modifiable(PyObject *unicode)
  1608. {
  1609. assert(_PyUnicode_CHECK(unicode));
  1610. if (Py_REFCNT(unicode) != 1)
  1611. return 0;
  1612. if (_PyUnicode_HASH(unicode) != -1)
  1613. return 0;
  1614. if (PyUnicode_CHECK_INTERNED(unicode))
  1615. return 0;
  1616. if (!PyUnicode_CheckExact(unicode))
  1617. return 0;
  1618. #ifdef Py_DEBUG
  1619. /* singleton refcount is greater than 1 */
  1620. assert(!unicode_is_singleton(unicode));
  1621. #endif
  1622. return 1;
  1623. }
  1624. static int
  1625. unicode_resize(PyObject **p_unicode, Py_ssize_t length)
  1626. {
  1627. PyObject *unicode;
  1628. Py_ssize_t old_length;
  1629. assert(p_unicode != NULL);
  1630. unicode = *p_unicode;
  1631. assert(unicode != NULL);
  1632. assert(PyUnicode_Check(unicode));
  1633. assert(0 <= length);
  1634. old_length = PyUnicode_GET_LENGTH(unicode);
  1635. if (old_length == length)
  1636. return 0;
  1637. if (length == 0) {
  1638. PyObject *empty = unicode_new_empty();
  1639. Py_SETREF(*p_unicode, empty);
  1640. return 0;
  1641. }
  1642. if (!unicode_modifiable(unicode)) {
  1643. PyObject *copy = resize_copy(unicode, length);
  1644. if (copy == NULL)
  1645. return -1;
  1646. Py_SETREF(*p_unicode, copy);
  1647. return 0;
  1648. }
  1649. if (PyUnicode_IS_COMPACT(unicode)) {
  1650. PyObject *new_unicode = resize_compact(unicode, length);
  1651. if (new_unicode == NULL)
  1652. return -1;
  1653. *p_unicode = new_unicode;
  1654. return 0;
  1655. }
  1656. return resize_inplace(unicode, length);
  1657. }
  1658. int
  1659. PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
  1660. {
  1661. PyObject *unicode;
  1662. if (p_unicode == NULL) {
  1663. PyErr_BadInternalCall();
  1664. return -1;
  1665. }
  1666. unicode = *p_unicode;
  1667. if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
  1668. {
  1669. PyErr_BadInternalCall();
  1670. return -1;
  1671. }
  1672. return unicode_resize(p_unicode, length);
  1673. }
  1674. /* Copy an ASCII or latin1 char* string into a Python Unicode string.
  1675. WARNING: The function doesn't copy the terminating null character and
  1676. doesn't check the maximum character (may write a latin1 character in an
  1677. ASCII string). */
  1678. static void
  1679. unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
  1680. const char *str, Py_ssize_t len)
  1681. {
  1682. int kind = PyUnicode_KIND(unicode);
  1683. const void *data = PyUnicode_DATA(unicode);
  1684. const char *end = str + len;
  1685. assert(index + len <= PyUnicode_GET_LENGTH(unicode));
  1686. switch (kind) {
  1687. case PyUnicode_1BYTE_KIND: {
  1688. #ifdef Py_DEBUG
  1689. if (PyUnicode_IS_ASCII(unicode)) {
  1690. Py_UCS4 maxchar = ucs1lib_find_max_char(
  1691. (const Py_UCS1*)str,
  1692. (const Py_UCS1*)str + len);
  1693. assert(maxchar < 128);
  1694. }
  1695. #endif
  1696. memcpy((char *) data + index, str, len);
  1697. break;
  1698. }
  1699. case PyUnicode_2BYTE_KIND: {
  1700. Py_UCS2 *start = (Py_UCS2 *)data + index;
  1701. Py_UCS2 *ucs2 = start;
  1702. for (; str < end; ++ucs2, ++str)
  1703. *ucs2 = (Py_UCS2)*str;
  1704. assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
  1705. break;
  1706. }
  1707. case PyUnicode_4BYTE_KIND: {
  1708. Py_UCS4 *start = (Py_UCS4 *)data + index;
  1709. Py_UCS4 *ucs4 = start;
  1710. for (; str < end; ++ucs4, ++str)
  1711. *ucs4 = (Py_UCS4)*str;
  1712. assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
  1713. break;
  1714. }
  1715. default:
  1716. Py_UNREACHABLE();
  1717. }
  1718. }
  1719. static PyObject*
  1720. get_latin1_char(Py_UCS1 ch)
  1721. {
  1722. PyObject *o = LATIN1(ch);
  1723. return o;
  1724. }
  1725. static PyObject*
  1726. unicode_char(Py_UCS4 ch)
  1727. {
  1728. PyObject *unicode;
  1729. assert(ch <= MAX_UNICODE);
  1730. if (ch < 256) {
  1731. return get_latin1_char(ch);
  1732. }
  1733. unicode = PyUnicode_New(1, ch);
  1734. if (unicode == NULL)
  1735. return NULL;
  1736. assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
  1737. if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
  1738. PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
  1739. } else {
  1740. assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
  1741. PyUnicode_4BYTE_DATA(unicode)[0] = ch;
  1742. }
  1743. assert(_PyUnicode_CheckConsistency(unicode, 1));
  1744. return unicode;
  1745. }
  1746. PyObject *
  1747. PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
  1748. {
  1749. PyObject *unicode;
  1750. Py_UCS4 maxchar = 0;
  1751. Py_ssize_t num_surrogates;
  1752. if (u == NULL && size != 0) {
  1753. PyErr_BadInternalCall();
  1754. return NULL;
  1755. }
  1756. if (size == -1) {
  1757. size = wcslen(u);
  1758. }
  1759. /* If the Unicode data is known at construction time, we can apply
  1760. some optimizations which share commonly used objects. */
  1761. /* Optimization for empty strings */
  1762. if (size == 0)
  1763. _Py_RETURN_UNICODE_EMPTY();
  1764. #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
  1765. /* Oracle Solaris uses non-Unicode internal wchar_t form for
  1766. non-Unicode locales and hence needs conversion to UCS-4 first. */
  1767. if (_Py_LocaleUsesNonUnicodeWchar()) {
  1768. wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
  1769. if (!converted) {
  1770. return NULL;
  1771. }
  1772. PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
  1773. PyMem_Free(converted);
  1774. return unicode;
  1775. }
  1776. #endif
  1777. /* Single character Unicode objects in the Latin-1 range are
  1778. shared when using this constructor */
  1779. if (size == 1 && (Py_UCS4)*u < 256)
  1780. return get_latin1_char((unsigned char)*u);
  1781. /* If not empty and not single character, copy the Unicode data
  1782. into the new object */
  1783. if (find_maxchar_surrogates(u, u + size,
  1784. &maxchar, &num_surrogates) == -1)
  1785. return NULL;
  1786. unicode = PyUnicode_New(size - num_surrogates, maxchar);
  1787. if (!unicode)
  1788. return NULL;
  1789. switch (PyUnicode_KIND(unicode)) {
  1790. case PyUnicode_1BYTE_KIND:
  1791. _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
  1792. u, u + size, PyUnicode_1BYTE_DATA(unicode));
  1793. break;
  1794. case PyUnicode_2BYTE_KIND:
  1795. #if Py_UNICODE_SIZE == 2
  1796. memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
  1797. #else
  1798. _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
  1799. u, u + size, PyUnicode_2BYTE_DATA(unicode));
  1800. #endif
  1801. break;
  1802. case PyUnicode_4BYTE_KIND:
  1803. #if SIZEOF_WCHAR_T == 2
  1804. /* This is the only case which has to process surrogates, thus
  1805. a simple copy loop is not enough and we need a function. */
  1806. unicode_convert_wchar_to_ucs4(u, u + size, unicode);
  1807. #else
  1808. assert(num_surrogates == 0);
  1809. memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
  1810. #endif
  1811. break;
  1812. default:
  1813. Py_UNREACHABLE();
  1814. }
  1815. return unicode_result(unicode);
  1816. }
  1817. PyObject *
  1818. PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
  1819. {
  1820. if (size < 0) {
  1821. PyErr_SetString(PyExc_SystemError,
  1822. "Negative size passed to PyUnicode_FromStringAndSize");
  1823. return NULL;
  1824. }
  1825. if (u != NULL) {
  1826. return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
  1827. }
  1828. if (size > 0) {
  1829. PyErr_SetString(PyExc_SystemError,
  1830. "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
  1831. return NULL;
  1832. }
  1833. return unicode_new_empty();
  1834. }
  1835. PyObject *
  1836. PyUnicode_FromString(const char *u)
  1837. {
  1838. #if defined(__has_feature)
  1839. # if __has_feature(memory_sanitizer)
  1840. __msan_unpoison_string(u);
  1841. # endif
  1842. #endif
  1843. size_t size = strlen(u);
  1844. if (size > PY_SSIZE_T_MAX) {
  1845. PyErr_SetString(PyExc_OverflowError, "input too long");
  1846. return NULL;
  1847. }
  1848. return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
  1849. }
  1850. PyObject *
  1851. _PyUnicode_FromId(_Py_Identifier *id)
  1852. {
  1853. PyInterpreterState *interp = _PyInterpreterState_GET();
  1854. struct _Py_unicode_ids *ids = &interp->unicode.ids;
  1855. Py_ssize_t index = _Py_atomic_size_get(&id->index);
  1856. if (index < 0) {
  1857. struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
  1858. PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
  1859. // Check again to detect concurrent access. Another thread can have
  1860. // initialized the index while this thread waited for the lock.
  1861. index = _Py_atomic_size_get(&id->index);
  1862. if (index < 0) {
  1863. assert(rt_ids->next_index < PY_SSIZE_T_MAX);
  1864. index = rt_ids->next_index;
  1865. rt_ids->next_index++;
  1866. _Py_atomic_size_set(&id->index, index);
  1867. }
  1868. PyThread_release_lock(rt_ids->lock);
  1869. }
  1870. assert(index >= 0);
  1871. PyObject *obj;
  1872. if (index < ids->size) {
  1873. obj = ids->array[index];
  1874. if (obj) {
  1875. // Return a borrowed reference
  1876. return obj;
  1877. }
  1878. }
  1879. obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
  1880. NULL, NULL);
  1881. if (!obj) {
  1882. return NULL;
  1883. }
  1884. _PyUnicode_InternImmortal(interp, &obj);
  1885. if (index >= ids->size) {
  1886. // Overallocate to reduce the number of realloc
  1887. Py_ssize_t new_size = Py_MAX(index * 2, 16);
  1888. Py_ssize_t item_size = sizeof(ids->array[0]);
  1889. PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
  1890. if (new_array == NULL) {
  1891. PyErr_NoMemory();
  1892. return NULL;
  1893. }
  1894. memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
  1895. ids->array = new_array;
  1896. ids->size = new_size;
  1897. }
  1898. // The array stores a strong reference
  1899. ids->array[index] = obj;
  1900. // Return a borrowed reference
  1901. return obj;
  1902. }
  1903. static void
  1904. unicode_clear_identifiers(struct _Py_unicode_state *state)
  1905. {
  1906. struct _Py_unicode_ids *ids = &state->ids;
  1907. for (Py_ssize_t i=0; i < ids->size; i++) {
  1908. Py_XDECREF(ids->array[i]);
  1909. }
  1910. ids->size = 0;
  1911. PyMem_Free(ids->array);
  1912. ids->array = NULL;
  1913. // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
  1914. // after Py_Finalize().
  1915. }
  1916. /* Internal function, doesn't check maximum character */
  1917. PyObject*
  1918. _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
  1919. {
  1920. const unsigned char *s = (const unsigned char *)buffer;
  1921. PyObject *unicode;
  1922. if (size == 1) {
  1923. #ifdef Py_DEBUG
  1924. assert((unsigned char)s[0] < 128);
  1925. #endif
  1926. return get_latin1_char(s[0]);
  1927. }
  1928. unicode = PyUnicode_New(size, 127);
  1929. if (!unicode)
  1930. return NULL;
  1931. memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
  1932. assert(_PyUnicode_CheckConsistency(unicode, 1));
  1933. return unicode;
  1934. }
  1935. static Py_UCS4
  1936. kind_maxchar_limit(int kind)
  1937. {
  1938. switch (kind) {
  1939. case PyUnicode_1BYTE_KIND:
  1940. return 0x80;
  1941. case PyUnicode_2BYTE_KIND:
  1942. return 0x100;
  1943. case PyUnicode_4BYTE_KIND:
  1944. return 0x10000;
  1945. default:
  1946. Py_UNREACHABLE();
  1947. }
  1948. }
  1949. static PyObject*
  1950. _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
  1951. {
  1952. PyObject *res;
  1953. unsigned char max_char;
  1954. if (size == 0) {
  1955. _Py_RETURN_UNICODE_EMPTY();
  1956. }
  1957. assert(size > 0);
  1958. if (size == 1) {
  1959. return get_latin1_char(u[0]);
  1960. }
  1961. max_char = ucs1lib_find_max_char(u, u + size);
  1962. res = PyUnicode_New(size, max_char);
  1963. if (!res)
  1964. return NULL;
  1965. memcpy(PyUnicode_1BYTE_DATA(res), u, size);
  1966. assert(_PyUnicode_CheckConsistency(res, 1));
  1967. return res;
  1968. }
  1969. static PyObject*
  1970. _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
  1971. {
  1972. PyObject *res;
  1973. Py_UCS2 max_char;
  1974. if (size == 0)
  1975. _Py_RETURN_UNICODE_EMPTY();
  1976. assert(size > 0);
  1977. if (size == 1)
  1978. return unicode_char(u[0]);
  1979. max_char = ucs2lib_find_max_char(u, u + size);
  1980. res = PyUnicode_New(size, max_char);
  1981. if (!res)
  1982. return NULL;
  1983. if (max_char >= 256)
  1984. memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
  1985. else {
  1986. _PyUnicode_CONVERT_BYTES(
  1987. Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
  1988. }
  1989. assert(_PyUnicode_CheckConsistency(res, 1));
  1990. return res;
  1991. }
  1992. static PyObject*
  1993. _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
  1994. {
  1995. PyObject *res;
  1996. Py_UCS4 max_char;
  1997. if (size == 0)
  1998. _Py_RETURN_UNICODE_EMPTY();
  1999. assert(size > 0);
  2000. if (size == 1)
  2001. return unicode_char(u[0]);
  2002. max_char = ucs4lib_find_max_char(u, u + size);
  2003. res = PyUnicode_New(size, max_char);
  2004. if (!res)
  2005. return NULL;
  2006. if (max_char < 256)
  2007. _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
  2008. PyUnicode_1BYTE_DATA(res));
  2009. else if (max_char < 0x10000)
  2010. _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
  2011. PyUnicode_2BYTE_DATA(res));
  2012. else
  2013. memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
  2014. assert(_PyUnicode_CheckConsistency(res, 1));
  2015. return res;
  2016. }
  2017. PyObject*
  2018. PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
  2019. {
  2020. if (size < 0) {
  2021. PyErr_SetString(PyExc_ValueError, "size must be positive");
  2022. return NULL;
  2023. }
  2024. switch (kind) {
  2025. case PyUnicode_1BYTE_KIND:
  2026. return _PyUnicode_FromUCS1(buffer, size);
  2027. case PyUnicode_2BYTE_KIND:
  2028. return _PyUnicode_FromUCS2(buffer, size);
  2029. case PyUnicode_4BYTE_KIND:
  2030. return _PyUnicode_FromUCS4(buffer, size);
  2031. default:
  2032. PyErr_SetString(PyExc_SystemError, "invalid kind");
  2033. return NULL;
  2034. }
  2035. }
  2036. Py_UCS4
  2037. _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
  2038. {
  2039. int kind;
  2040. const void *startptr, *endptr;
  2041. assert(0 <= start);
  2042. assert(end <= PyUnicode_GET_LENGTH(unicode));
  2043. assert(start <= end);
  2044. if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
  2045. return PyUnicode_MAX_CHAR_VALUE(unicode);
  2046. if (start == end)
  2047. return 127;
  2048. if (PyUnicode_IS_ASCII(unicode))
  2049. return 127;
  2050. kind = PyUnicode_KIND(unicode);
  2051. startptr = PyUnicode_DATA(unicode);
  2052. endptr = (char *)startptr + end * kind;
  2053. startptr = (char *)startptr + start * kind;
  2054. switch(kind) {
  2055. case PyUnicode_1BYTE_KIND:
  2056. return ucs1lib_find_max_char(startptr, endptr);
  2057. case PyUnicode_2BYTE_KIND:
  2058. return ucs2lib_find_max_char(startptr, endptr);
  2059. case PyUnicode_4BYTE_KIND:
  2060. return ucs4lib_find_max_char(startptr, endptr);
  2061. default:
  2062. Py_UNREACHABLE();
  2063. }
  2064. }
  2065. /* Ensure that a string uses the most efficient storage, if it is not the
  2066. case: create a new string with of the right kind. Write NULL into *p_unicode
  2067. on error. */
  2068. static void
  2069. unicode_adjust_maxchar(PyObject **p_unicode)
  2070. {
  2071. PyObject *unicode, *copy;
  2072. Py_UCS4 max_char;
  2073. Py_ssize_t len;
  2074. int kind;
  2075. assert(p_unicode != NULL);
  2076. unicode = *p_unicode;
  2077. if (PyUnicode_IS_ASCII(unicode))
  2078. return;
  2079. len = PyUnicode_GET_LENGTH(unicode);
  2080. kind = PyUnicode_KIND(unicode);
  2081. if (kind == PyUnicode_1BYTE_KIND) {
  2082. const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
  2083. max_char = ucs1lib_find_max_char(u, u + len);
  2084. if (max_char >= 128)
  2085. return;
  2086. }
  2087. else if (kind == PyUnicode_2BYTE_KIND) {
  2088. const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
  2089. max_char = ucs2lib_find_max_char(u, u + len);
  2090. if (max_char >= 256)
  2091. return;
  2092. }
  2093. else if (kind == PyUnicode_4BYTE_KIND) {
  2094. const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
  2095. max_char = ucs4lib_find_max_char(u, u + len);
  2096. if (max_char >= 0x10000)
  2097. return;
  2098. }
  2099. else
  2100. Py_UNREACHABLE();
  2101. copy = PyUnicode_New(len, max_char);
  2102. if (copy != NULL)
  2103. _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
  2104. Py_DECREF(unicode);
  2105. *p_unicode = copy;
  2106. }
  2107. PyObject*
  2108. _PyUnicode_Copy(PyObject *unicode)
  2109. {
  2110. Py_ssize_t length;
  2111. PyObject *copy;
  2112. if (!PyUnicode_Check(unicode)) {
  2113. PyErr_BadInternalCall();
  2114. return NULL;
  2115. }
  2116. length = PyUnicode_GET_LENGTH(unicode);
  2117. copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
  2118. if (!copy)
  2119. return NULL;
  2120. assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
  2121. memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
  2122. length * PyUnicode_KIND(unicode));
  2123. assert(_PyUnicode_CheckConsistency(copy, 1));
  2124. return copy;
  2125. }
  2126. /* Widen Unicode objects to larger buffers. Don't write terminating null
  2127. character. Return NULL on error. */
  2128. static void*
  2129. unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
  2130. {
  2131. void *result;
  2132. assert(skind < kind);
  2133. switch (kind) {
  2134. case PyUnicode_2BYTE_KIND:
  2135. result = PyMem_New(Py_UCS2, len);
  2136. if (!result)
  2137. return PyErr_NoMemory();
  2138. assert(skind == PyUnicode_1BYTE_KIND);
  2139. _PyUnicode_CONVERT_BYTES(
  2140. Py_UCS1, Py_UCS2,
  2141. (const Py_UCS1 *)data,
  2142. ((const Py_UCS1 *)data) + len,
  2143. result);
  2144. return result;
  2145. case PyUnicode_4BYTE_KIND:
  2146. result = PyMem_New(Py_UCS4, len);
  2147. if (!result)
  2148. return PyErr_NoMemory();
  2149. if (skind == PyUnicode_2BYTE_KIND) {
  2150. _PyUnicode_CONVERT_BYTES(
  2151. Py_UCS2, Py_UCS4,
  2152. (const Py_UCS2 *)data,
  2153. ((const Py_UCS2 *)data) + len,
  2154. result);
  2155. }
  2156. else {
  2157. assert(skind == PyUnicode_1BYTE_KIND);
  2158. _PyUnicode_CONVERT_BYTES(
  2159. Py_UCS1, Py_UCS4,
  2160. (const Py_UCS1 *)data,
  2161. ((const Py_UCS1 *)data) + len,
  2162. result);
  2163. }
  2164. return result;
  2165. default:
  2166. Py_UNREACHABLE();
  2167. return NULL;
  2168. }
  2169. }
  2170. static Py_UCS4*
  2171. as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
  2172. int copy_null)
  2173. {
  2174. int kind;
  2175. const void *data;
  2176. Py_ssize_t len, targetlen;
  2177. kind = PyUnicode_KIND(string);
  2178. data = PyUnicode_DATA(string);
  2179. len = PyUnicode_GET_LENGTH(string);
  2180. targetlen = len;
  2181. if (copy_null)
  2182. targetlen++;
  2183. if (!target) {
  2184. target = PyMem_New(Py_UCS4, targetlen);
  2185. if (!target) {
  2186. PyErr_NoMemory();
  2187. return NULL;
  2188. }
  2189. }
  2190. else {
  2191. if (targetsize < targetlen) {
  2192. PyErr_Format(PyExc_SystemError,
  2193. "string is longer than the buffer");
  2194. if (copy_null && 0 < targetsize)
  2195. target[0] = 0;
  2196. return NULL;
  2197. }
  2198. }
  2199. if (kind == PyUnicode_1BYTE_KIND) {
  2200. const Py_UCS1 *start = (const Py_UCS1 *) data;
  2201. _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
  2202. }
  2203. else if (kind == PyUnicode_2BYTE_KIND) {
  2204. const Py_UCS2 *start = (const Py_UCS2 *) data;
  2205. _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
  2206. }
  2207. else if (kind == PyUnicode_4BYTE_KIND) {
  2208. memcpy(target, data, len * sizeof(Py_UCS4));
  2209. }
  2210. else {
  2211. Py_UNREACHABLE();
  2212. }
  2213. if (copy_null)
  2214. target[len] = 0;
  2215. return target;
  2216. }
  2217. Py_UCS4*
  2218. PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
  2219. int copy_null)
  2220. {
  2221. if (target == NULL || targetsize < 0) {
  2222. PyErr_BadInternalCall();
  2223. return NULL;
  2224. }
  2225. return as_ucs4(string, target, targetsize, copy_null);
  2226. }
  2227. Py_UCS4*
  2228. PyUnicode_AsUCS4Copy(PyObject *string)
  2229. {
  2230. return as_ucs4(string, NULL, 0, 1);
  2231. }
  2232. /* maximum number of characters required for output of %jo or %jd or %p.
  2233. We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
  2234. plus 1 for the sign, plus 2 for the 0x prefix (for %p),
  2235. plus 1 for the terminal NUL. */
  2236. #define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
  2237. static int
  2238. unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
  2239. Py_ssize_t width, Py_ssize_t precision, int flags)
  2240. {
  2241. Py_ssize_t length, fill, arglen;
  2242. Py_UCS4 maxchar;
  2243. length = PyUnicode_GET_LENGTH(str);
  2244. if ((precision == -1 || precision >= length)
  2245. && width <= length)
  2246. return _PyUnicodeWriter_WriteStr(writer, str);
  2247. if (precision != -1)
  2248. length = Py_MIN(precision, length);
  2249. arglen = Py_MAX(length, width);
  2250. if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
  2251. maxchar = _PyUnicode_FindMaxChar(str, 0, length);
  2252. else
  2253. maxchar = writer->maxchar;
  2254. if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
  2255. return -1;
  2256. fill = Py_MAX(width - length, 0);
  2257. if (fill && !(flags & F_LJUST)) {
  2258. if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
  2259. return -1;
  2260. writer->pos += fill;
  2261. }
  2262. _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
  2263. str, 0, length);
  2264. writer->pos += length;
  2265. if (fill && (flags & F_LJUST)) {
  2266. if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
  2267. return -1;
  2268. writer->pos += fill;
  2269. }
  2270. return 0;
  2271. }
  2272. static int
  2273. unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
  2274. Py_ssize_t width, Py_ssize_t precision, int flags)
  2275. {
  2276. /* UTF-8 */
  2277. Py_ssize_t length;
  2278. PyObject *unicode;
  2279. int res;
  2280. if (precision == -1) {
  2281. length = strlen(str);
  2282. }
  2283. else {
  2284. length = 0;
  2285. while (length < precision && str[length]) {
  2286. length++;
  2287. }
  2288. }
  2289. unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
  2290. if (unicode == NULL)
  2291. return -1;
  2292. res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
  2293. Py_DECREF(unicode);
  2294. return res;
  2295. }
  2296. static int
  2297. unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
  2298. Py_ssize_t width, Py_ssize_t precision, int flags)
  2299. {
  2300. /* UTF-8 */
  2301. Py_ssize_t length;
  2302. PyObject *unicode;
  2303. int res;
  2304. if (precision == -1) {
  2305. length = wcslen(str);
  2306. }
  2307. else {
  2308. length = 0;
  2309. while (length < precision && str[length]) {
  2310. length++;
  2311. }
  2312. }
  2313. unicode = PyUnicode_FromWideChar(str, length);
  2314. if (unicode == NULL)
  2315. return -1;
  2316. res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
  2317. Py_DECREF(unicode);
  2318. return res;
  2319. }
  2320. #define F_LONG 1
  2321. #define F_LONGLONG 2
  2322. #define F_SIZE 3
  2323. #define F_PTRDIFF 4
  2324. #define F_INTMAX 5
  2325. static const char * const formats[] = {"%d", "%ld", "%lld", "%zd", "%td", "%jd"};
  2326. static const char * const formats_o[] = {"%o", "%lo", "%llo", "%zo", "%to", "%jo"};
  2327. static const char * const formats_u[] = {"%u", "%lu", "%llu", "%zu", "%tu", "%ju"};
  2328. static const char * const formats_x[] = {"%x", "%lx", "%llx", "%zx", "%tx", "%jx"};
  2329. static const char * const formats_X[] = {"%X", "%lX", "%llX", "%zX", "%tX", "%jX"};
  2330. static const char*
  2331. unicode_fromformat_arg(_PyUnicodeWriter *writer,
  2332. const char *f, va_list *vargs)
  2333. {
  2334. const char *p;
  2335. Py_ssize_t len;
  2336. int flags = 0;
  2337. Py_ssize_t width;
  2338. Py_ssize_t precision;
  2339. p = f;
  2340. f++;
  2341. if (*f == '%') {
  2342. if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
  2343. return NULL;
  2344. f++;
  2345. return f;
  2346. }
  2347. /* Parse flags. Example: "%-i" => flags=F_LJUST. */
  2348. /* Flags '+', ' ' and '#' are not particularly useful.
  2349. * They are not worth the implementation and maintenance costs.
  2350. * In addition, '#' should add "0" for "o" conversions for compatibility
  2351. * with printf, but it would confuse Python users. */
  2352. while (1) {
  2353. switch (*f++) {
  2354. case '-': flags |= F_LJUST; continue;
  2355. case '0': flags |= F_ZERO; continue;
  2356. }
  2357. f--;
  2358. break;
  2359. }
  2360. /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
  2361. width = -1;
  2362. if (*f == '*') {
  2363. width = va_arg(*vargs, int);
  2364. if (width < 0) {
  2365. flags |= F_LJUST;
  2366. width = -width;
  2367. }
  2368. f++;
  2369. }
  2370. else if (Py_ISDIGIT((unsigned)*f)) {
  2371. width = *f - '0';
  2372. f++;
  2373. while (Py_ISDIGIT((unsigned)*f)) {
  2374. if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
  2375. PyErr_SetString(PyExc_ValueError,
  2376. "width too big");
  2377. return NULL;
  2378. }
  2379. width = (width * 10) + (*f - '0');
  2380. f++;
  2381. }
  2382. }
  2383. precision = -1;
  2384. if (*f == '.') {
  2385. f++;
  2386. if (*f == '*') {
  2387. precision = va_arg(*vargs, int);
  2388. if (precision < 0) {
  2389. precision = -2;
  2390. }
  2391. f++;
  2392. }
  2393. else if (Py_ISDIGIT((unsigned)*f)) {
  2394. precision = (*f - '0');
  2395. f++;
  2396. while (Py_ISDIGIT((unsigned)*f)) {
  2397. if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
  2398. PyErr_SetString(PyExc_ValueError,
  2399. "precision too big");
  2400. return NULL;
  2401. }
  2402. precision = (precision * 10) + (*f - '0');
  2403. f++;
  2404. }
  2405. }
  2406. }
  2407. int sizemod = 0;
  2408. if (*f == 'l') {
  2409. if (f[1] == 'l') {
  2410. sizemod = F_LONGLONG;
  2411. f += 2;
  2412. }
  2413. else {
  2414. sizemod = F_LONG;
  2415. ++f;
  2416. }
  2417. }
  2418. else if (*f == 'z') {
  2419. sizemod = F_SIZE;
  2420. ++f;
  2421. }
  2422. else if (*f == 't') {
  2423. sizemod = F_PTRDIFF;
  2424. ++f;
  2425. }
  2426. else if (*f == 'j') {
  2427. sizemod = F_INTMAX;
  2428. ++f;
  2429. }
  2430. if (f[0] != '\0' && f[1] == '\0')
  2431. writer->overallocate = 0;
  2432. switch (*f) {
  2433. case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
  2434. break;
  2435. case 'c': case 'p':
  2436. if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
  2437. break;
  2438. case 's':
  2439. case 'V':
  2440. if (sizemod && sizemod != F_LONG) goto invalid_format;
  2441. break;
  2442. default:
  2443. if (sizemod) goto invalid_format;
  2444. break;
  2445. }
  2446. switch (*f) {
  2447. case 'c':
  2448. {
  2449. int ordinal = va_arg(*vargs, int);
  2450. if (ordinal < 0 || ordinal > MAX_UNICODE) {
  2451. PyErr_SetString(PyExc_OverflowError,
  2452. "character argument not in range(0x110000)");
  2453. return NULL;
  2454. }
  2455. if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
  2456. return NULL;
  2457. break;
  2458. }
  2459. case 'd': case 'i':
  2460. case 'o': case 'u': case 'x': case 'X':
  2461. {
  2462. /* used by sprintf */
  2463. char buffer[MAX_INTMAX_CHARS];
  2464. const char *fmt = NULL;
  2465. switch (*f) {
  2466. case 'o': fmt = formats_o[sizemod]; break;
  2467. case 'u': fmt = formats_u[sizemod]; break;
  2468. case 'x': fmt = formats_x[sizemod]; break;
  2469. case 'X': fmt = formats_X[sizemod]; break;
  2470. default: fmt = formats[sizemod]; break;
  2471. }
  2472. int issigned = (*f == 'd' || *f == 'i');
  2473. switch (sizemod) {
  2474. case F_LONG:
  2475. len = issigned ?
  2476. sprintf(buffer, fmt, va_arg(*vargs, long)) :
  2477. sprintf(buffer, fmt, va_arg(*vargs, unsigned long));
  2478. break;
  2479. case F_LONGLONG:
  2480. len = issigned ?
  2481. sprintf(buffer, fmt, va_arg(*vargs, long long)) :
  2482. sprintf(buffer, fmt, va_arg(*vargs, unsigned long long));
  2483. break;
  2484. case F_SIZE:
  2485. len = issigned ?
  2486. sprintf(buffer, fmt, va_arg(*vargs, Py_ssize_t)) :
  2487. sprintf(buffer, fmt, va_arg(*vargs, size_t));
  2488. break;
  2489. case F_PTRDIFF:
  2490. len = sprintf(buffer, fmt, va_arg(*vargs, ptrdiff_t));
  2491. break;
  2492. case F_INTMAX:
  2493. len = issigned ?
  2494. sprintf(buffer, fmt, va_arg(*vargs, intmax_t)) :
  2495. sprintf(buffer, fmt, va_arg(*vargs, uintmax_t));
  2496. break;
  2497. default:
  2498. len = issigned ?
  2499. sprintf(buffer, fmt, va_arg(*vargs, int)) :
  2500. sprintf(buffer, fmt, va_arg(*vargs, unsigned int));
  2501. break;
  2502. }
  2503. assert(len >= 0);
  2504. int sign = (buffer[0] == '-');
  2505. len -= sign;
  2506. precision = Py_MAX(precision, len);
  2507. width = Py_MAX(width, precision + sign);
  2508. if ((flags & F_ZERO) && !(flags & F_LJUST)) {
  2509. precision = width - sign;
  2510. }
  2511. Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
  2512. Py_ssize_t zeropad = Py_MAX(precision - len, 0);
  2513. if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
  2514. return NULL;
  2515. if (spacepad && !(flags & F_LJUST)) {
  2516. if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
  2517. return NULL;
  2518. writer->pos += spacepad;
  2519. }
  2520. if (sign) {
  2521. if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
  2522. return NULL;
  2523. }
  2524. if (zeropad) {
  2525. if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
  2526. return NULL;
  2527. writer->pos += zeropad;
  2528. }
  2529. if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
  2530. return NULL;
  2531. if (spacepad && (flags & F_LJUST)) {
  2532. if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
  2533. return NULL;
  2534. writer->pos += spacepad;
  2535. }
  2536. break;
  2537. }
  2538. case 'p':
  2539. {
  2540. char number[MAX_INTMAX_CHARS];
  2541. len = sprintf(number, "%p", va_arg(*vargs, void*));
  2542. assert(len >= 0);
  2543. /* %p is ill-defined: ensure leading 0x. */
  2544. if (number[1] == 'X')
  2545. number[1] = 'x';
  2546. else if (number[1] != 'x') {
  2547. memmove(number + 2, number,
  2548. strlen(number) + 1);
  2549. number[0] = '0';
  2550. number[1] = 'x';
  2551. len += 2;
  2552. }
  2553. if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
  2554. return NULL;
  2555. break;
  2556. }
  2557. case 's':
  2558. {
  2559. if (sizemod) {
  2560. const wchar_t *s = va_arg(*vargs, const wchar_t*);
  2561. if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
  2562. return NULL;
  2563. }
  2564. else {
  2565. /* UTF-8 */
  2566. const char *s = va_arg(*vargs, const char*);
  2567. if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0)
  2568. return NULL;
  2569. }
  2570. break;
  2571. }
  2572. case 'U':
  2573. {
  2574. PyObject *obj = va_arg(*vargs, PyObject *);
  2575. assert(obj && _PyUnicode_CHECK(obj));
  2576. if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
  2577. return NULL;
  2578. break;
  2579. }
  2580. case 'V':
  2581. {
  2582. PyObject *obj = va_arg(*vargs, PyObject *);
  2583. const char *str;
  2584. const wchar_t *wstr;
  2585. if (sizemod) {
  2586. wstr = va_arg(*vargs, const wchar_t*);
  2587. }
  2588. else {
  2589. str = va_arg(*vargs, const char *);
  2590. }
  2591. if (obj) {
  2592. assert(_PyUnicode_CHECK(obj));
  2593. if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
  2594. return NULL;
  2595. }
  2596. else if (sizemod) {
  2597. assert(wstr != NULL);
  2598. if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
  2599. return NULL;
  2600. }
  2601. else {
  2602. assert(str != NULL);
  2603. if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0)
  2604. return NULL;
  2605. }
  2606. break;
  2607. }
  2608. case 'S':
  2609. {
  2610. PyObject *obj = va_arg(*vargs, PyObject *);
  2611. PyObject *str;
  2612. assert(obj);
  2613. str = PyObject_Str(obj);
  2614. if (!str)
  2615. return NULL;
  2616. if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
  2617. Py_DECREF(str);
  2618. return NULL;
  2619. }
  2620. Py_DECREF(str);
  2621. break;
  2622. }
  2623. case 'R':
  2624. {
  2625. PyObject *obj = va_arg(*vargs, PyObject *);
  2626. PyObject *repr;
  2627. assert(obj);
  2628. repr = PyObject_Repr(obj);
  2629. if (!repr)
  2630. return NULL;
  2631. if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
  2632. Py_DECREF(repr);
  2633. return NULL;
  2634. }
  2635. Py_DECREF(repr);
  2636. break;
  2637. }
  2638. case 'A':
  2639. {
  2640. PyObject *obj = va_arg(*vargs, PyObject *);
  2641. PyObject *ascii;
  2642. assert(obj);
  2643. ascii = PyObject_ASCII(obj);
  2644. if (!ascii)
  2645. return NULL;
  2646. if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
  2647. Py_DECREF(ascii);
  2648. return NULL;
  2649. }
  2650. Py_DECREF(ascii);
  2651. break;
  2652. }
  2653. default:
  2654. invalid_format:
  2655. PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
  2656. return NULL;
  2657. }
  2658. f++;
  2659. return f;
  2660. }
  2661. PyObject *
  2662. PyUnicode_FromFormatV(const char *format, va_list vargs)
  2663. {
  2664. va_list vargs2;
  2665. const char *f;
  2666. _PyUnicodeWriter writer;
  2667. _PyUnicodeWriter_Init(&writer);
  2668. writer.min_length = strlen(format) + 100;
  2669. writer.overallocate = 1;
  2670. // Copy varags to be able to pass a reference to a subfunction.
  2671. va_copy(vargs2, vargs);
  2672. for (f = format; *f; ) {
  2673. if (*f == '%') {
  2674. f = unicode_fromformat_arg(&writer, f, &vargs2);
  2675. if (f == NULL)
  2676. goto fail;
  2677. }
  2678. else {
  2679. const char *p;
  2680. Py_ssize_t len;
  2681. p = f;
  2682. do
  2683. {
  2684. if ((unsigned char)*p > 127) {
  2685. PyErr_Format(PyExc_ValueError,
  2686. "PyUnicode_FromFormatV() expects an ASCII-encoded format "
  2687. "string, got a non-ASCII byte: 0x%02x",
  2688. (unsigned char)*p);
  2689. goto fail;
  2690. }
  2691. p++;
  2692. }
  2693. while (*p != '\0' && *p != '%');
  2694. len = p - f;
  2695. if (*p == '\0')
  2696. writer.overallocate = 0;
  2697. if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
  2698. goto fail;
  2699. f = p;
  2700. }
  2701. }
  2702. va_end(vargs2);
  2703. return _PyUnicodeWriter_Finish(&writer);
  2704. fail:
  2705. va_end(vargs2);
  2706. _PyUnicodeWriter_Dealloc(&writer);
  2707. return NULL;
  2708. }
  2709. PyObject *
  2710. PyUnicode_FromFormat(const char *format, ...)
  2711. {
  2712. PyObject* ret;
  2713. va_list vargs;
  2714. va_start(vargs, format);
  2715. ret = PyUnicode_FromFormatV(format, vargs);
  2716. va_end(vargs);
  2717. return ret;
  2718. }
  2719. static Py_ssize_t
  2720. unicode_get_widechar_size(PyObject *unicode)
  2721. {
  2722. Py_ssize_t res;
  2723. assert(unicode != NULL);
  2724. assert(_PyUnicode_CHECK(unicode));
  2725. res = _PyUnicode_LENGTH(unicode);
  2726. #if SIZEOF_WCHAR_T == 2
  2727. if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
  2728. const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
  2729. const Py_UCS4 *end = s + res;
  2730. for (; s < end; ++s) {
  2731. if (*s > 0xFFFF) {
  2732. ++res;
  2733. }
  2734. }
  2735. }
  2736. #endif
  2737. return res;
  2738. }
  2739. static void
  2740. unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
  2741. {
  2742. assert(unicode != NULL);
  2743. assert(_PyUnicode_CHECK(unicode));
  2744. if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
  2745. memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
  2746. return;
  2747. }
  2748. if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
  2749. const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
  2750. for (; size--; ++s, ++w) {
  2751. *w = *s;
  2752. }
  2753. }
  2754. else {
  2755. #if SIZEOF_WCHAR_T == 4
  2756. assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
  2757. const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
  2758. for (; size--; ++s, ++w) {
  2759. *w = *s;
  2760. }
  2761. #else
  2762. assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
  2763. const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
  2764. for (; size--; ++s, ++w) {
  2765. Py_UCS4 ch = *s;
  2766. if (ch > 0xFFFF) {
  2767. assert(ch <= MAX_UNICODE);
  2768. /* encode surrogate pair in this case */
  2769. *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
  2770. if (!size--)
  2771. break;
  2772. *w = Py_UNICODE_LOW_SURROGATE(ch);
  2773. }
  2774. else {
  2775. *w = ch;
  2776. }
  2777. }
  2778. #endif
  2779. }
  2780. }
  2781. #ifdef HAVE_WCHAR_H
  2782. /* Convert a Unicode object to a wide character string.
  2783. - If w is NULL: return the number of wide characters (including the null
  2784. character) required to convert the unicode object. Ignore size argument.
  2785. - Otherwise: return the number of wide characters (excluding the null
  2786. character) written into w. Write at most size wide characters (including
  2787. the null character). */
  2788. Py_ssize_t
  2789. PyUnicode_AsWideChar(PyObject *unicode,
  2790. wchar_t *w,
  2791. Py_ssize_t size)
  2792. {
  2793. Py_ssize_t res;
  2794. if (unicode == NULL) {
  2795. PyErr_BadInternalCall();
  2796. return -1;
  2797. }
  2798. if (!PyUnicode_Check(unicode)) {
  2799. PyErr_BadArgument();
  2800. return -1;
  2801. }
  2802. res = unicode_get_widechar_size(unicode);
  2803. if (w == NULL) {
  2804. return res + 1;
  2805. }
  2806. if (size > res) {
  2807. size = res + 1;
  2808. }
  2809. else {
  2810. res = size;
  2811. }
  2812. unicode_copy_as_widechar(unicode, w, size);
  2813. #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
  2814. /* Oracle Solaris uses non-Unicode internal wchar_t form for
  2815. non-Unicode locales and hence needs conversion first. */
  2816. if (_Py_LocaleUsesNonUnicodeWchar()) {
  2817. if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
  2818. return -1;
  2819. }
  2820. }
  2821. #endif
  2822. return res;
  2823. }
  2824. wchar_t*
  2825. PyUnicode_AsWideCharString(PyObject *unicode,
  2826. Py_ssize_t *size)
  2827. {
  2828. wchar_t *buffer;
  2829. Py_ssize_t buflen;
  2830. if (unicode == NULL) {
  2831. PyErr_BadInternalCall();
  2832. return NULL;
  2833. }
  2834. if (!PyUnicode_Check(unicode)) {
  2835. PyErr_BadArgument();
  2836. return NULL;
  2837. }
  2838. buflen = unicode_get_widechar_size(unicode);
  2839. buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
  2840. if (buffer == NULL) {
  2841. PyErr_NoMemory();
  2842. return NULL;
  2843. }
  2844. unicode_copy_as_widechar(unicode, buffer, buflen + 1);
  2845. #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
  2846. /* Oracle Solaris uses non-Unicode internal wchar_t form for
  2847. non-Unicode locales and hence needs conversion first. */
  2848. if (_Py_LocaleUsesNonUnicodeWchar()) {
  2849. if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
  2850. return NULL;
  2851. }
  2852. }
  2853. #endif
  2854. if (size != NULL) {
  2855. *size = buflen;
  2856. }
  2857. else if (wcslen(buffer) != (size_t)buflen) {
  2858. PyMem_Free(buffer);
  2859. PyErr_SetString(PyExc_ValueError,
  2860. "embedded null character");
  2861. return NULL;
  2862. }
  2863. return buffer;
  2864. }
  2865. #endif /* HAVE_WCHAR_H */
  2866. int
  2867. _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
  2868. {
  2869. wchar_t **p = (wchar_t **)ptr;
  2870. if (obj == NULL) {
  2871. PyMem_Free(*p);
  2872. *p = NULL;
  2873. return 1;
  2874. }
  2875. if (PyUnicode_Check(obj)) {
  2876. *p = PyUnicode_AsWideCharString(obj, NULL);
  2877. if (*p == NULL) {
  2878. return 0;
  2879. }
  2880. return Py_CLEANUP_SUPPORTED;
  2881. }
  2882. PyErr_Format(PyExc_TypeError,
  2883. "argument must be str, not %.50s",
  2884. Py_TYPE(obj)->tp_name);
  2885. return 0;
  2886. }
  2887. int
  2888. _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
  2889. {
  2890. wchar_t **p = (wchar_t **)ptr;
  2891. if (obj == NULL) {
  2892. PyMem_Free(*p);
  2893. *p = NULL;
  2894. return 1;
  2895. }
  2896. if (obj == Py_None) {
  2897. *p = NULL;
  2898. return 1;
  2899. }
  2900. if (PyUnicode_Check(obj)) {
  2901. *p = PyUnicode_AsWideCharString(obj, NULL);
  2902. if (*p == NULL) {
  2903. return 0;
  2904. }
  2905. return Py_CLEANUP_SUPPORTED;
  2906. }
  2907. PyErr_Format(PyExc_TypeError,
  2908. "argument must be str or None, not %.50s",
  2909. Py_TYPE(obj)->tp_name);
  2910. return 0;
  2911. }
  2912. PyObject *
  2913. PyUnicode_FromOrdinal(int ordinal)
  2914. {
  2915. if (ordinal < 0 || ordinal > MAX_UNICODE) {
  2916. PyErr_SetString(PyExc_ValueError,
  2917. "chr() arg not in range(0x110000)");
  2918. return NULL;
  2919. }
  2920. return unicode_char((Py_UCS4)ordinal);
  2921. }
  2922. PyObject *
  2923. PyUnicode_FromObject(PyObject *obj)
  2924. {
  2925. /* XXX Perhaps we should make this API an alias of
  2926. PyObject_Str() instead ?! */
  2927. if (PyUnicode_CheckExact(obj)) {
  2928. return Py_NewRef(obj);
  2929. }
  2930. if (PyUnicode_Check(obj)) {
  2931. /* For a Unicode subtype that's not a Unicode object,
  2932. return a true Unicode object with the same data. */
  2933. return _PyUnicode_Copy(obj);
  2934. }
  2935. PyErr_Format(PyExc_TypeError,
  2936. "Can't convert '%.100s' object to str implicitly",
  2937. Py_TYPE(obj)->tp_name);
  2938. return NULL;
  2939. }
  2940. PyObject *
  2941. PyUnicode_FromEncodedObject(PyObject *obj,
  2942. const char *encoding,
  2943. const char *errors)
  2944. {
  2945. Py_buffer buffer;
  2946. PyObject *v;
  2947. if (obj == NULL) {
  2948. PyErr_BadInternalCall();
  2949. return NULL;
  2950. }
  2951. /* Decoding bytes objects is the most common case and should be fast */
  2952. if (PyBytes_Check(obj)) {
  2953. if (PyBytes_GET_SIZE(obj) == 0) {
  2954. if (unicode_check_encoding_errors(encoding, errors) < 0) {
  2955. return NULL;
  2956. }
  2957. _Py_RETURN_UNICODE_EMPTY();
  2958. }
  2959. return PyUnicode_Decode(
  2960. PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
  2961. encoding, errors);
  2962. }
  2963. if (PyUnicode_Check(obj)) {
  2964. PyErr_SetString(PyExc_TypeError,
  2965. "decoding str is not supported");
  2966. return NULL;
  2967. }
  2968. /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
  2969. if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
  2970. PyErr_Format(PyExc_TypeError,
  2971. "decoding to str: need a bytes-like object, %.80s found",
  2972. Py_TYPE(obj)->tp_name);
  2973. return NULL;
  2974. }
  2975. if (buffer.len == 0) {
  2976. PyBuffer_Release(&buffer);
  2977. if (unicode_check_encoding_errors(encoding, errors) < 0) {
  2978. return NULL;
  2979. }
  2980. _Py_RETURN_UNICODE_EMPTY();
  2981. }
  2982. v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
  2983. PyBuffer_Release(&buffer);
  2984. return v;
  2985. }
  2986. /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
  2987. also convert to lowercase. Return 1 on success, or 0 on error (encoding is
  2988. longer than lower_len-1). */
  2989. int
  2990. _Py_normalize_encoding(const char *encoding,
  2991. char *lower,
  2992. size_t lower_len)
  2993. {
  2994. const char *e;
  2995. char *l;
  2996. char *l_end;
  2997. int punct;
  2998. assert(encoding != NULL);
  2999. e = encoding;
  3000. l = lower;
  3001. l_end = &lower[lower_len - 1];
  3002. punct = 0;
  3003. while (1) {
  3004. char c = *e;
  3005. if (c == 0) {
  3006. break;
  3007. }
  3008. if (Py_ISALNUM(c) || c == '.') {
  3009. if (punct && l != lower) {
  3010. if (l == l_end) {
  3011. return 0;
  3012. }
  3013. *l++ = '_';
  3014. }
  3015. punct = 0;
  3016. if (l == l_end) {
  3017. return 0;
  3018. }
  3019. *l++ = Py_TOLOWER(c);
  3020. }
  3021. else {
  3022. punct = 1;
  3023. }
  3024. e++;
  3025. }
  3026. *l = '\0';
  3027. return 1;
  3028. }
  3029. PyObject *
  3030. PyUnicode_Decode(const char *s,
  3031. Py_ssize_t size,
  3032. const char *encoding,
  3033. const char *errors)
  3034. {
  3035. PyObject *buffer = NULL, *unicode;
  3036. Py_buffer info;
  3037. char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
  3038. if (unicode_check_encoding_errors(encoding, errors) < 0) {
  3039. return NULL;
  3040. }
  3041. if (size == 0) {
  3042. _Py_RETURN_UNICODE_EMPTY();
  3043. }
  3044. if (encoding == NULL) {
  3045. return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
  3046. }
  3047. /* Shortcuts for common default encodings */
  3048. if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
  3049. char *lower = buflower;
  3050. /* Fast paths */
  3051. if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
  3052. lower += 3;
  3053. if (*lower == '_') {
  3054. /* Match "utf8" and "utf_8" */
  3055. lower++;
  3056. }
  3057. if (lower[0] == '8' && lower[1] == 0) {
  3058. return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
  3059. }
  3060. else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
  3061. return PyUnicode_DecodeUTF16(s, size, errors, 0);
  3062. }
  3063. else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
  3064. return PyUnicode_DecodeUTF32(s, size, errors, 0);
  3065. }
  3066. }
  3067. else {
  3068. if (strcmp(lower, "ascii") == 0
  3069. || strcmp(lower, "us_ascii") == 0) {
  3070. return PyUnicode_DecodeASCII(s, size, errors);
  3071. }
  3072. #ifdef MS_WINDOWS
  3073. else if (strcmp(lower, "mbcs") == 0) {
  3074. return PyUnicode_DecodeMBCS(s, size, errors);
  3075. }
  3076. #endif
  3077. else if (strcmp(lower, "latin1") == 0
  3078. || strcmp(lower, "latin_1") == 0
  3079. || strcmp(lower, "iso_8859_1") == 0
  3080. || strcmp(lower, "iso8859_1") == 0) {
  3081. return PyUnicode_DecodeLatin1(s, size, errors);
  3082. }
  3083. }
  3084. }
  3085. /* Decode via the codec registry */
  3086. buffer = NULL;
  3087. if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
  3088. goto onError;
  3089. buffer = PyMemoryView_FromBuffer(&info);
  3090. if (buffer == NULL)
  3091. goto onError;
  3092. unicode = _PyCodec_DecodeText(buffer, encoding, errors);
  3093. if (unicode == NULL)
  3094. goto onError;
  3095. if (!PyUnicode_Check(unicode)) {
  3096. PyErr_Format(PyExc_TypeError,
  3097. "'%.400s' decoder returned '%.400s' instead of 'str'; "
  3098. "use codecs.decode() to decode to arbitrary types",
  3099. encoding,
  3100. Py_TYPE(unicode)->tp_name);
  3101. Py_DECREF(unicode);
  3102. goto onError;
  3103. }
  3104. Py_DECREF(buffer);
  3105. return unicode_result(unicode);
  3106. onError:
  3107. Py_XDECREF(buffer);
  3108. return NULL;
  3109. }
  3110. PyObject *
  3111. PyUnicode_AsDecodedObject(PyObject *unicode,
  3112. const char *encoding,
  3113. const char *errors)
  3114. {
  3115. if (!PyUnicode_Check(unicode)) {
  3116. PyErr_BadArgument();
  3117. return NULL;
  3118. }
  3119. if (PyErr_WarnEx(PyExc_DeprecationWarning,
  3120. "PyUnicode_AsDecodedObject() is deprecated; "
  3121. "use PyCodec_Decode() to decode from str", 1) < 0)
  3122. return NULL;
  3123. if (encoding == NULL)
  3124. encoding = PyUnicode_GetDefaultEncoding();
  3125. /* Decode via the codec registry */
  3126. return PyCodec_Decode(unicode, encoding, errors);
  3127. }
  3128. PyObject *
  3129. PyUnicode_AsDecodedUnicode(PyObject *unicode,
  3130. const char *encoding,
  3131. const char *errors)
  3132. {
  3133. PyObject *v;
  3134. if (!PyUnicode_Check(unicode)) {
  3135. PyErr_BadArgument();
  3136. goto onError;
  3137. }
  3138. if (PyErr_WarnEx(PyExc_DeprecationWarning,
  3139. "PyUnicode_AsDecodedUnicode() is deprecated; "
  3140. "use PyCodec_Decode() to decode from str to str", 1) < 0)
  3141. return NULL;
  3142. if (encoding == NULL)
  3143. encoding = PyUnicode_GetDefaultEncoding();
  3144. /* Decode via the codec registry */
  3145. v = PyCodec_Decode(unicode, encoding, errors);
  3146. if (v == NULL)
  3147. goto onError;
  3148. if (!PyUnicode_Check(v)) {
  3149. PyErr_Format(PyExc_TypeError,
  3150. "'%.400s' decoder returned '%.400s' instead of 'str'; "
  3151. "use codecs.decode() to decode to arbitrary types",
  3152. encoding,
  3153. Py_TYPE(unicode)->tp_name);
  3154. Py_DECREF(v);
  3155. goto onError;
  3156. }
  3157. return unicode_result(v);
  3158. onError:
  3159. return NULL;
  3160. }
  3161. PyObject *
  3162. PyUnicode_AsEncodedObject(PyObject *unicode,
  3163. const char *encoding,
  3164. const char *errors)
  3165. {
  3166. PyObject *v;
  3167. if (!PyUnicode_Check(unicode)) {
  3168. PyErr_BadArgument();
  3169. goto onError;
  3170. }
  3171. if (PyErr_WarnEx(PyExc_DeprecationWarning,
  3172. "PyUnicode_AsEncodedObject() is deprecated; "
  3173. "use PyUnicode_AsEncodedString() to encode from str to bytes "
  3174. "or PyCodec_Encode() for generic encoding", 1) < 0)
  3175. return NULL;
  3176. if (encoding == NULL)
  3177. encoding = PyUnicode_GetDefaultEncoding();
  3178. /* Encode via the codec registry */
  3179. v = PyCodec_Encode(unicode, encoding, errors);
  3180. if (v == NULL)
  3181. goto onError;
  3182. return v;
  3183. onError:
  3184. return NULL;
  3185. }
  3186. static PyObject *
  3187. unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
  3188. int current_locale)
  3189. {
  3190. Py_ssize_t wlen;
  3191. wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
  3192. if (wstr == NULL) {
  3193. return NULL;
  3194. }
  3195. if ((size_t)wlen != wcslen(wstr)) {
  3196. PyErr_SetString(PyExc_ValueError, "embedded null character");
  3197. PyMem_Free(wstr);
  3198. return NULL;
  3199. }
  3200. char *str;
  3201. size_t error_pos;
  3202. const char *reason;
  3203. int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
  3204. current_locale, error_handler);
  3205. PyMem_Free(wstr);
  3206. if (res != 0) {
  3207. if (res == -2) {
  3208. PyObject *exc;
  3209. exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
  3210. "locale", unicode,
  3211. (Py_ssize_t)error_pos,
  3212. (Py_ssize_t)(error_pos+1),
  3213. reason);
  3214. if (exc != NULL) {
  3215. PyCodec_StrictErrors(exc);
  3216. Py_DECREF(exc);
  3217. }
  3218. }
  3219. else if (res == -3) {
  3220. PyErr_SetString(PyExc_ValueError, "unsupported error handler");
  3221. }
  3222. else {
  3223. PyErr_NoMemory();
  3224. }
  3225. return NULL;
  3226. }
  3227. PyObject *bytes = PyBytes_FromString(str);
  3228. PyMem_RawFree(str);
  3229. return bytes;
  3230. }
  3231. PyObject *
  3232. PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
  3233. {
  3234. _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
  3235. return unicode_encode_locale(unicode, error_handler, 1);
  3236. }
  3237. PyObject *
  3238. PyUnicode_EncodeFSDefault(PyObject *unicode)
  3239. {
  3240. PyInterpreterState *interp = _PyInterpreterState_GET();
  3241. struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
  3242. if (fs_codec->utf8) {
  3243. return unicode_encode_utf8(unicode,
  3244. fs_codec->error_handler,
  3245. fs_codec->errors);
  3246. }
  3247. #ifndef _Py_FORCE_UTF8_FS_ENCODING
  3248. else if (fs_codec->encoding) {
  3249. return PyUnicode_AsEncodedString(unicode,
  3250. fs_codec->encoding,
  3251. fs_codec->errors);
  3252. }
  3253. #endif
  3254. else {
  3255. /* Before _PyUnicode_InitEncodings() is called, the Python codec
  3256. machinery is not ready and so cannot be used:
  3257. use wcstombs() in this case. */
  3258. const PyConfig *config = _PyInterpreterState_GetConfig(interp);
  3259. const wchar_t *filesystem_errors = config->filesystem_errors;
  3260. assert(filesystem_errors != NULL);
  3261. _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
  3262. assert(errors != _Py_ERROR_UNKNOWN);
  3263. #ifdef _Py_FORCE_UTF8_FS_ENCODING
  3264. return unicode_encode_utf8(unicode, errors, NULL);
  3265. #else
  3266. return unicode_encode_locale(unicode, errors, 0);
  3267. #endif
  3268. }
  3269. }
  3270. PyObject *
  3271. PyUnicode_AsEncodedString(PyObject *unicode,
  3272. const char *encoding,
  3273. const char *errors)
  3274. {
  3275. PyObject *v;
  3276. char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
  3277. if (!PyUnicode_Check(unicode)) {
  3278. PyErr_BadArgument();
  3279. return NULL;
  3280. }
  3281. if (unicode_check_encoding_errors(encoding, errors) < 0) {
  3282. return NULL;
  3283. }
  3284. if (encoding == NULL) {
  3285. return _PyUnicode_AsUTF8String(unicode, errors);
  3286. }
  3287. /* Shortcuts for common default encodings */
  3288. if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
  3289. char *lower = buflower;
  3290. /* Fast paths */
  3291. if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
  3292. lower += 3;
  3293. if (*lower == '_') {
  3294. /* Match "utf8" and "utf_8" */
  3295. lower++;
  3296. }
  3297. if (lower[0] == '8' && lower[1] == 0) {
  3298. return _PyUnicode_AsUTF8String(unicode, errors);
  3299. }
  3300. else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
  3301. return _PyUnicode_EncodeUTF16(unicode, errors, 0);
  3302. }
  3303. else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
  3304. return _PyUnicode_EncodeUTF32(unicode, errors, 0);
  3305. }
  3306. }
  3307. else {
  3308. if (strcmp(lower, "ascii") == 0
  3309. || strcmp(lower, "us_ascii") == 0) {
  3310. return _PyUnicode_AsASCIIString(unicode, errors);
  3311. }
  3312. #ifdef MS_WINDOWS
  3313. else if (strcmp(lower, "mbcs") == 0) {
  3314. return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
  3315. }
  3316. #endif
  3317. else if (strcmp(lower, "latin1") == 0 ||
  3318. strcmp(lower, "latin_1") == 0 ||
  3319. strcmp(lower, "iso_8859_1") == 0 ||
  3320. strcmp(lower, "iso8859_1") == 0) {
  3321. return _PyUnicode_AsLatin1String(unicode, errors);
  3322. }
  3323. }
  3324. }
  3325. /* Encode via the codec registry */
  3326. v = _PyCodec_EncodeText(unicode, encoding, errors);
  3327. if (v == NULL)
  3328. return NULL;
  3329. /* The normal path */
  3330. if (PyBytes_Check(v))
  3331. return v;
  3332. /* If the codec returns a buffer, raise a warning and convert to bytes */
  3333. if (PyByteArray_Check(v)) {
  3334. int error;
  3335. PyObject *b;
  3336. error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
  3337. "encoder %s returned bytearray instead of bytes; "
  3338. "use codecs.encode() to encode to arbitrary types",
  3339. encoding);
  3340. if (error) {
  3341. Py_DECREF(v);
  3342. return NULL;
  3343. }
  3344. b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
  3345. PyByteArray_GET_SIZE(v));
  3346. Py_DECREF(v);
  3347. return b;
  3348. }
  3349. PyErr_Format(PyExc_TypeError,
  3350. "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
  3351. "use codecs.encode() to encode to arbitrary types",
  3352. encoding,
  3353. Py_TYPE(v)->tp_name);
  3354. Py_DECREF(v);
  3355. return NULL;
  3356. }
  3357. PyObject *
  3358. PyUnicode_AsEncodedUnicode(PyObject *unicode,
  3359. const char *encoding,
  3360. const char *errors)
  3361. {
  3362. PyObject *v;
  3363. if (!PyUnicode_Check(unicode)) {
  3364. PyErr_BadArgument();
  3365. goto onError;
  3366. }
  3367. if (PyErr_WarnEx(PyExc_DeprecationWarning,
  3368. "PyUnicode_AsEncodedUnicode() is deprecated; "
  3369. "use PyCodec_Encode() to encode from str to str", 1) < 0)
  3370. return NULL;
  3371. if (encoding == NULL)
  3372. encoding = PyUnicode_GetDefaultEncoding();
  3373. /* Encode via the codec registry */
  3374. v = PyCodec_Encode(unicode, encoding, errors);
  3375. if (v == NULL)
  3376. goto onError;
  3377. if (!PyUnicode_Check(v)) {
  3378. PyErr_Format(PyExc_TypeError,
  3379. "'%.400s' encoder returned '%.400s' instead of 'str'; "
  3380. "use codecs.encode() to encode to arbitrary types",
  3381. encoding,
  3382. Py_TYPE(v)->tp_name);
  3383. Py_DECREF(v);
  3384. goto onError;
  3385. }
  3386. return v;
  3387. onError:
  3388. return NULL;
  3389. }
  3390. static PyObject*
  3391. unicode_decode_locale(const char *str, Py_ssize_t len,
  3392. _Py_error_handler errors, int current_locale)
  3393. {
  3394. if (str[len] != '\0' || (size_t)len != strlen(str)) {
  3395. PyErr_SetString(PyExc_ValueError, "embedded null byte");
  3396. return NULL;
  3397. }
  3398. wchar_t *wstr;
  3399. size_t wlen;
  3400. const char *reason;
  3401. int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
  3402. current_locale, errors);
  3403. if (res != 0) {
  3404. if (res == -2) {
  3405. PyObject *exc;
  3406. exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
  3407. "locale", str, len,
  3408. (Py_ssize_t)wlen,
  3409. (Py_ssize_t)(wlen + 1),
  3410. reason);
  3411. if (exc != NULL) {
  3412. PyCodec_StrictErrors(exc);
  3413. Py_DECREF(exc);
  3414. }
  3415. }
  3416. else if (res == -3) {
  3417. PyErr_SetString(PyExc_ValueError, "unsupported error handler");
  3418. }
  3419. else {
  3420. PyErr_NoMemory();
  3421. }
  3422. return NULL;
  3423. }
  3424. PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
  3425. PyMem_RawFree(wstr);
  3426. return unicode;
  3427. }
  3428. PyObject*
  3429. PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
  3430. const char *errors)
  3431. {
  3432. _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
  3433. return unicode_decode_locale(str, len, error_handler, 1);
  3434. }
  3435. PyObject*
  3436. PyUnicode_DecodeLocale(const char *str, const char *errors)
  3437. {
  3438. Py_ssize_t size = (Py_ssize_t)strlen(str);
  3439. _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
  3440. return unicode_decode_locale(str, size, error_handler, 1);
  3441. }
  3442. PyObject*
  3443. PyUnicode_DecodeFSDefault(const char *s) {
  3444. Py_ssize_t size = (Py_ssize_t)strlen(s);
  3445. return PyUnicode_DecodeFSDefaultAndSize(s, size);
  3446. }
  3447. PyObject*
  3448. PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
  3449. {
  3450. PyInterpreterState *interp = _PyInterpreterState_GET();
  3451. struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
  3452. if (fs_codec->utf8) {
  3453. return unicode_decode_utf8(s, size,
  3454. fs_codec->error_handler,
  3455. fs_codec->errors,
  3456. NULL);
  3457. }
  3458. #ifndef _Py_FORCE_UTF8_FS_ENCODING
  3459. else if (fs_codec->encoding) {
  3460. return PyUnicode_Decode(s, size,
  3461. fs_codec->encoding,
  3462. fs_codec->errors);
  3463. }
  3464. #endif
  3465. else {
  3466. /* Before _PyUnicode_InitEncodings() is called, the Python codec
  3467. machinery is not ready and so cannot be used:
  3468. use mbstowcs() in this case. */
  3469. const PyConfig *config = _PyInterpreterState_GetConfig(interp);
  3470. const wchar_t *filesystem_errors = config->filesystem_errors;
  3471. assert(filesystem_errors != NULL);
  3472. _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
  3473. assert(errors != _Py_ERROR_UNKNOWN);
  3474. #ifdef _Py_FORCE_UTF8_FS_ENCODING
  3475. return unicode_decode_utf8(s, size, errors, NULL, NULL);
  3476. #else
  3477. return unicode_decode_locale(s, size, errors, 0);
  3478. #endif
  3479. }
  3480. }
  3481. int
  3482. PyUnicode_FSConverter(PyObject* arg, void* addr)
  3483. {
  3484. PyObject *path = NULL;
  3485. PyObject *output = NULL;
  3486. Py_ssize_t size;
  3487. const char *data;
  3488. if (arg == NULL) {
  3489. Py_DECREF(*(PyObject**)addr);
  3490. *(PyObject**)addr = NULL;
  3491. return 1;
  3492. }
  3493. path = PyOS_FSPath(arg);
  3494. if (path == NULL) {
  3495. return 0;
  3496. }
  3497. if (PyBytes_Check(path)) {
  3498. output = path;
  3499. }
  3500. else { // PyOS_FSPath() guarantees its returned value is bytes or str.
  3501. output = PyUnicode_EncodeFSDefault(path);
  3502. Py_DECREF(path);
  3503. if (!output) {
  3504. return 0;
  3505. }
  3506. assert(PyBytes_Check(output));
  3507. }
  3508. size = PyBytes_GET_SIZE(output);
  3509. data = PyBytes_AS_STRING(output);
  3510. if ((size_t)size != strlen(data)) {
  3511. PyErr_SetString(PyExc_ValueError, "embedded null byte");
  3512. Py_DECREF(output);
  3513. return 0;
  3514. }
  3515. *(PyObject**)addr = output;
  3516. return Py_CLEANUP_SUPPORTED;
  3517. }
  3518. int
  3519. PyUnicode_FSDecoder(PyObject* arg, void* addr)
  3520. {
  3521. if (arg == NULL) {
  3522. Py_DECREF(*(PyObject**)addr);
  3523. *(PyObject**)addr = NULL;
  3524. return 1;
  3525. }
  3526. PyObject *path = PyOS_FSPath(arg);
  3527. if (path == NULL) {
  3528. return 0;
  3529. }
  3530. PyObject *output = NULL;
  3531. if (PyUnicode_Check(path)) {
  3532. output = path;
  3533. }
  3534. else if (PyBytes_Check(path)) {
  3535. output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
  3536. PyBytes_GET_SIZE(path));
  3537. Py_DECREF(path);
  3538. if (!output) {
  3539. return 0;
  3540. }
  3541. }
  3542. else {
  3543. PyErr_Format(PyExc_TypeError,
  3544. "path should be string, bytes, or os.PathLike, not %.200s",
  3545. Py_TYPE(arg)->tp_name);
  3546. Py_DECREF(path);
  3547. return 0;
  3548. }
  3549. if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
  3550. PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
  3551. PyErr_SetString(PyExc_ValueError, "embedded null character");
  3552. Py_DECREF(output);
  3553. return 0;
  3554. }
  3555. *(PyObject**)addr = output;
  3556. return Py_CLEANUP_SUPPORTED;
  3557. }
  3558. static int unicode_fill_utf8(PyObject *unicode);
  3559. const char *
  3560. PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
  3561. {
  3562. if (!PyUnicode_Check(unicode)) {
  3563. PyErr_BadArgument();
  3564. return NULL;
  3565. }
  3566. if (PyUnicode_UTF8(unicode) == NULL) {
  3567. if (unicode_fill_utf8(unicode) == -1) {
  3568. return NULL;
  3569. }
  3570. }
  3571. if (psize)
  3572. *psize = PyUnicode_UTF8_LENGTH(unicode);
  3573. return PyUnicode_UTF8(unicode);
  3574. }
  3575. const char *
  3576. PyUnicode_AsUTF8(PyObject *unicode)
  3577. {
  3578. return PyUnicode_AsUTF8AndSize(unicode, NULL);
  3579. }
  3580. const char *
  3581. _PyUnicode_AsUTF8NoNUL(PyObject *unicode)
  3582. {
  3583. Py_ssize_t size;
  3584. const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
  3585. if (s && strlen(s) != (size_t)size) {
  3586. PyErr_SetString(PyExc_ValueError, "embedded null character");
  3587. return NULL;
  3588. }
  3589. return s;
  3590. }
  3591. /*
  3592. PyUnicode_GetSize() has been deprecated since Python 3.3
  3593. because it returned length of Py_UNICODE.
  3594. But this function is part of stable abi, because it don't
  3595. include Py_UNICODE in signature and it was not excluded from
  3596. stable abi in PEP 384.
  3597. */
  3598. PyAPI_FUNC(Py_ssize_t)
  3599. PyUnicode_GetSize(PyObject *unicode)
  3600. {
  3601. PyErr_SetString(PyExc_RuntimeError,
  3602. "PyUnicode_GetSize has been removed.");
  3603. return -1;
  3604. }
  3605. Py_ssize_t
  3606. PyUnicode_GetLength(PyObject *unicode)
  3607. {
  3608. if (!PyUnicode_Check(unicode)) {
  3609. PyErr_BadArgument();
  3610. return -1;
  3611. }
  3612. return PyUnicode_GET_LENGTH(unicode);
  3613. }
  3614. Py_UCS4
  3615. PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
  3616. {
  3617. const void *data;
  3618. int kind;
  3619. if (!PyUnicode_Check(unicode)) {
  3620. PyErr_BadArgument();
  3621. return (Py_UCS4)-1;
  3622. }
  3623. if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
  3624. PyErr_SetString(PyExc_IndexError, "string index out of range");
  3625. return (Py_UCS4)-1;
  3626. }
  3627. data = PyUnicode_DATA(unicode);
  3628. kind = PyUnicode_KIND(unicode);
  3629. return PyUnicode_READ(kind, data, index);
  3630. }
  3631. int
  3632. PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
  3633. {
  3634. if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
  3635. PyErr_BadArgument();
  3636. return -1;
  3637. }
  3638. if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
  3639. PyErr_SetString(PyExc_IndexError, "string index out of range");
  3640. return -1;
  3641. }
  3642. if (unicode_check_modifiable(unicode))
  3643. return -1;
  3644. if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
  3645. PyErr_SetString(PyExc_ValueError, "character out of range");
  3646. return -1;
  3647. }
  3648. PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
  3649. index, ch);
  3650. return 0;
  3651. }
  3652. const char *
  3653. PyUnicode_GetDefaultEncoding(void)
  3654. {
  3655. return "utf-8";
  3656. }
  3657. /* create or adjust a UnicodeDecodeError */
  3658. static void
  3659. make_decode_exception(PyObject **exceptionObject,
  3660. const char *encoding,
  3661. const char *input, Py_ssize_t length,
  3662. Py_ssize_t startpos, Py_ssize_t endpos,
  3663. const char *reason)
  3664. {
  3665. if (*exceptionObject == NULL) {
  3666. *exceptionObject = PyUnicodeDecodeError_Create(
  3667. encoding, input, length, startpos, endpos, reason);
  3668. }
  3669. else {
  3670. if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
  3671. goto onError;
  3672. if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
  3673. goto onError;
  3674. if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
  3675. goto onError;
  3676. }
  3677. return;
  3678. onError:
  3679. Py_CLEAR(*exceptionObject);
  3680. }
  3681. #ifdef MS_WINDOWS
  3682. static int
  3683. widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
  3684. {
  3685. if (newsize > *size) {
  3686. wchar_t *newbuf = *buf;
  3687. if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
  3688. PyErr_NoMemory();
  3689. return -1;
  3690. }
  3691. *buf = newbuf;
  3692. }
  3693. *size = newsize;
  3694. return 0;
  3695. }
  3696. /* error handling callback helper:
  3697. build arguments, call the callback and check the arguments,
  3698. if no exception occurred, copy the replacement to the output
  3699. and adjust various state variables.
  3700. return 0 on success, -1 on error
  3701. */
  3702. static int
  3703. unicode_decode_call_errorhandler_wchar(
  3704. const char *errors, PyObject **errorHandler,
  3705. const char *encoding, const char *reason,
  3706. const char **input, const char **inend, Py_ssize_t *startinpos,
  3707. Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
  3708. wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
  3709. {
  3710. static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
  3711. PyObject *restuple = NULL;
  3712. PyObject *repunicode = NULL;
  3713. Py_ssize_t outsize;
  3714. Py_ssize_t insize;
  3715. Py_ssize_t requiredsize;
  3716. Py_ssize_t newpos;
  3717. PyObject *inputobj = NULL;
  3718. Py_ssize_t repwlen;
  3719. if (*errorHandler == NULL) {
  3720. *errorHandler = PyCodec_LookupError(errors);
  3721. if (*errorHandler == NULL)
  3722. goto onError;
  3723. }
  3724. make_decode_exception(exceptionObject,
  3725. encoding,
  3726. *input, *inend - *input,
  3727. *startinpos, *endinpos,
  3728. reason);
  3729. if (*exceptionObject == NULL)
  3730. goto onError;
  3731. restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
  3732. if (restuple == NULL)
  3733. goto onError;
  3734. if (!PyTuple_Check(restuple)) {
  3735. PyErr_SetString(PyExc_TypeError, &argparse[3]);
  3736. goto onError;
  3737. }
  3738. if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
  3739. goto onError;
  3740. /* Copy back the bytes variables, which might have been modified by the
  3741. callback */
  3742. inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
  3743. if (!inputobj)
  3744. goto onError;
  3745. *input = PyBytes_AS_STRING(inputobj);
  3746. insize = PyBytes_GET_SIZE(inputobj);
  3747. *inend = *input + insize;
  3748. /* we can DECREF safely, as the exception has another reference,
  3749. so the object won't go away. */
  3750. Py_DECREF(inputobj);
  3751. if (newpos<0)
  3752. newpos = insize+newpos;
  3753. if (newpos<0 || newpos>insize) {
  3754. PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
  3755. goto onError;
  3756. }
  3757. repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
  3758. if (repwlen < 0)
  3759. goto onError;
  3760. repwlen--;
  3761. /* need more space? (at least enough for what we
  3762. have+the replacement+the rest of the string (starting
  3763. at the new input position), so we won't have to check space
  3764. when there are no errors in the rest of the string) */
  3765. requiredsize = *outpos;
  3766. if (requiredsize > PY_SSIZE_T_MAX - repwlen)
  3767. goto overflow;
  3768. requiredsize += repwlen;
  3769. if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
  3770. goto overflow;
  3771. requiredsize += insize - newpos;
  3772. outsize = *bufsize;
  3773. if (requiredsize > outsize) {
  3774. if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
  3775. requiredsize = 2*outsize;
  3776. if (widechar_resize(buf, bufsize, requiredsize) < 0) {
  3777. goto onError;
  3778. }
  3779. }
  3780. PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
  3781. *outpos += repwlen;
  3782. *endinpos = newpos;
  3783. *inptr = *input + newpos;
  3784. /* we made it! */
  3785. Py_DECREF(restuple);
  3786. return 0;
  3787. overflow:
  3788. PyErr_SetString(PyExc_OverflowError,
  3789. "decoded result is too long for a Python string");
  3790. onError:
  3791. Py_XDECREF(restuple);
  3792. return -1;
  3793. }
  3794. #endif /* MS_WINDOWS */
  3795. static int
  3796. unicode_decode_call_errorhandler_writer(
  3797. const char *errors, PyObject **errorHandler,
  3798. const char *encoding, const char *reason,
  3799. const char **input, const char **inend, Py_ssize_t *startinpos,
  3800. Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
  3801. _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
  3802. {
  3803. static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
  3804. PyObject *restuple = NULL;
  3805. PyObject *repunicode = NULL;
  3806. Py_ssize_t insize;
  3807. Py_ssize_t newpos;
  3808. Py_ssize_t replen;
  3809. Py_ssize_t remain;
  3810. PyObject *inputobj = NULL;
  3811. int need_to_grow = 0;
  3812. const char *new_inptr;
  3813. if (*errorHandler == NULL) {
  3814. *errorHandler = PyCodec_LookupError(errors);
  3815. if (*errorHandler == NULL)
  3816. goto onError;
  3817. }
  3818. make_decode_exception(exceptionObject,
  3819. encoding,
  3820. *input, *inend - *input,
  3821. *startinpos, *endinpos,
  3822. reason);
  3823. if (*exceptionObject == NULL)
  3824. goto onError;
  3825. restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
  3826. if (restuple == NULL)
  3827. goto onError;
  3828. if (!PyTuple_Check(restuple)) {
  3829. PyErr_SetString(PyExc_TypeError, &argparse[3]);
  3830. goto onError;
  3831. }
  3832. if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
  3833. goto onError;
  3834. /* Copy back the bytes variables, which might have been modified by the
  3835. callback */
  3836. inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
  3837. if (!inputobj)
  3838. goto onError;
  3839. remain = *inend - *input - *endinpos;
  3840. *input = PyBytes_AS_STRING(inputobj);
  3841. insize = PyBytes_GET_SIZE(inputobj);
  3842. *inend = *input + insize;
  3843. /* we can DECREF safely, as the exception has another reference,
  3844. so the object won't go away. */
  3845. Py_DECREF(inputobj);
  3846. if (newpos<0)
  3847. newpos = insize+newpos;
  3848. if (newpos<0 || newpos>insize) {
  3849. PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
  3850. goto onError;
  3851. }
  3852. replen = PyUnicode_GET_LENGTH(repunicode);
  3853. if (replen > 1) {
  3854. writer->min_length += replen - 1;
  3855. need_to_grow = 1;
  3856. }
  3857. new_inptr = *input + newpos;
  3858. if (*inend - new_inptr > remain) {
  3859. /* We don't know the decoding algorithm here so we make the worst
  3860. assumption that one byte decodes to one unicode character.
  3861. If unfortunately one byte could decode to more unicode characters,
  3862. the decoder may write out-of-bound then. Is it possible for the
  3863. algorithms using this function? */
  3864. writer->min_length += *inend - new_inptr - remain;
  3865. need_to_grow = 1;
  3866. }
  3867. if (need_to_grow) {
  3868. writer->overallocate = 1;
  3869. if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
  3870. PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
  3871. goto onError;
  3872. }
  3873. if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
  3874. goto onError;
  3875. *endinpos = newpos;
  3876. *inptr = new_inptr;
  3877. /* we made it! */
  3878. Py_DECREF(restuple);
  3879. return 0;
  3880. onError:
  3881. Py_XDECREF(restuple);
  3882. return -1;
  3883. }
  3884. /* --- UTF-7 Codec -------------------------------------------------------- */
  3885. /* See RFC2152 for details. We encode conservatively and decode liberally. */
  3886. /* Three simple macros defining base-64. */
  3887. /* Is c a base-64 character? */
  3888. #define IS_BASE64(c) \
  3889. (((c) >= 'A' && (c) <= 'Z') || \
  3890. ((c) >= 'a' && (c) <= 'z') || \
  3891. ((c) >= '0' && (c) <= '9') || \
  3892. (c) == '+' || (c) == '/')
  3893. /* given that c is a base-64 character, what is its base-64 value? */
  3894. #define FROM_BASE64(c) \
  3895. (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
  3896. ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
  3897. ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
  3898. (c) == '+' ? 62 : 63)
  3899. /* What is the base-64 character of the bottom 6 bits of n? */
  3900. #define TO_BASE64(n) \
  3901. ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
  3902. /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
  3903. * decoded as itself. We are permissive on decoding; the only ASCII
  3904. * byte not decoding to itself is the + which begins a base64
  3905. * string. */
  3906. #define DECODE_DIRECT(c) \
  3907. ((c) <= 127 && (c) != '+')
  3908. /* The UTF-7 encoder treats ASCII characters differently according to
  3909. * whether they are Set D, Set O, Whitespace, or special (i.e. none of
  3910. * the above). See RFC2152. This array identifies these different
  3911. * sets:
  3912. * 0 : "Set D"
  3913. * alphanumeric and '(),-./:?
  3914. * 1 : "Set O"
  3915. * !"#$%&*;<=>@[]^_`{|}
  3916. * 2 : "whitespace"
  3917. * ht nl cr sp
  3918. * 3 : special (must be base64 encoded)
  3919. * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
  3920. */
  3921. static
  3922. char utf7_category[128] = {
  3923. /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
  3924. 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
  3925. /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
  3926. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  3927. /* sp ! " # $ % & ' ( ) * + , - . / */
  3928. 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
  3929. /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
  3930. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
  3931. /* @ A B C D E F G H I J K L M N O */
  3932. 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3933. /* P Q R S T U V W X Y Z [ \ ] ^ _ */
  3934. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
  3935. /* ` a b c d e f g h i j k l m n o */
  3936. 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3937. /* p q r s t u v w x y z { | } ~ del */
  3938. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
  3939. };
  3940. /* ENCODE_DIRECT: this character should be encoded as itself. The
  3941. * answer depends on whether we are encoding set O as itself, and also
  3942. * on whether we are encoding whitespace as itself. RFC2152 makes it
  3943. * clear that the answers to these questions vary between
  3944. * applications, so this code needs to be flexible. */
  3945. #define ENCODE_DIRECT(c, directO, directWS) \
  3946. ((c) < 128 && (c) > 0 && \
  3947. ((utf7_category[(c)] == 0) || \
  3948. (directWS && (utf7_category[(c)] == 2)) || \
  3949. (directO && (utf7_category[(c)] == 1))))
  3950. PyObject *
  3951. PyUnicode_DecodeUTF7(const char *s,
  3952. Py_ssize_t size,
  3953. const char *errors)
  3954. {
  3955. return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
  3956. }
  3957. /* The decoder. The only state we preserve is our read position,
  3958. * i.e. how many characters we have consumed. So if we end in the
  3959. * middle of a shift sequence we have to back off the read position
  3960. * and the output to the beginning of the sequence, otherwise we lose
  3961. * all the shift state (seen bits, number of bits seen, high
  3962. * surrogate). */
  3963. PyObject *
  3964. PyUnicode_DecodeUTF7Stateful(const char *s,
  3965. Py_ssize_t size,
  3966. const char *errors,
  3967. Py_ssize_t *consumed)
  3968. {
  3969. const char *starts = s;
  3970. Py_ssize_t startinpos;
  3971. Py_ssize_t endinpos;
  3972. const char *e;
  3973. _PyUnicodeWriter writer;
  3974. const char *errmsg = "";
  3975. int inShift = 0;
  3976. Py_ssize_t shiftOutStart;
  3977. unsigned int base64bits = 0;
  3978. unsigned long base64buffer = 0;
  3979. Py_UCS4 surrogate = 0;
  3980. PyObject *errorHandler = NULL;
  3981. PyObject *exc = NULL;
  3982. if (size == 0) {
  3983. if (consumed)
  3984. *consumed = 0;
  3985. _Py_RETURN_UNICODE_EMPTY();
  3986. }
  3987. /* Start off assuming it's all ASCII. Widen later as necessary. */
  3988. _PyUnicodeWriter_Init(&writer);
  3989. writer.min_length = size;
  3990. shiftOutStart = 0;
  3991. e = s + size;
  3992. while (s < e) {
  3993. Py_UCS4 ch;
  3994. restart:
  3995. ch = (unsigned char) *s;
  3996. if (inShift) { /* in a base-64 section */
  3997. if (IS_BASE64(ch)) { /* consume a base-64 character */
  3998. base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
  3999. base64bits += 6;
  4000. s++;
  4001. if (base64bits >= 16) {
  4002. /* we have enough bits for a UTF-16 value */
  4003. Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
  4004. base64bits -= 16;
  4005. base64buffer &= (1 << base64bits) - 1; /* clear high bits */
  4006. assert(outCh <= 0xffff);
  4007. if (surrogate) {
  4008. /* expecting a second surrogate */
  4009. if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
  4010. Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
  4011. if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
  4012. goto onError;
  4013. surrogate = 0;
  4014. continue;
  4015. }
  4016. else {
  4017. if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
  4018. goto onError;
  4019. surrogate = 0;
  4020. }
  4021. }
  4022. if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
  4023. /* first surrogate */
  4024. surrogate = outCh;
  4025. }
  4026. else {
  4027. if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
  4028. goto onError;
  4029. }
  4030. }
  4031. }
  4032. else { /* now leaving a base-64 section */
  4033. inShift = 0;
  4034. if (base64bits > 0) { /* left-over bits */
  4035. if (base64bits >= 6) {
  4036. /* We've seen at least one base-64 character */
  4037. s++;
  4038. errmsg = "partial character in shift sequence";
  4039. goto utf7Error;
  4040. }
  4041. else {
  4042. /* Some bits remain; they should be zero */
  4043. if (base64buffer != 0) {
  4044. s++;
  4045. errmsg = "non-zero padding bits in shift sequence";
  4046. goto utf7Error;
  4047. }
  4048. }
  4049. }
  4050. if (surrogate && DECODE_DIRECT(ch)) {
  4051. if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
  4052. goto onError;
  4053. }
  4054. surrogate = 0;
  4055. if (ch == '-') {
  4056. /* '-' is absorbed; other terminating
  4057. characters are preserved */
  4058. s++;
  4059. }
  4060. }
  4061. }
  4062. else if ( ch == '+' ) {
  4063. startinpos = s-starts;
  4064. s++; /* consume '+' */
  4065. if (s < e && *s == '-') { /* '+-' encodes '+' */
  4066. s++;
  4067. if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
  4068. goto onError;
  4069. }
  4070. else if (s < e && !IS_BASE64(*s)) {
  4071. s++;
  4072. errmsg = "ill-formed sequence";
  4073. goto utf7Error;
  4074. }
  4075. else { /* begin base64-encoded section */
  4076. inShift = 1;
  4077. surrogate = 0;
  4078. shiftOutStart = writer.pos;
  4079. base64bits = 0;
  4080. base64buffer = 0;
  4081. }
  4082. }
  4083. else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
  4084. s++;
  4085. if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
  4086. goto onError;
  4087. }
  4088. else {
  4089. startinpos = s-starts;
  4090. s++;
  4091. errmsg = "unexpected special character";
  4092. goto utf7Error;
  4093. }
  4094. continue;
  4095. utf7Error:
  4096. endinpos = s-starts;
  4097. if (unicode_decode_call_errorhandler_writer(
  4098. errors, &errorHandler,
  4099. "utf7", errmsg,
  4100. &starts, &e, &startinpos, &endinpos, &exc, &s,
  4101. &writer))
  4102. goto onError;
  4103. }
  4104. /* end of string */
  4105. if (inShift && !consumed) { /* in shift sequence, no more to follow */
  4106. /* if we're in an inconsistent state, that's an error */
  4107. inShift = 0;
  4108. if (surrogate ||
  4109. (base64bits >= 6) ||
  4110. (base64bits > 0 && base64buffer != 0)) {
  4111. endinpos = size;
  4112. if (unicode_decode_call_errorhandler_writer(
  4113. errors, &errorHandler,
  4114. "utf7", "unterminated shift sequence",
  4115. &starts, &e, &startinpos, &endinpos, &exc, &s,
  4116. &writer))
  4117. goto onError;
  4118. if (s < e)
  4119. goto restart;
  4120. }
  4121. }
  4122. /* return state */
  4123. if (consumed) {
  4124. if (inShift) {
  4125. *consumed = startinpos;
  4126. if (writer.pos != shiftOutStart && writer.maxchar > 127) {
  4127. PyObject *result = PyUnicode_FromKindAndData(
  4128. writer.kind, writer.data, shiftOutStart);
  4129. Py_XDECREF(errorHandler);
  4130. Py_XDECREF(exc);
  4131. _PyUnicodeWriter_Dealloc(&writer);
  4132. return result;
  4133. }
  4134. writer.pos = shiftOutStart; /* back off output */
  4135. }
  4136. else {
  4137. *consumed = s-starts;
  4138. }
  4139. }
  4140. Py_XDECREF(errorHandler);
  4141. Py_XDECREF(exc);
  4142. return _PyUnicodeWriter_Finish(&writer);
  4143. onError:
  4144. Py_XDECREF(errorHandler);
  4145. Py_XDECREF(exc);
  4146. _PyUnicodeWriter_Dealloc(&writer);
  4147. return NULL;
  4148. }
  4149. PyObject *
  4150. _PyUnicode_EncodeUTF7(PyObject *str,
  4151. int base64SetO,
  4152. int base64WhiteSpace,
  4153. const char *errors)
  4154. {
  4155. int kind;
  4156. const void *data;
  4157. Py_ssize_t len;
  4158. PyObject *v;
  4159. int inShift = 0;
  4160. Py_ssize_t i;
  4161. unsigned int base64bits = 0;
  4162. unsigned long base64buffer = 0;
  4163. char * out;
  4164. const char * start;
  4165. kind = PyUnicode_KIND(str);
  4166. data = PyUnicode_DATA(str);
  4167. len = PyUnicode_GET_LENGTH(str);
  4168. if (len == 0)
  4169. return PyBytes_FromStringAndSize(NULL, 0);
  4170. /* It might be possible to tighten this worst case */
  4171. if (len > PY_SSIZE_T_MAX / 8)
  4172. return PyErr_NoMemory();
  4173. v = PyBytes_FromStringAndSize(NULL, len * 8);
  4174. if (v == NULL)
  4175. return NULL;
  4176. start = out = PyBytes_AS_STRING(v);
  4177. for (i = 0; i < len; ++i) {
  4178. Py_UCS4 ch = PyUnicode_READ(kind, data, i);
  4179. if (inShift) {
  4180. if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
  4181. /* shifting out */
  4182. if (base64bits) { /* output remaining bits */
  4183. *out++ = TO_BASE64(base64buffer << (6-base64bits));
  4184. base64buffer = 0;
  4185. base64bits = 0;
  4186. }
  4187. inShift = 0;
  4188. /* Characters not in the BASE64 set implicitly unshift the sequence
  4189. so no '-' is required, except if the character is itself a '-' */
  4190. if (IS_BASE64(ch) || ch == '-') {
  4191. *out++ = '-';
  4192. }
  4193. *out++ = (char) ch;
  4194. }
  4195. else {
  4196. goto encode_char;
  4197. }
  4198. }
  4199. else { /* not in a shift sequence */
  4200. if (ch == '+') {
  4201. *out++ = '+';
  4202. *out++ = '-';
  4203. }
  4204. else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
  4205. *out++ = (char) ch;
  4206. }
  4207. else {
  4208. *out++ = '+';
  4209. inShift = 1;
  4210. goto encode_char;
  4211. }
  4212. }
  4213. continue;
  4214. encode_char:
  4215. if (ch >= 0x10000) {
  4216. assert(ch <= MAX_UNICODE);
  4217. /* code first surrogate */
  4218. base64bits += 16;
  4219. base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
  4220. while (base64bits >= 6) {
  4221. *out++ = TO_BASE64(base64buffer >> (base64bits-6));
  4222. base64bits -= 6;
  4223. }
  4224. /* prepare second surrogate */
  4225. ch = Py_UNICODE_LOW_SURROGATE(ch);
  4226. }
  4227. base64bits += 16;
  4228. base64buffer = (base64buffer << 16) | ch;
  4229. while (base64bits >= 6) {
  4230. *out++ = TO_BASE64(base64buffer >> (base64bits-6));
  4231. base64bits -= 6;
  4232. }
  4233. }
  4234. if (base64bits)
  4235. *out++= TO_BASE64(base64buffer << (6-base64bits) );
  4236. if (inShift)
  4237. *out++ = '-';
  4238. if (_PyBytes_Resize(&v, out - start) < 0)
  4239. return NULL;
  4240. return v;
  4241. }
  4242. #undef IS_BASE64
  4243. #undef FROM_BASE64
  4244. #undef TO_BASE64
  4245. #undef DECODE_DIRECT
  4246. #undef ENCODE_DIRECT
  4247. /* --- UTF-8 Codec -------------------------------------------------------- */
  4248. PyObject *
  4249. PyUnicode_DecodeUTF8(const char *s,
  4250. Py_ssize_t size,
  4251. const char *errors)
  4252. {
  4253. return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
  4254. }
  4255. #include "stringlib/asciilib.h"
  4256. #include "stringlib/codecs.h"
  4257. #include "stringlib/undef.h"
  4258. #include "stringlib/ucs1lib.h"
  4259. #include "stringlib/codecs.h"
  4260. #include "stringlib/undef.h"
  4261. #include "stringlib/ucs2lib.h"
  4262. #include "stringlib/codecs.h"
  4263. #include "stringlib/undef.h"
  4264. #include "stringlib/ucs4lib.h"
  4265. #include "stringlib/codecs.h"
  4266. #include "stringlib/undef.h"
  4267. /* Mask to quickly check whether a C 'size_t' contains a
  4268. non-ASCII, UTF8-encoded char. */
  4269. #if (SIZEOF_SIZE_T == 8)
  4270. # define ASCII_CHAR_MASK 0x8080808080808080ULL
  4271. #elif (SIZEOF_SIZE_T == 4)
  4272. # define ASCII_CHAR_MASK 0x80808080U
  4273. #else
  4274. # error C 'size_t' size should be either 4 or 8!
  4275. #endif
  4276. static Py_ssize_t
  4277. ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
  4278. {
  4279. const char *p = start;
  4280. #if SIZEOF_SIZE_T <= SIZEOF_VOID_P
  4281. assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
  4282. if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
  4283. /* Fast path, see in STRINGLIB(utf8_decode) for
  4284. an explanation. */
  4285. /* Help allocation */
  4286. const char *_p = p;
  4287. Py_UCS1 * q = dest;
  4288. while (_p + SIZEOF_SIZE_T <= end) {
  4289. size_t value = *(const size_t *) _p;
  4290. if (value & ASCII_CHAR_MASK)
  4291. break;
  4292. *((size_t *)q) = value;
  4293. _p += SIZEOF_SIZE_T;
  4294. q += SIZEOF_SIZE_T;
  4295. }
  4296. p = _p;
  4297. while (p < end) {
  4298. if ((unsigned char)*p & 0x80)
  4299. break;
  4300. *q++ = *p++;
  4301. }
  4302. return p - start;
  4303. }
  4304. #endif
  4305. while (p < end) {
  4306. /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
  4307. for an explanation. */
  4308. if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
  4309. /* Help allocation */
  4310. const char *_p = p;
  4311. while (_p + SIZEOF_SIZE_T <= end) {
  4312. size_t value = *(const size_t *) _p;
  4313. if (value & ASCII_CHAR_MASK)
  4314. break;
  4315. _p += SIZEOF_SIZE_T;
  4316. }
  4317. p = _p;
  4318. if (_p == end)
  4319. break;
  4320. }
  4321. if ((unsigned char)*p & 0x80)
  4322. break;
  4323. ++p;
  4324. }
  4325. memcpy(dest, start, p - start);
  4326. return p - start;
  4327. }
  4328. static PyObject *
  4329. unicode_decode_utf8(const char *s, Py_ssize_t size,
  4330. _Py_error_handler error_handler, const char *errors,
  4331. Py_ssize_t *consumed)
  4332. {
  4333. if (size == 0) {
  4334. if (consumed)
  4335. *consumed = 0;
  4336. _Py_RETURN_UNICODE_EMPTY();
  4337. }
  4338. /* ASCII is equivalent to the first 128 ordinals in Unicode. */
  4339. if (size == 1 && (unsigned char)s[0] < 128) {
  4340. if (consumed) {
  4341. *consumed = 1;
  4342. }
  4343. return get_latin1_char((unsigned char)s[0]);
  4344. }
  4345. const char *starts = s;
  4346. const char *end = s + size;
  4347. // fast path: try ASCII string.
  4348. PyObject *u = PyUnicode_New(size, 127);
  4349. if (u == NULL) {
  4350. return NULL;
  4351. }
  4352. s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
  4353. if (s == end) {
  4354. if (consumed) {
  4355. *consumed = size;
  4356. }
  4357. return u;
  4358. }
  4359. // Use _PyUnicodeWriter after fast path is failed.
  4360. _PyUnicodeWriter writer;
  4361. _PyUnicodeWriter_InitWithBuffer(&writer, u);
  4362. writer.pos = s - starts;
  4363. Py_ssize_t startinpos, endinpos;
  4364. const char *errmsg = "";
  4365. PyObject *error_handler_obj = NULL;
  4366. PyObject *exc = NULL;
  4367. while (s < end) {
  4368. Py_UCS4 ch;
  4369. int kind = writer.kind;
  4370. if (kind == PyUnicode_1BYTE_KIND) {
  4371. if (PyUnicode_IS_ASCII(writer.buffer))
  4372. ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
  4373. else
  4374. ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
  4375. } else if (kind == PyUnicode_2BYTE_KIND) {
  4376. ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
  4377. } else {
  4378. assert(kind == PyUnicode_4BYTE_KIND);
  4379. ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
  4380. }
  4381. switch (ch) {
  4382. case 0:
  4383. if (s == end || consumed)
  4384. goto End;
  4385. errmsg = "unexpected end of data";
  4386. startinpos = s - starts;
  4387. endinpos = end - starts;
  4388. break;
  4389. case 1:
  4390. errmsg = "invalid start byte";
  4391. startinpos = s - starts;
  4392. endinpos = startinpos + 1;
  4393. break;
  4394. case 2:
  4395. if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
  4396. && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
  4397. {
  4398. /* Truncated surrogate code in range D800-DFFF */
  4399. goto End;
  4400. }
  4401. /* fall through */
  4402. case 3:
  4403. case 4:
  4404. errmsg = "invalid continuation byte";
  4405. startinpos = s - starts;
  4406. endinpos = startinpos + ch - 1;
  4407. break;
  4408. default:
  4409. if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
  4410. goto onError;
  4411. continue;
  4412. }
  4413. if (error_handler == _Py_ERROR_UNKNOWN)
  4414. error_handler = _Py_GetErrorHandler(errors);
  4415. switch (error_handler) {
  4416. case _Py_ERROR_IGNORE:
  4417. s += (endinpos - startinpos);
  4418. break;
  4419. case _Py_ERROR_REPLACE:
  4420. if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
  4421. goto onError;
  4422. s += (endinpos - startinpos);
  4423. break;
  4424. case _Py_ERROR_SURROGATEESCAPE:
  4425. {
  4426. Py_ssize_t i;
  4427. if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
  4428. goto onError;
  4429. for (i=startinpos; i<endinpos; i++) {
  4430. ch = (Py_UCS4)(unsigned char)(starts[i]);
  4431. PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
  4432. ch + 0xdc00);
  4433. writer.pos++;
  4434. }
  4435. s += (endinpos - startinpos);
  4436. break;
  4437. }
  4438. default:
  4439. if (unicode_decode_call_errorhandler_writer(
  4440. errors, &error_handler_obj,
  4441. "utf-8", errmsg,
  4442. &starts, &end, &startinpos, &endinpos, &exc, &s,
  4443. &writer))
  4444. goto onError;
  4445. }
  4446. }
  4447. End:
  4448. if (consumed)
  4449. *consumed = s - starts;
  4450. Py_XDECREF(error_handler_obj);
  4451. Py_XDECREF(exc);
  4452. return _PyUnicodeWriter_Finish(&writer);
  4453. onError:
  4454. Py_XDECREF(error_handler_obj);
  4455. Py_XDECREF(exc);
  4456. _PyUnicodeWriter_Dealloc(&writer);
  4457. return NULL;
  4458. }
  4459. PyObject *
  4460. PyUnicode_DecodeUTF8Stateful(const char *s,
  4461. Py_ssize_t size,
  4462. const char *errors,
  4463. Py_ssize_t *consumed)
  4464. {
  4465. return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
  4466. }
  4467. /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
  4468. non-zero, use strict error handler otherwise.
  4469. On success, write a pointer to a newly allocated wide character string into
  4470. *wstr (use PyMem_RawFree() to free the memory) and write the output length
  4471. (in number of wchar_t units) into *wlen (if wlen is set).
  4472. On memory allocation failure, return -1.
  4473. On decoding error (if surrogateescape is zero), return -2. If wlen is
  4474. non-NULL, write the start of the illegal byte sequence into *wlen. If reason
  4475. is not NULL, write the decoding error message into *reason. */
  4476. int
  4477. _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
  4478. const char **reason, _Py_error_handler errors)
  4479. {
  4480. const char *orig_s = s;
  4481. const char *e;
  4482. wchar_t *unicode;
  4483. Py_ssize_t outpos;
  4484. int surrogateescape = 0;
  4485. int surrogatepass = 0;
  4486. switch (errors)
  4487. {
  4488. case _Py_ERROR_STRICT:
  4489. break;
  4490. case _Py_ERROR_SURROGATEESCAPE:
  4491. surrogateescape = 1;
  4492. break;
  4493. case _Py_ERROR_SURROGATEPASS:
  4494. surrogatepass = 1;
  4495. break;
  4496. default:
  4497. return -3;
  4498. }
  4499. /* Note: size will always be longer than the resulting Unicode
  4500. character count */
  4501. if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
  4502. return -1;
  4503. }
  4504. unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
  4505. if (!unicode) {
  4506. return -1;
  4507. }
  4508. /* Unpack UTF-8 encoded data */
  4509. e = s + size;
  4510. outpos = 0;
  4511. while (s < e) {
  4512. Py_UCS4 ch;
  4513. #if SIZEOF_WCHAR_T == 4
  4514. ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
  4515. #else
  4516. ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
  4517. #endif
  4518. if (ch > 0xFF) {
  4519. #if SIZEOF_WCHAR_T == 4
  4520. Py_UNREACHABLE();
  4521. #else
  4522. assert(ch > 0xFFFF && ch <= MAX_UNICODE);
  4523. /* write a surrogate pair */
  4524. unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
  4525. unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
  4526. #endif
  4527. }
  4528. else {
  4529. if (!ch && s == e) {
  4530. break;
  4531. }
  4532. if (surrogateescape) {
  4533. unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
  4534. }
  4535. else {
  4536. /* Is it a valid three-byte code? */
  4537. if (surrogatepass
  4538. && (e - s) >= 3
  4539. && (s[0] & 0xf0) == 0xe0
  4540. && (s[1] & 0xc0) == 0x80
  4541. && (s[2] & 0xc0) == 0x80)
  4542. {
  4543. ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
  4544. s += 3;
  4545. unicode[outpos++] = ch;
  4546. }
  4547. else {
  4548. PyMem_RawFree(unicode );
  4549. if (reason != NULL) {
  4550. switch (ch) {
  4551. case 0:
  4552. *reason = "unexpected end of data";
  4553. break;
  4554. case 1:
  4555. *reason = "invalid start byte";
  4556. break;
  4557. /* 2, 3, 4 */
  4558. default:
  4559. *reason = "invalid continuation byte";
  4560. break;
  4561. }
  4562. }
  4563. if (wlen != NULL) {
  4564. *wlen = s - orig_s;
  4565. }
  4566. return -2;
  4567. }
  4568. }
  4569. }
  4570. }
  4571. unicode[outpos] = L'\0';
  4572. if (wlen) {
  4573. *wlen = outpos;
  4574. }
  4575. *wstr = unicode;
  4576. return 0;
  4577. }
  4578. wchar_t*
  4579. _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
  4580. size_t *wlen)
  4581. {
  4582. wchar_t *wstr;
  4583. int res = _Py_DecodeUTF8Ex(arg, arglen,
  4584. &wstr, wlen,
  4585. NULL, _Py_ERROR_SURROGATEESCAPE);
  4586. if (res != 0) {
  4587. /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
  4588. assert(res != -3);
  4589. if (wlen) {
  4590. *wlen = (size_t)res;
  4591. }
  4592. return NULL;
  4593. }
  4594. return wstr;
  4595. }
  4596. /* UTF-8 encoder using the surrogateescape error handler .
  4597. On success, return 0 and write the newly allocated character string (use
  4598. PyMem_Free() to free the memory) into *str.
  4599. On encoding failure, return -2 and write the position of the invalid
  4600. surrogate character into *error_pos (if error_pos is set) and the decoding
  4601. error message into *reason (if reason is set).
  4602. On memory allocation failure, return -1. */
  4603. int
  4604. _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
  4605. const char **reason, int raw_malloc, _Py_error_handler errors)
  4606. {
  4607. const Py_ssize_t max_char_size = 4;
  4608. Py_ssize_t len = wcslen(text);
  4609. assert(len >= 0);
  4610. int surrogateescape = 0;
  4611. int surrogatepass = 0;
  4612. switch (errors)
  4613. {
  4614. case _Py_ERROR_STRICT:
  4615. break;
  4616. case _Py_ERROR_SURROGATEESCAPE:
  4617. surrogateescape = 1;
  4618. break;
  4619. case _Py_ERROR_SURROGATEPASS:
  4620. surrogatepass = 1;
  4621. break;
  4622. default:
  4623. return -3;
  4624. }
  4625. if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
  4626. return -1;
  4627. }
  4628. char *bytes;
  4629. if (raw_malloc) {
  4630. bytes = PyMem_RawMalloc((len + 1) * max_char_size);
  4631. }
  4632. else {
  4633. bytes = PyMem_Malloc((len + 1) * max_char_size);
  4634. }
  4635. if (bytes == NULL) {
  4636. return -1;
  4637. }
  4638. char *p = bytes;
  4639. Py_ssize_t i;
  4640. for (i = 0; i < len; ) {
  4641. Py_ssize_t ch_pos = i;
  4642. Py_UCS4 ch = text[i];
  4643. i++;
  4644. #if Py_UNICODE_SIZE == 2
  4645. if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
  4646. && i < len
  4647. && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
  4648. {
  4649. ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
  4650. i++;
  4651. }
  4652. #endif
  4653. if (ch < 0x80) {
  4654. /* Encode ASCII */
  4655. *p++ = (char) ch;
  4656. }
  4657. else if (ch < 0x0800) {
  4658. /* Encode Latin-1 */
  4659. *p++ = (char)(0xc0 | (ch >> 6));
  4660. *p++ = (char)(0x80 | (ch & 0x3f));
  4661. }
  4662. else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
  4663. /* surrogateescape error handler */
  4664. if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
  4665. if (error_pos != NULL) {
  4666. *error_pos = (size_t)ch_pos;
  4667. }
  4668. if (reason != NULL) {
  4669. *reason = "encoding error";
  4670. }
  4671. if (raw_malloc) {
  4672. PyMem_RawFree(bytes);
  4673. }
  4674. else {
  4675. PyMem_Free(bytes);
  4676. }
  4677. return -2;
  4678. }
  4679. *p++ = (char)(ch & 0xff);
  4680. }
  4681. else if (ch < 0x10000) {
  4682. *p++ = (char)(0xe0 | (ch >> 12));
  4683. *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
  4684. *p++ = (char)(0x80 | (ch & 0x3f));
  4685. }
  4686. else { /* ch >= 0x10000 */
  4687. assert(ch <= MAX_UNICODE);
  4688. /* Encode UCS4 Unicode ordinals */
  4689. *p++ = (char)(0xf0 | (ch >> 18));
  4690. *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
  4691. *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
  4692. *p++ = (char)(0x80 | (ch & 0x3f));
  4693. }
  4694. }
  4695. *p++ = '\0';
  4696. size_t final_size = (p - bytes);
  4697. char *bytes2;
  4698. if (raw_malloc) {
  4699. bytes2 = PyMem_RawRealloc(bytes, final_size);
  4700. }
  4701. else {
  4702. bytes2 = PyMem_Realloc(bytes, final_size);
  4703. }
  4704. if (bytes2 == NULL) {
  4705. if (error_pos != NULL) {
  4706. *error_pos = (size_t)-1;
  4707. }
  4708. if (raw_malloc) {
  4709. PyMem_RawFree(bytes);
  4710. }
  4711. else {
  4712. PyMem_Free(bytes);
  4713. }
  4714. return -1;
  4715. }
  4716. *str = bytes2;
  4717. return 0;
  4718. }
  4719. /* Primary internal function which creates utf8 encoded bytes objects.
  4720. Allocation strategy: if the string is short, convert into a stack buffer
  4721. and allocate exactly as much space needed at the end. Else allocate the
  4722. maximum possible needed (4 result bytes per Unicode character), and return
  4723. the excess memory at the end.
  4724. */
  4725. static PyObject *
  4726. unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
  4727. const char *errors)
  4728. {
  4729. if (!PyUnicode_Check(unicode)) {
  4730. PyErr_BadArgument();
  4731. return NULL;
  4732. }
  4733. if (PyUnicode_UTF8(unicode))
  4734. return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
  4735. PyUnicode_UTF8_LENGTH(unicode));
  4736. int kind = PyUnicode_KIND(unicode);
  4737. const void *data = PyUnicode_DATA(unicode);
  4738. Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
  4739. _PyBytesWriter writer;
  4740. char *end;
  4741. switch (kind) {
  4742. default:
  4743. Py_UNREACHABLE();
  4744. case PyUnicode_1BYTE_KIND:
  4745. /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
  4746. assert(!PyUnicode_IS_ASCII(unicode));
  4747. end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
  4748. break;
  4749. case PyUnicode_2BYTE_KIND:
  4750. end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
  4751. break;
  4752. case PyUnicode_4BYTE_KIND:
  4753. end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
  4754. break;
  4755. }
  4756. if (end == NULL) {
  4757. _PyBytesWriter_Dealloc(&writer);
  4758. return NULL;
  4759. }
  4760. return _PyBytesWriter_Finish(&writer, end);
  4761. }
  4762. static int
  4763. unicode_fill_utf8(PyObject *unicode)
  4764. {
  4765. /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
  4766. assert(!PyUnicode_IS_ASCII(unicode));
  4767. int kind = PyUnicode_KIND(unicode);
  4768. const void *data = PyUnicode_DATA(unicode);
  4769. Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
  4770. _PyBytesWriter writer;
  4771. char *end;
  4772. switch (kind) {
  4773. default:
  4774. Py_UNREACHABLE();
  4775. case PyUnicode_1BYTE_KIND:
  4776. end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
  4777. _Py_ERROR_STRICT, NULL);
  4778. break;
  4779. case PyUnicode_2BYTE_KIND:
  4780. end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
  4781. _Py_ERROR_STRICT, NULL);
  4782. break;
  4783. case PyUnicode_4BYTE_KIND:
  4784. end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
  4785. _Py_ERROR_STRICT, NULL);
  4786. break;
  4787. }
  4788. if (end == NULL) {
  4789. _PyBytesWriter_Dealloc(&writer);
  4790. return -1;
  4791. }
  4792. const char *start = writer.use_small_buffer ? writer.small_buffer :
  4793. PyBytes_AS_STRING(writer.buffer);
  4794. Py_ssize_t len = end - start;
  4795. char *cache = PyObject_Malloc(len + 1);
  4796. if (cache == NULL) {
  4797. _PyBytesWriter_Dealloc(&writer);
  4798. PyErr_NoMemory();
  4799. return -1;
  4800. }
  4801. _PyUnicode_UTF8(unicode) = cache;
  4802. _PyUnicode_UTF8_LENGTH(unicode) = len;
  4803. memcpy(cache, start, len);
  4804. cache[len] = '\0';
  4805. _PyBytesWriter_Dealloc(&writer);
  4806. return 0;
  4807. }
  4808. PyObject *
  4809. _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
  4810. {
  4811. return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
  4812. }
  4813. PyObject *
  4814. PyUnicode_AsUTF8String(PyObject *unicode)
  4815. {
  4816. return _PyUnicode_AsUTF8String(unicode, NULL);
  4817. }
  4818. /* --- UTF-32 Codec ------------------------------------------------------- */
  4819. PyObject *
  4820. PyUnicode_DecodeUTF32(const char *s,
  4821. Py_ssize_t size,
  4822. const char *errors,
  4823. int *byteorder)
  4824. {
  4825. return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
  4826. }
  4827. PyObject *
  4828. PyUnicode_DecodeUTF32Stateful(const char *s,
  4829. Py_ssize_t size,
  4830. const char *errors,
  4831. int *byteorder,
  4832. Py_ssize_t *consumed)
  4833. {
  4834. const char *starts = s;
  4835. Py_ssize_t startinpos;
  4836. Py_ssize_t endinpos;
  4837. _PyUnicodeWriter writer;
  4838. const unsigned char *q, *e;
  4839. int le, bo = 0; /* assume native ordering by default */
  4840. const char *encoding;
  4841. const char *errmsg = "";
  4842. PyObject *errorHandler = NULL;
  4843. PyObject *exc = NULL;
  4844. q = (const unsigned char *)s;
  4845. e = q + size;
  4846. if (byteorder)
  4847. bo = *byteorder;
  4848. /* Check for BOM marks (U+FEFF) in the input and adjust current
  4849. byte order setting accordingly. In native mode, the leading BOM
  4850. mark is skipped, in all other modes, it is copied to the output
  4851. stream as-is (giving a ZWNBSP character). */
  4852. if (bo == 0 && size >= 4) {
  4853. Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
  4854. if (bom == 0x0000FEFF) {
  4855. bo = -1;
  4856. q += 4;
  4857. }
  4858. else if (bom == 0xFFFE0000) {
  4859. bo = 1;
  4860. q += 4;
  4861. }
  4862. if (byteorder)
  4863. *byteorder = bo;
  4864. }
  4865. if (q == e) {
  4866. if (consumed)
  4867. *consumed = size;
  4868. _Py_RETURN_UNICODE_EMPTY();
  4869. }
  4870. #ifdef WORDS_BIGENDIAN
  4871. le = bo < 0;
  4872. #else
  4873. le = bo <= 0;
  4874. #endif
  4875. encoding = le ? "utf-32-le" : "utf-32-be";
  4876. _PyUnicodeWriter_Init(&writer);
  4877. writer.min_length = (e - q + 3) / 4;
  4878. if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
  4879. goto onError;
  4880. while (1) {
  4881. Py_UCS4 ch = 0;
  4882. Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
  4883. if (e - q >= 4) {
  4884. int kind = writer.kind;
  4885. void *data = writer.data;
  4886. const unsigned char *last = e - 4;
  4887. Py_ssize_t pos = writer.pos;
  4888. if (le) {
  4889. do {
  4890. ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
  4891. if (ch > maxch)
  4892. break;
  4893. if (kind != PyUnicode_1BYTE_KIND &&
  4894. Py_UNICODE_IS_SURROGATE(ch))
  4895. break;
  4896. PyUnicode_WRITE(kind, data, pos++, ch);
  4897. q += 4;
  4898. } while (q <= last);
  4899. }
  4900. else {
  4901. do {
  4902. ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
  4903. if (ch > maxch)
  4904. break;
  4905. if (kind != PyUnicode_1BYTE_KIND &&
  4906. Py_UNICODE_IS_SURROGATE(ch))
  4907. break;
  4908. PyUnicode_WRITE(kind, data, pos++, ch);
  4909. q += 4;
  4910. } while (q <= last);
  4911. }
  4912. writer.pos = pos;
  4913. }
  4914. if (Py_UNICODE_IS_SURROGATE(ch)) {
  4915. errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
  4916. startinpos = ((const char *)q) - starts;
  4917. endinpos = startinpos + 4;
  4918. }
  4919. else if (ch <= maxch) {
  4920. if (q == e || consumed)
  4921. break;
  4922. /* remaining bytes at the end? (size should be divisible by 4) */
  4923. errmsg = "truncated data";
  4924. startinpos = ((const char *)q) - starts;
  4925. endinpos = ((const char *)e) - starts;
  4926. }
  4927. else {
  4928. if (ch < 0x110000) {
  4929. if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
  4930. goto onError;
  4931. q += 4;
  4932. continue;
  4933. }
  4934. errmsg = "code point not in range(0x110000)";
  4935. startinpos = ((const char *)q) - starts;
  4936. endinpos = startinpos + 4;
  4937. }
  4938. /* The remaining input chars are ignored if the callback
  4939. chooses to skip the input */
  4940. if (unicode_decode_call_errorhandler_writer(
  4941. errors, &errorHandler,
  4942. encoding, errmsg,
  4943. &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
  4944. &writer))
  4945. goto onError;
  4946. }
  4947. if (consumed)
  4948. *consumed = (const char *)q-starts;
  4949. Py_XDECREF(errorHandler);
  4950. Py_XDECREF(exc);
  4951. return _PyUnicodeWriter_Finish(&writer);
  4952. onError:
  4953. _PyUnicodeWriter_Dealloc(&writer);
  4954. Py_XDECREF(errorHandler);
  4955. Py_XDECREF(exc);
  4956. return NULL;
  4957. }
  4958. PyObject *
  4959. _PyUnicode_EncodeUTF32(PyObject *str,
  4960. const char *errors,
  4961. int byteorder)
  4962. {
  4963. int kind;
  4964. const void *data;
  4965. Py_ssize_t len;
  4966. PyObject *v;
  4967. uint32_t *out;
  4968. #if PY_LITTLE_ENDIAN
  4969. int native_ordering = byteorder <= 0;
  4970. #else
  4971. int native_ordering = byteorder >= 0;
  4972. #endif
  4973. const char *encoding;
  4974. Py_ssize_t nsize, pos;
  4975. PyObject *errorHandler = NULL;
  4976. PyObject *exc = NULL;
  4977. PyObject *rep = NULL;
  4978. if (!PyUnicode_Check(str)) {
  4979. PyErr_BadArgument();
  4980. return NULL;
  4981. }
  4982. kind = PyUnicode_KIND(str);
  4983. data = PyUnicode_DATA(str);
  4984. len = PyUnicode_GET_LENGTH(str);
  4985. if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
  4986. return PyErr_NoMemory();
  4987. nsize = len + (byteorder == 0);
  4988. v = PyBytes_FromStringAndSize(NULL, nsize * 4);
  4989. if (v == NULL)
  4990. return NULL;
  4991. /* output buffer is 4-bytes aligned */
  4992. assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
  4993. out = (uint32_t *)PyBytes_AS_STRING(v);
  4994. if (byteorder == 0)
  4995. *out++ = 0xFEFF;
  4996. if (len == 0)
  4997. goto done;
  4998. if (byteorder == -1)
  4999. encoding = "utf-32-le";
  5000. else if (byteorder == 1)
  5001. encoding = "utf-32-be";
  5002. else
  5003. encoding = "utf-32";
  5004. if (kind == PyUnicode_1BYTE_KIND) {
  5005. ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
  5006. goto done;
  5007. }
  5008. pos = 0;
  5009. while (pos < len) {
  5010. Py_ssize_t newpos, repsize, moreunits;
  5011. if (kind == PyUnicode_2BYTE_KIND) {
  5012. pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
  5013. &out, native_ordering);
  5014. }
  5015. else {
  5016. assert(kind == PyUnicode_4BYTE_KIND);
  5017. pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
  5018. &out, native_ordering);
  5019. }
  5020. if (pos == len)
  5021. break;
  5022. rep = unicode_encode_call_errorhandler(
  5023. errors, &errorHandler,
  5024. encoding, "surrogates not allowed",
  5025. str, &exc, pos, pos + 1, &newpos);
  5026. if (!rep)
  5027. goto error;
  5028. if (PyBytes_Check(rep)) {
  5029. repsize = PyBytes_GET_SIZE(rep);
  5030. if (repsize & 3) {
  5031. raise_encode_exception(&exc, encoding,
  5032. str, pos, pos + 1,
  5033. "surrogates not allowed");
  5034. goto error;
  5035. }
  5036. moreunits = repsize / 4;
  5037. }
  5038. else {
  5039. assert(PyUnicode_Check(rep));
  5040. moreunits = repsize = PyUnicode_GET_LENGTH(rep);
  5041. if (!PyUnicode_IS_ASCII(rep)) {
  5042. raise_encode_exception(&exc, encoding,
  5043. str, pos, pos + 1,
  5044. "surrogates not allowed");
  5045. goto error;
  5046. }
  5047. }
  5048. moreunits += pos - newpos;
  5049. pos = newpos;
  5050. /* four bytes are reserved for each surrogate */
  5051. if (moreunits > 0) {
  5052. Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
  5053. if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
  5054. /* integer overflow */
  5055. PyErr_NoMemory();
  5056. goto error;
  5057. }
  5058. if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
  5059. goto error;
  5060. out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
  5061. }
  5062. if (PyBytes_Check(rep)) {
  5063. memcpy(out, PyBytes_AS_STRING(rep), repsize);
  5064. out += repsize / 4;
  5065. } else /* rep is unicode */ {
  5066. assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
  5067. ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
  5068. &out, native_ordering);
  5069. }
  5070. Py_CLEAR(rep);
  5071. }
  5072. /* Cut back to size actually needed. This is necessary for, for example,
  5073. encoding of a string containing isolated surrogates and the 'ignore'
  5074. handler is used. */
  5075. nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
  5076. if (nsize != PyBytes_GET_SIZE(v))
  5077. _PyBytes_Resize(&v, nsize);
  5078. Py_XDECREF(errorHandler);
  5079. Py_XDECREF(exc);
  5080. done:
  5081. return v;
  5082. error:
  5083. Py_XDECREF(rep);
  5084. Py_XDECREF(errorHandler);
  5085. Py_XDECREF(exc);
  5086. Py_XDECREF(v);
  5087. return NULL;
  5088. }
  5089. PyObject *
  5090. PyUnicode_AsUTF32String(PyObject *unicode)
  5091. {
  5092. return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
  5093. }
  5094. /* --- UTF-16 Codec ------------------------------------------------------- */
  5095. PyObject *
  5096. PyUnicode_DecodeUTF16(const char *s,
  5097. Py_ssize_t size,
  5098. const char *errors,
  5099. int *byteorder)
  5100. {
  5101. return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
  5102. }
  5103. PyObject *
  5104. PyUnicode_DecodeUTF16Stateful(const char *s,
  5105. Py_ssize_t size,
  5106. const char *errors,
  5107. int *byteorder,
  5108. Py_ssize_t *consumed)
  5109. {
  5110. const char *starts = s;
  5111. Py_ssize_t startinpos;
  5112. Py_ssize_t endinpos;
  5113. _PyUnicodeWriter writer;
  5114. const unsigned char *q, *e;
  5115. int bo = 0; /* assume native ordering by default */
  5116. int native_ordering;
  5117. const char *errmsg = "";
  5118. PyObject *errorHandler = NULL;
  5119. PyObject *exc = NULL;
  5120. const char *encoding;
  5121. q = (const unsigned char *)s;
  5122. e = q + size;
  5123. if (byteorder)
  5124. bo = *byteorder;
  5125. /* Check for BOM marks (U+FEFF) in the input and adjust current
  5126. byte order setting accordingly. In native mode, the leading BOM
  5127. mark is skipped, in all other modes, it is copied to the output
  5128. stream as-is (giving a ZWNBSP character). */
  5129. if (bo == 0 && size >= 2) {
  5130. const Py_UCS4 bom = (q[1] << 8) | q[0];
  5131. if (bom == 0xFEFF) {
  5132. q += 2;
  5133. bo = -1;
  5134. }
  5135. else if (bom == 0xFFFE) {
  5136. q += 2;
  5137. bo = 1;
  5138. }
  5139. if (byteorder)
  5140. *byteorder = bo;
  5141. }
  5142. if (q == e) {
  5143. if (consumed)
  5144. *consumed = size;
  5145. _Py_RETURN_UNICODE_EMPTY();
  5146. }
  5147. #if PY_LITTLE_ENDIAN
  5148. native_ordering = bo <= 0;
  5149. encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
  5150. #else
  5151. native_ordering = bo >= 0;
  5152. encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
  5153. #endif
  5154. /* Note: size will always be longer than the resulting Unicode
  5155. character count normally. Error handler will take care of
  5156. resizing when needed. */
  5157. _PyUnicodeWriter_Init(&writer);
  5158. writer.min_length = (e - q + 1) / 2;
  5159. if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
  5160. goto onError;
  5161. while (1) {
  5162. Py_UCS4 ch = 0;
  5163. if (e - q >= 2) {
  5164. int kind = writer.kind;
  5165. if (kind == PyUnicode_1BYTE_KIND) {
  5166. if (PyUnicode_IS_ASCII(writer.buffer))
  5167. ch = asciilib_utf16_decode(&q, e,
  5168. (Py_UCS1*)writer.data, &writer.pos,
  5169. native_ordering);
  5170. else
  5171. ch = ucs1lib_utf16_decode(&q, e,
  5172. (Py_UCS1*)writer.data, &writer.pos,
  5173. native_ordering);
  5174. } else if (kind == PyUnicode_2BYTE_KIND) {
  5175. ch = ucs2lib_utf16_decode(&q, e,
  5176. (Py_UCS2*)writer.data, &writer.pos,
  5177. native_ordering);
  5178. } else {
  5179. assert(kind == PyUnicode_4BYTE_KIND);
  5180. ch = ucs4lib_utf16_decode(&q, e,
  5181. (Py_UCS4*)writer.data, &writer.pos,
  5182. native_ordering);
  5183. }
  5184. }
  5185. switch (ch)
  5186. {
  5187. case 0:
  5188. /* remaining byte at the end? (size should be even) */
  5189. if (q == e || consumed)
  5190. goto End;
  5191. errmsg = "truncated data";
  5192. startinpos = ((const char *)q) - starts;
  5193. endinpos = ((const char *)e) - starts;
  5194. break;
  5195. /* The remaining input chars are ignored if the callback
  5196. chooses to skip the input */
  5197. case 1:
  5198. q -= 2;
  5199. if (consumed)
  5200. goto End;
  5201. errmsg = "unexpected end of data";
  5202. startinpos = ((const char *)q) - starts;
  5203. endinpos = ((const char *)e) - starts;
  5204. break;
  5205. case 2:
  5206. errmsg = "illegal encoding";
  5207. startinpos = ((const char *)q) - 2 - starts;
  5208. endinpos = startinpos + 2;
  5209. break;
  5210. case 3:
  5211. errmsg = "illegal UTF-16 surrogate";
  5212. startinpos = ((const char *)q) - 4 - starts;
  5213. endinpos = startinpos + 2;
  5214. break;
  5215. default:
  5216. if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
  5217. goto onError;
  5218. continue;
  5219. }
  5220. if (unicode_decode_call_errorhandler_writer(
  5221. errors,
  5222. &errorHandler,
  5223. encoding, errmsg,
  5224. &starts,
  5225. (const char **)&e,
  5226. &startinpos,
  5227. &endinpos,
  5228. &exc,
  5229. (const char **)&q,
  5230. &writer))
  5231. goto onError;
  5232. }
  5233. End:
  5234. if (consumed)
  5235. *consumed = (const char *)q-starts;
  5236. Py_XDECREF(errorHandler);
  5237. Py_XDECREF(exc);
  5238. return _PyUnicodeWriter_Finish(&writer);
  5239. onError:
  5240. _PyUnicodeWriter_Dealloc(&writer);
  5241. Py_XDECREF(errorHandler);
  5242. Py_XDECREF(exc);
  5243. return NULL;
  5244. }
  5245. PyObject *
  5246. _PyUnicode_EncodeUTF16(PyObject *str,
  5247. const char *errors,
  5248. int byteorder)
  5249. {
  5250. int kind;
  5251. const void *data;
  5252. Py_ssize_t len;
  5253. PyObject *v;
  5254. unsigned short *out;
  5255. Py_ssize_t pairs;
  5256. #if PY_BIG_ENDIAN
  5257. int native_ordering = byteorder >= 0;
  5258. #else
  5259. int native_ordering = byteorder <= 0;
  5260. #endif
  5261. const char *encoding;
  5262. Py_ssize_t nsize, pos;
  5263. PyObject *errorHandler = NULL;
  5264. PyObject *exc = NULL;
  5265. PyObject *rep = NULL;
  5266. if (!PyUnicode_Check(str)) {
  5267. PyErr_BadArgument();
  5268. return NULL;
  5269. }
  5270. kind = PyUnicode_KIND(str);
  5271. data = PyUnicode_DATA(str);
  5272. len = PyUnicode_GET_LENGTH(str);
  5273. pairs = 0;
  5274. if (kind == PyUnicode_4BYTE_KIND) {
  5275. const Py_UCS4 *in = (const Py_UCS4 *)data;
  5276. const Py_UCS4 *end = in + len;
  5277. while (in < end) {
  5278. if (*in++ >= 0x10000) {
  5279. pairs++;
  5280. }
  5281. }
  5282. }
  5283. if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
  5284. return PyErr_NoMemory();
  5285. }
  5286. nsize = len + pairs + (byteorder == 0);
  5287. v = PyBytes_FromStringAndSize(NULL, nsize * 2);
  5288. if (v == NULL) {
  5289. return NULL;
  5290. }
  5291. /* output buffer is 2-bytes aligned */
  5292. assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
  5293. out = (unsigned short *)PyBytes_AS_STRING(v);
  5294. if (byteorder == 0) {
  5295. *out++ = 0xFEFF;
  5296. }
  5297. if (len == 0) {
  5298. goto done;
  5299. }
  5300. if (kind == PyUnicode_1BYTE_KIND) {
  5301. ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
  5302. goto done;
  5303. }
  5304. if (byteorder < 0) {
  5305. encoding = "utf-16-le";
  5306. }
  5307. else if (byteorder > 0) {
  5308. encoding = "utf-16-be";
  5309. }
  5310. else {
  5311. encoding = "utf-16";
  5312. }
  5313. pos = 0;
  5314. while (pos < len) {
  5315. Py_ssize_t newpos, repsize, moreunits;
  5316. if (kind == PyUnicode_2BYTE_KIND) {
  5317. pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
  5318. &out, native_ordering);
  5319. }
  5320. else {
  5321. assert(kind == PyUnicode_4BYTE_KIND);
  5322. pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
  5323. &out, native_ordering);
  5324. }
  5325. if (pos == len)
  5326. break;
  5327. rep = unicode_encode_call_errorhandler(
  5328. errors, &errorHandler,
  5329. encoding, "surrogates not allowed",
  5330. str, &exc, pos, pos + 1, &newpos);
  5331. if (!rep)
  5332. goto error;
  5333. if (PyBytes_Check(rep)) {
  5334. repsize = PyBytes_GET_SIZE(rep);
  5335. if (repsize & 1) {
  5336. raise_encode_exception(&exc, encoding,
  5337. str, pos, pos + 1,
  5338. "surrogates not allowed");
  5339. goto error;
  5340. }
  5341. moreunits = repsize / 2;
  5342. }
  5343. else {
  5344. assert(PyUnicode_Check(rep));
  5345. moreunits = repsize = PyUnicode_GET_LENGTH(rep);
  5346. if (!PyUnicode_IS_ASCII(rep)) {
  5347. raise_encode_exception(&exc, encoding,
  5348. str, pos, pos + 1,
  5349. "surrogates not allowed");
  5350. goto error;
  5351. }
  5352. }
  5353. moreunits += pos - newpos;
  5354. pos = newpos;
  5355. /* two bytes are reserved for each surrogate */
  5356. if (moreunits > 0) {
  5357. Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
  5358. if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
  5359. /* integer overflow */
  5360. PyErr_NoMemory();
  5361. goto error;
  5362. }
  5363. if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
  5364. goto error;
  5365. out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
  5366. }
  5367. if (PyBytes_Check(rep)) {
  5368. memcpy(out, PyBytes_AS_STRING(rep), repsize);
  5369. out += repsize / 2;
  5370. } else /* rep is unicode */ {
  5371. assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
  5372. ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
  5373. &out, native_ordering);
  5374. }
  5375. Py_CLEAR(rep);
  5376. }
  5377. /* Cut back to size actually needed. This is necessary for, for example,
  5378. encoding of a string containing isolated surrogates and the 'ignore' handler
  5379. is used. */
  5380. nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
  5381. if (nsize != PyBytes_GET_SIZE(v))
  5382. _PyBytes_Resize(&v, nsize);
  5383. Py_XDECREF(errorHandler);
  5384. Py_XDECREF(exc);
  5385. done:
  5386. return v;
  5387. error:
  5388. Py_XDECREF(rep);
  5389. Py_XDECREF(errorHandler);
  5390. Py_XDECREF(exc);
  5391. Py_XDECREF(v);
  5392. return NULL;
  5393. #undef STORECHAR
  5394. }
  5395. PyObject *
  5396. PyUnicode_AsUTF16String(PyObject *unicode)
  5397. {
  5398. return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
  5399. }
  5400. /* --- Unicode Escape Codec ----------------------------------------------- */
  5401. PyObject *
  5402. _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
  5403. Py_ssize_t size,
  5404. const char *errors,
  5405. Py_ssize_t *consumed,
  5406. const char **first_invalid_escape)
  5407. {
  5408. const char *starts = s;
  5409. _PyUnicodeWriter writer;
  5410. const char *end;
  5411. PyObject *errorHandler = NULL;
  5412. PyObject *exc = NULL;
  5413. _PyUnicode_Name_CAPI *ucnhash_capi;
  5414. PyInterpreterState *interp = _PyInterpreterState_Get();
  5415. // so we can remember if we've seen an invalid escape char or not
  5416. *first_invalid_escape = NULL;
  5417. if (size == 0) {
  5418. if (consumed) {
  5419. *consumed = 0;
  5420. }
  5421. _Py_RETURN_UNICODE_EMPTY();
  5422. }
  5423. /* Escaped strings will always be longer than the resulting
  5424. Unicode string, so we start with size here and then reduce the
  5425. length after conversion to the true value.
  5426. (but if the error callback returns a long replacement string
  5427. we'll have to allocate more space) */
  5428. _PyUnicodeWriter_Init(&writer);
  5429. writer.min_length = size;
  5430. if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
  5431. goto onError;
  5432. }
  5433. end = s + size;
  5434. while (s < end) {
  5435. unsigned char c = (unsigned char) *s++;
  5436. Py_UCS4 ch;
  5437. int count;
  5438. const char *message;
  5439. #define WRITE_ASCII_CHAR(ch) \
  5440. do { \
  5441. assert(ch <= 127); \
  5442. assert(writer.pos < writer.size); \
  5443. PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
  5444. } while(0)
  5445. #define WRITE_CHAR(ch) \
  5446. do { \
  5447. if (ch <= writer.maxchar) { \
  5448. assert(writer.pos < writer.size); \
  5449. PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
  5450. } \
  5451. else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
  5452. goto onError; \
  5453. } \
  5454. } while(0)
  5455. /* Non-escape characters are interpreted as Unicode ordinals */
  5456. if (c != '\\') {
  5457. WRITE_CHAR(c);
  5458. continue;
  5459. }
  5460. Py_ssize_t startinpos = s - starts - 1;
  5461. /* \ - Escapes */
  5462. if (s >= end) {
  5463. message = "\\ at end of string";
  5464. goto incomplete;
  5465. }
  5466. c = (unsigned char) *s++;
  5467. assert(writer.pos < writer.size);
  5468. switch (c) {
  5469. /* \x escapes */
  5470. case '\n': continue;
  5471. case '\\': WRITE_ASCII_CHAR('\\'); continue;
  5472. case '\'': WRITE_ASCII_CHAR('\''); continue;
  5473. case '\"': WRITE_ASCII_CHAR('\"'); continue;
  5474. case 'b': WRITE_ASCII_CHAR('\b'); continue;
  5475. /* FF */
  5476. case 'f': WRITE_ASCII_CHAR('\014'); continue;
  5477. case 't': WRITE_ASCII_CHAR('\t'); continue;
  5478. case 'n': WRITE_ASCII_CHAR('\n'); continue;
  5479. case 'r': WRITE_ASCII_CHAR('\r'); continue;
  5480. /* VT */
  5481. case 'v': WRITE_ASCII_CHAR('\013'); continue;
  5482. /* BEL, not classic C */
  5483. case 'a': WRITE_ASCII_CHAR('\007'); continue;
  5484. /* \OOO (octal) escapes */
  5485. case '0': case '1': case '2': case '3':
  5486. case '4': case '5': case '6': case '7':
  5487. ch = c - '0';
  5488. if (s < end && '0' <= *s && *s <= '7') {
  5489. ch = (ch<<3) + *s++ - '0';
  5490. if (s < end && '0' <= *s && *s <= '7') {
  5491. ch = (ch<<3) + *s++ - '0';
  5492. }
  5493. }
  5494. if (ch > 0377) {
  5495. if (*first_invalid_escape == NULL) {
  5496. *first_invalid_escape = s-3; /* Back up 3 chars, since we've
  5497. already incremented s. */
  5498. }
  5499. }
  5500. WRITE_CHAR(ch);
  5501. continue;
  5502. /* hex escapes */
  5503. /* \xXX */
  5504. case 'x':
  5505. count = 2;
  5506. message = "truncated \\xXX escape";
  5507. goto hexescape;
  5508. /* \uXXXX */
  5509. case 'u':
  5510. count = 4;
  5511. message = "truncated \\uXXXX escape";
  5512. goto hexescape;
  5513. /* \UXXXXXXXX */
  5514. case 'U':
  5515. count = 8;
  5516. message = "truncated \\UXXXXXXXX escape";
  5517. hexescape:
  5518. for (ch = 0; count; ++s, --count) {
  5519. if (s >= end) {
  5520. goto incomplete;
  5521. }
  5522. c = (unsigned char)*s;
  5523. ch <<= 4;
  5524. if (c >= '0' && c <= '9') {
  5525. ch += c - '0';
  5526. }
  5527. else if (c >= 'a' && c <= 'f') {
  5528. ch += c - ('a' - 10);
  5529. }
  5530. else if (c >= 'A' && c <= 'F') {
  5531. ch += c - ('A' - 10);
  5532. }
  5533. else {
  5534. goto error;
  5535. }
  5536. }
  5537. /* when we get here, ch is a 32-bit unicode character */
  5538. if (ch > MAX_UNICODE) {
  5539. message = "illegal Unicode character";
  5540. goto error;
  5541. }
  5542. WRITE_CHAR(ch);
  5543. continue;
  5544. /* \N{name} */
  5545. case 'N':
  5546. ucnhash_capi = interp->unicode.ucnhash_capi;
  5547. if (ucnhash_capi == NULL) {
  5548. /* load the unicode data module */
  5549. ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
  5550. PyUnicodeData_CAPSULE_NAME, 1);
  5551. if (ucnhash_capi == NULL) {
  5552. PyErr_SetString(
  5553. PyExc_UnicodeError,
  5554. "\\N escapes not supported (can't load unicodedata module)"
  5555. );
  5556. goto onError;
  5557. }
  5558. interp->unicode.ucnhash_capi = ucnhash_capi;
  5559. }
  5560. message = "malformed \\N character escape";
  5561. if (s >= end) {
  5562. goto incomplete;
  5563. }
  5564. if (*s == '{') {
  5565. const char *start = ++s;
  5566. size_t namelen;
  5567. /* look for the closing brace */
  5568. while (s < end && *s != '}')
  5569. s++;
  5570. if (s >= end) {
  5571. goto incomplete;
  5572. }
  5573. namelen = s - start;
  5574. if (namelen) {
  5575. /* found a name. look it up in the unicode database */
  5576. s++;
  5577. ch = 0xffffffff; /* in case 'getcode' messes up */
  5578. if (namelen <= INT_MAX &&
  5579. ucnhash_capi->getcode(start, (int)namelen,
  5580. &ch, 0)) {
  5581. assert(ch <= MAX_UNICODE);
  5582. WRITE_CHAR(ch);
  5583. continue;
  5584. }
  5585. message = "unknown Unicode character name";
  5586. }
  5587. }
  5588. goto error;
  5589. default:
  5590. if (*first_invalid_escape == NULL) {
  5591. *first_invalid_escape = s-1; /* Back up one char, since we've
  5592. already incremented s. */
  5593. }
  5594. WRITE_ASCII_CHAR('\\');
  5595. WRITE_CHAR(c);
  5596. continue;
  5597. }
  5598. incomplete:
  5599. if (consumed) {
  5600. *consumed = startinpos;
  5601. break;
  5602. }
  5603. error:;
  5604. Py_ssize_t endinpos = s-starts;
  5605. writer.min_length = end - s + writer.pos;
  5606. if (unicode_decode_call_errorhandler_writer(
  5607. errors, &errorHandler,
  5608. "unicodeescape", message,
  5609. &starts, &end, &startinpos, &endinpos, &exc, &s,
  5610. &writer)) {
  5611. goto onError;
  5612. }
  5613. assert(end - s <= writer.size - writer.pos);
  5614. #undef WRITE_ASCII_CHAR
  5615. #undef WRITE_CHAR
  5616. }
  5617. Py_XDECREF(errorHandler);
  5618. Py_XDECREF(exc);
  5619. return _PyUnicodeWriter_Finish(&writer);
  5620. onError:
  5621. _PyUnicodeWriter_Dealloc(&writer);
  5622. Py_XDECREF(errorHandler);
  5623. Py_XDECREF(exc);
  5624. return NULL;
  5625. }
  5626. PyObject *
  5627. _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
  5628. Py_ssize_t size,
  5629. const char *errors,
  5630. Py_ssize_t *consumed)
  5631. {
  5632. const char *first_invalid_escape;
  5633. PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
  5634. consumed,
  5635. &first_invalid_escape);
  5636. if (result == NULL)
  5637. return NULL;
  5638. if (first_invalid_escape != NULL) {
  5639. unsigned char c = *first_invalid_escape;
  5640. if ('4' <= c && c <= '7') {
  5641. if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
  5642. "invalid octal escape sequence '\\%.3s'",
  5643. first_invalid_escape) < 0)
  5644. {
  5645. Py_DECREF(result);
  5646. return NULL;
  5647. }
  5648. }
  5649. else {
  5650. if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
  5651. "invalid escape sequence '\\%c'",
  5652. c) < 0)
  5653. {
  5654. Py_DECREF(result);
  5655. return NULL;
  5656. }
  5657. }
  5658. }
  5659. return result;
  5660. }
  5661. PyObject *
  5662. PyUnicode_DecodeUnicodeEscape(const char *s,
  5663. Py_ssize_t size,
  5664. const char *errors)
  5665. {
  5666. return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
  5667. }
  5668. /* Return a Unicode-Escape string version of the Unicode object. */
  5669. PyObject *
  5670. PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
  5671. {
  5672. Py_ssize_t i, len;
  5673. PyObject *repr;
  5674. char *p;
  5675. int kind;
  5676. const void *data;
  5677. Py_ssize_t expandsize;
  5678. /* Initial allocation is based on the longest-possible character
  5679. escape.
  5680. For UCS1 strings it's '\xxx', 4 bytes per source character.
  5681. For UCS2 strings it's '\uxxxx', 6 bytes per source character.
  5682. For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
  5683. */
  5684. if (!PyUnicode_Check(unicode)) {
  5685. PyErr_BadArgument();
  5686. return NULL;
  5687. }
  5688. len = PyUnicode_GET_LENGTH(unicode);
  5689. if (len == 0) {
  5690. return PyBytes_FromStringAndSize(NULL, 0);
  5691. }
  5692. kind = PyUnicode_KIND(unicode);
  5693. data = PyUnicode_DATA(unicode);
  5694. /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
  5695. bytes, and 1 byte characters 4. */
  5696. expandsize = kind * 2 + 2;
  5697. if (len > PY_SSIZE_T_MAX / expandsize) {
  5698. return PyErr_NoMemory();
  5699. }
  5700. repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
  5701. if (repr == NULL) {
  5702. return NULL;
  5703. }
  5704. p = PyBytes_AS_STRING(repr);
  5705. for (i = 0; i < len; i++) {
  5706. Py_UCS4 ch = PyUnicode_READ(kind, data, i);
  5707. /* U+0000-U+00ff range */
  5708. if (ch < 0x100) {
  5709. if (ch >= ' ' && ch < 127) {
  5710. if (ch != '\\') {
  5711. /* Copy printable US ASCII as-is */
  5712. *p++ = (char) ch;
  5713. }
  5714. /* Escape backslashes */
  5715. else {
  5716. *p++ = '\\';
  5717. *p++ = '\\';
  5718. }
  5719. }
  5720. /* Map special whitespace to '\t', \n', '\r' */
  5721. else if (ch == '\t') {
  5722. *p++ = '\\';
  5723. *p++ = 't';
  5724. }
  5725. else if (ch == '\n') {
  5726. *p++ = '\\';
  5727. *p++ = 'n';
  5728. }
  5729. else if (ch == '\r') {
  5730. *p++ = '\\';
  5731. *p++ = 'r';
  5732. }
  5733. /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
  5734. else {
  5735. *p++ = '\\';
  5736. *p++ = 'x';
  5737. *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
  5738. *p++ = Py_hexdigits[ch & 0x000F];
  5739. }
  5740. }
  5741. /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
  5742. else if (ch < 0x10000) {
  5743. *p++ = '\\';
  5744. *p++ = 'u';
  5745. *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
  5746. *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
  5747. *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
  5748. *p++ = Py_hexdigits[ch & 0x000F];
  5749. }
  5750. /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
  5751. else {
  5752. /* Make sure that the first two digits are zero */
  5753. assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
  5754. *p++ = '\\';
  5755. *p++ = 'U';
  5756. *p++ = '0';
  5757. *p++ = '0';
  5758. *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
  5759. *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
  5760. *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
  5761. *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
  5762. *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
  5763. *p++ = Py_hexdigits[ch & 0x0000000F];
  5764. }
  5765. }
  5766. assert(p - PyBytes_AS_STRING(repr) > 0);
  5767. if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
  5768. return NULL;
  5769. }
  5770. return repr;
  5771. }
  5772. /* --- Raw Unicode Escape Codec ------------------------------------------- */
  5773. PyObject *
  5774. _PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
  5775. Py_ssize_t size,
  5776. const char *errors,
  5777. Py_ssize_t *consumed)
  5778. {
  5779. const char *starts = s;
  5780. _PyUnicodeWriter writer;
  5781. const char *end;
  5782. PyObject *errorHandler = NULL;
  5783. PyObject *exc = NULL;
  5784. if (size == 0) {
  5785. if (consumed) {
  5786. *consumed = 0;
  5787. }
  5788. _Py_RETURN_UNICODE_EMPTY();
  5789. }
  5790. /* Escaped strings will always be longer than the resulting
  5791. Unicode string, so we start with size here and then reduce the
  5792. length after conversion to the true value. (But decoding error
  5793. handler might have to resize the string) */
  5794. _PyUnicodeWriter_Init(&writer);
  5795. writer.min_length = size;
  5796. if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
  5797. goto onError;
  5798. }
  5799. end = s + size;
  5800. while (s < end) {
  5801. unsigned char c = (unsigned char) *s++;
  5802. Py_UCS4 ch;
  5803. int count;
  5804. const char *message;
  5805. #define WRITE_CHAR(ch) \
  5806. do { \
  5807. if (ch <= writer.maxchar) { \
  5808. assert(writer.pos < writer.size); \
  5809. PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
  5810. } \
  5811. else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
  5812. goto onError; \
  5813. } \
  5814. } while(0)
  5815. /* Non-escape characters are interpreted as Unicode ordinals */
  5816. if (c != '\\' || (s >= end && !consumed)) {
  5817. WRITE_CHAR(c);
  5818. continue;
  5819. }
  5820. Py_ssize_t startinpos = s - starts - 1;
  5821. /* \ - Escapes */
  5822. if (s >= end) {
  5823. assert(consumed);
  5824. // Set message to silent compiler warning.
  5825. // Actually it is never used.
  5826. message = "\\ at end of string";
  5827. goto incomplete;
  5828. }
  5829. c = (unsigned char) *s++;
  5830. if (c == 'u') {
  5831. count = 4;
  5832. message = "truncated \\uXXXX escape";
  5833. }
  5834. else if (c == 'U') {
  5835. count = 8;
  5836. message = "truncated \\UXXXXXXXX escape";
  5837. }
  5838. else {
  5839. assert(writer.pos < writer.size);
  5840. PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
  5841. WRITE_CHAR(c);
  5842. continue;
  5843. }
  5844. /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
  5845. for (ch = 0; count; ++s, --count) {
  5846. if (s >= end) {
  5847. goto incomplete;
  5848. }
  5849. c = (unsigned char)*s;
  5850. ch <<= 4;
  5851. if (c >= '0' && c <= '9') {
  5852. ch += c - '0';
  5853. }
  5854. else if (c >= 'a' && c <= 'f') {
  5855. ch += c - ('a' - 10);
  5856. }
  5857. else if (c >= 'A' && c <= 'F') {
  5858. ch += c - ('A' - 10);
  5859. }
  5860. else {
  5861. goto error;
  5862. }
  5863. }
  5864. if (ch > MAX_UNICODE) {
  5865. message = "\\Uxxxxxxxx out of range";
  5866. goto error;
  5867. }
  5868. WRITE_CHAR(ch);
  5869. continue;
  5870. incomplete:
  5871. if (consumed) {
  5872. *consumed = startinpos;
  5873. break;
  5874. }
  5875. error:;
  5876. Py_ssize_t endinpos = s-starts;
  5877. writer.min_length = end - s + writer.pos;
  5878. if (unicode_decode_call_errorhandler_writer(
  5879. errors, &errorHandler,
  5880. "rawunicodeescape", message,
  5881. &starts, &end, &startinpos, &endinpos, &exc, &s,
  5882. &writer)) {
  5883. goto onError;
  5884. }
  5885. assert(end - s <= writer.size - writer.pos);
  5886. #undef WRITE_CHAR
  5887. }
  5888. Py_XDECREF(errorHandler);
  5889. Py_XDECREF(exc);
  5890. return _PyUnicodeWriter_Finish(&writer);
  5891. onError:
  5892. _PyUnicodeWriter_Dealloc(&writer);
  5893. Py_XDECREF(errorHandler);
  5894. Py_XDECREF(exc);
  5895. return NULL;
  5896. }
  5897. PyObject *
  5898. PyUnicode_DecodeRawUnicodeEscape(const char *s,
  5899. Py_ssize_t size,
  5900. const char *errors)
  5901. {
  5902. return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
  5903. }
  5904. PyObject *
  5905. PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
  5906. {
  5907. PyObject *repr;
  5908. char *p;
  5909. Py_ssize_t expandsize, pos;
  5910. int kind;
  5911. const void *data;
  5912. Py_ssize_t len;
  5913. if (!PyUnicode_Check(unicode)) {
  5914. PyErr_BadArgument();
  5915. return NULL;
  5916. }
  5917. kind = PyUnicode_KIND(unicode);
  5918. data = PyUnicode_DATA(unicode);
  5919. len = PyUnicode_GET_LENGTH(unicode);
  5920. if (kind == PyUnicode_1BYTE_KIND) {
  5921. return PyBytes_FromStringAndSize(data, len);
  5922. }
  5923. /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
  5924. bytes, and 1 byte characters 4. */
  5925. expandsize = kind * 2 + 2;
  5926. if (len > PY_SSIZE_T_MAX / expandsize) {
  5927. return PyErr_NoMemory();
  5928. }
  5929. repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
  5930. if (repr == NULL) {
  5931. return NULL;
  5932. }
  5933. if (len == 0) {
  5934. return repr;
  5935. }
  5936. p = PyBytes_AS_STRING(repr);
  5937. for (pos = 0; pos < len; pos++) {
  5938. Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
  5939. /* U+0000-U+00ff range: Copy 8-bit characters as-is */
  5940. if (ch < 0x100) {
  5941. *p++ = (char) ch;
  5942. }
  5943. /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
  5944. else if (ch < 0x10000) {
  5945. *p++ = '\\';
  5946. *p++ = 'u';
  5947. *p++ = Py_hexdigits[(ch >> 12) & 0xf];
  5948. *p++ = Py_hexdigits[(ch >> 8) & 0xf];
  5949. *p++ = Py_hexdigits[(ch >> 4) & 0xf];
  5950. *p++ = Py_hexdigits[ch & 15];
  5951. }
  5952. /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
  5953. else {
  5954. assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
  5955. *p++ = '\\';
  5956. *p++ = 'U';
  5957. *p++ = '0';
  5958. *p++ = '0';
  5959. *p++ = Py_hexdigits[(ch >> 20) & 0xf];
  5960. *p++ = Py_hexdigits[(ch >> 16) & 0xf];
  5961. *p++ = Py_hexdigits[(ch >> 12) & 0xf];
  5962. *p++ = Py_hexdigits[(ch >> 8) & 0xf];
  5963. *p++ = Py_hexdigits[(ch >> 4) & 0xf];
  5964. *p++ = Py_hexdigits[ch & 15];
  5965. }
  5966. }
  5967. assert(p > PyBytes_AS_STRING(repr));
  5968. if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
  5969. return NULL;
  5970. }
  5971. return repr;
  5972. }
  5973. /* --- Latin-1 Codec ------------------------------------------------------ */
  5974. PyObject *
  5975. PyUnicode_DecodeLatin1(const char *s,
  5976. Py_ssize_t size,
  5977. const char *errors)
  5978. {
  5979. /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
  5980. return _PyUnicode_FromUCS1((const unsigned char*)s, size);
  5981. }
  5982. /* create or adjust a UnicodeEncodeError */
  5983. static void
  5984. make_encode_exception(PyObject **exceptionObject,
  5985. const char *encoding,
  5986. PyObject *unicode,
  5987. Py_ssize_t startpos, Py_ssize_t endpos,
  5988. const char *reason)
  5989. {
  5990. if (*exceptionObject == NULL) {
  5991. *exceptionObject = PyObject_CallFunction(
  5992. PyExc_UnicodeEncodeError, "sOnns",
  5993. encoding, unicode, startpos, endpos, reason);
  5994. }
  5995. else {
  5996. if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
  5997. goto onError;
  5998. if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
  5999. goto onError;
  6000. if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
  6001. goto onError;
  6002. return;
  6003. onError:
  6004. Py_CLEAR(*exceptionObject);
  6005. }
  6006. }
  6007. /* raises a UnicodeEncodeError */
  6008. static void
  6009. raise_encode_exception(PyObject **exceptionObject,
  6010. const char *encoding,
  6011. PyObject *unicode,
  6012. Py_ssize_t startpos, Py_ssize_t endpos,
  6013. const char *reason)
  6014. {
  6015. make_encode_exception(exceptionObject,
  6016. encoding, unicode, startpos, endpos, reason);
  6017. if (*exceptionObject != NULL)
  6018. PyCodec_StrictErrors(*exceptionObject);
  6019. }
  6020. /* error handling callback helper:
  6021. build arguments, call the callback and check the arguments,
  6022. put the result into newpos and return the replacement string, which
  6023. has to be freed by the caller */
  6024. static PyObject *
  6025. unicode_encode_call_errorhandler(const char *errors,
  6026. PyObject **errorHandler,
  6027. const char *encoding, const char *reason,
  6028. PyObject *unicode, PyObject **exceptionObject,
  6029. Py_ssize_t startpos, Py_ssize_t endpos,
  6030. Py_ssize_t *newpos)
  6031. {
  6032. static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
  6033. Py_ssize_t len;
  6034. PyObject *restuple;
  6035. PyObject *resunicode;
  6036. if (*errorHandler == NULL) {
  6037. *errorHandler = PyCodec_LookupError(errors);
  6038. if (*errorHandler == NULL)
  6039. return NULL;
  6040. }
  6041. len = PyUnicode_GET_LENGTH(unicode);
  6042. make_encode_exception(exceptionObject,
  6043. encoding, unicode, startpos, endpos, reason);
  6044. if (*exceptionObject == NULL)
  6045. return NULL;
  6046. restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
  6047. if (restuple == NULL)
  6048. return NULL;
  6049. if (!PyTuple_Check(restuple)) {
  6050. PyErr_SetString(PyExc_TypeError, &argparse[3]);
  6051. Py_DECREF(restuple);
  6052. return NULL;
  6053. }
  6054. if (!PyArg_ParseTuple(restuple, argparse,
  6055. &resunicode, newpos)) {
  6056. Py_DECREF(restuple);
  6057. return NULL;
  6058. }
  6059. if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
  6060. PyErr_SetString(PyExc_TypeError, &argparse[3]);
  6061. Py_DECREF(restuple);
  6062. return NULL;
  6063. }
  6064. if (*newpos<0)
  6065. *newpos = len + *newpos;
  6066. if (*newpos<0 || *newpos>len) {
  6067. PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
  6068. Py_DECREF(restuple);
  6069. return NULL;
  6070. }
  6071. Py_INCREF(resunicode);
  6072. Py_DECREF(restuple);
  6073. return resunicode;
  6074. }
  6075. static PyObject *
  6076. unicode_encode_ucs1(PyObject *unicode,
  6077. const char *errors,
  6078. const Py_UCS4 limit)
  6079. {
  6080. /* input state */
  6081. Py_ssize_t pos=0, size;
  6082. int kind;
  6083. const void *data;
  6084. /* pointer into the output */
  6085. char *str;
  6086. const char *encoding = (limit == 256) ? "latin-1" : "ascii";
  6087. const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
  6088. PyObject *error_handler_obj = NULL;
  6089. PyObject *exc = NULL;
  6090. _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
  6091. PyObject *rep = NULL;
  6092. /* output object */
  6093. _PyBytesWriter writer;
  6094. size = PyUnicode_GET_LENGTH(unicode);
  6095. kind = PyUnicode_KIND(unicode);
  6096. data = PyUnicode_DATA(unicode);
  6097. /* allocate enough for a simple encoding without
  6098. replacements, if we need more, we'll resize */
  6099. if (size == 0)
  6100. return PyBytes_FromStringAndSize(NULL, 0);
  6101. _PyBytesWriter_Init(&writer);
  6102. str = _PyBytesWriter_Alloc(&writer, size);
  6103. if (str == NULL)
  6104. return NULL;
  6105. while (pos < size) {
  6106. Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
  6107. /* can we encode this? */
  6108. if (ch < limit) {
  6109. /* no overflow check, because we know that the space is enough */
  6110. *str++ = (char)ch;
  6111. ++pos;
  6112. }
  6113. else {
  6114. Py_ssize_t newpos, i;
  6115. /* startpos for collecting unencodable chars */
  6116. Py_ssize_t collstart = pos;
  6117. Py_ssize_t collend = collstart + 1;
  6118. /* find all unecodable characters */
  6119. while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
  6120. ++collend;
  6121. /* Only overallocate the buffer if it's not the last write */
  6122. writer.overallocate = (collend < size);
  6123. /* cache callback name lookup (if not done yet, i.e. it's the first error) */
  6124. if (error_handler == _Py_ERROR_UNKNOWN)
  6125. error_handler = _Py_GetErrorHandler(errors);
  6126. switch (error_handler) {
  6127. case _Py_ERROR_STRICT:
  6128. raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
  6129. goto onError;
  6130. case _Py_ERROR_REPLACE:
  6131. memset(str, '?', collend - collstart);
  6132. str += (collend - collstart);
  6133. /* fall through */
  6134. case _Py_ERROR_IGNORE:
  6135. pos = collend;
  6136. break;
  6137. case _Py_ERROR_BACKSLASHREPLACE:
  6138. /* subtract preallocated bytes */
  6139. writer.min_size -= (collend - collstart);
  6140. str = backslashreplace(&writer, str,
  6141. unicode, collstart, collend);
  6142. if (str == NULL)
  6143. goto onError;
  6144. pos = collend;
  6145. break;
  6146. case _Py_ERROR_XMLCHARREFREPLACE:
  6147. /* subtract preallocated bytes */
  6148. writer.min_size -= (collend - collstart);
  6149. str = xmlcharrefreplace(&writer, str,
  6150. unicode, collstart, collend);
  6151. if (str == NULL)
  6152. goto onError;
  6153. pos = collend;
  6154. break;
  6155. case _Py_ERROR_SURROGATEESCAPE:
  6156. for (i = collstart; i < collend; ++i) {
  6157. ch = PyUnicode_READ(kind, data, i);
  6158. if (ch < 0xdc80 || 0xdcff < ch) {
  6159. /* Not a UTF-8b surrogate */
  6160. break;
  6161. }
  6162. *str++ = (char)(ch - 0xdc00);
  6163. ++pos;
  6164. }
  6165. if (i >= collend)
  6166. break;
  6167. collstart = pos;
  6168. assert(collstart != collend);
  6169. /* fall through */
  6170. default:
  6171. rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
  6172. encoding, reason, unicode, &exc,
  6173. collstart, collend, &newpos);
  6174. if (rep == NULL)
  6175. goto onError;
  6176. if (newpos < collstart) {
  6177. writer.overallocate = 1;
  6178. str = _PyBytesWriter_Prepare(&writer, str,
  6179. collstart - newpos);
  6180. if (str == NULL)
  6181. goto onError;
  6182. }
  6183. else {
  6184. /* subtract preallocated bytes */
  6185. writer.min_size -= newpos - collstart;
  6186. /* Only overallocate the buffer if it's not the last write */
  6187. writer.overallocate = (newpos < size);
  6188. }
  6189. if (PyBytes_Check(rep)) {
  6190. /* Directly copy bytes result to output. */
  6191. str = _PyBytesWriter_WriteBytes(&writer, str,
  6192. PyBytes_AS_STRING(rep),
  6193. PyBytes_GET_SIZE(rep));
  6194. }
  6195. else {
  6196. assert(PyUnicode_Check(rep));
  6197. if (limit == 256 ?
  6198. PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
  6199. !PyUnicode_IS_ASCII(rep))
  6200. {
  6201. /* Not all characters are smaller than limit */
  6202. raise_encode_exception(&exc, encoding, unicode,
  6203. collstart, collend, reason);
  6204. goto onError;
  6205. }
  6206. assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
  6207. str = _PyBytesWriter_WriteBytes(&writer, str,
  6208. PyUnicode_DATA(rep),
  6209. PyUnicode_GET_LENGTH(rep));
  6210. }
  6211. if (str == NULL)
  6212. goto onError;
  6213. pos = newpos;
  6214. Py_CLEAR(rep);
  6215. }
  6216. /* If overallocation was disabled, ensure that it was the last
  6217. write. Otherwise, we missed an optimization */
  6218. assert(writer.overallocate || pos == size);
  6219. }
  6220. }
  6221. Py_XDECREF(error_handler_obj);
  6222. Py_XDECREF(exc);
  6223. return _PyBytesWriter_Finish(&writer, str);
  6224. onError:
  6225. Py_XDECREF(rep);
  6226. _PyBytesWriter_Dealloc(&writer);
  6227. Py_XDECREF(error_handler_obj);
  6228. Py_XDECREF(exc);
  6229. return NULL;
  6230. }
  6231. PyObject *
  6232. _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
  6233. {
  6234. if (!PyUnicode_Check(unicode)) {
  6235. PyErr_BadArgument();
  6236. return NULL;
  6237. }
  6238. /* Fast path: if it is a one-byte string, construct
  6239. bytes object directly. */
  6240. if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
  6241. return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
  6242. PyUnicode_GET_LENGTH(unicode));
  6243. /* Non-Latin-1 characters present. Defer to above function to
  6244. raise the exception. */
  6245. return unicode_encode_ucs1(unicode, errors, 256);
  6246. }
  6247. PyObject*
  6248. PyUnicode_AsLatin1String(PyObject *unicode)
  6249. {
  6250. return _PyUnicode_AsLatin1String(unicode, NULL);
  6251. }
  6252. /* --- 7-bit ASCII Codec -------------------------------------------------- */
  6253. PyObject *
  6254. PyUnicode_DecodeASCII(const char *s,
  6255. Py_ssize_t size,
  6256. const char *errors)
  6257. {
  6258. const char *starts = s;
  6259. const char *e = s + size;
  6260. PyObject *error_handler_obj = NULL;
  6261. PyObject *exc = NULL;
  6262. _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
  6263. if (size == 0)
  6264. _Py_RETURN_UNICODE_EMPTY();
  6265. /* ASCII is equivalent to the first 128 ordinals in Unicode. */
  6266. if (size == 1 && (unsigned char)s[0] < 128) {
  6267. return get_latin1_char((unsigned char)s[0]);
  6268. }
  6269. // Shortcut for simple case
  6270. PyObject *u = PyUnicode_New(size, 127);
  6271. if (u == NULL) {
  6272. return NULL;
  6273. }
  6274. Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
  6275. if (outpos == size) {
  6276. return u;
  6277. }
  6278. _PyUnicodeWriter writer;
  6279. _PyUnicodeWriter_InitWithBuffer(&writer, u);
  6280. writer.pos = outpos;
  6281. s += outpos;
  6282. int kind = writer.kind;
  6283. void *data = writer.data;
  6284. Py_ssize_t startinpos, endinpos;
  6285. while (s < e) {
  6286. unsigned char c = (unsigned char)*s;
  6287. if (c < 128) {
  6288. PyUnicode_WRITE(kind, data, writer.pos, c);
  6289. writer.pos++;
  6290. ++s;
  6291. continue;
  6292. }
  6293. /* byte outsize range 0x00..0x7f: call the error handler */
  6294. if (error_handler == _Py_ERROR_UNKNOWN)
  6295. error_handler = _Py_GetErrorHandler(errors);
  6296. switch (error_handler)
  6297. {
  6298. case _Py_ERROR_REPLACE:
  6299. case _Py_ERROR_SURROGATEESCAPE:
  6300. /* Fast-path: the error handler only writes one character,
  6301. but we may switch to UCS2 at the first write */
  6302. if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
  6303. goto onError;
  6304. kind = writer.kind;
  6305. data = writer.data;
  6306. if (error_handler == _Py_ERROR_REPLACE)
  6307. PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
  6308. else
  6309. PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
  6310. writer.pos++;
  6311. ++s;
  6312. break;
  6313. case _Py_ERROR_IGNORE:
  6314. ++s;
  6315. break;
  6316. default:
  6317. startinpos = s-starts;
  6318. endinpos = startinpos + 1;
  6319. if (unicode_decode_call_errorhandler_writer(
  6320. errors, &error_handler_obj,
  6321. "ascii", "ordinal not in range(128)",
  6322. &starts, &e, &startinpos, &endinpos, &exc, &s,
  6323. &writer))
  6324. goto onError;
  6325. kind = writer.kind;
  6326. data = writer.data;
  6327. }
  6328. }
  6329. Py_XDECREF(error_handler_obj);
  6330. Py_XDECREF(exc);
  6331. return _PyUnicodeWriter_Finish(&writer);
  6332. onError:
  6333. _PyUnicodeWriter_Dealloc(&writer);
  6334. Py_XDECREF(error_handler_obj);
  6335. Py_XDECREF(exc);
  6336. return NULL;
  6337. }
  6338. PyObject *
  6339. _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
  6340. {
  6341. if (!PyUnicode_Check(unicode)) {
  6342. PyErr_BadArgument();
  6343. return NULL;
  6344. }
  6345. /* Fast path: if it is an ASCII-only string, construct bytes object
  6346. directly. Else defer to above function to raise the exception. */
  6347. if (PyUnicode_IS_ASCII(unicode))
  6348. return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
  6349. PyUnicode_GET_LENGTH(unicode));
  6350. return unicode_encode_ucs1(unicode, errors, 128);
  6351. }
  6352. PyObject *
  6353. PyUnicode_AsASCIIString(PyObject *unicode)
  6354. {
  6355. return _PyUnicode_AsASCIIString(unicode, NULL);
  6356. }
  6357. #ifdef MS_WINDOWS
  6358. /* --- MBCS codecs for Windows -------------------------------------------- */
  6359. #if SIZEOF_INT < SIZEOF_SIZE_T
  6360. #define NEED_RETRY
  6361. #endif
  6362. /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
  6363. transcoding from UTF-16), but INT_MAX / 4 performs better in
  6364. both cases also and avoids partial characters overrunning the
  6365. length limit in MultiByteToWideChar on Windows */
  6366. #define DECODING_CHUNK_SIZE (INT_MAX/4)
  6367. #ifndef WC_ERR_INVALID_CHARS
  6368. # define WC_ERR_INVALID_CHARS 0x0080
  6369. #endif
  6370. static const char*
  6371. code_page_name(UINT code_page, PyObject **obj)
  6372. {
  6373. *obj = NULL;
  6374. if (code_page == CP_ACP)
  6375. return "mbcs";
  6376. if (code_page == CP_UTF7)
  6377. return "CP_UTF7";
  6378. if (code_page == CP_UTF8)
  6379. return "CP_UTF8";
  6380. *obj = PyBytes_FromFormat("cp%u", code_page);
  6381. if (*obj == NULL)
  6382. return NULL;
  6383. return PyBytes_AS_STRING(*obj);
  6384. }
  6385. static DWORD
  6386. decode_code_page_flags(UINT code_page)
  6387. {
  6388. if (code_page == CP_UTF7) {
  6389. /* The CP_UTF7 decoder only supports flags=0 */
  6390. return 0;
  6391. }
  6392. else
  6393. return MB_ERR_INVALID_CHARS;
  6394. }
  6395. /*
  6396. * Decode a byte string from a Windows code page into unicode object in strict
  6397. * mode.
  6398. *
  6399. * Returns consumed size if succeed, returns -2 on decode error, or raise an
  6400. * OSError and returns -1 on other error.
  6401. */
  6402. static int
  6403. decode_code_page_strict(UINT code_page,
  6404. wchar_t **buf,
  6405. Py_ssize_t *bufsize,
  6406. const char *in,
  6407. int insize)
  6408. {
  6409. DWORD flags = MB_ERR_INVALID_CHARS;
  6410. wchar_t *out;
  6411. DWORD outsize;
  6412. /* First get the size of the result */
  6413. assert(insize > 0);
  6414. while ((outsize = MultiByteToWideChar(code_page, flags,
  6415. in, insize, NULL, 0)) <= 0)
  6416. {
  6417. if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
  6418. goto error;
  6419. }
  6420. /* For some code pages (e.g. UTF-7) flags must be set to 0. */
  6421. flags = 0;
  6422. }
  6423. /* Extend a wchar_t* buffer */
  6424. Py_ssize_t n = *bufsize; /* Get the current length */
  6425. if (widechar_resize(buf, bufsize, n + outsize) < 0) {
  6426. return -1;
  6427. }
  6428. out = *buf + n;
  6429. /* Do the conversion */
  6430. outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
  6431. if (outsize <= 0)
  6432. goto error;
  6433. return insize;
  6434. error:
  6435. if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
  6436. return -2;
  6437. PyErr_SetFromWindowsErr(0);
  6438. return -1;
  6439. }
  6440. /*
  6441. * Decode a byte string from a code page into unicode object with an error
  6442. * handler.
  6443. *
  6444. * Returns consumed size if succeed, or raise an OSError or
  6445. * UnicodeDecodeError exception and returns -1 on error.
  6446. */
  6447. static int
  6448. decode_code_page_errors(UINT code_page,
  6449. wchar_t **buf,
  6450. Py_ssize_t *bufsize,
  6451. const char *in, const int size,
  6452. const char *errors, int final)
  6453. {
  6454. const char *startin = in;
  6455. const char *endin = in + size;
  6456. DWORD flags = MB_ERR_INVALID_CHARS;
  6457. /* Ideally, we should get reason from FormatMessage. This is the Windows
  6458. 2000 English version of the message. */
  6459. const char *reason = "No mapping for the Unicode character exists "
  6460. "in the target code page.";
  6461. /* each step cannot decode more than 1 character, but a character can be
  6462. represented as a surrogate pair */
  6463. wchar_t buffer[2], *out;
  6464. int insize;
  6465. Py_ssize_t outsize;
  6466. PyObject *errorHandler = NULL;
  6467. PyObject *exc = NULL;
  6468. PyObject *encoding_obj = NULL;
  6469. const char *encoding;
  6470. DWORD err;
  6471. int ret = -1;
  6472. assert(size > 0);
  6473. encoding = code_page_name(code_page, &encoding_obj);
  6474. if (encoding == NULL)
  6475. return -1;
  6476. if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
  6477. /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
  6478. UnicodeDecodeError. */
  6479. make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
  6480. if (exc != NULL) {
  6481. PyCodec_StrictErrors(exc);
  6482. Py_CLEAR(exc);
  6483. }
  6484. goto error;
  6485. }
  6486. /* Extend a wchar_t* buffer */
  6487. Py_ssize_t n = *bufsize; /* Get the current length */
  6488. if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
  6489. PyErr_NoMemory();
  6490. goto error;
  6491. }
  6492. if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
  6493. goto error;
  6494. }
  6495. out = *buf + n;
  6496. /* Decode the byte string character per character */
  6497. while (in < endin)
  6498. {
  6499. /* Decode a character */
  6500. insize = 1;
  6501. do
  6502. {
  6503. outsize = MultiByteToWideChar(code_page, flags,
  6504. in, insize,
  6505. buffer, Py_ARRAY_LENGTH(buffer));
  6506. if (outsize > 0)
  6507. break;
  6508. err = GetLastError();
  6509. if (err == ERROR_INVALID_FLAGS && flags) {
  6510. /* For some code pages (e.g. UTF-7) flags must be set to 0. */
  6511. flags = 0;
  6512. continue;
  6513. }
  6514. if (err != ERROR_NO_UNICODE_TRANSLATION
  6515. && err != ERROR_INSUFFICIENT_BUFFER)
  6516. {
  6517. PyErr_SetFromWindowsErr(err);
  6518. goto error;
  6519. }
  6520. insize++;
  6521. }
  6522. /* 4=maximum length of a UTF-8 sequence */
  6523. while (insize <= 4 && (in + insize) <= endin);
  6524. if (outsize <= 0) {
  6525. Py_ssize_t startinpos, endinpos, outpos;
  6526. /* last character in partial decode? */
  6527. if (in + insize >= endin && !final)
  6528. break;
  6529. startinpos = in - startin;
  6530. endinpos = startinpos + 1;
  6531. outpos = out - *buf;
  6532. if (unicode_decode_call_errorhandler_wchar(
  6533. errors, &errorHandler,
  6534. encoding, reason,
  6535. &startin, &endin, &startinpos, &endinpos, &exc, &in,
  6536. buf, bufsize, &outpos))
  6537. {
  6538. goto error;
  6539. }
  6540. out = *buf + outpos;
  6541. }
  6542. else {
  6543. in += insize;
  6544. memcpy(out, buffer, outsize * sizeof(wchar_t));
  6545. out += outsize;
  6546. }
  6547. }
  6548. /* Shrink the buffer */
  6549. assert(out - *buf <= *bufsize);
  6550. *bufsize = out - *buf;
  6551. /* (in - startin) <= size and size is an int */
  6552. ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
  6553. error:
  6554. Py_XDECREF(encoding_obj);
  6555. Py_XDECREF(errorHandler);
  6556. Py_XDECREF(exc);
  6557. return ret;
  6558. }
  6559. static PyObject *
  6560. decode_code_page_stateful(int code_page,
  6561. const char *s, Py_ssize_t size,
  6562. const char *errors, Py_ssize_t *consumed)
  6563. {
  6564. wchar_t *buf = NULL;
  6565. Py_ssize_t bufsize = 0;
  6566. int chunk_size, final, converted, done;
  6567. if (code_page < 0) {
  6568. PyErr_SetString(PyExc_ValueError, "invalid code page number");
  6569. return NULL;
  6570. }
  6571. if (size < 0) {
  6572. PyErr_BadInternalCall();
  6573. return NULL;
  6574. }
  6575. if (consumed)
  6576. *consumed = 0;
  6577. do
  6578. {
  6579. #ifdef NEED_RETRY
  6580. if (size > DECODING_CHUNK_SIZE) {
  6581. chunk_size = DECODING_CHUNK_SIZE;
  6582. final = 0;
  6583. done = 0;
  6584. }
  6585. else
  6586. #endif
  6587. {
  6588. chunk_size = (int)size;
  6589. final = (consumed == NULL);
  6590. done = 1;
  6591. }
  6592. if (chunk_size == 0 && done) {
  6593. if (buf != NULL)
  6594. break;
  6595. _Py_RETURN_UNICODE_EMPTY();
  6596. }
  6597. converted = decode_code_page_strict(code_page, &buf, &bufsize,
  6598. s, chunk_size);
  6599. if (converted == -2)
  6600. converted = decode_code_page_errors(code_page, &buf, &bufsize,
  6601. s, chunk_size,
  6602. errors, final);
  6603. assert(converted != 0 || done);
  6604. if (converted < 0) {
  6605. PyMem_Free(buf);
  6606. return NULL;
  6607. }
  6608. if (consumed)
  6609. *consumed += converted;
  6610. s += converted;
  6611. size -= converted;
  6612. } while (!done);
  6613. PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
  6614. PyMem_Free(buf);
  6615. return v;
  6616. }
  6617. PyObject *
  6618. PyUnicode_DecodeCodePageStateful(int code_page,
  6619. const char *s,
  6620. Py_ssize_t size,
  6621. const char *errors,
  6622. Py_ssize_t *consumed)
  6623. {
  6624. return decode_code_page_stateful(code_page, s, size, errors, consumed);
  6625. }
  6626. PyObject *
  6627. PyUnicode_DecodeMBCSStateful(const char *s,
  6628. Py_ssize_t size,
  6629. const char *errors,
  6630. Py_ssize_t *consumed)
  6631. {
  6632. return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
  6633. }
  6634. PyObject *
  6635. PyUnicode_DecodeMBCS(const char *s,
  6636. Py_ssize_t size,
  6637. const char *errors)
  6638. {
  6639. return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
  6640. }
  6641. static DWORD
  6642. encode_code_page_flags(UINT code_page, const char *errors)
  6643. {
  6644. if (code_page == CP_UTF8) {
  6645. return WC_ERR_INVALID_CHARS;
  6646. }
  6647. else if (code_page == CP_UTF7) {
  6648. /* CP_UTF7 only supports flags=0 */
  6649. return 0;
  6650. }
  6651. else {
  6652. if (errors != NULL && strcmp(errors, "replace") == 0)
  6653. return 0;
  6654. else
  6655. return WC_NO_BEST_FIT_CHARS;
  6656. }
  6657. }
  6658. /*
  6659. * Encode a Unicode string to a Windows code page into a byte string in strict
  6660. * mode.
  6661. *
  6662. * Returns consumed characters if succeed, returns -2 on encode error, or raise
  6663. * an OSError and returns -1 on other error.
  6664. */
  6665. static int
  6666. encode_code_page_strict(UINT code_page, PyObject **outbytes,
  6667. PyObject *unicode, Py_ssize_t offset, int len,
  6668. const char* errors)
  6669. {
  6670. BOOL usedDefaultChar = FALSE;
  6671. BOOL *pusedDefaultChar = &usedDefaultChar;
  6672. int outsize;
  6673. wchar_t *p;
  6674. Py_ssize_t size;
  6675. const DWORD flags = encode_code_page_flags(code_page, NULL);
  6676. char *out;
  6677. /* Create a substring so that we can get the UTF-16 representation
  6678. of just the slice under consideration. */
  6679. PyObject *substring;
  6680. int ret = -1;
  6681. assert(len > 0);
  6682. if (code_page != CP_UTF8 && code_page != CP_UTF7)
  6683. pusedDefaultChar = &usedDefaultChar;
  6684. else
  6685. pusedDefaultChar = NULL;
  6686. substring = PyUnicode_Substring(unicode, offset, offset+len);
  6687. if (substring == NULL)
  6688. return -1;
  6689. p = PyUnicode_AsWideCharString(substring, &size);
  6690. Py_CLEAR(substring);
  6691. if (p == NULL) {
  6692. return -1;
  6693. }
  6694. assert(size <= INT_MAX);
  6695. /* First get the size of the result */
  6696. outsize = WideCharToMultiByte(code_page, flags,
  6697. p, (int)size,
  6698. NULL, 0,
  6699. NULL, pusedDefaultChar);
  6700. if (outsize <= 0)
  6701. goto error;
  6702. /* If we used a default char, then we failed! */
  6703. if (pusedDefaultChar && *pusedDefaultChar) {
  6704. ret = -2;
  6705. goto done;
  6706. }
  6707. if (*outbytes == NULL) {
  6708. /* Create string object */
  6709. *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
  6710. if (*outbytes == NULL) {
  6711. goto done;
  6712. }
  6713. out = PyBytes_AS_STRING(*outbytes);
  6714. }
  6715. else {
  6716. /* Extend string object */
  6717. const Py_ssize_t n = PyBytes_Size(*outbytes);
  6718. if (outsize > PY_SSIZE_T_MAX - n) {
  6719. PyErr_NoMemory();
  6720. goto done;
  6721. }
  6722. if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
  6723. goto done;
  6724. }
  6725. out = PyBytes_AS_STRING(*outbytes) + n;
  6726. }
  6727. /* Do the conversion */
  6728. outsize = WideCharToMultiByte(code_page, flags,
  6729. p, (int)size,
  6730. out, outsize,
  6731. NULL, pusedDefaultChar);
  6732. if (outsize <= 0)
  6733. goto error;
  6734. if (pusedDefaultChar && *pusedDefaultChar) {
  6735. ret = -2;
  6736. goto done;
  6737. }
  6738. ret = 0;
  6739. done:
  6740. PyMem_Free(p);
  6741. return ret;
  6742. error:
  6743. if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
  6744. ret = -2;
  6745. goto done;
  6746. }
  6747. PyErr_SetFromWindowsErr(0);
  6748. goto done;
  6749. }
  6750. /*
  6751. * Encode a Unicode string to a Windows code page into a byte string using an
  6752. * error handler.
  6753. *
  6754. * Returns consumed characters if succeed, or raise an OSError and returns
  6755. * -1 on other error.
  6756. */
  6757. static int
  6758. encode_code_page_errors(UINT code_page, PyObject **outbytes,
  6759. PyObject *unicode, Py_ssize_t unicode_offset,
  6760. Py_ssize_t insize, const char* errors)
  6761. {
  6762. const DWORD flags = encode_code_page_flags(code_page, errors);
  6763. Py_ssize_t pos = unicode_offset;
  6764. Py_ssize_t endin = unicode_offset + insize;
  6765. /* Ideally, we should get reason from FormatMessage. This is the Windows
  6766. 2000 English version of the message. */
  6767. const char *reason = "invalid character";
  6768. /* 4=maximum length of a UTF-8 sequence */
  6769. char buffer[4];
  6770. BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
  6771. Py_ssize_t outsize;
  6772. char *out;
  6773. PyObject *errorHandler = NULL;
  6774. PyObject *exc = NULL;
  6775. PyObject *encoding_obj = NULL;
  6776. const char *encoding;
  6777. Py_ssize_t newpos, newoutsize;
  6778. PyObject *rep;
  6779. int ret = -1;
  6780. assert(insize > 0);
  6781. encoding = code_page_name(code_page, &encoding_obj);
  6782. if (encoding == NULL)
  6783. return -1;
  6784. if (errors == NULL || strcmp(errors, "strict") == 0) {
  6785. /* The last error was ERROR_NO_UNICODE_TRANSLATION,
  6786. then we raise a UnicodeEncodeError. */
  6787. make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
  6788. if (exc != NULL) {
  6789. PyCodec_StrictErrors(exc);
  6790. Py_DECREF(exc);
  6791. }
  6792. Py_XDECREF(encoding_obj);
  6793. return -1;
  6794. }
  6795. if (code_page != CP_UTF8 && code_page != CP_UTF7)
  6796. pusedDefaultChar = &usedDefaultChar;
  6797. else
  6798. pusedDefaultChar = NULL;
  6799. if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
  6800. PyErr_NoMemory();
  6801. goto error;
  6802. }
  6803. outsize = insize * Py_ARRAY_LENGTH(buffer);
  6804. if (*outbytes == NULL) {
  6805. /* Create string object */
  6806. *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
  6807. if (*outbytes == NULL)
  6808. goto error;
  6809. out = PyBytes_AS_STRING(*outbytes);
  6810. }
  6811. else {
  6812. /* Extend string object */
  6813. Py_ssize_t n = PyBytes_Size(*outbytes);
  6814. if (n > PY_SSIZE_T_MAX - outsize) {
  6815. PyErr_NoMemory();
  6816. goto error;
  6817. }
  6818. if (_PyBytes_Resize(outbytes, n + outsize) < 0)
  6819. goto error;
  6820. out = PyBytes_AS_STRING(*outbytes) + n;
  6821. }
  6822. /* Encode the string character per character */
  6823. while (pos < endin)
  6824. {
  6825. Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
  6826. wchar_t chars[2];
  6827. int charsize;
  6828. if (ch < 0x10000) {
  6829. chars[0] = (wchar_t)ch;
  6830. charsize = 1;
  6831. }
  6832. else {
  6833. chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
  6834. chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
  6835. charsize = 2;
  6836. }
  6837. outsize = WideCharToMultiByte(code_page, flags,
  6838. chars, charsize,
  6839. buffer, Py_ARRAY_LENGTH(buffer),
  6840. NULL, pusedDefaultChar);
  6841. if (outsize > 0) {
  6842. if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
  6843. {
  6844. pos++;
  6845. memcpy(out, buffer, outsize);
  6846. out += outsize;
  6847. continue;
  6848. }
  6849. }
  6850. else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
  6851. PyErr_SetFromWindowsErr(0);
  6852. goto error;
  6853. }
  6854. rep = unicode_encode_call_errorhandler(
  6855. errors, &errorHandler, encoding, reason,
  6856. unicode, &exc,
  6857. pos, pos + 1, &newpos);
  6858. if (rep == NULL)
  6859. goto error;
  6860. Py_ssize_t morebytes = pos - newpos;
  6861. if (PyBytes_Check(rep)) {
  6862. outsize = PyBytes_GET_SIZE(rep);
  6863. morebytes += outsize;
  6864. if (morebytes > 0) {
  6865. Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
  6866. newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
  6867. if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
  6868. Py_DECREF(rep);
  6869. goto error;
  6870. }
  6871. out = PyBytes_AS_STRING(*outbytes) + offset;
  6872. }
  6873. memcpy(out, PyBytes_AS_STRING(rep), outsize);
  6874. out += outsize;
  6875. }
  6876. else {
  6877. Py_ssize_t i;
  6878. int kind;
  6879. const void *data;
  6880. outsize = PyUnicode_GET_LENGTH(rep);
  6881. morebytes += outsize;
  6882. if (morebytes > 0) {
  6883. Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
  6884. newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
  6885. if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
  6886. Py_DECREF(rep);
  6887. goto error;
  6888. }
  6889. out = PyBytes_AS_STRING(*outbytes) + offset;
  6890. }
  6891. kind = PyUnicode_KIND(rep);
  6892. data = PyUnicode_DATA(rep);
  6893. for (i=0; i < outsize; i++) {
  6894. Py_UCS4 ch = PyUnicode_READ(kind, data, i);
  6895. if (ch > 127) {
  6896. raise_encode_exception(&exc,
  6897. encoding, unicode,
  6898. pos, pos + 1,
  6899. "unable to encode error handler result to ASCII");
  6900. Py_DECREF(rep);
  6901. goto error;
  6902. }
  6903. *out = (unsigned char)ch;
  6904. out++;
  6905. }
  6906. }
  6907. pos = newpos;
  6908. Py_DECREF(rep);
  6909. }
  6910. /* write a NUL byte */
  6911. *out = 0;
  6912. outsize = out - PyBytes_AS_STRING(*outbytes);
  6913. assert(outsize <= PyBytes_GET_SIZE(*outbytes));
  6914. if (_PyBytes_Resize(outbytes, outsize) < 0)
  6915. goto error;
  6916. ret = 0;
  6917. error:
  6918. Py_XDECREF(encoding_obj);
  6919. Py_XDECREF(errorHandler);
  6920. Py_XDECREF(exc);
  6921. return ret;
  6922. }
  6923. static PyObject *
  6924. encode_code_page(int code_page,
  6925. PyObject *unicode,
  6926. const char *errors)
  6927. {
  6928. Py_ssize_t len;
  6929. PyObject *outbytes = NULL;
  6930. Py_ssize_t offset;
  6931. int chunk_len, ret, done;
  6932. if (!PyUnicode_Check(unicode)) {
  6933. PyErr_BadArgument();
  6934. return NULL;
  6935. }
  6936. len = PyUnicode_GET_LENGTH(unicode);
  6937. if (code_page < 0) {
  6938. PyErr_SetString(PyExc_ValueError, "invalid code page number");
  6939. return NULL;
  6940. }
  6941. if (len == 0)
  6942. return PyBytes_FromStringAndSize(NULL, 0);
  6943. offset = 0;
  6944. do
  6945. {
  6946. #ifdef NEED_RETRY
  6947. if (len > DECODING_CHUNK_SIZE) {
  6948. chunk_len = DECODING_CHUNK_SIZE;
  6949. done = 0;
  6950. }
  6951. else
  6952. #endif
  6953. {
  6954. chunk_len = (int)len;
  6955. done = 1;
  6956. }
  6957. ret = encode_code_page_strict(code_page, &outbytes,
  6958. unicode, offset, chunk_len,
  6959. errors);
  6960. if (ret == -2)
  6961. ret = encode_code_page_errors(code_page, &outbytes,
  6962. unicode, offset,
  6963. chunk_len, errors);
  6964. if (ret < 0) {
  6965. Py_XDECREF(outbytes);
  6966. return NULL;
  6967. }
  6968. offset += chunk_len;
  6969. len -= chunk_len;
  6970. } while (!done);
  6971. return outbytes;
  6972. }
  6973. PyObject *
  6974. PyUnicode_EncodeCodePage(int code_page,
  6975. PyObject *unicode,
  6976. const char *errors)
  6977. {
  6978. return encode_code_page(code_page, unicode, errors);
  6979. }
  6980. PyObject *
  6981. PyUnicode_AsMBCSString(PyObject *unicode)
  6982. {
  6983. return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
  6984. }
  6985. #undef NEED_RETRY
  6986. #endif /* MS_WINDOWS */
  6987. /* --- Character Mapping Codec -------------------------------------------- */
  6988. static int
  6989. charmap_decode_string(const char *s,
  6990. Py_ssize_t size,
  6991. PyObject *mapping,
  6992. const char *errors,
  6993. _PyUnicodeWriter *writer)
  6994. {
  6995. const char *starts = s;
  6996. const char *e;
  6997. Py_ssize_t startinpos, endinpos;
  6998. PyObject *errorHandler = NULL, *exc = NULL;
  6999. Py_ssize_t maplen;
  7000. int mapkind;
  7001. const void *mapdata;
  7002. Py_UCS4 x;
  7003. unsigned char ch;
  7004. maplen = PyUnicode_GET_LENGTH(mapping);
  7005. mapdata = PyUnicode_DATA(mapping);
  7006. mapkind = PyUnicode_KIND(mapping);
  7007. e = s + size;
  7008. if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
  7009. /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
  7010. * is disabled in encoding aliases, latin1 is preferred because
  7011. * its implementation is faster. */
  7012. const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
  7013. Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
  7014. Py_UCS4 maxchar = writer->maxchar;
  7015. assert (writer->kind == PyUnicode_1BYTE_KIND);
  7016. while (s < e) {
  7017. ch = *s;
  7018. x = mapdata_ucs1[ch];
  7019. if (x > maxchar) {
  7020. if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
  7021. goto onError;
  7022. maxchar = writer->maxchar;
  7023. outdata = (Py_UCS1 *)writer->data;
  7024. }
  7025. outdata[writer->pos] = x;
  7026. writer->pos++;
  7027. ++s;
  7028. }
  7029. return 0;
  7030. }
  7031. while (s < e) {
  7032. if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
  7033. int outkind = writer->kind;
  7034. const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
  7035. if (outkind == PyUnicode_1BYTE_KIND) {
  7036. Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
  7037. Py_UCS4 maxchar = writer->maxchar;
  7038. while (s < e) {
  7039. ch = *s;
  7040. x = mapdata_ucs2[ch];
  7041. if (x > maxchar)
  7042. goto Error;
  7043. outdata[writer->pos] = x;
  7044. writer->pos++;
  7045. ++s;
  7046. }
  7047. break;
  7048. }
  7049. else if (outkind == PyUnicode_2BYTE_KIND) {
  7050. Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
  7051. while (s < e) {
  7052. ch = *s;
  7053. x = mapdata_ucs2[ch];
  7054. if (x == 0xFFFE)
  7055. goto Error;
  7056. outdata[writer->pos] = x;
  7057. writer->pos++;
  7058. ++s;
  7059. }
  7060. break;
  7061. }
  7062. }
  7063. ch = *s;
  7064. if (ch < maplen)
  7065. x = PyUnicode_READ(mapkind, mapdata, ch);
  7066. else
  7067. x = 0xfffe; /* invalid value */
  7068. Error:
  7069. if (x == 0xfffe)
  7070. {
  7071. /* undefined mapping */
  7072. startinpos = s-starts;
  7073. endinpos = startinpos+1;
  7074. if (unicode_decode_call_errorhandler_writer(
  7075. errors, &errorHandler,
  7076. "charmap", "character maps to <undefined>",
  7077. &starts, &e, &startinpos, &endinpos, &exc, &s,
  7078. writer)) {
  7079. goto onError;
  7080. }
  7081. continue;
  7082. }
  7083. if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
  7084. goto onError;
  7085. ++s;
  7086. }
  7087. Py_XDECREF(errorHandler);
  7088. Py_XDECREF(exc);
  7089. return 0;
  7090. onError:
  7091. Py_XDECREF(errorHandler);
  7092. Py_XDECREF(exc);
  7093. return -1;
  7094. }
  7095. static int
  7096. charmap_decode_mapping(const char *s,
  7097. Py_ssize_t size,
  7098. PyObject *mapping,
  7099. const char *errors,
  7100. _PyUnicodeWriter *writer)
  7101. {
  7102. const char *starts = s;
  7103. const char *e;
  7104. Py_ssize_t startinpos, endinpos;
  7105. PyObject *errorHandler = NULL, *exc = NULL;
  7106. unsigned char ch;
  7107. PyObject *key, *item = NULL;
  7108. e = s + size;
  7109. while (s < e) {
  7110. ch = *s;
  7111. /* Get mapping (char ordinal -> integer, Unicode char or None) */
  7112. key = PyLong_FromLong((long)ch);
  7113. if (key == NULL)
  7114. goto onError;
  7115. item = PyObject_GetItem(mapping, key);
  7116. Py_DECREF(key);
  7117. if (item == NULL) {
  7118. if (PyErr_ExceptionMatches(PyExc_LookupError)) {
  7119. /* No mapping found means: mapping is undefined. */
  7120. PyErr_Clear();
  7121. goto Undefined;
  7122. } else
  7123. goto onError;
  7124. }
  7125. /* Apply mapping */
  7126. if (item == Py_None)
  7127. goto Undefined;
  7128. if (PyLong_Check(item)) {
  7129. long value = PyLong_AS_LONG(item);
  7130. if (value == 0xFFFE)
  7131. goto Undefined;
  7132. if (value < 0 || value > MAX_UNICODE) {
  7133. PyErr_Format(PyExc_TypeError,
  7134. "character mapping must be in range(0x%x)",
  7135. (unsigned long)MAX_UNICODE + 1);
  7136. goto onError;
  7137. }
  7138. if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
  7139. goto onError;
  7140. }
  7141. else if (PyUnicode_Check(item)) {
  7142. if (PyUnicode_GET_LENGTH(item) == 1) {
  7143. Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
  7144. if (value == 0xFFFE)
  7145. goto Undefined;
  7146. if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
  7147. goto onError;
  7148. }
  7149. else {
  7150. writer->overallocate = 1;
  7151. if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
  7152. goto onError;
  7153. }
  7154. }
  7155. else {
  7156. /* wrong return value */
  7157. PyErr_SetString(PyExc_TypeError,
  7158. "character mapping must return integer, None or str");
  7159. goto onError;
  7160. }
  7161. Py_CLEAR(item);
  7162. ++s;
  7163. continue;
  7164. Undefined:
  7165. /* undefined mapping */
  7166. Py_CLEAR(item);
  7167. startinpos = s-starts;
  7168. endinpos = startinpos+1;
  7169. if (unicode_decode_call_errorhandler_writer(
  7170. errors, &errorHandler,
  7171. "charmap", "character maps to <undefined>",
  7172. &starts, &e, &startinpos, &endinpos, &exc, &s,
  7173. writer)) {
  7174. goto onError;
  7175. }
  7176. }
  7177. Py_XDECREF(errorHandler);
  7178. Py_XDECREF(exc);
  7179. return 0;
  7180. onError:
  7181. Py_XDECREF(item);
  7182. Py_XDECREF(errorHandler);
  7183. Py_XDECREF(exc);
  7184. return -1;
  7185. }
  7186. PyObject *
  7187. PyUnicode_DecodeCharmap(const char *s,
  7188. Py_ssize_t size,
  7189. PyObject *mapping,
  7190. const char *errors)
  7191. {
  7192. _PyUnicodeWriter writer;
  7193. /* Default to Latin-1 */
  7194. if (mapping == NULL)
  7195. return PyUnicode_DecodeLatin1(s, size, errors);
  7196. if (size == 0)
  7197. _Py_RETURN_UNICODE_EMPTY();
  7198. _PyUnicodeWriter_Init(&writer);
  7199. writer.min_length = size;
  7200. if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
  7201. goto onError;
  7202. if (PyUnicode_CheckExact(mapping)) {
  7203. if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
  7204. goto onError;
  7205. }
  7206. else {
  7207. if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
  7208. goto onError;
  7209. }
  7210. return _PyUnicodeWriter_Finish(&writer);
  7211. onError:
  7212. _PyUnicodeWriter_Dealloc(&writer);
  7213. return NULL;
  7214. }
  7215. /* Charmap encoding: the lookup table */
  7216. /*[clinic input]
  7217. class EncodingMap "struct encoding_map *" "&EncodingMapType"
  7218. [clinic start generated code]*/
  7219. /*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
  7220. struct encoding_map {
  7221. PyObject_HEAD
  7222. unsigned char level1[32];
  7223. int count2, count3;
  7224. unsigned char level23[1];
  7225. };
  7226. /*[clinic input]
  7227. EncodingMap.size
  7228. Return the size (in bytes) of this object.
  7229. [clinic start generated code]*/
  7230. static PyObject *
  7231. EncodingMap_size_impl(struct encoding_map *self)
  7232. /*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
  7233. {
  7234. return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
  7235. 128*self->count3);
  7236. }
  7237. static PyMethodDef encoding_map_methods[] = {
  7238. ENCODINGMAP_SIZE_METHODDEF
  7239. {NULL, NULL}
  7240. };
  7241. static PyTypeObject EncodingMapType = {
  7242. PyVarObject_HEAD_INIT(NULL, 0)
  7243. .tp_name = "EncodingMap",
  7244. .tp_basicsize = sizeof(struct encoding_map),
  7245. /* methods */
  7246. .tp_flags = Py_TPFLAGS_DEFAULT,
  7247. .tp_methods = encoding_map_methods,
  7248. };
  7249. PyObject*
  7250. PyUnicode_BuildEncodingMap(PyObject* string)
  7251. {
  7252. PyObject *result;
  7253. struct encoding_map *mresult;
  7254. int i;
  7255. int need_dict = 0;
  7256. unsigned char level1[32];
  7257. unsigned char level2[512];
  7258. unsigned char *mlevel1, *mlevel2, *mlevel3;
  7259. int count2 = 0, count3 = 0;
  7260. int kind;
  7261. const void *data;
  7262. Py_ssize_t length;
  7263. Py_UCS4 ch;
  7264. if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
  7265. PyErr_BadArgument();
  7266. return NULL;
  7267. }
  7268. kind = PyUnicode_KIND(string);
  7269. data = PyUnicode_DATA(string);
  7270. length = PyUnicode_GET_LENGTH(string);
  7271. length = Py_MIN(length, 256);
  7272. memset(level1, 0xFF, sizeof level1);
  7273. memset(level2, 0xFF, sizeof level2);
  7274. /* If there isn't a one-to-one mapping of NULL to \0,
  7275. or if there are non-BMP characters, we need to use
  7276. a mapping dictionary. */
  7277. if (PyUnicode_READ(kind, data, 0) != 0)
  7278. need_dict = 1;
  7279. for (i = 1; i < length; i++) {
  7280. int l1, l2;
  7281. ch = PyUnicode_READ(kind, data, i);
  7282. if (ch == 0 || ch > 0xFFFF) {
  7283. need_dict = 1;
  7284. break;
  7285. }
  7286. if (ch == 0xFFFE)
  7287. /* unmapped character */
  7288. continue;
  7289. l1 = ch >> 11;
  7290. l2 = ch >> 7;
  7291. if (level1[l1] == 0xFF)
  7292. level1[l1] = count2++;
  7293. if (level2[l2] == 0xFF)
  7294. level2[l2] = count3++;
  7295. }
  7296. if (count2 >= 0xFF || count3 >= 0xFF)
  7297. need_dict = 1;
  7298. if (need_dict) {
  7299. PyObject *result = PyDict_New();
  7300. if (!result)
  7301. return NULL;
  7302. for (i = 0; i < length; i++) {
  7303. Py_UCS4 c = PyUnicode_READ(kind, data, i);
  7304. PyObject *key = PyLong_FromLong(c);
  7305. if (key == NULL) {
  7306. Py_DECREF(result);
  7307. return NULL;
  7308. }
  7309. PyObject *value = PyLong_FromLong(i);
  7310. if (value == NULL) {
  7311. Py_DECREF(key);
  7312. Py_DECREF(result);
  7313. return NULL;
  7314. }
  7315. int rc = PyDict_SetItem(result, key, value);
  7316. Py_DECREF(key);
  7317. Py_DECREF(value);
  7318. if (rc < 0) {
  7319. Py_DECREF(result);
  7320. return NULL;
  7321. }
  7322. }
  7323. return result;
  7324. }
  7325. /* Create a three-level trie */
  7326. result = PyObject_Malloc(sizeof(struct encoding_map) +
  7327. 16*count2 + 128*count3 - 1);
  7328. if (!result) {
  7329. return PyErr_NoMemory();
  7330. }
  7331. _PyObject_Init(result, &EncodingMapType);
  7332. mresult = (struct encoding_map*)result;
  7333. mresult->count2 = count2;
  7334. mresult->count3 = count3;
  7335. mlevel1 = mresult->level1;
  7336. mlevel2 = mresult->level23;
  7337. mlevel3 = mresult->level23 + 16*count2;
  7338. memcpy(mlevel1, level1, 32);
  7339. memset(mlevel2, 0xFF, 16*count2);
  7340. memset(mlevel3, 0, 128*count3);
  7341. count3 = 0;
  7342. for (i = 1; i < length; i++) {
  7343. int o1, o2, o3, i2, i3;
  7344. Py_UCS4 ch = PyUnicode_READ(kind, data, i);
  7345. if (ch == 0xFFFE)
  7346. /* unmapped character */
  7347. continue;
  7348. o1 = ch>>11;
  7349. o2 = (ch>>7) & 0xF;
  7350. i2 = 16*mlevel1[o1] + o2;
  7351. if (mlevel2[i2] == 0xFF)
  7352. mlevel2[i2] = count3++;
  7353. o3 = ch & 0x7F;
  7354. i3 = 128*mlevel2[i2] + o3;
  7355. mlevel3[i3] = i;
  7356. }
  7357. return result;
  7358. }
  7359. static int
  7360. encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
  7361. {
  7362. struct encoding_map *map = (struct encoding_map*)mapping;
  7363. int l1 = c>>11;
  7364. int l2 = (c>>7) & 0xF;
  7365. int l3 = c & 0x7F;
  7366. int i;
  7367. if (c > 0xFFFF)
  7368. return -1;
  7369. if (c == 0)
  7370. return 0;
  7371. /* level 1*/
  7372. i = map->level1[l1];
  7373. if (i == 0xFF) {
  7374. return -1;
  7375. }
  7376. /* level 2*/
  7377. i = map->level23[16*i+l2];
  7378. if (i == 0xFF) {
  7379. return -1;
  7380. }
  7381. /* level 3 */
  7382. i = map->level23[16*map->count2 + 128*i + l3];
  7383. if (i == 0) {
  7384. return -1;
  7385. }
  7386. return i;
  7387. }
  7388. /* Lookup the character ch in the mapping. If the character
  7389. can't be found, Py_None is returned (or NULL, if another
  7390. error occurred). */
  7391. static PyObject *
  7392. charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
  7393. {
  7394. PyObject *w = PyLong_FromLong((long)c);
  7395. PyObject *x;
  7396. if (w == NULL)
  7397. return NULL;
  7398. x = PyObject_GetItem(mapping, w);
  7399. Py_DECREF(w);
  7400. if (x == NULL) {
  7401. if (PyErr_ExceptionMatches(PyExc_LookupError)) {
  7402. /* No mapping found means: mapping is undefined. */
  7403. PyErr_Clear();
  7404. Py_RETURN_NONE;
  7405. } else
  7406. return NULL;
  7407. }
  7408. else if (x == Py_None)
  7409. return x;
  7410. else if (PyLong_Check(x)) {
  7411. long value = PyLong_AS_LONG(x);
  7412. if (value < 0 || value > 255) {
  7413. PyErr_SetString(PyExc_TypeError,
  7414. "character mapping must be in range(256)");
  7415. Py_DECREF(x);
  7416. return NULL;
  7417. }
  7418. return x;
  7419. }
  7420. else if (PyBytes_Check(x))
  7421. return x;
  7422. else {
  7423. /* wrong return value */
  7424. PyErr_Format(PyExc_TypeError,
  7425. "character mapping must return integer, bytes or None, not %.400s",
  7426. Py_TYPE(x)->tp_name);
  7427. Py_DECREF(x);
  7428. return NULL;
  7429. }
  7430. }
  7431. static int
  7432. charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
  7433. {
  7434. Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
  7435. /* exponentially overallocate to minimize reallocations */
  7436. if (requiredsize < 2*outsize)
  7437. requiredsize = 2*outsize;
  7438. if (_PyBytes_Resize(outobj, requiredsize))
  7439. return -1;
  7440. return 0;
  7441. }
  7442. typedef enum charmapencode_result {
  7443. enc_SUCCESS, enc_FAILED, enc_EXCEPTION
  7444. } charmapencode_result;
  7445. /* lookup the character, put the result in the output string and adjust
  7446. various state variables. Resize the output bytes object if not enough
  7447. space is available. Return a new reference to the object that
  7448. was put in the output buffer, or Py_None, if the mapping was undefined
  7449. (in which case no character was written) or NULL, if a
  7450. reallocation error occurred. The caller must decref the result */
  7451. static charmapencode_result
  7452. charmapencode_output(Py_UCS4 c, PyObject *mapping,
  7453. PyObject **outobj, Py_ssize_t *outpos)
  7454. {
  7455. PyObject *rep;
  7456. char *outstart;
  7457. Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
  7458. if (Py_IS_TYPE(mapping, &EncodingMapType)) {
  7459. int res = encoding_map_lookup(c, mapping);
  7460. Py_ssize_t requiredsize = *outpos+1;
  7461. if (res == -1)
  7462. return enc_FAILED;
  7463. if (outsize<requiredsize)
  7464. if (charmapencode_resize(outobj, outpos, requiredsize))
  7465. return enc_EXCEPTION;
  7466. outstart = PyBytes_AS_STRING(*outobj);
  7467. outstart[(*outpos)++] = (char)res;
  7468. return enc_SUCCESS;
  7469. }
  7470. rep = charmapencode_lookup(c, mapping);
  7471. if (rep==NULL)
  7472. return enc_EXCEPTION;
  7473. else if (rep==Py_None) {
  7474. Py_DECREF(rep);
  7475. return enc_FAILED;
  7476. } else {
  7477. if (PyLong_Check(rep)) {
  7478. Py_ssize_t requiredsize = *outpos+1;
  7479. if (outsize<requiredsize)
  7480. if (charmapencode_resize(outobj, outpos, requiredsize)) {
  7481. Py_DECREF(rep);
  7482. return enc_EXCEPTION;
  7483. }
  7484. outstart = PyBytes_AS_STRING(*outobj);
  7485. outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
  7486. }
  7487. else {
  7488. const char *repchars = PyBytes_AS_STRING(rep);
  7489. Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
  7490. Py_ssize_t requiredsize = *outpos+repsize;
  7491. if (outsize<requiredsize)
  7492. if (charmapencode_resize(outobj, outpos, requiredsize)) {
  7493. Py_DECREF(rep);
  7494. return enc_EXCEPTION;
  7495. }
  7496. outstart = PyBytes_AS_STRING(*outobj);
  7497. memcpy(outstart + *outpos, repchars, repsize);
  7498. *outpos += repsize;
  7499. }
  7500. }
  7501. Py_DECREF(rep);
  7502. return enc_SUCCESS;
  7503. }
  7504. /* handle an error in PyUnicode_EncodeCharmap
  7505. Return 0 on success, -1 on error */
  7506. static int
  7507. charmap_encoding_error(
  7508. PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
  7509. PyObject **exceptionObject,
  7510. _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
  7511. PyObject **res, Py_ssize_t *respos)
  7512. {
  7513. PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
  7514. Py_ssize_t size, repsize;
  7515. Py_ssize_t newpos;
  7516. int kind;
  7517. const void *data;
  7518. Py_ssize_t index;
  7519. /* startpos for collecting unencodable chars */
  7520. Py_ssize_t collstartpos = *inpos;
  7521. Py_ssize_t collendpos = *inpos+1;
  7522. Py_ssize_t collpos;
  7523. const char *encoding = "charmap";
  7524. const char *reason = "character maps to <undefined>";
  7525. charmapencode_result x;
  7526. Py_UCS4 ch;
  7527. int val;
  7528. size = PyUnicode_GET_LENGTH(unicode);
  7529. /* find all unencodable characters */
  7530. while (collendpos < size) {
  7531. PyObject *rep;
  7532. if (Py_IS_TYPE(mapping, &EncodingMapType)) {
  7533. ch = PyUnicode_READ_CHAR(unicode, collendpos);
  7534. val = encoding_map_lookup(ch, mapping);
  7535. if (val != -1)
  7536. break;
  7537. ++collendpos;
  7538. continue;
  7539. }
  7540. ch = PyUnicode_READ_CHAR(unicode, collendpos);
  7541. rep = charmapencode_lookup(ch, mapping);
  7542. if (rep==NULL)
  7543. return -1;
  7544. else if (rep!=Py_None) {
  7545. Py_DECREF(rep);
  7546. break;
  7547. }
  7548. Py_DECREF(rep);
  7549. ++collendpos;
  7550. }
  7551. /* cache callback name lookup
  7552. * (if not done yet, i.e. it's the first error) */
  7553. if (*error_handler == _Py_ERROR_UNKNOWN)
  7554. *error_handler = _Py_GetErrorHandler(errors);
  7555. switch (*error_handler) {
  7556. case _Py_ERROR_STRICT:
  7557. raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
  7558. return -1;
  7559. case _Py_ERROR_REPLACE:
  7560. for (collpos = collstartpos; collpos<collendpos; ++collpos) {
  7561. x = charmapencode_output('?', mapping, res, respos);
  7562. if (x==enc_EXCEPTION) {
  7563. return -1;
  7564. }
  7565. else if (x==enc_FAILED) {
  7566. raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
  7567. return -1;
  7568. }
  7569. }
  7570. /* fall through */
  7571. case _Py_ERROR_IGNORE:
  7572. *inpos = collendpos;
  7573. break;
  7574. case _Py_ERROR_XMLCHARREFREPLACE:
  7575. /* generate replacement (temporarily (mis)uses p) */
  7576. for (collpos = collstartpos; collpos < collendpos; ++collpos) {
  7577. char buffer[2+29+1+1];
  7578. char *cp;
  7579. sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
  7580. for (cp = buffer; *cp; ++cp) {
  7581. x = charmapencode_output(*cp, mapping, res, respos);
  7582. if (x==enc_EXCEPTION)
  7583. return -1;
  7584. else if (x==enc_FAILED) {
  7585. raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
  7586. return -1;
  7587. }
  7588. }
  7589. }
  7590. *inpos = collendpos;
  7591. break;
  7592. default:
  7593. repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
  7594. encoding, reason, unicode, exceptionObject,
  7595. collstartpos, collendpos, &newpos);
  7596. if (repunicode == NULL)
  7597. return -1;
  7598. if (PyBytes_Check(repunicode)) {
  7599. /* Directly copy bytes result to output. */
  7600. Py_ssize_t outsize = PyBytes_Size(*res);
  7601. Py_ssize_t requiredsize;
  7602. repsize = PyBytes_Size(repunicode);
  7603. requiredsize = *respos + repsize;
  7604. if (requiredsize > outsize)
  7605. /* Make room for all additional bytes. */
  7606. if (charmapencode_resize(res, respos, requiredsize)) {
  7607. Py_DECREF(repunicode);
  7608. return -1;
  7609. }
  7610. memcpy(PyBytes_AsString(*res) + *respos,
  7611. PyBytes_AsString(repunicode), repsize);
  7612. *respos += repsize;
  7613. *inpos = newpos;
  7614. Py_DECREF(repunicode);
  7615. break;
  7616. }
  7617. /* generate replacement */
  7618. repsize = PyUnicode_GET_LENGTH(repunicode);
  7619. data = PyUnicode_DATA(repunicode);
  7620. kind = PyUnicode_KIND(repunicode);
  7621. for (index = 0; index < repsize; index++) {
  7622. Py_UCS4 repch = PyUnicode_READ(kind, data, index);
  7623. x = charmapencode_output(repch, mapping, res, respos);
  7624. if (x==enc_EXCEPTION) {
  7625. Py_DECREF(repunicode);
  7626. return -1;
  7627. }
  7628. else if (x==enc_FAILED) {
  7629. Py_DECREF(repunicode);
  7630. raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
  7631. return -1;
  7632. }
  7633. }
  7634. *inpos = newpos;
  7635. Py_DECREF(repunicode);
  7636. }
  7637. return 0;
  7638. }
  7639. PyObject *
  7640. _PyUnicode_EncodeCharmap(PyObject *unicode,
  7641. PyObject *mapping,
  7642. const char *errors)
  7643. {
  7644. /* output object */
  7645. PyObject *res = NULL;
  7646. /* current input position */
  7647. Py_ssize_t inpos = 0;
  7648. Py_ssize_t size;
  7649. /* current output position */
  7650. Py_ssize_t respos = 0;
  7651. PyObject *error_handler_obj = NULL;
  7652. PyObject *exc = NULL;
  7653. _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
  7654. const void *data;
  7655. int kind;
  7656. size = PyUnicode_GET_LENGTH(unicode);
  7657. data = PyUnicode_DATA(unicode);
  7658. kind = PyUnicode_KIND(unicode);
  7659. /* Default to Latin-1 */
  7660. if (mapping == NULL)
  7661. return unicode_encode_ucs1(unicode, errors, 256);
  7662. /* allocate enough for a simple encoding without
  7663. replacements, if we need more, we'll resize */
  7664. res = PyBytes_FromStringAndSize(NULL, size);
  7665. if (res == NULL)
  7666. goto onError;
  7667. if (size == 0)
  7668. return res;
  7669. while (inpos<size) {
  7670. Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
  7671. /* try to encode it */
  7672. charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
  7673. if (x==enc_EXCEPTION) /* error */
  7674. goto onError;
  7675. if (x==enc_FAILED) { /* unencodable character */
  7676. if (charmap_encoding_error(unicode, &inpos, mapping,
  7677. &exc,
  7678. &error_handler, &error_handler_obj, errors,
  7679. &res, &respos)) {
  7680. goto onError;
  7681. }
  7682. }
  7683. else
  7684. /* done with this character => adjust input position */
  7685. ++inpos;
  7686. }
  7687. /* Resize if we allocated to much */
  7688. if (respos<PyBytes_GET_SIZE(res))
  7689. if (_PyBytes_Resize(&res, respos) < 0)
  7690. goto onError;
  7691. Py_XDECREF(exc);
  7692. Py_XDECREF(error_handler_obj);
  7693. return res;
  7694. onError:
  7695. Py_XDECREF(res);
  7696. Py_XDECREF(exc);
  7697. Py_XDECREF(error_handler_obj);
  7698. return NULL;
  7699. }
  7700. PyObject *
  7701. PyUnicode_AsCharmapString(PyObject *unicode,
  7702. PyObject *mapping)
  7703. {
  7704. if (!PyUnicode_Check(unicode) || mapping == NULL) {
  7705. PyErr_BadArgument();
  7706. return NULL;
  7707. }
  7708. return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
  7709. }
  7710. /* create or adjust a UnicodeTranslateError */
  7711. static void
  7712. make_translate_exception(PyObject **exceptionObject,
  7713. PyObject *unicode,
  7714. Py_ssize_t startpos, Py_ssize_t endpos,
  7715. const char *reason)
  7716. {
  7717. if (*exceptionObject == NULL) {
  7718. *exceptionObject = _PyUnicodeTranslateError_Create(
  7719. unicode, startpos, endpos, reason);
  7720. }
  7721. else {
  7722. if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
  7723. goto onError;
  7724. if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
  7725. goto onError;
  7726. if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
  7727. goto onError;
  7728. return;
  7729. onError:
  7730. Py_CLEAR(*exceptionObject);
  7731. }
  7732. }
  7733. /* error handling callback helper:
  7734. build arguments, call the callback and check the arguments,
  7735. put the result into newpos and return the replacement string, which
  7736. has to be freed by the caller */
  7737. static PyObject *
  7738. unicode_translate_call_errorhandler(const char *errors,
  7739. PyObject **errorHandler,
  7740. const char *reason,
  7741. PyObject *unicode, PyObject **exceptionObject,
  7742. Py_ssize_t startpos, Py_ssize_t endpos,
  7743. Py_ssize_t *newpos)
  7744. {
  7745. static const char *argparse = "Un;translating error handler must return (str, int) tuple";
  7746. Py_ssize_t i_newpos;
  7747. PyObject *restuple;
  7748. PyObject *resunicode;
  7749. if (*errorHandler == NULL) {
  7750. *errorHandler = PyCodec_LookupError(errors);
  7751. if (*errorHandler == NULL)
  7752. return NULL;
  7753. }
  7754. make_translate_exception(exceptionObject,
  7755. unicode, startpos, endpos, reason);
  7756. if (*exceptionObject == NULL)
  7757. return NULL;
  7758. restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
  7759. if (restuple == NULL)
  7760. return NULL;
  7761. if (!PyTuple_Check(restuple)) {
  7762. PyErr_SetString(PyExc_TypeError, &argparse[3]);
  7763. Py_DECREF(restuple);
  7764. return NULL;
  7765. }
  7766. if (!PyArg_ParseTuple(restuple, argparse,
  7767. &resunicode, &i_newpos)) {
  7768. Py_DECREF(restuple);
  7769. return NULL;
  7770. }
  7771. if (i_newpos<0)
  7772. *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
  7773. else
  7774. *newpos = i_newpos;
  7775. if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
  7776. PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
  7777. Py_DECREF(restuple);
  7778. return NULL;
  7779. }
  7780. Py_INCREF(resunicode);
  7781. Py_DECREF(restuple);
  7782. return resunicode;
  7783. }
  7784. /* Lookup the character ch in the mapping and put the result in result,
  7785. which must be decrefed by the caller.
  7786. Return 0 on success, -1 on error */
  7787. static int
  7788. charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
  7789. {
  7790. PyObject *w = PyLong_FromLong((long)c);
  7791. PyObject *x;
  7792. if (w == NULL)
  7793. return -1;
  7794. x = PyObject_GetItem(mapping, w);
  7795. Py_DECREF(w);
  7796. if (x == NULL) {
  7797. if (PyErr_ExceptionMatches(PyExc_LookupError)) {
  7798. /* No mapping found means: use 1:1 mapping. */
  7799. PyErr_Clear();
  7800. *result = NULL;
  7801. return 0;
  7802. } else
  7803. return -1;
  7804. }
  7805. else if (x == Py_None) {
  7806. *result = x;
  7807. return 0;
  7808. }
  7809. else if (PyLong_Check(x)) {
  7810. long value = PyLong_AS_LONG(x);
  7811. if (value < 0 || value > MAX_UNICODE) {
  7812. PyErr_Format(PyExc_ValueError,
  7813. "character mapping must be in range(0x%x)",
  7814. MAX_UNICODE+1);
  7815. Py_DECREF(x);
  7816. return -1;
  7817. }
  7818. *result = x;
  7819. return 0;
  7820. }
  7821. else if (PyUnicode_Check(x)) {
  7822. *result = x;
  7823. return 0;
  7824. }
  7825. else {
  7826. /* wrong return value */
  7827. PyErr_SetString(PyExc_TypeError,
  7828. "character mapping must return integer, None or str");
  7829. Py_DECREF(x);
  7830. return -1;
  7831. }
  7832. }
  7833. /* lookup the character, write the result into the writer.
  7834. Return 1 if the result was written into the writer, return 0 if the mapping
  7835. was undefined, raise an exception return -1 on error. */
  7836. static int
  7837. charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
  7838. _PyUnicodeWriter *writer)
  7839. {
  7840. PyObject *item;
  7841. if (charmaptranslate_lookup(ch, mapping, &item))
  7842. return -1;
  7843. if (item == NULL) {
  7844. /* not found => default to 1:1 mapping */
  7845. if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
  7846. return -1;
  7847. }
  7848. return 1;
  7849. }
  7850. if (item == Py_None) {
  7851. Py_DECREF(item);
  7852. return 0;
  7853. }
  7854. if (PyLong_Check(item)) {
  7855. long ch = (Py_UCS4)PyLong_AS_LONG(item);
  7856. /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
  7857. used it */
  7858. if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
  7859. Py_DECREF(item);
  7860. return -1;
  7861. }
  7862. Py_DECREF(item);
  7863. return 1;
  7864. }
  7865. if (!PyUnicode_Check(item)) {
  7866. Py_DECREF(item);
  7867. return -1;
  7868. }
  7869. if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
  7870. Py_DECREF(item);
  7871. return -1;
  7872. }
  7873. Py_DECREF(item);
  7874. return 1;
  7875. }
  7876. static int
  7877. unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
  7878. Py_UCS1 *translate)
  7879. {
  7880. PyObject *item = NULL;
  7881. int ret = 0;
  7882. if (charmaptranslate_lookup(ch, mapping, &item)) {
  7883. return -1;
  7884. }
  7885. if (item == Py_None) {
  7886. /* deletion */
  7887. translate[ch] = 0xfe;
  7888. }
  7889. else if (item == NULL) {
  7890. /* not found => default to 1:1 mapping */
  7891. translate[ch] = ch;
  7892. return 1;
  7893. }
  7894. else if (PyLong_Check(item)) {
  7895. long replace = PyLong_AS_LONG(item);
  7896. /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
  7897. used it */
  7898. if (127 < replace) {
  7899. /* invalid character or character outside ASCII:
  7900. skip the fast translate */
  7901. goto exit;
  7902. }
  7903. translate[ch] = (Py_UCS1)replace;
  7904. }
  7905. else if (PyUnicode_Check(item)) {
  7906. Py_UCS4 replace;
  7907. if (PyUnicode_GET_LENGTH(item) != 1)
  7908. goto exit;
  7909. replace = PyUnicode_READ_CHAR(item, 0);
  7910. if (replace > 127)
  7911. goto exit;
  7912. translate[ch] = (Py_UCS1)replace;
  7913. }
  7914. else {
  7915. /* not None, NULL, long or unicode */
  7916. goto exit;
  7917. }
  7918. ret = 1;
  7919. exit:
  7920. Py_DECREF(item);
  7921. return ret;
  7922. }
  7923. /* Fast path for ascii => ascii translation. Return 1 if the whole string
  7924. was translated into writer, return 0 if the input string was partially
  7925. translated into writer, raise an exception and return -1 on error. */
  7926. static int
  7927. unicode_fast_translate(PyObject *input, PyObject *mapping,
  7928. _PyUnicodeWriter *writer, int ignore,
  7929. Py_ssize_t *input_pos)
  7930. {
  7931. Py_UCS1 ascii_table[128], ch, ch2;
  7932. Py_ssize_t len;
  7933. const Py_UCS1 *in, *end;
  7934. Py_UCS1 *out;
  7935. int res = 0;
  7936. len = PyUnicode_GET_LENGTH(input);
  7937. memset(ascii_table, 0xff, 128);
  7938. in = PyUnicode_1BYTE_DATA(input);
  7939. end = in + len;
  7940. assert(PyUnicode_IS_ASCII(writer->buffer));
  7941. assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
  7942. out = PyUnicode_1BYTE_DATA(writer->buffer);
  7943. for (; in < end; in++) {
  7944. ch = *in;
  7945. ch2 = ascii_table[ch];
  7946. if (ch2 == 0xff) {
  7947. int translate = unicode_fast_translate_lookup(mapping, ch,
  7948. ascii_table);
  7949. if (translate < 0)
  7950. return -1;
  7951. if (translate == 0)
  7952. goto exit;
  7953. ch2 = ascii_table[ch];
  7954. }
  7955. if (ch2 == 0xfe) {
  7956. if (ignore)
  7957. continue;
  7958. goto exit;
  7959. }
  7960. assert(ch2 < 128);
  7961. *out = ch2;
  7962. out++;
  7963. }
  7964. res = 1;
  7965. exit:
  7966. writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
  7967. *input_pos = in - PyUnicode_1BYTE_DATA(input);
  7968. return res;
  7969. }
  7970. static PyObject *
  7971. _PyUnicode_TranslateCharmap(PyObject *input,
  7972. PyObject *mapping,
  7973. const char *errors)
  7974. {
  7975. /* input object */
  7976. const void *data;
  7977. Py_ssize_t size, i;
  7978. int kind;
  7979. /* output buffer */
  7980. _PyUnicodeWriter writer;
  7981. /* error handler */
  7982. const char *reason = "character maps to <undefined>";
  7983. PyObject *errorHandler = NULL;
  7984. PyObject *exc = NULL;
  7985. int ignore;
  7986. int res;
  7987. if (mapping == NULL) {
  7988. PyErr_BadArgument();
  7989. return NULL;
  7990. }
  7991. data = PyUnicode_DATA(input);
  7992. kind = PyUnicode_KIND(input);
  7993. size = PyUnicode_GET_LENGTH(input);
  7994. if (size == 0)
  7995. return PyUnicode_FromObject(input);
  7996. /* allocate enough for a simple 1:1 translation without
  7997. replacements, if we need more, we'll resize */
  7998. _PyUnicodeWriter_Init(&writer);
  7999. if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
  8000. goto onError;
  8001. ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
  8002. if (PyUnicode_IS_ASCII(input)) {
  8003. res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
  8004. if (res < 0) {
  8005. _PyUnicodeWriter_Dealloc(&writer);
  8006. return NULL;
  8007. }
  8008. if (res == 1)
  8009. return _PyUnicodeWriter_Finish(&writer);
  8010. }
  8011. else {
  8012. i = 0;
  8013. }
  8014. while (i<size) {
  8015. /* try to encode it */
  8016. int translate;
  8017. PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
  8018. Py_ssize_t newpos;
  8019. /* startpos for collecting untranslatable chars */
  8020. Py_ssize_t collstart;
  8021. Py_ssize_t collend;
  8022. Py_UCS4 ch;
  8023. ch = PyUnicode_READ(kind, data, i);
  8024. translate = charmaptranslate_output(ch, mapping, &writer);
  8025. if (translate < 0)
  8026. goto onError;
  8027. if (translate != 0) {
  8028. /* it worked => adjust input pointer */
  8029. ++i;
  8030. continue;
  8031. }
  8032. /* untranslatable character */
  8033. collstart = i;
  8034. collend = i+1;
  8035. /* find all untranslatable characters */
  8036. while (collend < size) {
  8037. PyObject *x;
  8038. ch = PyUnicode_READ(kind, data, collend);
  8039. if (charmaptranslate_lookup(ch, mapping, &x))
  8040. goto onError;
  8041. Py_XDECREF(x);
  8042. if (x != Py_None)
  8043. break;
  8044. ++collend;
  8045. }
  8046. if (ignore) {
  8047. i = collend;
  8048. }
  8049. else {
  8050. repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
  8051. reason, input, &exc,
  8052. collstart, collend, &newpos);
  8053. if (repunicode == NULL)
  8054. goto onError;
  8055. if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
  8056. Py_DECREF(repunicode);
  8057. goto onError;
  8058. }
  8059. Py_DECREF(repunicode);
  8060. i = newpos;
  8061. }
  8062. }
  8063. Py_XDECREF(exc);
  8064. Py_XDECREF(errorHandler);
  8065. return _PyUnicodeWriter_Finish(&writer);
  8066. onError:
  8067. _PyUnicodeWriter_Dealloc(&writer);
  8068. Py_XDECREF(exc);
  8069. Py_XDECREF(errorHandler);
  8070. return NULL;
  8071. }
  8072. PyObject *
  8073. PyUnicode_Translate(PyObject *str,
  8074. PyObject *mapping,
  8075. const char *errors)
  8076. {
  8077. if (ensure_unicode(str) < 0)
  8078. return NULL;
  8079. return _PyUnicode_TranslateCharmap(str, mapping, errors);
  8080. }
  8081. PyObject *
  8082. _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
  8083. {
  8084. if (!PyUnicode_Check(unicode)) {
  8085. PyErr_BadInternalCall();
  8086. return NULL;
  8087. }
  8088. if (PyUnicode_IS_ASCII(unicode)) {
  8089. /* If the string is already ASCII, just return the same string */
  8090. return Py_NewRef(unicode);
  8091. }
  8092. Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
  8093. PyObject *result = PyUnicode_New(len, 127);
  8094. if (result == NULL) {
  8095. return NULL;
  8096. }
  8097. Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
  8098. int kind = PyUnicode_KIND(unicode);
  8099. const void *data = PyUnicode_DATA(unicode);
  8100. Py_ssize_t i;
  8101. for (i = 0; i < len; ++i) {
  8102. Py_UCS4 ch = PyUnicode_READ(kind, data, i);
  8103. if (ch < 127) {
  8104. out[i] = ch;
  8105. }
  8106. else if (Py_UNICODE_ISSPACE(ch)) {
  8107. out[i] = ' ';
  8108. }
  8109. else {
  8110. int decimal = Py_UNICODE_TODECIMAL(ch);
  8111. if (decimal < 0) {
  8112. out[i] = '?';
  8113. out[i+1] = '\0';
  8114. _PyUnicode_LENGTH(result) = i + 1;
  8115. break;
  8116. }
  8117. out[i] = '0' + decimal;
  8118. }
  8119. }
  8120. assert(_PyUnicode_CheckConsistency(result, 1));
  8121. return result;
  8122. }
  8123. /* --- Helpers ------------------------------------------------------------ */
  8124. /* helper macro to fixup start/end slice values */
  8125. #define ADJUST_INDICES(start, end, len) \
  8126. if (end > len) \
  8127. end = len; \
  8128. else if (end < 0) { \
  8129. end += len; \
  8130. if (end < 0) \
  8131. end = 0; \
  8132. } \
  8133. if (start < 0) { \
  8134. start += len; \
  8135. if (start < 0) \
  8136. start = 0; \
  8137. }
  8138. static Py_ssize_t
  8139. any_find_slice(PyObject* s1, PyObject* s2,
  8140. Py_ssize_t start,
  8141. Py_ssize_t end,
  8142. int direction)
  8143. {
  8144. int kind1, kind2;
  8145. const void *buf1, *buf2;
  8146. Py_ssize_t len1, len2, result;
  8147. kind1 = PyUnicode_KIND(s1);
  8148. kind2 = PyUnicode_KIND(s2);
  8149. if (kind1 < kind2)
  8150. return -1;
  8151. len1 = PyUnicode_GET_LENGTH(s1);
  8152. len2 = PyUnicode_GET_LENGTH(s2);
  8153. ADJUST_INDICES(start, end, len1);
  8154. if (end - start < len2)
  8155. return -1;
  8156. buf1 = PyUnicode_DATA(s1);
  8157. buf2 = PyUnicode_DATA(s2);
  8158. if (len2 == 1) {
  8159. Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
  8160. result = findchar((const char *)buf1 + kind1*start,
  8161. kind1, end - start, ch, direction);
  8162. if (result == -1)
  8163. return -1;
  8164. else
  8165. return start + result;
  8166. }
  8167. if (kind2 != kind1) {
  8168. buf2 = unicode_askind(kind2, buf2, len2, kind1);
  8169. if (!buf2)
  8170. return -2;
  8171. }
  8172. if (direction > 0) {
  8173. switch (kind1) {
  8174. case PyUnicode_1BYTE_KIND:
  8175. if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
  8176. result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
  8177. else
  8178. result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
  8179. break;
  8180. case PyUnicode_2BYTE_KIND:
  8181. result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
  8182. break;
  8183. case PyUnicode_4BYTE_KIND:
  8184. result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
  8185. break;
  8186. default:
  8187. Py_UNREACHABLE();
  8188. }
  8189. }
  8190. else {
  8191. switch (kind1) {
  8192. case PyUnicode_1BYTE_KIND:
  8193. if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
  8194. result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
  8195. else
  8196. result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
  8197. break;
  8198. case PyUnicode_2BYTE_KIND:
  8199. result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
  8200. break;
  8201. case PyUnicode_4BYTE_KIND:
  8202. result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
  8203. break;
  8204. default:
  8205. Py_UNREACHABLE();
  8206. }
  8207. }
  8208. assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
  8209. if (kind2 != kind1)
  8210. PyMem_Free((void *)buf2);
  8211. return result;
  8212. }
  8213. /* _PyUnicode_InsertThousandsGrouping() helper functions */
  8214. #include "stringlib/localeutil.h"
  8215. /**
  8216. * InsertThousandsGrouping:
  8217. * @writer: Unicode writer.
  8218. * @n_buffer: Number of characters in @buffer.
  8219. * @digits: Digits we're reading from. If count is non-NULL, this is unused.
  8220. * @d_pos: Start of digits string.
  8221. * @n_digits: The number of digits in the string, in which we want
  8222. * to put the grouping chars.
  8223. * @min_width: The minimum width of the digits in the output string.
  8224. * Output will be zero-padded on the left to fill.
  8225. * @grouping: see definition in localeconv().
  8226. * @thousands_sep: see definition in localeconv().
  8227. *
  8228. * There are 2 modes: counting and filling. If @writer is NULL,
  8229. * we are in counting mode, else filling mode.
  8230. * If counting, the required buffer size is returned.
  8231. * If filling, we know the buffer will be large enough, so we don't
  8232. * need to pass in the buffer size.
  8233. * Inserts thousand grouping characters (as defined by grouping and
  8234. * thousands_sep) into @writer.
  8235. *
  8236. * Return value: -1 on error, number of characters otherwise.
  8237. **/
  8238. Py_ssize_t
  8239. _PyUnicode_InsertThousandsGrouping(
  8240. _PyUnicodeWriter *writer,
  8241. Py_ssize_t n_buffer,
  8242. PyObject *digits,
  8243. Py_ssize_t d_pos,
  8244. Py_ssize_t n_digits,
  8245. Py_ssize_t min_width,
  8246. const char *grouping,
  8247. PyObject *thousands_sep,
  8248. Py_UCS4 *maxchar)
  8249. {
  8250. min_width = Py_MAX(0, min_width);
  8251. if (writer) {
  8252. assert(digits != NULL);
  8253. assert(maxchar == NULL);
  8254. }
  8255. else {
  8256. assert(digits == NULL);
  8257. assert(maxchar != NULL);
  8258. }
  8259. assert(0 <= d_pos);
  8260. assert(0 <= n_digits);
  8261. assert(grouping != NULL);
  8262. Py_ssize_t count = 0;
  8263. Py_ssize_t n_zeros;
  8264. int loop_broken = 0;
  8265. int use_separator = 0; /* First time through, don't append the
  8266. separator. They only go between
  8267. groups. */
  8268. Py_ssize_t buffer_pos;
  8269. Py_ssize_t digits_pos;
  8270. Py_ssize_t len;
  8271. Py_ssize_t n_chars;
  8272. Py_ssize_t remaining = n_digits; /* Number of chars remaining to
  8273. be looked at */
  8274. /* A generator that returns all of the grouping widths, until it
  8275. returns 0. */
  8276. GroupGenerator groupgen;
  8277. GroupGenerator_init(&groupgen, grouping);
  8278. const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
  8279. /* if digits are not grouped, thousands separator
  8280. should be an empty string */
  8281. assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
  8282. digits_pos = d_pos + n_digits;
  8283. if (writer) {
  8284. buffer_pos = writer->pos + n_buffer;
  8285. assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
  8286. assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
  8287. }
  8288. else {
  8289. buffer_pos = n_buffer;
  8290. }
  8291. if (!writer) {
  8292. *maxchar = 127;
  8293. }
  8294. while ((len = GroupGenerator_next(&groupgen)) > 0) {
  8295. len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
  8296. n_zeros = Py_MAX(0, len - remaining);
  8297. n_chars = Py_MAX(0, Py_MIN(remaining, len));
  8298. /* Use n_zero zero's and n_chars chars */
  8299. /* Count only, don't do anything. */
  8300. count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
  8301. /* Copy into the writer. */
  8302. InsertThousandsGrouping_fill(writer, &buffer_pos,
  8303. digits, &digits_pos,
  8304. n_chars, n_zeros,
  8305. use_separator ? thousands_sep : NULL,
  8306. thousands_sep_len, maxchar);
  8307. /* Use a separator next time. */
  8308. use_separator = 1;
  8309. remaining -= n_chars;
  8310. min_width -= len;
  8311. if (remaining <= 0 && min_width <= 0) {
  8312. loop_broken = 1;
  8313. break;
  8314. }
  8315. min_width -= thousands_sep_len;
  8316. }
  8317. if (!loop_broken) {
  8318. /* We left the loop without using a break statement. */
  8319. len = Py_MAX(Py_MAX(remaining, min_width), 1);
  8320. n_zeros = Py_MAX(0, len - remaining);
  8321. n_chars = Py_MAX(0, Py_MIN(remaining, len));
  8322. /* Use n_zero zero's and n_chars chars */
  8323. count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
  8324. /* Copy into the writer. */
  8325. InsertThousandsGrouping_fill(writer, &buffer_pos,
  8326. digits, &digits_pos,
  8327. n_chars, n_zeros,
  8328. use_separator ? thousands_sep : NULL,
  8329. thousands_sep_len, maxchar);
  8330. }
  8331. return count;
  8332. }
  8333. static Py_ssize_t
  8334. unicode_count_impl(PyObject *str,
  8335. PyObject *substr,
  8336. Py_ssize_t start,
  8337. Py_ssize_t end)
  8338. {
  8339. assert(PyUnicode_Check(str));
  8340. assert(PyUnicode_Check(substr));
  8341. Py_ssize_t result;
  8342. int kind1, kind2;
  8343. const void *buf1 = NULL, *buf2 = NULL;
  8344. Py_ssize_t len1, len2;
  8345. kind1 = PyUnicode_KIND(str);
  8346. kind2 = PyUnicode_KIND(substr);
  8347. if (kind1 < kind2)
  8348. return 0;
  8349. len1 = PyUnicode_GET_LENGTH(str);
  8350. len2 = PyUnicode_GET_LENGTH(substr);
  8351. ADJUST_INDICES(start, end, len1);
  8352. if (end - start < len2)
  8353. return 0;
  8354. buf1 = PyUnicode_DATA(str);
  8355. buf2 = PyUnicode_DATA(substr);
  8356. if (kind2 != kind1) {
  8357. buf2 = unicode_askind(kind2, buf2, len2, kind1);
  8358. if (!buf2)
  8359. goto onError;
  8360. }
  8361. // We don't reuse `anylib_count` here because of the explicit casts.
  8362. switch (kind1) {
  8363. case PyUnicode_1BYTE_KIND:
  8364. result = ucs1lib_count(
  8365. ((const Py_UCS1*)buf1) + start, end - start,
  8366. buf2, len2, PY_SSIZE_T_MAX
  8367. );
  8368. break;
  8369. case PyUnicode_2BYTE_KIND:
  8370. result = ucs2lib_count(
  8371. ((const Py_UCS2*)buf1) + start, end - start,
  8372. buf2, len2, PY_SSIZE_T_MAX
  8373. );
  8374. break;
  8375. case PyUnicode_4BYTE_KIND:
  8376. result = ucs4lib_count(
  8377. ((const Py_UCS4*)buf1) + start, end - start,
  8378. buf2, len2, PY_SSIZE_T_MAX
  8379. );
  8380. break;
  8381. default:
  8382. Py_UNREACHABLE();
  8383. }
  8384. assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
  8385. if (kind2 != kind1)
  8386. PyMem_Free((void *)buf2);
  8387. return result;
  8388. onError:
  8389. assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
  8390. if (kind2 != kind1)
  8391. PyMem_Free((void *)buf2);
  8392. return -1;
  8393. }
  8394. Py_ssize_t
  8395. PyUnicode_Count(PyObject *str,
  8396. PyObject *substr,
  8397. Py_ssize_t start,
  8398. Py_ssize_t end)
  8399. {
  8400. if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
  8401. return -1;
  8402. return unicode_count_impl(str, substr, start, end);
  8403. }
  8404. Py_ssize_t
  8405. PyUnicode_Find(PyObject *str,
  8406. PyObject *substr,
  8407. Py_ssize_t start,
  8408. Py_ssize_t end,
  8409. int direction)
  8410. {
  8411. if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
  8412. return -2;
  8413. return any_find_slice(str, substr, start, end, direction);
  8414. }
  8415. Py_ssize_t
  8416. PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
  8417. Py_ssize_t start, Py_ssize_t end,
  8418. int direction)
  8419. {
  8420. int kind;
  8421. Py_ssize_t len, result;
  8422. len = PyUnicode_GET_LENGTH(str);
  8423. ADJUST_INDICES(start, end, len);
  8424. if (end - start < 1)
  8425. return -1;
  8426. kind = PyUnicode_KIND(str);
  8427. result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
  8428. kind, end-start, ch, direction);
  8429. if (result == -1)
  8430. return -1;
  8431. else
  8432. return start + result;
  8433. }
  8434. static int
  8435. tailmatch(PyObject *self,
  8436. PyObject *substring,
  8437. Py_ssize_t start,
  8438. Py_ssize_t end,
  8439. int direction)
  8440. {
  8441. int kind_self;
  8442. int kind_sub;
  8443. const void *data_self;
  8444. const void *data_sub;
  8445. Py_ssize_t offset;
  8446. Py_ssize_t i;
  8447. Py_ssize_t end_sub;
  8448. ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
  8449. end -= PyUnicode_GET_LENGTH(substring);
  8450. if (end < start)
  8451. return 0;
  8452. if (PyUnicode_GET_LENGTH(substring) == 0)
  8453. return 1;
  8454. kind_self = PyUnicode_KIND(self);
  8455. data_self = PyUnicode_DATA(self);
  8456. kind_sub = PyUnicode_KIND(substring);
  8457. data_sub = PyUnicode_DATA(substring);
  8458. end_sub = PyUnicode_GET_LENGTH(substring) - 1;
  8459. if (direction > 0)
  8460. offset = end;
  8461. else
  8462. offset = start;
  8463. if (PyUnicode_READ(kind_self, data_self, offset) ==
  8464. PyUnicode_READ(kind_sub, data_sub, 0) &&
  8465. PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
  8466. PyUnicode_READ(kind_sub, data_sub, end_sub)) {
  8467. /* If both are of the same kind, memcmp is sufficient */
  8468. if (kind_self == kind_sub) {
  8469. return ! memcmp((char *)data_self +
  8470. (offset * PyUnicode_KIND(substring)),
  8471. data_sub,
  8472. PyUnicode_GET_LENGTH(substring) *
  8473. PyUnicode_KIND(substring));
  8474. }
  8475. /* otherwise we have to compare each character by first accessing it */
  8476. else {
  8477. /* We do not need to compare 0 and len(substring)-1 because
  8478. the if statement above ensured already that they are equal
  8479. when we end up here. */
  8480. for (i = 1; i < end_sub; ++i) {
  8481. if (PyUnicode_READ(kind_self, data_self, offset + i) !=
  8482. PyUnicode_READ(kind_sub, data_sub, i))
  8483. return 0;
  8484. }
  8485. return 1;
  8486. }
  8487. }
  8488. return 0;
  8489. }
  8490. Py_ssize_t
  8491. PyUnicode_Tailmatch(PyObject *str,
  8492. PyObject *substr,
  8493. Py_ssize_t start,
  8494. Py_ssize_t end,
  8495. int direction)
  8496. {
  8497. if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
  8498. return -1;
  8499. return tailmatch(str, substr, start, end, direction);
  8500. }
  8501. static PyObject *
  8502. ascii_upper_or_lower(PyObject *self, int lower)
  8503. {
  8504. Py_ssize_t len = PyUnicode_GET_LENGTH(self);
  8505. const char *data = PyUnicode_DATA(self);
  8506. char *resdata;
  8507. PyObject *res;
  8508. res = PyUnicode_New(len, 127);
  8509. if (res == NULL)
  8510. return NULL;
  8511. resdata = PyUnicode_DATA(res);
  8512. if (lower)
  8513. _Py_bytes_lower(resdata, data, len);
  8514. else
  8515. _Py_bytes_upper(resdata, data, len);
  8516. return res;
  8517. }
  8518. static Py_UCS4
  8519. handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
  8520. {
  8521. Py_ssize_t j;
  8522. int final_sigma;
  8523. Py_UCS4 c = 0; /* initialize to prevent gcc warning */
  8524. /* U+03A3 is in the Final_Sigma context when, it is found like this:
  8525. \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
  8526. where ! is a negation and \p{xxx} is a character with property xxx.
  8527. */
  8528. for (j = i - 1; j >= 0; j--) {
  8529. c = PyUnicode_READ(kind, data, j);
  8530. if (!_PyUnicode_IsCaseIgnorable(c))
  8531. break;
  8532. }
  8533. final_sigma = j >= 0 && _PyUnicode_IsCased(c);
  8534. if (final_sigma) {
  8535. for (j = i + 1; j < length; j++) {
  8536. c = PyUnicode_READ(kind, data, j);
  8537. if (!_PyUnicode_IsCaseIgnorable(c))
  8538. break;
  8539. }
  8540. final_sigma = j == length || !_PyUnicode_IsCased(c);
  8541. }
  8542. return (final_sigma) ? 0x3C2 : 0x3C3;
  8543. }
  8544. static int
  8545. lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
  8546. Py_UCS4 c, Py_UCS4 *mapped)
  8547. {
  8548. /* Obscure special case. */
  8549. if (c == 0x3A3) {
  8550. mapped[0] = handle_capital_sigma(kind, data, length, i);
  8551. return 1;
  8552. }
  8553. return _PyUnicode_ToLowerFull(c, mapped);
  8554. }
  8555. static Py_ssize_t
  8556. do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
  8557. {
  8558. Py_ssize_t i, k = 0;
  8559. int n_res, j;
  8560. Py_UCS4 c, mapped[3];
  8561. c = PyUnicode_READ(kind, data, 0);
  8562. n_res = _PyUnicode_ToTitleFull(c, mapped);
  8563. for (j = 0; j < n_res; j++) {
  8564. *maxchar = Py_MAX(*maxchar, mapped[j]);
  8565. res[k++] = mapped[j];
  8566. }
  8567. for (i = 1; i < length; i++) {
  8568. c = PyUnicode_READ(kind, data, i);
  8569. n_res = lower_ucs4(kind, data, length, i, c, mapped);
  8570. for (j = 0; j < n_res; j++) {
  8571. *maxchar = Py_MAX(*maxchar, mapped[j]);
  8572. res[k++] = mapped[j];
  8573. }
  8574. }
  8575. return k;
  8576. }
  8577. static Py_ssize_t
  8578. do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
  8579. Py_ssize_t i, k = 0;
  8580. for (i = 0; i < length; i++) {
  8581. Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
  8582. int n_res, j;
  8583. if (Py_UNICODE_ISUPPER(c)) {
  8584. n_res = lower_ucs4(kind, data, length, i, c, mapped);
  8585. }
  8586. else if (Py_UNICODE_ISLOWER(c)) {
  8587. n_res = _PyUnicode_ToUpperFull(c, mapped);
  8588. }
  8589. else {
  8590. n_res = 1;
  8591. mapped[0] = c;
  8592. }
  8593. for (j = 0; j < n_res; j++) {
  8594. *maxchar = Py_MAX(*maxchar, mapped[j]);
  8595. res[k++] = mapped[j];
  8596. }
  8597. }
  8598. return k;
  8599. }
  8600. static Py_ssize_t
  8601. do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
  8602. Py_UCS4 *maxchar, int lower)
  8603. {
  8604. Py_ssize_t i, k = 0;
  8605. for (i = 0; i < length; i++) {
  8606. Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
  8607. int n_res, j;
  8608. if (lower)
  8609. n_res = lower_ucs4(kind, data, length, i, c, mapped);
  8610. else
  8611. n_res = _PyUnicode_ToUpperFull(c, mapped);
  8612. for (j = 0; j < n_res; j++) {
  8613. *maxchar = Py_MAX(*maxchar, mapped[j]);
  8614. res[k++] = mapped[j];
  8615. }
  8616. }
  8617. return k;
  8618. }
  8619. static Py_ssize_t
  8620. do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
  8621. {
  8622. return do_upper_or_lower(kind, data, length, res, maxchar, 0);
  8623. }
  8624. static Py_ssize_t
  8625. do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
  8626. {
  8627. return do_upper_or_lower(kind, data, length, res, maxchar, 1);
  8628. }
  8629. static Py_ssize_t
  8630. do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
  8631. {
  8632. Py_ssize_t i, k = 0;
  8633. for (i = 0; i < length; i++) {
  8634. Py_UCS4 c = PyUnicode_READ(kind, data, i);
  8635. Py_UCS4 mapped[3];
  8636. int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
  8637. for (j = 0; j < n_res; j++) {
  8638. *maxchar = Py_MAX(*maxchar, mapped[j]);
  8639. res[k++] = mapped[j];
  8640. }
  8641. }
  8642. return k;
  8643. }
  8644. static Py_ssize_t
  8645. do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
  8646. {
  8647. Py_ssize_t i, k = 0;
  8648. int previous_is_cased;
  8649. previous_is_cased = 0;
  8650. for (i = 0; i < length; i++) {
  8651. const Py_UCS4 c = PyUnicode_READ(kind, data, i);
  8652. Py_UCS4 mapped[3];
  8653. int n_res, j;
  8654. if (previous_is_cased)
  8655. n_res = lower_ucs4(kind, data, length, i, c, mapped);
  8656. else
  8657. n_res = _PyUnicode_ToTitleFull(c, mapped);
  8658. for (j = 0; j < n_res; j++) {
  8659. *maxchar = Py_MAX(*maxchar, mapped[j]);
  8660. res[k++] = mapped[j];
  8661. }
  8662. previous_is_cased = _PyUnicode_IsCased(c);
  8663. }
  8664. return k;
  8665. }
  8666. static PyObject *
  8667. case_operation(PyObject *self,
  8668. Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
  8669. {
  8670. PyObject *res = NULL;
  8671. Py_ssize_t length, newlength = 0;
  8672. int kind, outkind;
  8673. const void *data;
  8674. void *outdata;
  8675. Py_UCS4 maxchar = 0, *tmp, *tmpend;
  8676. kind = PyUnicode_KIND(self);
  8677. data = PyUnicode_DATA(self);
  8678. length = PyUnicode_GET_LENGTH(self);
  8679. if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
  8680. PyErr_SetString(PyExc_OverflowError, "string is too long");
  8681. return NULL;
  8682. }
  8683. tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
  8684. if (tmp == NULL)
  8685. return PyErr_NoMemory();
  8686. newlength = perform(kind, data, length, tmp, &maxchar);
  8687. res = PyUnicode_New(newlength, maxchar);
  8688. if (res == NULL)
  8689. goto leave;
  8690. tmpend = tmp + newlength;
  8691. outdata = PyUnicode_DATA(res);
  8692. outkind = PyUnicode_KIND(res);
  8693. switch (outkind) {
  8694. case PyUnicode_1BYTE_KIND:
  8695. _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
  8696. break;
  8697. case PyUnicode_2BYTE_KIND:
  8698. _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
  8699. break;
  8700. case PyUnicode_4BYTE_KIND:
  8701. memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
  8702. break;
  8703. default:
  8704. Py_UNREACHABLE();
  8705. }
  8706. leave:
  8707. PyMem_Free(tmp);
  8708. return res;
  8709. }
  8710. PyObject *
  8711. PyUnicode_Join(PyObject *separator, PyObject *seq)
  8712. {
  8713. PyObject *res;
  8714. PyObject *fseq;
  8715. Py_ssize_t seqlen;
  8716. PyObject **items;
  8717. fseq = PySequence_Fast(seq, "can only join an iterable");
  8718. if (fseq == NULL) {
  8719. return NULL;
  8720. }
  8721. /* NOTE: the following code can't call back into Python code,
  8722. * so we are sure that fseq won't be mutated.
  8723. */
  8724. items = PySequence_Fast_ITEMS(fseq);
  8725. seqlen = PySequence_Fast_GET_SIZE(fseq);
  8726. res = _PyUnicode_JoinArray(separator, items, seqlen);
  8727. Py_DECREF(fseq);
  8728. return res;
  8729. }
  8730. PyObject *
  8731. _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
  8732. {
  8733. PyObject *res = NULL; /* the result */
  8734. PyObject *sep = NULL;
  8735. Py_ssize_t seplen;
  8736. PyObject *item;
  8737. Py_ssize_t sz, i, res_offset;
  8738. Py_UCS4 maxchar;
  8739. Py_UCS4 item_maxchar;
  8740. int use_memcpy;
  8741. unsigned char *res_data = NULL, *sep_data = NULL;
  8742. PyObject *last_obj;
  8743. int kind = 0;
  8744. /* If empty sequence, return u"". */
  8745. if (seqlen == 0) {
  8746. _Py_RETURN_UNICODE_EMPTY();
  8747. }
  8748. /* If singleton sequence with an exact Unicode, return that. */
  8749. last_obj = NULL;
  8750. if (seqlen == 1) {
  8751. if (PyUnicode_CheckExact(items[0])) {
  8752. res = items[0];
  8753. return Py_NewRef(res);
  8754. }
  8755. seplen = 0;
  8756. maxchar = 0;
  8757. }
  8758. else {
  8759. /* Set up sep and seplen */
  8760. if (separator == NULL) {
  8761. /* fall back to a blank space separator */
  8762. sep = PyUnicode_FromOrdinal(' ');
  8763. if (!sep)
  8764. goto onError;
  8765. seplen = 1;
  8766. maxchar = 32;
  8767. }
  8768. else {
  8769. if (!PyUnicode_Check(separator)) {
  8770. PyErr_Format(PyExc_TypeError,
  8771. "separator: expected str instance,"
  8772. " %.80s found",
  8773. Py_TYPE(separator)->tp_name);
  8774. goto onError;
  8775. }
  8776. sep = separator;
  8777. seplen = PyUnicode_GET_LENGTH(separator);
  8778. maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
  8779. /* inc refcount to keep this code path symmetric with the
  8780. above case of a blank separator */
  8781. Py_INCREF(sep);
  8782. }
  8783. last_obj = sep;
  8784. }
  8785. /* There are at least two things to join, or else we have a subclass
  8786. * of str in the sequence.
  8787. * Do a pre-pass to figure out the total amount of space we'll
  8788. * need (sz), and see whether all argument are strings.
  8789. */
  8790. sz = 0;
  8791. #ifdef Py_DEBUG
  8792. use_memcpy = 0;
  8793. #else
  8794. use_memcpy = 1;
  8795. #endif
  8796. for (i = 0; i < seqlen; i++) {
  8797. size_t add_sz;
  8798. item = items[i];
  8799. if (!PyUnicode_Check(item)) {
  8800. PyErr_Format(PyExc_TypeError,
  8801. "sequence item %zd: expected str instance,"
  8802. " %.80s found",
  8803. i, Py_TYPE(item)->tp_name);
  8804. goto onError;
  8805. }
  8806. add_sz = PyUnicode_GET_LENGTH(item);
  8807. item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
  8808. maxchar = Py_MAX(maxchar, item_maxchar);
  8809. if (i != 0) {
  8810. add_sz += seplen;
  8811. }
  8812. if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
  8813. PyErr_SetString(PyExc_OverflowError,
  8814. "join() result is too long for a Python string");
  8815. goto onError;
  8816. }
  8817. sz += add_sz;
  8818. if (use_memcpy && last_obj != NULL) {
  8819. if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
  8820. use_memcpy = 0;
  8821. }
  8822. last_obj = item;
  8823. }
  8824. res = PyUnicode_New(sz, maxchar);
  8825. if (res == NULL)
  8826. goto onError;
  8827. /* Catenate everything. */
  8828. #ifdef Py_DEBUG
  8829. use_memcpy = 0;
  8830. #else
  8831. if (use_memcpy) {
  8832. res_data = PyUnicode_1BYTE_DATA(res);
  8833. kind = PyUnicode_KIND(res);
  8834. if (seplen != 0)
  8835. sep_data = PyUnicode_1BYTE_DATA(sep);
  8836. }
  8837. #endif
  8838. if (use_memcpy) {
  8839. for (i = 0; i < seqlen; ++i) {
  8840. Py_ssize_t itemlen;
  8841. item = items[i];
  8842. /* Copy item, and maybe the separator. */
  8843. if (i && seplen != 0) {
  8844. memcpy(res_data,
  8845. sep_data,
  8846. kind * seplen);
  8847. res_data += kind * seplen;
  8848. }
  8849. itemlen = PyUnicode_GET_LENGTH(item);
  8850. if (itemlen != 0) {
  8851. memcpy(res_data,
  8852. PyUnicode_DATA(item),
  8853. kind * itemlen);
  8854. res_data += kind * itemlen;
  8855. }
  8856. }
  8857. assert(res_data == PyUnicode_1BYTE_DATA(res)
  8858. + kind * PyUnicode_GET_LENGTH(res));
  8859. }
  8860. else {
  8861. for (i = 0, res_offset = 0; i < seqlen; ++i) {
  8862. Py_ssize_t itemlen;
  8863. item = items[i];
  8864. /* Copy item, and maybe the separator. */
  8865. if (i && seplen != 0) {
  8866. _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
  8867. res_offset += seplen;
  8868. }
  8869. itemlen = PyUnicode_GET_LENGTH(item);
  8870. if (itemlen != 0) {
  8871. _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
  8872. res_offset += itemlen;
  8873. }
  8874. }
  8875. assert(res_offset == PyUnicode_GET_LENGTH(res));
  8876. }
  8877. Py_XDECREF(sep);
  8878. assert(_PyUnicode_CheckConsistency(res, 1));
  8879. return res;
  8880. onError:
  8881. Py_XDECREF(sep);
  8882. Py_XDECREF(res);
  8883. return NULL;
  8884. }
  8885. void
  8886. _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
  8887. Py_UCS4 fill_char)
  8888. {
  8889. const int kind = PyUnicode_KIND(unicode);
  8890. void *data = PyUnicode_DATA(unicode);
  8891. assert(unicode_modifiable(unicode));
  8892. assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
  8893. assert(start >= 0);
  8894. assert(start + length <= PyUnicode_GET_LENGTH(unicode));
  8895. unicode_fill(kind, data, fill_char, start, length);
  8896. }
  8897. Py_ssize_t
  8898. PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
  8899. Py_UCS4 fill_char)
  8900. {
  8901. Py_ssize_t maxlen;
  8902. if (!PyUnicode_Check(unicode)) {
  8903. PyErr_BadInternalCall();
  8904. return -1;
  8905. }
  8906. if (unicode_check_modifiable(unicode))
  8907. return -1;
  8908. if (start < 0) {
  8909. PyErr_SetString(PyExc_IndexError, "string index out of range");
  8910. return -1;
  8911. }
  8912. if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
  8913. PyErr_SetString(PyExc_ValueError,
  8914. "fill character is bigger than "
  8915. "the string maximum character");
  8916. return -1;
  8917. }
  8918. maxlen = PyUnicode_GET_LENGTH(unicode) - start;
  8919. length = Py_MIN(maxlen, length);
  8920. if (length <= 0)
  8921. return 0;
  8922. _PyUnicode_FastFill(unicode, start, length, fill_char);
  8923. return length;
  8924. }
  8925. static PyObject *
  8926. pad(PyObject *self,
  8927. Py_ssize_t left,
  8928. Py_ssize_t right,
  8929. Py_UCS4 fill)
  8930. {
  8931. PyObject *u;
  8932. Py_UCS4 maxchar;
  8933. int kind;
  8934. void *data;
  8935. if (left < 0)
  8936. left = 0;
  8937. if (right < 0)
  8938. right = 0;
  8939. if (left == 0 && right == 0)
  8940. return unicode_result_unchanged(self);
  8941. if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
  8942. right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
  8943. PyErr_SetString(PyExc_OverflowError, "padded string is too long");
  8944. return NULL;
  8945. }
  8946. maxchar = PyUnicode_MAX_CHAR_VALUE(self);
  8947. maxchar = Py_MAX(maxchar, fill);
  8948. u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
  8949. if (!u)
  8950. return NULL;
  8951. kind = PyUnicode_KIND(u);
  8952. data = PyUnicode_DATA(u);
  8953. if (left)
  8954. unicode_fill(kind, data, fill, 0, left);
  8955. if (right)
  8956. unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
  8957. _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
  8958. assert(_PyUnicode_CheckConsistency(u, 1));
  8959. return u;
  8960. }
  8961. PyObject *
  8962. PyUnicode_Splitlines(PyObject *string, int keepends)
  8963. {
  8964. PyObject *list;
  8965. if (ensure_unicode(string) < 0)
  8966. return NULL;
  8967. switch (PyUnicode_KIND(string)) {
  8968. case PyUnicode_1BYTE_KIND:
  8969. if (PyUnicode_IS_ASCII(string))
  8970. list = asciilib_splitlines(
  8971. string, PyUnicode_1BYTE_DATA(string),
  8972. PyUnicode_GET_LENGTH(string), keepends);
  8973. else
  8974. list = ucs1lib_splitlines(
  8975. string, PyUnicode_1BYTE_DATA(string),
  8976. PyUnicode_GET_LENGTH(string), keepends);
  8977. break;
  8978. case PyUnicode_2BYTE_KIND:
  8979. list = ucs2lib_splitlines(
  8980. string, PyUnicode_2BYTE_DATA(string),
  8981. PyUnicode_GET_LENGTH(string), keepends);
  8982. break;
  8983. case PyUnicode_4BYTE_KIND:
  8984. list = ucs4lib_splitlines(
  8985. string, PyUnicode_4BYTE_DATA(string),
  8986. PyUnicode_GET_LENGTH(string), keepends);
  8987. break;
  8988. default:
  8989. Py_UNREACHABLE();
  8990. }
  8991. return list;
  8992. }
  8993. static PyObject *
  8994. split(PyObject *self,
  8995. PyObject *substring,
  8996. Py_ssize_t maxcount)
  8997. {
  8998. int kind1, kind2;
  8999. const void *buf1, *buf2;
  9000. Py_ssize_t len1, len2;
  9001. PyObject* out;
  9002. len1 = PyUnicode_GET_LENGTH(self);
  9003. kind1 = PyUnicode_KIND(self);
  9004. if (substring == NULL) {
  9005. if (maxcount < 0) {
  9006. maxcount = (len1 - 1) / 2 + 1;
  9007. }
  9008. switch (kind1) {
  9009. case PyUnicode_1BYTE_KIND:
  9010. if (PyUnicode_IS_ASCII(self))
  9011. return asciilib_split_whitespace(
  9012. self, PyUnicode_1BYTE_DATA(self),
  9013. len1, maxcount
  9014. );
  9015. else
  9016. return ucs1lib_split_whitespace(
  9017. self, PyUnicode_1BYTE_DATA(self),
  9018. len1, maxcount
  9019. );
  9020. case PyUnicode_2BYTE_KIND:
  9021. return ucs2lib_split_whitespace(
  9022. self, PyUnicode_2BYTE_DATA(self),
  9023. len1, maxcount
  9024. );
  9025. case PyUnicode_4BYTE_KIND:
  9026. return ucs4lib_split_whitespace(
  9027. self, PyUnicode_4BYTE_DATA(self),
  9028. len1, maxcount
  9029. );
  9030. default:
  9031. Py_UNREACHABLE();
  9032. }
  9033. }
  9034. kind2 = PyUnicode_KIND(substring);
  9035. len2 = PyUnicode_GET_LENGTH(substring);
  9036. if (maxcount < 0) {
  9037. // if len2 == 0, it will raise ValueError.
  9038. maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
  9039. // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
  9040. maxcount = maxcount < 0 ? len1 : maxcount;
  9041. }
  9042. if (kind1 < kind2 || len1 < len2) {
  9043. out = PyList_New(1);
  9044. if (out == NULL)
  9045. return NULL;
  9046. PyList_SET_ITEM(out, 0, Py_NewRef(self));
  9047. return out;
  9048. }
  9049. buf1 = PyUnicode_DATA(self);
  9050. buf2 = PyUnicode_DATA(substring);
  9051. if (kind2 != kind1) {
  9052. buf2 = unicode_askind(kind2, buf2, len2, kind1);
  9053. if (!buf2)
  9054. return NULL;
  9055. }
  9056. switch (kind1) {
  9057. case PyUnicode_1BYTE_KIND:
  9058. if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
  9059. out = asciilib_split(
  9060. self, buf1, len1, buf2, len2, maxcount);
  9061. else
  9062. out = ucs1lib_split(
  9063. self, buf1, len1, buf2, len2, maxcount);
  9064. break;
  9065. case PyUnicode_2BYTE_KIND:
  9066. out = ucs2lib_split(
  9067. self, buf1, len1, buf2, len2, maxcount);
  9068. break;
  9069. case PyUnicode_4BYTE_KIND:
  9070. out = ucs4lib_split(
  9071. self, buf1, len1, buf2, len2, maxcount);
  9072. break;
  9073. default:
  9074. out = NULL;
  9075. }
  9076. assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
  9077. if (kind2 != kind1)
  9078. PyMem_Free((void *)buf2);
  9079. return out;
  9080. }
  9081. static PyObject *
  9082. rsplit(PyObject *self,
  9083. PyObject *substring,
  9084. Py_ssize_t maxcount)
  9085. {
  9086. int kind1, kind2;
  9087. const void *buf1, *buf2;
  9088. Py_ssize_t len1, len2;
  9089. PyObject* out;
  9090. len1 = PyUnicode_GET_LENGTH(self);
  9091. kind1 = PyUnicode_KIND(self);
  9092. if (substring == NULL) {
  9093. if (maxcount < 0) {
  9094. maxcount = (len1 - 1) / 2 + 1;
  9095. }
  9096. switch (kind1) {
  9097. case PyUnicode_1BYTE_KIND:
  9098. if (PyUnicode_IS_ASCII(self))
  9099. return asciilib_rsplit_whitespace(
  9100. self, PyUnicode_1BYTE_DATA(self),
  9101. len1, maxcount
  9102. );
  9103. else
  9104. return ucs1lib_rsplit_whitespace(
  9105. self, PyUnicode_1BYTE_DATA(self),
  9106. len1, maxcount
  9107. );
  9108. case PyUnicode_2BYTE_KIND:
  9109. return ucs2lib_rsplit_whitespace(
  9110. self, PyUnicode_2BYTE_DATA(self),
  9111. len1, maxcount
  9112. );
  9113. case PyUnicode_4BYTE_KIND:
  9114. return ucs4lib_rsplit_whitespace(
  9115. self, PyUnicode_4BYTE_DATA(self),
  9116. len1, maxcount
  9117. );
  9118. default:
  9119. Py_UNREACHABLE();
  9120. }
  9121. }
  9122. kind2 = PyUnicode_KIND(substring);
  9123. len2 = PyUnicode_GET_LENGTH(substring);
  9124. if (maxcount < 0) {
  9125. // if len2 == 0, it will raise ValueError.
  9126. maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
  9127. // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
  9128. maxcount = maxcount < 0 ? len1 : maxcount;
  9129. }
  9130. if (kind1 < kind2 || len1 < len2) {
  9131. out = PyList_New(1);
  9132. if (out == NULL)
  9133. return NULL;
  9134. PyList_SET_ITEM(out, 0, Py_NewRef(self));
  9135. return out;
  9136. }
  9137. buf1 = PyUnicode_DATA(self);
  9138. buf2 = PyUnicode_DATA(substring);
  9139. if (kind2 != kind1) {
  9140. buf2 = unicode_askind(kind2, buf2, len2, kind1);
  9141. if (!buf2)
  9142. return NULL;
  9143. }
  9144. switch (kind1) {
  9145. case PyUnicode_1BYTE_KIND:
  9146. if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
  9147. out = asciilib_rsplit(
  9148. self, buf1, len1, buf2, len2, maxcount);
  9149. else
  9150. out = ucs1lib_rsplit(
  9151. self, buf1, len1, buf2, len2, maxcount);
  9152. break;
  9153. case PyUnicode_2BYTE_KIND:
  9154. out = ucs2lib_rsplit(
  9155. self, buf1, len1, buf2, len2, maxcount);
  9156. break;
  9157. case PyUnicode_4BYTE_KIND:
  9158. out = ucs4lib_rsplit(
  9159. self, buf1, len1, buf2, len2, maxcount);
  9160. break;
  9161. default:
  9162. out = NULL;
  9163. }
  9164. assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
  9165. if (kind2 != kind1)
  9166. PyMem_Free((void *)buf2);
  9167. return out;
  9168. }
  9169. static Py_ssize_t
  9170. anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
  9171. PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
  9172. {
  9173. switch (kind) {
  9174. case PyUnicode_1BYTE_KIND:
  9175. if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
  9176. return asciilib_find(buf1, len1, buf2, len2, offset);
  9177. else
  9178. return ucs1lib_find(buf1, len1, buf2, len2, offset);
  9179. case PyUnicode_2BYTE_KIND:
  9180. return ucs2lib_find(buf1, len1, buf2, len2, offset);
  9181. case PyUnicode_4BYTE_KIND:
  9182. return ucs4lib_find(buf1, len1, buf2, len2, offset);
  9183. }
  9184. Py_UNREACHABLE();
  9185. }
  9186. static Py_ssize_t
  9187. anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
  9188. PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
  9189. {
  9190. switch (kind) {
  9191. case PyUnicode_1BYTE_KIND:
  9192. return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
  9193. case PyUnicode_2BYTE_KIND:
  9194. return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
  9195. case PyUnicode_4BYTE_KIND:
  9196. return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
  9197. }
  9198. Py_UNREACHABLE();
  9199. }
  9200. static void
  9201. replace_1char_inplace(PyObject *u, Py_ssize_t pos,
  9202. Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
  9203. {
  9204. int kind = PyUnicode_KIND(u);
  9205. void *data = PyUnicode_DATA(u);
  9206. Py_ssize_t len = PyUnicode_GET_LENGTH(u);
  9207. if (kind == PyUnicode_1BYTE_KIND) {
  9208. ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
  9209. (Py_UCS1 *)data + len,
  9210. u1, u2, maxcount);
  9211. }
  9212. else if (kind == PyUnicode_2BYTE_KIND) {
  9213. ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
  9214. (Py_UCS2 *)data + len,
  9215. u1, u2, maxcount);
  9216. }
  9217. else {
  9218. assert(kind == PyUnicode_4BYTE_KIND);
  9219. ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
  9220. (Py_UCS4 *)data + len,
  9221. u1, u2, maxcount);
  9222. }
  9223. }
  9224. static PyObject *
  9225. replace(PyObject *self, PyObject *str1,
  9226. PyObject *str2, Py_ssize_t maxcount)
  9227. {
  9228. PyObject *u;
  9229. const char *sbuf = PyUnicode_DATA(self);
  9230. const void *buf1 = PyUnicode_DATA(str1);
  9231. const void *buf2 = PyUnicode_DATA(str2);
  9232. int srelease = 0, release1 = 0, release2 = 0;
  9233. int skind = PyUnicode_KIND(self);
  9234. int kind1 = PyUnicode_KIND(str1);
  9235. int kind2 = PyUnicode_KIND(str2);
  9236. Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
  9237. Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
  9238. Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
  9239. int mayshrink;
  9240. Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
  9241. if (slen < len1)
  9242. goto nothing;
  9243. if (maxcount < 0)
  9244. maxcount = PY_SSIZE_T_MAX;
  9245. else if (maxcount == 0)
  9246. goto nothing;
  9247. if (str1 == str2)
  9248. goto nothing;
  9249. maxchar = PyUnicode_MAX_CHAR_VALUE(self);
  9250. maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
  9251. if (maxchar < maxchar_str1)
  9252. /* substring too wide to be present */
  9253. goto nothing;
  9254. maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
  9255. /* Replacing str1 with str2 may cause a maxchar reduction in the
  9256. result string. */
  9257. mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
  9258. maxchar = Py_MAX(maxchar, maxchar_str2);
  9259. if (len1 == len2) {
  9260. /* same length */
  9261. if (len1 == 0)
  9262. goto nothing;
  9263. if (len1 == 1) {
  9264. /* replace characters */
  9265. Py_UCS4 u1, u2;
  9266. Py_ssize_t pos;
  9267. u1 = PyUnicode_READ(kind1, buf1, 0);
  9268. pos = findchar(sbuf, skind, slen, u1, 1);
  9269. if (pos < 0)
  9270. goto nothing;
  9271. u2 = PyUnicode_READ(kind2, buf2, 0);
  9272. u = PyUnicode_New(slen, maxchar);
  9273. if (!u)
  9274. goto error;
  9275. _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
  9276. replace_1char_inplace(u, pos, u1, u2, maxcount);
  9277. }
  9278. else {
  9279. int rkind = skind;
  9280. char *res;
  9281. Py_ssize_t i;
  9282. if (kind1 < rkind) {
  9283. /* widen substring */
  9284. buf1 = unicode_askind(kind1, buf1, len1, rkind);
  9285. if (!buf1) goto error;
  9286. release1 = 1;
  9287. }
  9288. i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
  9289. if (i < 0)
  9290. goto nothing;
  9291. if (rkind > kind2) {
  9292. /* widen replacement */
  9293. buf2 = unicode_askind(kind2, buf2, len2, rkind);
  9294. if (!buf2) goto error;
  9295. release2 = 1;
  9296. }
  9297. else if (rkind < kind2) {
  9298. /* widen self and buf1 */
  9299. rkind = kind2;
  9300. if (release1) {
  9301. assert(buf1 != PyUnicode_DATA(str1));
  9302. PyMem_Free((void *)buf1);
  9303. buf1 = PyUnicode_DATA(str1);
  9304. release1 = 0;
  9305. }
  9306. sbuf = unicode_askind(skind, sbuf, slen, rkind);
  9307. if (!sbuf) goto error;
  9308. srelease = 1;
  9309. buf1 = unicode_askind(kind1, buf1, len1, rkind);
  9310. if (!buf1) goto error;
  9311. release1 = 1;
  9312. }
  9313. u = PyUnicode_New(slen, maxchar);
  9314. if (!u)
  9315. goto error;
  9316. assert(PyUnicode_KIND(u) == rkind);
  9317. res = PyUnicode_DATA(u);
  9318. memcpy(res, sbuf, rkind * slen);
  9319. /* change everything in-place, starting with this one */
  9320. memcpy(res + rkind * i,
  9321. buf2,
  9322. rkind * len2);
  9323. i += len1;
  9324. while ( --maxcount > 0) {
  9325. i = anylib_find(rkind, self,
  9326. sbuf+rkind*i, slen-i,
  9327. str1, buf1, len1, i);
  9328. if (i == -1)
  9329. break;
  9330. memcpy(res + rkind * i,
  9331. buf2,
  9332. rkind * len2);
  9333. i += len1;
  9334. }
  9335. }
  9336. }
  9337. else {
  9338. Py_ssize_t n, i, j, ires;
  9339. Py_ssize_t new_size;
  9340. int rkind = skind;
  9341. char *res;
  9342. if (kind1 < rkind) {
  9343. /* widen substring */
  9344. buf1 = unicode_askind(kind1, buf1, len1, rkind);
  9345. if (!buf1) goto error;
  9346. release1 = 1;
  9347. }
  9348. n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
  9349. if (n == 0)
  9350. goto nothing;
  9351. if (kind2 < rkind) {
  9352. /* widen replacement */
  9353. buf2 = unicode_askind(kind2, buf2, len2, rkind);
  9354. if (!buf2) goto error;
  9355. release2 = 1;
  9356. }
  9357. else if (kind2 > rkind) {
  9358. /* widen self and buf1 */
  9359. rkind = kind2;
  9360. sbuf = unicode_askind(skind, sbuf, slen, rkind);
  9361. if (!sbuf) goto error;
  9362. srelease = 1;
  9363. if (release1) {
  9364. assert(buf1 != PyUnicode_DATA(str1));
  9365. PyMem_Free((void *)buf1);
  9366. buf1 = PyUnicode_DATA(str1);
  9367. release1 = 0;
  9368. }
  9369. buf1 = unicode_askind(kind1, buf1, len1, rkind);
  9370. if (!buf1) goto error;
  9371. release1 = 1;
  9372. }
  9373. /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
  9374. PyUnicode_GET_LENGTH(str1)); */
  9375. if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
  9376. PyErr_SetString(PyExc_OverflowError,
  9377. "replace string is too long");
  9378. goto error;
  9379. }
  9380. new_size = slen + n * (len2 - len1);
  9381. if (new_size == 0) {
  9382. u = unicode_new_empty();
  9383. goto done;
  9384. }
  9385. if (new_size > (PY_SSIZE_T_MAX / rkind)) {
  9386. PyErr_SetString(PyExc_OverflowError,
  9387. "replace string is too long");
  9388. goto error;
  9389. }
  9390. u = PyUnicode_New(new_size, maxchar);
  9391. if (!u)
  9392. goto error;
  9393. assert(PyUnicode_KIND(u) == rkind);
  9394. res = PyUnicode_DATA(u);
  9395. ires = i = 0;
  9396. if (len1 > 0) {
  9397. while (n-- > 0) {
  9398. /* look for next match */
  9399. j = anylib_find(rkind, self,
  9400. sbuf + rkind * i, slen-i,
  9401. str1, buf1, len1, i);
  9402. if (j == -1)
  9403. break;
  9404. else if (j > i) {
  9405. /* copy unchanged part [i:j] */
  9406. memcpy(res + rkind * ires,
  9407. sbuf + rkind * i,
  9408. rkind * (j-i));
  9409. ires += j - i;
  9410. }
  9411. /* copy substitution string */
  9412. if (len2 > 0) {
  9413. memcpy(res + rkind * ires,
  9414. buf2,
  9415. rkind * len2);
  9416. ires += len2;
  9417. }
  9418. i = j + len1;
  9419. }
  9420. if (i < slen)
  9421. /* copy tail [i:] */
  9422. memcpy(res + rkind * ires,
  9423. sbuf + rkind * i,
  9424. rkind * (slen-i));
  9425. }
  9426. else {
  9427. /* interleave */
  9428. while (n > 0) {
  9429. memcpy(res + rkind * ires,
  9430. buf2,
  9431. rkind * len2);
  9432. ires += len2;
  9433. if (--n <= 0)
  9434. break;
  9435. memcpy(res + rkind * ires,
  9436. sbuf + rkind * i,
  9437. rkind);
  9438. ires++;
  9439. i++;
  9440. }
  9441. memcpy(res + rkind * ires,
  9442. sbuf + rkind * i,
  9443. rkind * (slen-i));
  9444. }
  9445. }
  9446. if (mayshrink) {
  9447. unicode_adjust_maxchar(&u);
  9448. if (u == NULL)
  9449. goto error;
  9450. }
  9451. done:
  9452. assert(srelease == (sbuf != PyUnicode_DATA(self)));
  9453. assert(release1 == (buf1 != PyUnicode_DATA(str1)));
  9454. assert(release2 == (buf2 != PyUnicode_DATA(str2)));
  9455. if (srelease)
  9456. PyMem_Free((void *)sbuf);
  9457. if (release1)
  9458. PyMem_Free((void *)buf1);
  9459. if (release2)
  9460. PyMem_Free((void *)buf2);
  9461. assert(_PyUnicode_CheckConsistency(u, 1));
  9462. return u;
  9463. nothing:
  9464. /* nothing to replace; return original string (when possible) */
  9465. assert(srelease == (sbuf != PyUnicode_DATA(self)));
  9466. assert(release1 == (buf1 != PyUnicode_DATA(str1)));
  9467. assert(release2 == (buf2 != PyUnicode_DATA(str2)));
  9468. if (srelease)
  9469. PyMem_Free((void *)sbuf);
  9470. if (release1)
  9471. PyMem_Free((void *)buf1);
  9472. if (release2)
  9473. PyMem_Free((void *)buf2);
  9474. return unicode_result_unchanged(self);
  9475. error:
  9476. assert(srelease == (sbuf != PyUnicode_DATA(self)));
  9477. assert(release1 == (buf1 != PyUnicode_DATA(str1)));
  9478. assert(release2 == (buf2 != PyUnicode_DATA(str2)));
  9479. if (srelease)
  9480. PyMem_Free((void *)sbuf);
  9481. if (release1)
  9482. PyMem_Free((void *)buf1);
  9483. if (release2)
  9484. PyMem_Free((void *)buf2);
  9485. return NULL;
  9486. }
  9487. /* --- Unicode Object Methods --------------------------------------------- */
  9488. /*[clinic input]
  9489. str.title as unicode_title
  9490. Return a version of the string where each word is titlecased.
  9491. More specifically, words start with uppercased characters and all remaining
  9492. cased characters have lower case.
  9493. [clinic start generated code]*/
  9494. static PyObject *
  9495. unicode_title_impl(PyObject *self)
  9496. /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
  9497. {
  9498. return case_operation(self, do_title);
  9499. }
  9500. /*[clinic input]
  9501. str.capitalize as unicode_capitalize
  9502. Return a capitalized version of the string.
  9503. More specifically, make the first character have upper case and the rest lower
  9504. case.
  9505. [clinic start generated code]*/
  9506. static PyObject *
  9507. unicode_capitalize_impl(PyObject *self)
  9508. /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
  9509. {
  9510. if (PyUnicode_GET_LENGTH(self) == 0)
  9511. return unicode_result_unchanged(self);
  9512. return case_operation(self, do_capitalize);
  9513. }
  9514. /*[clinic input]
  9515. str.casefold as unicode_casefold
  9516. Return a version of the string suitable for caseless comparisons.
  9517. [clinic start generated code]*/
  9518. static PyObject *
  9519. unicode_casefold_impl(PyObject *self)
  9520. /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
  9521. {
  9522. if (PyUnicode_IS_ASCII(self))
  9523. return ascii_upper_or_lower(self, 1);
  9524. return case_operation(self, do_casefold);
  9525. }
  9526. /* Argument converter. Accepts a single Unicode character. */
  9527. static int
  9528. convert_uc(PyObject *obj, void *addr)
  9529. {
  9530. Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
  9531. if (!PyUnicode_Check(obj)) {
  9532. PyErr_Format(PyExc_TypeError,
  9533. "The fill character must be a unicode character, "
  9534. "not %.100s", Py_TYPE(obj)->tp_name);
  9535. return 0;
  9536. }
  9537. if (PyUnicode_GET_LENGTH(obj) != 1) {
  9538. PyErr_SetString(PyExc_TypeError,
  9539. "The fill character must be exactly one character long");
  9540. return 0;
  9541. }
  9542. *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
  9543. return 1;
  9544. }
  9545. /*[clinic input]
  9546. str.center as unicode_center
  9547. width: Py_ssize_t
  9548. fillchar: Py_UCS4 = ' '
  9549. /
  9550. Return a centered string of length width.
  9551. Padding is done using the specified fill character (default is a space).
  9552. [clinic start generated code]*/
  9553. static PyObject *
  9554. unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
  9555. /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
  9556. {
  9557. Py_ssize_t marg, left;
  9558. if (PyUnicode_GET_LENGTH(self) >= width)
  9559. return unicode_result_unchanged(self);
  9560. marg = width - PyUnicode_GET_LENGTH(self);
  9561. left = marg / 2 + (marg & width & 1);
  9562. return pad(self, left, marg - left, fillchar);
  9563. }
  9564. /* This function assumes that str1 and str2 are readied by the caller. */
  9565. static int
  9566. unicode_compare(PyObject *str1, PyObject *str2)
  9567. {
  9568. #define COMPARE(TYPE1, TYPE2) \
  9569. do { \
  9570. TYPE1* p1 = (TYPE1 *)data1; \
  9571. TYPE2* p2 = (TYPE2 *)data2; \
  9572. TYPE1* end = p1 + len; \
  9573. Py_UCS4 c1, c2; \
  9574. for (; p1 != end; p1++, p2++) { \
  9575. c1 = *p1; \
  9576. c2 = *p2; \
  9577. if (c1 != c2) \
  9578. return (c1 < c2) ? -1 : 1; \
  9579. } \
  9580. } \
  9581. while (0)
  9582. int kind1, kind2;
  9583. const void *data1, *data2;
  9584. Py_ssize_t len1, len2, len;
  9585. kind1 = PyUnicode_KIND(str1);
  9586. kind2 = PyUnicode_KIND(str2);
  9587. data1 = PyUnicode_DATA(str1);
  9588. data2 = PyUnicode_DATA(str2);
  9589. len1 = PyUnicode_GET_LENGTH(str1);
  9590. len2 = PyUnicode_GET_LENGTH(str2);
  9591. len = Py_MIN(len1, len2);
  9592. switch(kind1) {
  9593. case PyUnicode_1BYTE_KIND:
  9594. {
  9595. switch(kind2) {
  9596. case PyUnicode_1BYTE_KIND:
  9597. {
  9598. int cmp = memcmp(data1, data2, len);
  9599. /* normalize result of memcmp() into the range [-1; 1] */
  9600. if (cmp < 0)
  9601. return -1;
  9602. if (cmp > 0)
  9603. return 1;
  9604. break;
  9605. }
  9606. case PyUnicode_2BYTE_KIND:
  9607. COMPARE(Py_UCS1, Py_UCS2);
  9608. break;
  9609. case PyUnicode_4BYTE_KIND:
  9610. COMPARE(Py_UCS1, Py_UCS4);
  9611. break;
  9612. default:
  9613. Py_UNREACHABLE();
  9614. }
  9615. break;
  9616. }
  9617. case PyUnicode_2BYTE_KIND:
  9618. {
  9619. switch(kind2) {
  9620. case PyUnicode_1BYTE_KIND:
  9621. COMPARE(Py_UCS2, Py_UCS1);
  9622. break;
  9623. case PyUnicode_2BYTE_KIND:
  9624. {
  9625. COMPARE(Py_UCS2, Py_UCS2);
  9626. break;
  9627. }
  9628. case PyUnicode_4BYTE_KIND:
  9629. COMPARE(Py_UCS2, Py_UCS4);
  9630. break;
  9631. default:
  9632. Py_UNREACHABLE();
  9633. }
  9634. break;
  9635. }
  9636. case PyUnicode_4BYTE_KIND:
  9637. {
  9638. switch(kind2) {
  9639. case PyUnicode_1BYTE_KIND:
  9640. COMPARE(Py_UCS4, Py_UCS1);
  9641. break;
  9642. case PyUnicode_2BYTE_KIND:
  9643. COMPARE(Py_UCS4, Py_UCS2);
  9644. break;
  9645. case PyUnicode_4BYTE_KIND:
  9646. {
  9647. #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
  9648. int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
  9649. /* normalize result of wmemcmp() into the range [-1; 1] */
  9650. if (cmp < 0)
  9651. return -1;
  9652. if (cmp > 0)
  9653. return 1;
  9654. #else
  9655. COMPARE(Py_UCS4, Py_UCS4);
  9656. #endif
  9657. break;
  9658. }
  9659. default:
  9660. Py_UNREACHABLE();
  9661. }
  9662. break;
  9663. }
  9664. default:
  9665. Py_UNREACHABLE();
  9666. }
  9667. if (len1 == len2)
  9668. return 0;
  9669. if (len1 < len2)
  9670. return -1;
  9671. else
  9672. return 1;
  9673. #undef COMPARE
  9674. }
  9675. static int
  9676. unicode_compare_eq(PyObject *str1, PyObject *str2)
  9677. {
  9678. int kind;
  9679. const void *data1, *data2;
  9680. Py_ssize_t len;
  9681. int cmp;
  9682. len = PyUnicode_GET_LENGTH(str1);
  9683. if (PyUnicode_GET_LENGTH(str2) != len)
  9684. return 0;
  9685. kind = PyUnicode_KIND(str1);
  9686. if (PyUnicode_KIND(str2) != kind)
  9687. return 0;
  9688. data1 = PyUnicode_DATA(str1);
  9689. data2 = PyUnicode_DATA(str2);
  9690. cmp = memcmp(data1, data2, len * kind);
  9691. return (cmp == 0);
  9692. }
  9693. int
  9694. _PyUnicode_Equal(PyObject *str1, PyObject *str2)
  9695. {
  9696. assert(PyUnicode_Check(str1));
  9697. assert(PyUnicode_Check(str2));
  9698. if (str1 == str2) {
  9699. return 1;
  9700. }
  9701. return unicode_compare_eq(str1, str2);
  9702. }
  9703. int
  9704. PyUnicode_Compare(PyObject *left, PyObject *right)
  9705. {
  9706. if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
  9707. /* a string is equal to itself */
  9708. if (left == right)
  9709. return 0;
  9710. return unicode_compare(left, right);
  9711. }
  9712. PyErr_Format(PyExc_TypeError,
  9713. "Can't compare %.100s and %.100s",
  9714. Py_TYPE(left)->tp_name,
  9715. Py_TYPE(right)->tp_name);
  9716. return -1;
  9717. }
  9718. int
  9719. PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
  9720. {
  9721. Py_ssize_t i;
  9722. int kind;
  9723. Py_UCS4 chr;
  9724. assert(_PyUnicode_CHECK(uni));
  9725. kind = PyUnicode_KIND(uni);
  9726. if (kind == PyUnicode_1BYTE_KIND) {
  9727. const void *data = PyUnicode_1BYTE_DATA(uni);
  9728. size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
  9729. size_t len, len2 = strlen(str);
  9730. int cmp;
  9731. len = Py_MIN(len1, len2);
  9732. cmp = memcmp(data, str, len);
  9733. if (cmp != 0) {
  9734. if (cmp < 0)
  9735. return -1;
  9736. else
  9737. return 1;
  9738. }
  9739. if (len1 > len2)
  9740. return 1; /* uni is longer */
  9741. if (len1 < len2)
  9742. return -1; /* str is longer */
  9743. return 0;
  9744. }
  9745. else {
  9746. const void *data = PyUnicode_DATA(uni);
  9747. /* Compare Unicode string and source character set string */
  9748. for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
  9749. if (chr != (unsigned char)str[i])
  9750. return (chr < (unsigned char)(str[i])) ? -1 : 1;
  9751. /* This check keeps Python strings that end in '\0' from comparing equal
  9752. to C strings identical up to that point. */
  9753. if (PyUnicode_GET_LENGTH(uni) != i || chr)
  9754. return 1; /* uni is longer */
  9755. if (str[i])
  9756. return -1; /* str is longer */
  9757. return 0;
  9758. }
  9759. }
  9760. int
  9761. _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
  9762. {
  9763. size_t len;
  9764. assert(_PyUnicode_CHECK(unicode));
  9765. assert(str);
  9766. #ifndef NDEBUG
  9767. for (const char *p = str; *p; p++) {
  9768. assert((unsigned char)*p < 128);
  9769. }
  9770. #endif
  9771. if (!PyUnicode_IS_ASCII(unicode))
  9772. return 0;
  9773. len = (size_t)PyUnicode_GET_LENGTH(unicode);
  9774. return strlen(str) == len &&
  9775. memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
  9776. }
  9777. int
  9778. _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
  9779. {
  9780. PyObject *right_uni;
  9781. assert(_PyUnicode_CHECK(left));
  9782. assert(right->string);
  9783. #ifndef NDEBUG
  9784. for (const char *p = right->string; *p; p++) {
  9785. assert((unsigned char)*p < 128);
  9786. }
  9787. #endif
  9788. if (!PyUnicode_IS_ASCII(left))
  9789. return 0;
  9790. right_uni = _PyUnicode_FromId(right); /* borrowed */
  9791. if (right_uni == NULL) {
  9792. /* memory error or bad data */
  9793. PyErr_Clear();
  9794. return _PyUnicode_EqualToASCIIString(left, right->string);
  9795. }
  9796. if (left == right_uni)
  9797. return 1;
  9798. assert(PyUnicode_CHECK_INTERNED(right_uni));
  9799. if (PyUnicode_CHECK_INTERNED(left)) {
  9800. return 0;
  9801. }
  9802. assert(_PyUnicode_HASH(right_uni) != -1);
  9803. Py_hash_t hash = _PyUnicode_HASH(left);
  9804. if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
  9805. return 0;
  9806. }
  9807. return unicode_compare_eq(left, right_uni);
  9808. }
  9809. PyObject *
  9810. PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
  9811. {
  9812. int result;
  9813. if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
  9814. Py_RETURN_NOTIMPLEMENTED;
  9815. if (left == right) {
  9816. switch (op) {
  9817. case Py_EQ:
  9818. case Py_LE:
  9819. case Py_GE:
  9820. /* a string is equal to itself */
  9821. Py_RETURN_TRUE;
  9822. case Py_NE:
  9823. case Py_LT:
  9824. case Py_GT:
  9825. Py_RETURN_FALSE;
  9826. default:
  9827. PyErr_BadArgument();
  9828. return NULL;
  9829. }
  9830. }
  9831. else if (op == Py_EQ || op == Py_NE) {
  9832. result = unicode_compare_eq(left, right);
  9833. result ^= (op == Py_NE);
  9834. return PyBool_FromLong(result);
  9835. }
  9836. else {
  9837. result = unicode_compare(left, right);
  9838. Py_RETURN_RICHCOMPARE(result, 0, op);
  9839. }
  9840. }
  9841. int
  9842. _PyUnicode_EQ(PyObject *aa, PyObject *bb)
  9843. {
  9844. return unicode_eq(aa, bb);
  9845. }
  9846. int
  9847. PyUnicode_Contains(PyObject *str, PyObject *substr)
  9848. {
  9849. int kind1, kind2;
  9850. const void *buf1, *buf2;
  9851. Py_ssize_t len1, len2;
  9852. int result;
  9853. if (!PyUnicode_Check(substr)) {
  9854. PyErr_Format(PyExc_TypeError,
  9855. "'in <string>' requires string as left operand, not %.100s",
  9856. Py_TYPE(substr)->tp_name);
  9857. return -1;
  9858. }
  9859. if (ensure_unicode(str) < 0)
  9860. return -1;
  9861. kind1 = PyUnicode_KIND(str);
  9862. kind2 = PyUnicode_KIND(substr);
  9863. if (kind1 < kind2)
  9864. return 0;
  9865. len1 = PyUnicode_GET_LENGTH(str);
  9866. len2 = PyUnicode_GET_LENGTH(substr);
  9867. if (len1 < len2)
  9868. return 0;
  9869. buf1 = PyUnicode_DATA(str);
  9870. buf2 = PyUnicode_DATA(substr);
  9871. if (len2 == 1) {
  9872. Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
  9873. result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
  9874. return result;
  9875. }
  9876. if (kind2 != kind1) {
  9877. buf2 = unicode_askind(kind2, buf2, len2, kind1);
  9878. if (!buf2)
  9879. return -1;
  9880. }
  9881. switch (kind1) {
  9882. case PyUnicode_1BYTE_KIND:
  9883. result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
  9884. break;
  9885. case PyUnicode_2BYTE_KIND:
  9886. result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
  9887. break;
  9888. case PyUnicode_4BYTE_KIND:
  9889. result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
  9890. break;
  9891. default:
  9892. Py_UNREACHABLE();
  9893. }
  9894. assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
  9895. if (kind2 != kind1)
  9896. PyMem_Free((void *)buf2);
  9897. return result;
  9898. }
  9899. /* Concat to string or Unicode object giving a new Unicode object. */
  9900. PyObject *
  9901. PyUnicode_Concat(PyObject *left, PyObject *right)
  9902. {
  9903. PyObject *result;
  9904. Py_UCS4 maxchar, maxchar2;
  9905. Py_ssize_t left_len, right_len, new_len;
  9906. if (ensure_unicode(left) < 0)
  9907. return NULL;
  9908. if (!PyUnicode_Check(right)) {
  9909. PyErr_Format(PyExc_TypeError,
  9910. "can only concatenate str (not \"%.200s\") to str",
  9911. Py_TYPE(right)->tp_name);
  9912. return NULL;
  9913. }
  9914. /* Shortcuts */
  9915. PyObject *empty = unicode_get_empty(); // Borrowed reference
  9916. if (left == empty) {
  9917. return PyUnicode_FromObject(right);
  9918. }
  9919. if (right == empty) {
  9920. return PyUnicode_FromObject(left);
  9921. }
  9922. left_len = PyUnicode_GET_LENGTH(left);
  9923. right_len = PyUnicode_GET_LENGTH(right);
  9924. if (left_len > PY_SSIZE_T_MAX - right_len) {
  9925. PyErr_SetString(PyExc_OverflowError,
  9926. "strings are too large to concat");
  9927. return NULL;
  9928. }
  9929. new_len = left_len + right_len;
  9930. maxchar = PyUnicode_MAX_CHAR_VALUE(left);
  9931. maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
  9932. maxchar = Py_MAX(maxchar, maxchar2);
  9933. /* Concat the two Unicode strings */
  9934. result = PyUnicode_New(new_len, maxchar);
  9935. if (result == NULL)
  9936. return NULL;
  9937. _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
  9938. _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
  9939. assert(_PyUnicode_CheckConsistency(result, 1));
  9940. return result;
  9941. }
  9942. void
  9943. PyUnicode_Append(PyObject **p_left, PyObject *right)
  9944. {
  9945. PyObject *left, *res;
  9946. Py_UCS4 maxchar, maxchar2;
  9947. Py_ssize_t left_len, right_len, new_len;
  9948. if (p_left == NULL) {
  9949. if (!PyErr_Occurred())
  9950. PyErr_BadInternalCall();
  9951. return;
  9952. }
  9953. left = *p_left;
  9954. if (right == NULL || left == NULL
  9955. || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
  9956. if (!PyErr_Occurred())
  9957. PyErr_BadInternalCall();
  9958. goto error;
  9959. }
  9960. /* Shortcuts */
  9961. PyObject *empty = unicode_get_empty(); // Borrowed reference
  9962. if (left == empty) {
  9963. Py_DECREF(left);
  9964. *p_left = Py_NewRef(right);
  9965. return;
  9966. }
  9967. if (right == empty) {
  9968. return;
  9969. }
  9970. left_len = PyUnicode_GET_LENGTH(left);
  9971. right_len = PyUnicode_GET_LENGTH(right);
  9972. if (left_len > PY_SSIZE_T_MAX - right_len) {
  9973. PyErr_SetString(PyExc_OverflowError,
  9974. "strings are too large to concat");
  9975. goto error;
  9976. }
  9977. new_len = left_len + right_len;
  9978. if (unicode_modifiable(left)
  9979. && PyUnicode_CheckExact(right)
  9980. && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
  9981. /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
  9982. to change the structure size, but characters are stored just after
  9983. the structure, and so it requires to move all characters which is
  9984. not so different than duplicating the string. */
  9985. && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
  9986. {
  9987. /* append inplace */
  9988. if (unicode_resize(p_left, new_len) != 0)
  9989. goto error;
  9990. /* copy 'right' into the newly allocated area of 'left' */
  9991. _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
  9992. }
  9993. else {
  9994. maxchar = PyUnicode_MAX_CHAR_VALUE(left);
  9995. maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
  9996. maxchar = Py_MAX(maxchar, maxchar2);
  9997. /* Concat the two Unicode strings */
  9998. res = PyUnicode_New(new_len, maxchar);
  9999. if (res == NULL)
  10000. goto error;
  10001. _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
  10002. _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
  10003. Py_DECREF(left);
  10004. *p_left = res;
  10005. }
  10006. assert(_PyUnicode_CheckConsistency(*p_left, 1));
  10007. return;
  10008. error:
  10009. Py_CLEAR(*p_left);
  10010. }
  10011. void
  10012. PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
  10013. {
  10014. PyUnicode_Append(pleft, right);
  10015. Py_XDECREF(right);
  10016. }
  10017. /*
  10018. Wraps asciilib_parse_args_finds() and additionally ensures that the
  10019. first argument is a unicode object.
  10020. */
  10021. static inline int
  10022. parse_args_finds_unicode(const char * function_name, PyObject *args,
  10023. PyObject **substring,
  10024. Py_ssize_t *start, Py_ssize_t *end)
  10025. {
  10026. if (asciilib_parse_args_finds(function_name, args, substring, start, end)) {
  10027. if (ensure_unicode(*substring) < 0)
  10028. return 0;
  10029. return 1;
  10030. }
  10031. return 0;
  10032. }
  10033. PyDoc_STRVAR(count__doc__,
  10034. "S.count(sub[, start[, end]]) -> int\n\
  10035. \n\
  10036. Return the number of non-overlapping occurrences of substring sub in\n\
  10037. string S[start:end]. Optional arguments start and end are\n\
  10038. interpreted as in slice notation.");
  10039. static PyObject *
  10040. unicode_count(PyObject *self, PyObject *args)
  10041. {
  10042. PyObject *substring = NULL; /* initialize to fix a compiler warning */
  10043. Py_ssize_t start = 0;
  10044. Py_ssize_t end = PY_SSIZE_T_MAX;
  10045. Py_ssize_t result;
  10046. if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
  10047. return NULL;
  10048. result = unicode_count_impl(self, substring, start, end);
  10049. if (result == -1)
  10050. return NULL;
  10051. return PyLong_FromSsize_t(result);
  10052. }
  10053. /*[clinic input]
  10054. str.encode as unicode_encode
  10055. encoding: str(c_default="NULL") = 'utf-8'
  10056. The encoding in which to encode the string.
  10057. errors: str(c_default="NULL") = 'strict'
  10058. The error handling scheme to use for encoding errors.
  10059. The default is 'strict' meaning that encoding errors raise a
  10060. UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
  10061. 'xmlcharrefreplace' as well as any other name registered with
  10062. codecs.register_error that can handle UnicodeEncodeErrors.
  10063. Encode the string using the codec registered for encoding.
  10064. [clinic start generated code]*/
  10065. static PyObject *
  10066. unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
  10067. /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
  10068. {
  10069. return PyUnicode_AsEncodedString(self, encoding, errors);
  10070. }
  10071. /*[clinic input]
  10072. str.expandtabs as unicode_expandtabs
  10073. tabsize: int = 8
  10074. Return a copy where all tab characters are expanded using spaces.
  10075. If tabsize is not given, a tab size of 8 characters is assumed.
  10076. [clinic start generated code]*/
  10077. static PyObject *
  10078. unicode_expandtabs_impl(PyObject *self, int tabsize)
  10079. /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
  10080. {
  10081. Py_ssize_t i, j, line_pos, src_len, incr;
  10082. Py_UCS4 ch;
  10083. PyObject *u;
  10084. const void *src_data;
  10085. void *dest_data;
  10086. int kind;
  10087. int found;
  10088. /* First pass: determine size of output string */
  10089. src_len = PyUnicode_GET_LENGTH(self);
  10090. i = j = line_pos = 0;
  10091. kind = PyUnicode_KIND(self);
  10092. src_data = PyUnicode_DATA(self);
  10093. found = 0;
  10094. for (; i < src_len; i++) {
  10095. ch = PyUnicode_READ(kind, src_data, i);
  10096. if (ch == '\t') {
  10097. found = 1;
  10098. if (tabsize > 0) {
  10099. incr = tabsize - (line_pos % tabsize); /* cannot overflow */
  10100. if (j > PY_SSIZE_T_MAX - incr)
  10101. goto overflow;
  10102. line_pos += incr;
  10103. j += incr;
  10104. }
  10105. }
  10106. else {
  10107. if (j > PY_SSIZE_T_MAX - 1)
  10108. goto overflow;
  10109. line_pos++;
  10110. j++;
  10111. if (ch == '\n' || ch == '\r')
  10112. line_pos = 0;
  10113. }
  10114. }
  10115. if (!found)
  10116. return unicode_result_unchanged(self);
  10117. /* Second pass: create output string and fill it */
  10118. u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
  10119. if (!u)
  10120. return NULL;
  10121. dest_data = PyUnicode_DATA(u);
  10122. i = j = line_pos = 0;
  10123. for (; i < src_len; i++) {
  10124. ch = PyUnicode_READ(kind, src_data, i);
  10125. if (ch == '\t') {
  10126. if (tabsize > 0) {
  10127. incr = tabsize - (line_pos % tabsize);
  10128. line_pos += incr;
  10129. unicode_fill(kind, dest_data, ' ', j, incr);
  10130. j += incr;
  10131. }
  10132. }
  10133. else {
  10134. line_pos++;
  10135. PyUnicode_WRITE(kind, dest_data, j, ch);
  10136. j++;
  10137. if (ch == '\n' || ch == '\r')
  10138. line_pos = 0;
  10139. }
  10140. }
  10141. assert (j == PyUnicode_GET_LENGTH(u));
  10142. return unicode_result(u);
  10143. overflow:
  10144. PyErr_SetString(PyExc_OverflowError, "new string is too long");
  10145. return NULL;
  10146. }
  10147. PyDoc_STRVAR(find__doc__,
  10148. "S.find(sub[, start[, end]]) -> int\n\
  10149. \n\
  10150. Return the lowest index in S where substring sub is found,\n\
  10151. such that sub is contained within S[start:end]. Optional\n\
  10152. arguments start and end are interpreted as in slice notation.\n\
  10153. \n\
  10154. Return -1 on failure.");
  10155. static PyObject *
  10156. unicode_find(PyObject *self, PyObject *args)
  10157. {
  10158. /* initialize variables to prevent gcc warning */
  10159. PyObject *substring = NULL;
  10160. Py_ssize_t start = 0;
  10161. Py_ssize_t end = 0;
  10162. Py_ssize_t result;
  10163. if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
  10164. return NULL;
  10165. result = any_find_slice(self, substring, start, end, 1);
  10166. if (result == -2)
  10167. return NULL;
  10168. return PyLong_FromSsize_t(result);
  10169. }
  10170. static PyObject *
  10171. unicode_getitem(PyObject *self, Py_ssize_t index)
  10172. {
  10173. const void *data;
  10174. int kind;
  10175. Py_UCS4 ch;
  10176. if (!PyUnicode_Check(self)) {
  10177. PyErr_BadArgument();
  10178. return NULL;
  10179. }
  10180. if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
  10181. PyErr_SetString(PyExc_IndexError, "string index out of range");
  10182. return NULL;
  10183. }
  10184. kind = PyUnicode_KIND(self);
  10185. data = PyUnicode_DATA(self);
  10186. ch = PyUnicode_READ(kind, data, index);
  10187. return unicode_char(ch);
  10188. }
  10189. /* Believe it or not, this produces the same value for ASCII strings
  10190. as bytes_hash(). */
  10191. static Py_hash_t
  10192. unicode_hash(PyObject *self)
  10193. {
  10194. Py_uhash_t x; /* Unsigned for defined overflow behavior. */
  10195. #ifdef Py_DEBUG
  10196. assert(_Py_HashSecret_Initialized);
  10197. #endif
  10198. if (_PyUnicode_HASH(self) != -1)
  10199. return _PyUnicode_HASH(self);
  10200. x = _Py_HashBytes(PyUnicode_DATA(self),
  10201. PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
  10202. _PyUnicode_HASH(self) = x;
  10203. return x;
  10204. }
  10205. PyDoc_STRVAR(index__doc__,
  10206. "S.index(sub[, start[, end]]) -> int\n\
  10207. \n\
  10208. Return the lowest index in S where substring sub is found,\n\
  10209. such that sub is contained within S[start:end]. Optional\n\
  10210. arguments start and end are interpreted as in slice notation.\n\
  10211. \n\
  10212. Raises ValueError when the substring is not found.");
  10213. static PyObject *
  10214. unicode_index(PyObject *self, PyObject *args)
  10215. {
  10216. /* initialize variables to prevent gcc warning */
  10217. Py_ssize_t result;
  10218. PyObject *substring = NULL;
  10219. Py_ssize_t start = 0;
  10220. Py_ssize_t end = 0;
  10221. if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
  10222. return NULL;
  10223. result = any_find_slice(self, substring, start, end, 1);
  10224. if (result == -2)
  10225. return NULL;
  10226. if (result < 0) {
  10227. PyErr_SetString(PyExc_ValueError, "substring not found");
  10228. return NULL;
  10229. }
  10230. return PyLong_FromSsize_t(result);
  10231. }
  10232. /*[clinic input]
  10233. str.isascii as unicode_isascii
  10234. Return True if all characters in the string are ASCII, False otherwise.
  10235. ASCII characters have code points in the range U+0000-U+007F.
  10236. Empty string is ASCII too.
  10237. [clinic start generated code]*/
  10238. static PyObject *
  10239. unicode_isascii_impl(PyObject *self)
  10240. /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
  10241. {
  10242. return PyBool_FromLong(PyUnicode_IS_ASCII(self));
  10243. }
  10244. /*[clinic input]
  10245. str.islower as unicode_islower
  10246. Return True if the string is a lowercase string, False otherwise.
  10247. A string is lowercase if all cased characters in the string are lowercase and
  10248. there is at least one cased character in the string.
  10249. [clinic start generated code]*/
  10250. static PyObject *
  10251. unicode_islower_impl(PyObject *self)
  10252. /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
  10253. {
  10254. Py_ssize_t i, length;
  10255. int kind;
  10256. const void *data;
  10257. int cased;
  10258. length = PyUnicode_GET_LENGTH(self);
  10259. kind = PyUnicode_KIND(self);
  10260. data = PyUnicode_DATA(self);
  10261. /* Shortcut for single character strings */
  10262. if (length == 1)
  10263. return PyBool_FromLong(
  10264. Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
  10265. /* Special case for empty strings */
  10266. if (length == 0)
  10267. Py_RETURN_FALSE;
  10268. cased = 0;
  10269. for (i = 0; i < length; i++) {
  10270. const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
  10271. if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
  10272. Py_RETURN_FALSE;
  10273. else if (!cased && Py_UNICODE_ISLOWER(ch))
  10274. cased = 1;
  10275. }
  10276. return PyBool_FromLong(cased);
  10277. }
  10278. /*[clinic input]
  10279. str.isupper as unicode_isupper
  10280. Return True if the string is an uppercase string, False otherwise.
  10281. A string is uppercase if all cased characters in the string are uppercase and
  10282. there is at least one cased character in the string.
  10283. [clinic start generated code]*/
  10284. static PyObject *
  10285. unicode_isupper_impl(PyObject *self)
  10286. /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
  10287. {
  10288. Py_ssize_t i, length;
  10289. int kind;
  10290. const void *data;
  10291. int cased;
  10292. length = PyUnicode_GET_LENGTH(self);
  10293. kind = PyUnicode_KIND(self);
  10294. data = PyUnicode_DATA(self);
  10295. /* Shortcut for single character strings */
  10296. if (length == 1)
  10297. return PyBool_FromLong(
  10298. Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
  10299. /* Special case for empty strings */
  10300. if (length == 0)
  10301. Py_RETURN_FALSE;
  10302. cased = 0;
  10303. for (i = 0; i < length; i++) {
  10304. const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
  10305. if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
  10306. Py_RETURN_FALSE;
  10307. else if (!cased && Py_UNICODE_ISUPPER(ch))
  10308. cased = 1;
  10309. }
  10310. return PyBool_FromLong(cased);
  10311. }
  10312. /*[clinic input]
  10313. str.istitle as unicode_istitle
  10314. Return True if the string is a title-cased string, False otherwise.
  10315. In a title-cased string, upper- and title-case characters may only
  10316. follow uncased characters and lowercase characters only cased ones.
  10317. [clinic start generated code]*/
  10318. static PyObject *
  10319. unicode_istitle_impl(PyObject *self)
  10320. /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
  10321. {
  10322. Py_ssize_t i, length;
  10323. int kind;
  10324. const void *data;
  10325. int cased, previous_is_cased;
  10326. length = PyUnicode_GET_LENGTH(self);
  10327. kind = PyUnicode_KIND(self);
  10328. data = PyUnicode_DATA(self);
  10329. /* Shortcut for single character strings */
  10330. if (length == 1) {
  10331. Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
  10332. return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
  10333. (Py_UNICODE_ISUPPER(ch) != 0));
  10334. }
  10335. /* Special case for empty strings */
  10336. if (length == 0)
  10337. Py_RETURN_FALSE;
  10338. cased = 0;
  10339. previous_is_cased = 0;
  10340. for (i = 0; i < length; i++) {
  10341. const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
  10342. if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
  10343. if (previous_is_cased)
  10344. Py_RETURN_FALSE;
  10345. previous_is_cased = 1;
  10346. cased = 1;
  10347. }
  10348. else if (Py_UNICODE_ISLOWER(ch)) {
  10349. if (!previous_is_cased)
  10350. Py_RETURN_FALSE;
  10351. previous_is_cased = 1;
  10352. cased = 1;
  10353. }
  10354. else
  10355. previous_is_cased = 0;
  10356. }
  10357. return PyBool_FromLong(cased);
  10358. }
  10359. /*[clinic input]
  10360. str.isspace as unicode_isspace
  10361. Return True if the string is a whitespace string, False otherwise.
  10362. A string is whitespace if all characters in the string are whitespace and there
  10363. is at least one character in the string.
  10364. [clinic start generated code]*/
  10365. static PyObject *
  10366. unicode_isspace_impl(PyObject *self)
  10367. /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
  10368. {
  10369. Py_ssize_t i, length;
  10370. int kind;
  10371. const void *data;
  10372. length = PyUnicode_GET_LENGTH(self);
  10373. kind = PyUnicode_KIND(self);
  10374. data = PyUnicode_DATA(self);
  10375. /* Shortcut for single character strings */
  10376. if (length == 1)
  10377. return PyBool_FromLong(
  10378. Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
  10379. /* Special case for empty strings */
  10380. if (length == 0)
  10381. Py_RETURN_FALSE;
  10382. for (i = 0; i < length; i++) {
  10383. const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
  10384. if (!Py_UNICODE_ISSPACE(ch))
  10385. Py_RETURN_FALSE;
  10386. }
  10387. Py_RETURN_TRUE;
  10388. }
  10389. /*[clinic input]
  10390. str.isalpha as unicode_isalpha
  10391. Return True if the string is an alphabetic string, False otherwise.
  10392. A string is alphabetic if all characters in the string are alphabetic and there
  10393. is at least one character in the string.
  10394. [clinic start generated code]*/
  10395. static PyObject *
  10396. unicode_isalpha_impl(PyObject *self)
  10397. /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
  10398. {
  10399. Py_ssize_t i, length;
  10400. int kind;
  10401. const void *data;
  10402. length = PyUnicode_GET_LENGTH(self);
  10403. kind = PyUnicode_KIND(self);
  10404. data = PyUnicode_DATA(self);
  10405. /* Shortcut for single character strings */
  10406. if (length == 1)
  10407. return PyBool_FromLong(
  10408. Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
  10409. /* Special case for empty strings */
  10410. if (length == 0)
  10411. Py_RETURN_FALSE;
  10412. for (i = 0; i < length; i++) {
  10413. if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
  10414. Py_RETURN_FALSE;
  10415. }
  10416. Py_RETURN_TRUE;
  10417. }
  10418. /*[clinic input]
  10419. str.isalnum as unicode_isalnum
  10420. Return True if the string is an alpha-numeric string, False otherwise.
  10421. A string is alpha-numeric if all characters in the string are alpha-numeric and
  10422. there is at least one character in the string.
  10423. [clinic start generated code]*/
  10424. static PyObject *
  10425. unicode_isalnum_impl(PyObject *self)
  10426. /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
  10427. {
  10428. int kind;
  10429. const void *data;
  10430. Py_ssize_t len, i;
  10431. kind = PyUnicode_KIND(self);
  10432. data = PyUnicode_DATA(self);
  10433. len = PyUnicode_GET_LENGTH(self);
  10434. /* Shortcut for single character strings */
  10435. if (len == 1) {
  10436. const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
  10437. return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
  10438. }
  10439. /* Special case for empty strings */
  10440. if (len == 0)
  10441. Py_RETURN_FALSE;
  10442. for (i = 0; i < len; i++) {
  10443. const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
  10444. if (!Py_UNICODE_ISALNUM(ch))
  10445. Py_RETURN_FALSE;
  10446. }
  10447. Py_RETURN_TRUE;
  10448. }
  10449. /*[clinic input]
  10450. str.isdecimal as unicode_isdecimal
  10451. Return True if the string is a decimal string, False otherwise.
  10452. A string is a decimal string if all characters in the string are decimal and
  10453. there is at least one character in the string.
  10454. [clinic start generated code]*/
  10455. static PyObject *
  10456. unicode_isdecimal_impl(PyObject *self)
  10457. /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
  10458. {
  10459. Py_ssize_t i, length;
  10460. int kind;
  10461. const void *data;
  10462. length = PyUnicode_GET_LENGTH(self);
  10463. kind = PyUnicode_KIND(self);
  10464. data = PyUnicode_DATA(self);
  10465. /* Shortcut for single character strings */
  10466. if (length == 1)
  10467. return PyBool_FromLong(
  10468. Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
  10469. /* Special case for empty strings */
  10470. if (length == 0)
  10471. Py_RETURN_FALSE;
  10472. for (i = 0; i < length; i++) {
  10473. if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
  10474. Py_RETURN_FALSE;
  10475. }
  10476. Py_RETURN_TRUE;
  10477. }
  10478. /*[clinic input]
  10479. str.isdigit as unicode_isdigit
  10480. Return True if the string is a digit string, False otherwise.
  10481. A string is a digit string if all characters in the string are digits and there
  10482. is at least one character in the string.
  10483. [clinic start generated code]*/
  10484. static PyObject *
  10485. unicode_isdigit_impl(PyObject *self)
  10486. /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
  10487. {
  10488. Py_ssize_t i, length;
  10489. int kind;
  10490. const void *data;
  10491. length = PyUnicode_GET_LENGTH(self);
  10492. kind = PyUnicode_KIND(self);
  10493. data = PyUnicode_DATA(self);
  10494. /* Shortcut for single character strings */
  10495. if (length == 1) {
  10496. const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
  10497. return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
  10498. }
  10499. /* Special case for empty strings */
  10500. if (length == 0)
  10501. Py_RETURN_FALSE;
  10502. for (i = 0; i < length; i++) {
  10503. if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
  10504. Py_RETURN_FALSE;
  10505. }
  10506. Py_RETURN_TRUE;
  10507. }
  10508. /*[clinic input]
  10509. str.isnumeric as unicode_isnumeric
  10510. Return True if the string is a numeric string, False otherwise.
  10511. A string is numeric if all characters in the string are numeric and there is at
  10512. least one character in the string.
  10513. [clinic start generated code]*/
  10514. static PyObject *
  10515. unicode_isnumeric_impl(PyObject *self)
  10516. /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
  10517. {
  10518. Py_ssize_t i, length;
  10519. int kind;
  10520. const void *data;
  10521. length = PyUnicode_GET_LENGTH(self);
  10522. kind = PyUnicode_KIND(self);
  10523. data = PyUnicode_DATA(self);
  10524. /* Shortcut for single character strings */
  10525. if (length == 1)
  10526. return PyBool_FromLong(
  10527. Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
  10528. /* Special case for empty strings */
  10529. if (length == 0)
  10530. Py_RETURN_FALSE;
  10531. for (i = 0; i < length; i++) {
  10532. if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
  10533. Py_RETURN_FALSE;
  10534. }
  10535. Py_RETURN_TRUE;
  10536. }
  10537. Py_ssize_t
  10538. _PyUnicode_ScanIdentifier(PyObject *self)
  10539. {
  10540. Py_ssize_t i;
  10541. Py_ssize_t len = PyUnicode_GET_LENGTH(self);
  10542. if (len == 0) {
  10543. /* an empty string is not a valid identifier */
  10544. return 0;
  10545. }
  10546. int kind = PyUnicode_KIND(self);
  10547. const void *data = PyUnicode_DATA(self);
  10548. Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
  10549. /* PEP 3131 says that the first character must be in
  10550. XID_Start and subsequent characters in XID_Continue,
  10551. and for the ASCII range, the 2.x rules apply (i.e
  10552. start with letters and underscore, continue with
  10553. letters, digits, underscore). However, given the current
  10554. definition of XID_Start and XID_Continue, it is sufficient
  10555. to check just for these, except that _ must be allowed
  10556. as starting an identifier. */
  10557. if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
  10558. return 0;
  10559. }
  10560. for (i = 1; i < len; i++) {
  10561. ch = PyUnicode_READ(kind, data, i);
  10562. if (!_PyUnicode_IsXidContinue(ch)) {
  10563. return i;
  10564. }
  10565. }
  10566. return i;
  10567. }
  10568. int
  10569. PyUnicode_IsIdentifier(PyObject *self)
  10570. {
  10571. Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
  10572. Py_ssize_t len = PyUnicode_GET_LENGTH(self);
  10573. /* an empty string is not a valid identifier */
  10574. return len && i == len;
  10575. }
  10576. /*[clinic input]
  10577. str.isidentifier as unicode_isidentifier
  10578. Return True if the string is a valid Python identifier, False otherwise.
  10579. Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
  10580. such as "def" or "class".
  10581. [clinic start generated code]*/
  10582. static PyObject *
  10583. unicode_isidentifier_impl(PyObject *self)
  10584. /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
  10585. {
  10586. return PyBool_FromLong(PyUnicode_IsIdentifier(self));
  10587. }
  10588. /*[clinic input]
  10589. str.isprintable as unicode_isprintable
  10590. Return True if the string is printable, False otherwise.
  10591. A string is printable if all of its characters are considered printable in
  10592. repr() or if it is empty.
  10593. [clinic start generated code]*/
  10594. static PyObject *
  10595. unicode_isprintable_impl(PyObject *self)
  10596. /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
  10597. {
  10598. Py_ssize_t i, length;
  10599. int kind;
  10600. const void *data;
  10601. length = PyUnicode_GET_LENGTH(self);
  10602. kind = PyUnicode_KIND(self);
  10603. data = PyUnicode_DATA(self);
  10604. /* Shortcut for single character strings */
  10605. if (length == 1)
  10606. return PyBool_FromLong(
  10607. Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
  10608. for (i = 0; i < length; i++) {
  10609. if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
  10610. Py_RETURN_FALSE;
  10611. }
  10612. }
  10613. Py_RETURN_TRUE;
  10614. }
  10615. /*[clinic input]
  10616. str.join as unicode_join
  10617. iterable: object
  10618. /
  10619. Concatenate any number of strings.
  10620. The string whose method is called is inserted in between each given string.
  10621. The result is returned as a new string.
  10622. Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
  10623. [clinic start generated code]*/
  10624. static PyObject *
  10625. unicode_join(PyObject *self, PyObject *iterable)
  10626. /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
  10627. {
  10628. return PyUnicode_Join(self, iterable);
  10629. }
  10630. static Py_ssize_t
  10631. unicode_length(PyObject *self)
  10632. {
  10633. return PyUnicode_GET_LENGTH(self);
  10634. }
  10635. /*[clinic input]
  10636. str.ljust as unicode_ljust
  10637. width: Py_ssize_t
  10638. fillchar: Py_UCS4 = ' '
  10639. /
  10640. Return a left-justified string of length width.
  10641. Padding is done using the specified fill character (default is a space).
  10642. [clinic start generated code]*/
  10643. static PyObject *
  10644. unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
  10645. /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
  10646. {
  10647. if (PyUnicode_GET_LENGTH(self) >= width)
  10648. return unicode_result_unchanged(self);
  10649. return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
  10650. }
  10651. /*[clinic input]
  10652. str.lower as unicode_lower
  10653. Return a copy of the string converted to lowercase.
  10654. [clinic start generated code]*/
  10655. static PyObject *
  10656. unicode_lower_impl(PyObject *self)
  10657. /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
  10658. {
  10659. if (PyUnicode_IS_ASCII(self))
  10660. return ascii_upper_or_lower(self, 1);
  10661. return case_operation(self, do_lower);
  10662. }
  10663. #define LEFTSTRIP 0
  10664. #define RIGHTSTRIP 1
  10665. #define BOTHSTRIP 2
  10666. /* Arrays indexed by above */
  10667. static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
  10668. #define STRIPNAME(i) (stripfuncnames[i])
  10669. /* externally visible for str.strip(unicode) */
  10670. PyObject *
  10671. _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
  10672. {
  10673. const void *data;
  10674. int kind;
  10675. Py_ssize_t i, j, len;
  10676. BLOOM_MASK sepmask;
  10677. Py_ssize_t seplen;
  10678. kind = PyUnicode_KIND(self);
  10679. data = PyUnicode_DATA(self);
  10680. len = PyUnicode_GET_LENGTH(self);
  10681. seplen = PyUnicode_GET_LENGTH(sepobj);
  10682. sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
  10683. PyUnicode_DATA(sepobj),
  10684. seplen);
  10685. i = 0;
  10686. if (striptype != RIGHTSTRIP) {
  10687. while (i < len) {
  10688. Py_UCS4 ch = PyUnicode_READ(kind, data, i);
  10689. if (!BLOOM(sepmask, ch))
  10690. break;
  10691. if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
  10692. break;
  10693. i++;
  10694. }
  10695. }
  10696. j = len;
  10697. if (striptype != LEFTSTRIP) {
  10698. j--;
  10699. while (j >= i) {
  10700. Py_UCS4 ch = PyUnicode_READ(kind, data, j);
  10701. if (!BLOOM(sepmask, ch))
  10702. break;
  10703. if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
  10704. break;
  10705. j--;
  10706. }
  10707. j++;
  10708. }
  10709. return PyUnicode_Substring(self, i, j);
  10710. }
  10711. PyObject*
  10712. PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
  10713. {
  10714. const unsigned char *data;
  10715. int kind;
  10716. Py_ssize_t length;
  10717. length = PyUnicode_GET_LENGTH(self);
  10718. end = Py_MIN(end, length);
  10719. if (start == 0 && end == length)
  10720. return unicode_result_unchanged(self);
  10721. if (start < 0 || end < 0) {
  10722. PyErr_SetString(PyExc_IndexError, "string index out of range");
  10723. return NULL;
  10724. }
  10725. if (start >= length || end < start)
  10726. _Py_RETURN_UNICODE_EMPTY();
  10727. length = end - start;
  10728. if (PyUnicode_IS_ASCII(self)) {
  10729. data = PyUnicode_1BYTE_DATA(self);
  10730. return _PyUnicode_FromASCII((const char*)(data + start), length);
  10731. }
  10732. else {
  10733. kind = PyUnicode_KIND(self);
  10734. data = PyUnicode_1BYTE_DATA(self);
  10735. return PyUnicode_FromKindAndData(kind,
  10736. data + kind * start,
  10737. length);
  10738. }
  10739. }
  10740. static PyObject *
  10741. do_strip(PyObject *self, int striptype)
  10742. {
  10743. Py_ssize_t len, i, j;
  10744. len = PyUnicode_GET_LENGTH(self);
  10745. if (PyUnicode_IS_ASCII(self)) {
  10746. const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
  10747. i = 0;
  10748. if (striptype != RIGHTSTRIP) {
  10749. while (i < len) {
  10750. Py_UCS1 ch = data[i];
  10751. if (!_Py_ascii_whitespace[ch])
  10752. break;
  10753. i++;
  10754. }
  10755. }
  10756. j = len;
  10757. if (striptype != LEFTSTRIP) {
  10758. j--;
  10759. while (j >= i) {
  10760. Py_UCS1 ch = data[j];
  10761. if (!_Py_ascii_whitespace[ch])
  10762. break;
  10763. j--;
  10764. }
  10765. j++;
  10766. }
  10767. }
  10768. else {
  10769. int kind = PyUnicode_KIND(self);
  10770. const void *data = PyUnicode_DATA(self);
  10771. i = 0;
  10772. if (striptype != RIGHTSTRIP) {
  10773. while (i < len) {
  10774. Py_UCS4 ch = PyUnicode_READ(kind, data, i);
  10775. if (!Py_UNICODE_ISSPACE(ch))
  10776. break;
  10777. i++;
  10778. }
  10779. }
  10780. j = len;
  10781. if (striptype != LEFTSTRIP) {
  10782. j--;
  10783. while (j >= i) {
  10784. Py_UCS4 ch = PyUnicode_READ(kind, data, j);
  10785. if (!Py_UNICODE_ISSPACE(ch))
  10786. break;
  10787. j--;
  10788. }
  10789. j++;
  10790. }
  10791. }
  10792. return PyUnicode_Substring(self, i, j);
  10793. }
  10794. static PyObject *
  10795. do_argstrip(PyObject *self, int striptype, PyObject *sep)
  10796. {
  10797. if (sep != Py_None) {
  10798. if (PyUnicode_Check(sep))
  10799. return _PyUnicode_XStrip(self, striptype, sep);
  10800. else {
  10801. PyErr_Format(PyExc_TypeError,
  10802. "%s arg must be None or str",
  10803. STRIPNAME(striptype));
  10804. return NULL;
  10805. }
  10806. }
  10807. return do_strip(self, striptype);
  10808. }
  10809. /*[clinic input]
  10810. str.strip as unicode_strip
  10811. chars: object = None
  10812. /
  10813. Return a copy of the string with leading and trailing whitespace removed.
  10814. If chars is given and not None, remove characters in chars instead.
  10815. [clinic start generated code]*/
  10816. static PyObject *
  10817. unicode_strip_impl(PyObject *self, PyObject *chars)
  10818. /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
  10819. {
  10820. return do_argstrip(self, BOTHSTRIP, chars);
  10821. }
  10822. /*[clinic input]
  10823. str.lstrip as unicode_lstrip
  10824. chars: object = None
  10825. /
  10826. Return a copy of the string with leading whitespace removed.
  10827. If chars is given and not None, remove characters in chars instead.
  10828. [clinic start generated code]*/
  10829. static PyObject *
  10830. unicode_lstrip_impl(PyObject *self, PyObject *chars)
  10831. /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
  10832. {
  10833. return do_argstrip(self, LEFTSTRIP, chars);
  10834. }
  10835. /*[clinic input]
  10836. str.rstrip as unicode_rstrip
  10837. chars: object = None
  10838. /
  10839. Return a copy of the string with trailing whitespace removed.
  10840. If chars is given and not None, remove characters in chars instead.
  10841. [clinic start generated code]*/
  10842. static PyObject *
  10843. unicode_rstrip_impl(PyObject *self, PyObject *chars)
  10844. /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
  10845. {
  10846. return do_argstrip(self, RIGHTSTRIP, chars);
  10847. }
  10848. static PyObject*
  10849. unicode_repeat(PyObject *str, Py_ssize_t len)
  10850. {
  10851. PyObject *u;
  10852. Py_ssize_t nchars, n;
  10853. if (len < 1)
  10854. _Py_RETURN_UNICODE_EMPTY();
  10855. /* no repeat, return original string */
  10856. if (len == 1)
  10857. return unicode_result_unchanged(str);
  10858. if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
  10859. PyErr_SetString(PyExc_OverflowError,
  10860. "repeated string is too long");
  10861. return NULL;
  10862. }
  10863. nchars = len * PyUnicode_GET_LENGTH(str);
  10864. u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
  10865. if (!u)
  10866. return NULL;
  10867. assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
  10868. if (PyUnicode_GET_LENGTH(str) == 1) {
  10869. int kind = PyUnicode_KIND(str);
  10870. Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
  10871. if (kind == PyUnicode_1BYTE_KIND) {
  10872. void *to = PyUnicode_DATA(u);
  10873. memset(to, (unsigned char)fill_char, len);
  10874. }
  10875. else if (kind == PyUnicode_2BYTE_KIND) {
  10876. Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
  10877. for (n = 0; n < len; ++n)
  10878. ucs2[n] = fill_char;
  10879. } else {
  10880. Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
  10881. assert(kind == PyUnicode_4BYTE_KIND);
  10882. for (n = 0; n < len; ++n)
  10883. ucs4[n] = fill_char;
  10884. }
  10885. }
  10886. else {
  10887. Py_ssize_t char_size = PyUnicode_KIND(str);
  10888. char *to = (char *) PyUnicode_DATA(u);
  10889. _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
  10890. PyUnicode_GET_LENGTH(str) * char_size);
  10891. }
  10892. assert(_PyUnicode_CheckConsistency(u, 1));
  10893. return u;
  10894. }
  10895. PyObject *
  10896. PyUnicode_Replace(PyObject *str,
  10897. PyObject *substr,
  10898. PyObject *replstr,
  10899. Py_ssize_t maxcount)
  10900. {
  10901. if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
  10902. ensure_unicode(replstr) < 0)
  10903. return NULL;
  10904. return replace(str, substr, replstr, maxcount);
  10905. }
  10906. /*[clinic input]
  10907. str.replace as unicode_replace
  10908. old: unicode
  10909. new: unicode
  10910. count: Py_ssize_t = -1
  10911. Maximum number of occurrences to replace.
  10912. -1 (the default value) means replace all occurrences.
  10913. /
  10914. Return a copy with all occurrences of substring old replaced by new.
  10915. If the optional argument count is given, only the first count occurrences are
  10916. replaced.
  10917. [clinic start generated code]*/
  10918. static PyObject *
  10919. unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
  10920. Py_ssize_t count)
  10921. /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
  10922. {
  10923. return replace(self, old, new, count);
  10924. }
  10925. /*[clinic input]
  10926. str.removeprefix as unicode_removeprefix
  10927. prefix: unicode
  10928. /
  10929. Return a str with the given prefix string removed if present.
  10930. If the string starts with the prefix string, return string[len(prefix):].
  10931. Otherwise, return a copy of the original string.
  10932. [clinic start generated code]*/
  10933. static PyObject *
  10934. unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
  10935. /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
  10936. {
  10937. int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
  10938. if (match == -1) {
  10939. return NULL;
  10940. }
  10941. if (match) {
  10942. return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
  10943. PyUnicode_GET_LENGTH(self));
  10944. }
  10945. return unicode_result_unchanged(self);
  10946. }
  10947. /*[clinic input]
  10948. str.removesuffix as unicode_removesuffix
  10949. suffix: unicode
  10950. /
  10951. Return a str with the given suffix string removed if present.
  10952. If the string ends with the suffix string and that suffix is not empty,
  10953. return string[:-len(suffix)]. Otherwise, return a copy of the original
  10954. string.
  10955. [clinic start generated code]*/
  10956. static PyObject *
  10957. unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
  10958. /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
  10959. {
  10960. int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
  10961. if (match == -1) {
  10962. return NULL;
  10963. }
  10964. if (match) {
  10965. return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
  10966. - PyUnicode_GET_LENGTH(suffix));
  10967. }
  10968. return unicode_result_unchanged(self);
  10969. }
  10970. static PyObject *
  10971. unicode_repr(PyObject *unicode)
  10972. {
  10973. PyObject *repr;
  10974. Py_ssize_t isize;
  10975. Py_ssize_t osize, squote, dquote, i, o;
  10976. Py_UCS4 max, quote;
  10977. int ikind, okind, unchanged;
  10978. const void *idata;
  10979. void *odata;
  10980. isize = PyUnicode_GET_LENGTH(unicode);
  10981. idata = PyUnicode_DATA(unicode);
  10982. /* Compute length of output, quote characters, and
  10983. maximum character */
  10984. osize = 0;
  10985. max = 127;
  10986. squote = dquote = 0;
  10987. ikind = PyUnicode_KIND(unicode);
  10988. for (i = 0; i < isize; i++) {
  10989. Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
  10990. Py_ssize_t incr = 1;
  10991. switch (ch) {
  10992. case '\'': squote++; break;
  10993. case '"': dquote++; break;
  10994. case '\\': case '\t': case '\r': case '\n':
  10995. incr = 2;
  10996. break;
  10997. default:
  10998. /* Fast-path ASCII */
  10999. if (ch < ' ' || ch == 0x7f)
  11000. incr = 4; /* \xHH */
  11001. else if (ch < 0x7f)
  11002. ;
  11003. else if (Py_UNICODE_ISPRINTABLE(ch))
  11004. max = ch > max ? ch : max;
  11005. else if (ch < 0x100)
  11006. incr = 4; /* \xHH */
  11007. else if (ch < 0x10000)
  11008. incr = 6; /* \uHHHH */
  11009. else
  11010. incr = 10; /* \uHHHHHHHH */
  11011. }
  11012. if (osize > PY_SSIZE_T_MAX - incr) {
  11013. PyErr_SetString(PyExc_OverflowError,
  11014. "string is too long to generate repr");
  11015. return NULL;
  11016. }
  11017. osize += incr;
  11018. }
  11019. quote = '\'';
  11020. unchanged = (osize == isize);
  11021. if (squote) {
  11022. unchanged = 0;
  11023. if (dquote)
  11024. /* Both squote and dquote present. Use squote,
  11025. and escape them */
  11026. osize += squote;
  11027. else
  11028. quote = '"';
  11029. }
  11030. osize += 2; /* quotes */
  11031. repr = PyUnicode_New(osize, max);
  11032. if (repr == NULL)
  11033. return NULL;
  11034. okind = PyUnicode_KIND(repr);
  11035. odata = PyUnicode_DATA(repr);
  11036. PyUnicode_WRITE(okind, odata, 0, quote);
  11037. PyUnicode_WRITE(okind, odata, osize-1, quote);
  11038. if (unchanged) {
  11039. _PyUnicode_FastCopyCharacters(repr, 1,
  11040. unicode, 0,
  11041. isize);
  11042. }
  11043. else {
  11044. for (i = 0, o = 1; i < isize; i++) {
  11045. Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
  11046. /* Escape quotes and backslashes */
  11047. if ((ch == quote) || (ch == '\\')) {
  11048. PyUnicode_WRITE(okind, odata, o++, '\\');
  11049. PyUnicode_WRITE(okind, odata, o++, ch);
  11050. continue;
  11051. }
  11052. /* Map special whitespace to '\t', \n', '\r' */
  11053. if (ch == '\t') {
  11054. PyUnicode_WRITE(okind, odata, o++, '\\');
  11055. PyUnicode_WRITE(okind, odata, o++, 't');
  11056. }
  11057. else if (ch == '\n') {
  11058. PyUnicode_WRITE(okind, odata, o++, '\\');
  11059. PyUnicode_WRITE(okind, odata, o++, 'n');
  11060. }
  11061. else if (ch == '\r') {
  11062. PyUnicode_WRITE(okind, odata, o++, '\\');
  11063. PyUnicode_WRITE(okind, odata, o++, 'r');
  11064. }
  11065. /* Map non-printable US ASCII to '\xhh' */
  11066. else if (ch < ' ' || ch == 0x7F) {
  11067. PyUnicode_WRITE(okind, odata, o++, '\\');
  11068. PyUnicode_WRITE(okind, odata, o++, 'x');
  11069. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
  11070. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
  11071. }
  11072. /* Copy ASCII characters as-is */
  11073. else if (ch < 0x7F) {
  11074. PyUnicode_WRITE(okind, odata, o++, ch);
  11075. }
  11076. /* Non-ASCII characters */
  11077. else {
  11078. /* Map Unicode whitespace and control characters
  11079. (categories Z* and C* except ASCII space)
  11080. */
  11081. if (!Py_UNICODE_ISPRINTABLE(ch)) {
  11082. PyUnicode_WRITE(okind, odata, o++, '\\');
  11083. /* Map 8-bit characters to '\xhh' */
  11084. if (ch <= 0xff) {
  11085. PyUnicode_WRITE(okind, odata, o++, 'x');
  11086. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
  11087. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
  11088. }
  11089. /* Map 16-bit characters to '\uxxxx' */
  11090. else if (ch <= 0xffff) {
  11091. PyUnicode_WRITE(okind, odata, o++, 'u');
  11092. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
  11093. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
  11094. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
  11095. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
  11096. }
  11097. /* Map 21-bit characters to '\U00xxxxxx' */
  11098. else {
  11099. PyUnicode_WRITE(okind, odata, o++, 'U');
  11100. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
  11101. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
  11102. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
  11103. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
  11104. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
  11105. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
  11106. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
  11107. PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
  11108. }
  11109. }
  11110. /* Copy characters as-is */
  11111. else {
  11112. PyUnicode_WRITE(okind, odata, o++, ch);
  11113. }
  11114. }
  11115. }
  11116. }
  11117. /* Closing quote already added at the beginning */
  11118. assert(_PyUnicode_CheckConsistency(repr, 1));
  11119. return repr;
  11120. }
  11121. PyDoc_STRVAR(rfind__doc__,
  11122. "S.rfind(sub[, start[, end]]) -> int\n\
  11123. \n\
  11124. Return the highest index in S where substring sub is found,\n\
  11125. such that sub is contained within S[start:end]. Optional\n\
  11126. arguments start and end are interpreted as in slice notation.\n\
  11127. \n\
  11128. Return -1 on failure.");
  11129. static PyObject *
  11130. unicode_rfind(PyObject *self, PyObject *args)
  11131. {
  11132. /* initialize variables to prevent gcc warning */
  11133. PyObject *substring = NULL;
  11134. Py_ssize_t start = 0;
  11135. Py_ssize_t end = 0;
  11136. Py_ssize_t result;
  11137. if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
  11138. return NULL;
  11139. result = any_find_slice(self, substring, start, end, -1);
  11140. if (result == -2)
  11141. return NULL;
  11142. return PyLong_FromSsize_t(result);
  11143. }
  11144. PyDoc_STRVAR(rindex__doc__,
  11145. "S.rindex(sub[, start[, end]]) -> int\n\
  11146. \n\
  11147. Return the highest index in S where substring sub is found,\n\
  11148. such that sub is contained within S[start:end]. Optional\n\
  11149. arguments start and end are interpreted as in slice notation.\n\
  11150. \n\
  11151. Raises ValueError when the substring is not found.");
  11152. static PyObject *
  11153. unicode_rindex(PyObject *self, PyObject *args)
  11154. {
  11155. /* initialize variables to prevent gcc warning */
  11156. PyObject *substring = NULL;
  11157. Py_ssize_t start = 0;
  11158. Py_ssize_t end = 0;
  11159. Py_ssize_t result;
  11160. if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
  11161. return NULL;
  11162. result = any_find_slice(self, substring, start, end, -1);
  11163. if (result == -2)
  11164. return NULL;
  11165. if (result < 0) {
  11166. PyErr_SetString(PyExc_ValueError, "substring not found");
  11167. return NULL;
  11168. }
  11169. return PyLong_FromSsize_t(result);
  11170. }
  11171. /*[clinic input]
  11172. str.rjust as unicode_rjust
  11173. width: Py_ssize_t
  11174. fillchar: Py_UCS4 = ' '
  11175. /
  11176. Return a right-justified string of length width.
  11177. Padding is done using the specified fill character (default is a space).
  11178. [clinic start generated code]*/
  11179. static PyObject *
  11180. unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
  11181. /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
  11182. {
  11183. if (PyUnicode_GET_LENGTH(self) >= width)
  11184. return unicode_result_unchanged(self);
  11185. return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
  11186. }
  11187. PyObject *
  11188. PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
  11189. {
  11190. if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
  11191. return NULL;
  11192. return split(s, sep, maxsplit);
  11193. }
  11194. /*[clinic input]
  11195. str.split as unicode_split
  11196. sep: object = None
  11197. The separator used to split the string.
  11198. When set to None (the default value), will split on any whitespace
  11199. character (including \n \r \t \f and spaces) and will discard
  11200. empty strings from the result.
  11201. maxsplit: Py_ssize_t = -1
  11202. Maximum number of splits.
  11203. -1 (the default value) means no limit.
  11204. Return a list of the substrings in the string, using sep as the separator string.
  11205. Splitting starts at the front of the string and works to the end.
  11206. Note, str.split() is mainly useful for data that has been intentionally
  11207. delimited. With natural text that includes punctuation, consider using
  11208. the regular expression module.
  11209. [clinic start generated code]*/
  11210. static PyObject *
  11211. unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
  11212. /*[clinic end generated code: output=3a65b1db356948dc input=a29bcc0c7a5af0eb]*/
  11213. {
  11214. if (sep == Py_None)
  11215. return split(self, NULL, maxsplit);
  11216. if (PyUnicode_Check(sep))
  11217. return split(self, sep, maxsplit);
  11218. PyErr_Format(PyExc_TypeError,
  11219. "must be str or None, not %.100s",
  11220. Py_TYPE(sep)->tp_name);
  11221. return NULL;
  11222. }
  11223. PyObject *
  11224. PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
  11225. {
  11226. PyObject* out;
  11227. int kind1, kind2;
  11228. const void *buf1, *buf2;
  11229. Py_ssize_t len1, len2;
  11230. if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
  11231. return NULL;
  11232. kind1 = PyUnicode_KIND(str_obj);
  11233. kind2 = PyUnicode_KIND(sep_obj);
  11234. len1 = PyUnicode_GET_LENGTH(str_obj);
  11235. len2 = PyUnicode_GET_LENGTH(sep_obj);
  11236. if (kind1 < kind2 || len1 < len2) {
  11237. PyObject *empty = unicode_get_empty(); // Borrowed reference
  11238. return PyTuple_Pack(3, str_obj, empty, empty);
  11239. }
  11240. buf1 = PyUnicode_DATA(str_obj);
  11241. buf2 = PyUnicode_DATA(sep_obj);
  11242. if (kind2 != kind1) {
  11243. buf2 = unicode_askind(kind2, buf2, len2, kind1);
  11244. if (!buf2)
  11245. return NULL;
  11246. }
  11247. switch (kind1) {
  11248. case PyUnicode_1BYTE_KIND:
  11249. if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
  11250. out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
  11251. else
  11252. out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
  11253. break;
  11254. case PyUnicode_2BYTE_KIND:
  11255. out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
  11256. break;
  11257. case PyUnicode_4BYTE_KIND:
  11258. out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
  11259. break;
  11260. default:
  11261. Py_UNREACHABLE();
  11262. }
  11263. assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
  11264. if (kind2 != kind1)
  11265. PyMem_Free((void *)buf2);
  11266. return out;
  11267. }
  11268. PyObject *
  11269. PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
  11270. {
  11271. PyObject* out;
  11272. int kind1, kind2;
  11273. const void *buf1, *buf2;
  11274. Py_ssize_t len1, len2;
  11275. if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
  11276. return NULL;
  11277. kind1 = PyUnicode_KIND(str_obj);
  11278. kind2 = PyUnicode_KIND(sep_obj);
  11279. len1 = PyUnicode_GET_LENGTH(str_obj);
  11280. len2 = PyUnicode_GET_LENGTH(sep_obj);
  11281. if (kind1 < kind2 || len1 < len2) {
  11282. PyObject *empty = unicode_get_empty(); // Borrowed reference
  11283. return PyTuple_Pack(3, empty, empty, str_obj);
  11284. }
  11285. buf1 = PyUnicode_DATA(str_obj);
  11286. buf2 = PyUnicode_DATA(sep_obj);
  11287. if (kind2 != kind1) {
  11288. buf2 = unicode_askind(kind2, buf2, len2, kind1);
  11289. if (!buf2)
  11290. return NULL;
  11291. }
  11292. switch (kind1) {
  11293. case PyUnicode_1BYTE_KIND:
  11294. if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
  11295. out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
  11296. else
  11297. out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
  11298. break;
  11299. case PyUnicode_2BYTE_KIND:
  11300. out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
  11301. break;
  11302. case PyUnicode_4BYTE_KIND:
  11303. out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
  11304. break;
  11305. default:
  11306. Py_UNREACHABLE();
  11307. }
  11308. assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
  11309. if (kind2 != kind1)
  11310. PyMem_Free((void *)buf2);
  11311. return out;
  11312. }
  11313. /*[clinic input]
  11314. str.partition as unicode_partition
  11315. sep: object
  11316. /
  11317. Partition the string into three parts using the given separator.
  11318. This will search for the separator in the string. If the separator is found,
  11319. returns a 3-tuple containing the part before the separator, the separator
  11320. itself, and the part after it.
  11321. If the separator is not found, returns a 3-tuple containing the original string
  11322. and two empty strings.
  11323. [clinic start generated code]*/
  11324. static PyObject *
  11325. unicode_partition(PyObject *self, PyObject *sep)
  11326. /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
  11327. {
  11328. return PyUnicode_Partition(self, sep);
  11329. }
  11330. /*[clinic input]
  11331. str.rpartition as unicode_rpartition = str.partition
  11332. Partition the string into three parts using the given separator.
  11333. This will search for the separator in the string, starting at the end. If
  11334. the separator is found, returns a 3-tuple containing the part before the
  11335. separator, the separator itself, and the part after it.
  11336. If the separator is not found, returns a 3-tuple containing two empty strings
  11337. and the original string.
  11338. [clinic start generated code]*/
  11339. static PyObject *
  11340. unicode_rpartition(PyObject *self, PyObject *sep)
  11341. /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
  11342. {
  11343. return PyUnicode_RPartition(self, sep);
  11344. }
  11345. PyObject *
  11346. PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
  11347. {
  11348. if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
  11349. return NULL;
  11350. return rsplit(s, sep, maxsplit);
  11351. }
  11352. /*[clinic input]
  11353. str.rsplit as unicode_rsplit = str.split
  11354. Return a list of the substrings in the string, using sep as the separator string.
  11355. Splitting starts at the end of the string and works to the front.
  11356. [clinic start generated code]*/
  11357. static PyObject *
  11358. unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
  11359. /*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
  11360. {
  11361. if (sep == Py_None)
  11362. return rsplit(self, NULL, maxsplit);
  11363. if (PyUnicode_Check(sep))
  11364. return rsplit(self, sep, maxsplit);
  11365. PyErr_Format(PyExc_TypeError,
  11366. "must be str or None, not %.100s",
  11367. Py_TYPE(sep)->tp_name);
  11368. return NULL;
  11369. }
  11370. /*[clinic input]
  11371. str.splitlines as unicode_splitlines
  11372. keepends: bool = False
  11373. Return a list of the lines in the string, breaking at line boundaries.
  11374. Line breaks are not included in the resulting list unless keepends is given and
  11375. true.
  11376. [clinic start generated code]*/
  11377. static PyObject *
  11378. unicode_splitlines_impl(PyObject *self, int keepends)
  11379. /*[clinic end generated code: output=f664dcdad153ec40 input=ba6ad05ee85d2b55]*/
  11380. {
  11381. return PyUnicode_Splitlines(self, keepends);
  11382. }
  11383. static
  11384. PyObject *unicode_str(PyObject *self)
  11385. {
  11386. return unicode_result_unchanged(self);
  11387. }
  11388. /*[clinic input]
  11389. str.swapcase as unicode_swapcase
  11390. Convert uppercase characters to lowercase and lowercase characters to uppercase.
  11391. [clinic start generated code]*/
  11392. static PyObject *
  11393. unicode_swapcase_impl(PyObject *self)
  11394. /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
  11395. {
  11396. return case_operation(self, do_swapcase);
  11397. }
  11398. /*[clinic input]
  11399. @staticmethod
  11400. str.maketrans as unicode_maketrans
  11401. x: object
  11402. y: unicode=NULL
  11403. z: unicode=NULL
  11404. /
  11405. Return a translation table usable for str.translate().
  11406. If there is only one argument, it must be a dictionary mapping Unicode
  11407. ordinals (integers) or characters to Unicode ordinals, strings or None.
  11408. Character keys will be then converted to ordinals.
  11409. If there are two arguments, they must be strings of equal length, and
  11410. in the resulting dictionary, each character in x will be mapped to the
  11411. character at the same position in y. If there is a third argument, it
  11412. must be a string, whose characters will be mapped to None in the result.
  11413. [clinic start generated code]*/
  11414. static PyObject *
  11415. unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
  11416. /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
  11417. {
  11418. PyObject *new = NULL, *key, *value;
  11419. Py_ssize_t i = 0;
  11420. int res;
  11421. new = PyDict_New();
  11422. if (!new)
  11423. return NULL;
  11424. if (y != NULL) {
  11425. int x_kind, y_kind, z_kind;
  11426. const void *x_data, *y_data, *z_data;
  11427. /* x must be a string too, of equal length */
  11428. if (!PyUnicode_Check(x)) {
  11429. PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
  11430. "be a string if there is a second argument");
  11431. goto err;
  11432. }
  11433. if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
  11434. PyErr_SetString(PyExc_ValueError, "the first two maketrans "
  11435. "arguments must have equal length");
  11436. goto err;
  11437. }
  11438. /* create entries for translating chars in x to those in y */
  11439. x_kind = PyUnicode_KIND(x);
  11440. y_kind = PyUnicode_KIND(y);
  11441. x_data = PyUnicode_DATA(x);
  11442. y_data = PyUnicode_DATA(y);
  11443. for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
  11444. key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
  11445. if (!key)
  11446. goto err;
  11447. value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
  11448. if (!value) {
  11449. Py_DECREF(key);
  11450. goto err;
  11451. }
  11452. res = PyDict_SetItem(new, key, value);
  11453. Py_DECREF(key);
  11454. Py_DECREF(value);
  11455. if (res < 0)
  11456. goto err;
  11457. }
  11458. /* create entries for deleting chars in z */
  11459. if (z != NULL) {
  11460. z_kind = PyUnicode_KIND(z);
  11461. z_data = PyUnicode_DATA(z);
  11462. for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
  11463. key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
  11464. if (!key)
  11465. goto err;
  11466. res = PyDict_SetItem(new, key, Py_None);
  11467. Py_DECREF(key);
  11468. if (res < 0)
  11469. goto err;
  11470. }
  11471. }
  11472. } else {
  11473. int kind;
  11474. const void *data;
  11475. /* x must be a dict */
  11476. if (!PyDict_CheckExact(x)) {
  11477. PyErr_SetString(PyExc_TypeError, "if you give only one argument "
  11478. "to maketrans it must be a dict");
  11479. goto err;
  11480. }
  11481. /* copy entries into the new dict, converting string keys to int keys */
  11482. while (PyDict_Next(x, &i, &key, &value)) {
  11483. if (PyUnicode_Check(key)) {
  11484. /* convert string keys to integer keys */
  11485. PyObject *newkey;
  11486. if (PyUnicode_GET_LENGTH(key) != 1) {
  11487. PyErr_SetString(PyExc_ValueError, "string keys in translate "
  11488. "table must be of length 1");
  11489. goto err;
  11490. }
  11491. kind = PyUnicode_KIND(key);
  11492. data = PyUnicode_DATA(key);
  11493. newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
  11494. if (!newkey)
  11495. goto err;
  11496. res = PyDict_SetItem(new, newkey, value);
  11497. Py_DECREF(newkey);
  11498. if (res < 0)
  11499. goto err;
  11500. } else if (PyLong_Check(key)) {
  11501. /* just keep integer keys */
  11502. if (PyDict_SetItem(new, key, value) < 0)
  11503. goto err;
  11504. } else {
  11505. PyErr_SetString(PyExc_TypeError, "keys in translate table must "
  11506. "be strings or integers");
  11507. goto err;
  11508. }
  11509. }
  11510. }
  11511. return new;
  11512. err:
  11513. Py_DECREF(new);
  11514. return NULL;
  11515. }
  11516. /*[clinic input]
  11517. str.translate as unicode_translate
  11518. table: object
  11519. Translation table, which must be a mapping of Unicode ordinals to
  11520. Unicode ordinals, strings, or None.
  11521. /
  11522. Replace each character in the string using the given translation table.
  11523. The table must implement lookup/indexing via __getitem__, for instance a
  11524. dictionary or list. If this operation raises LookupError, the character is
  11525. left untouched. Characters mapped to None are deleted.
  11526. [clinic start generated code]*/
  11527. static PyObject *
  11528. unicode_translate(PyObject *self, PyObject *table)
  11529. /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
  11530. {
  11531. return _PyUnicode_TranslateCharmap(self, table, "ignore");
  11532. }
  11533. /*[clinic input]
  11534. str.upper as unicode_upper
  11535. Return a copy of the string converted to uppercase.
  11536. [clinic start generated code]*/
  11537. static PyObject *
  11538. unicode_upper_impl(PyObject *self)
  11539. /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
  11540. {
  11541. if (PyUnicode_IS_ASCII(self))
  11542. return ascii_upper_or_lower(self, 0);
  11543. return case_operation(self, do_upper);
  11544. }
  11545. /*[clinic input]
  11546. str.zfill as unicode_zfill
  11547. width: Py_ssize_t
  11548. /
  11549. Pad a numeric string with zeros on the left, to fill a field of the given width.
  11550. The string is never truncated.
  11551. [clinic start generated code]*/
  11552. static PyObject *
  11553. unicode_zfill_impl(PyObject *self, Py_ssize_t width)
  11554. /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
  11555. {
  11556. Py_ssize_t fill;
  11557. PyObject *u;
  11558. int kind;
  11559. const void *data;
  11560. Py_UCS4 chr;
  11561. if (PyUnicode_GET_LENGTH(self) >= width)
  11562. return unicode_result_unchanged(self);
  11563. fill = width - PyUnicode_GET_LENGTH(self);
  11564. u = pad(self, fill, 0, '0');
  11565. if (u == NULL)
  11566. return NULL;
  11567. kind = PyUnicode_KIND(u);
  11568. data = PyUnicode_DATA(u);
  11569. chr = PyUnicode_READ(kind, data, fill);
  11570. if (chr == '+' || chr == '-') {
  11571. /* move sign to beginning of string */
  11572. PyUnicode_WRITE(kind, data, 0, chr);
  11573. PyUnicode_WRITE(kind, data, fill, '0');
  11574. }
  11575. assert(_PyUnicode_CheckConsistency(u, 1));
  11576. return u;
  11577. }
  11578. PyDoc_STRVAR(startswith__doc__,
  11579. "S.startswith(prefix[, start[, end]]) -> bool\n\
  11580. \n\
  11581. Return True if S starts with the specified prefix, False otherwise.\n\
  11582. With optional start, test S beginning at that position.\n\
  11583. With optional end, stop comparing S at that position.\n\
  11584. prefix can also be a tuple of strings to try.");
  11585. static PyObject *
  11586. unicode_startswith(PyObject *self,
  11587. PyObject *args)
  11588. {
  11589. PyObject *subobj;
  11590. PyObject *substring;
  11591. Py_ssize_t start = 0;
  11592. Py_ssize_t end = PY_SSIZE_T_MAX;
  11593. int result;
  11594. if (!asciilib_parse_args_finds("startswith", args, &subobj, &start, &end))
  11595. return NULL;
  11596. if (PyTuple_Check(subobj)) {
  11597. Py_ssize_t i;
  11598. for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
  11599. substring = PyTuple_GET_ITEM(subobj, i);
  11600. if (!PyUnicode_Check(substring)) {
  11601. PyErr_Format(PyExc_TypeError,
  11602. "tuple for startswith must only contain str, "
  11603. "not %.100s",
  11604. Py_TYPE(substring)->tp_name);
  11605. return NULL;
  11606. }
  11607. result = tailmatch(self, substring, start, end, -1);
  11608. if (result == -1)
  11609. return NULL;
  11610. if (result) {
  11611. Py_RETURN_TRUE;
  11612. }
  11613. }
  11614. /* nothing matched */
  11615. Py_RETURN_FALSE;
  11616. }
  11617. if (!PyUnicode_Check(subobj)) {
  11618. PyErr_Format(PyExc_TypeError,
  11619. "startswith first arg must be str or "
  11620. "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
  11621. return NULL;
  11622. }
  11623. result = tailmatch(self, subobj, start, end, -1);
  11624. if (result == -1)
  11625. return NULL;
  11626. return PyBool_FromLong(result);
  11627. }
  11628. PyDoc_STRVAR(endswith__doc__,
  11629. "S.endswith(suffix[, start[, end]]) -> bool\n\
  11630. \n\
  11631. Return True if S ends with the specified suffix, False otherwise.\n\
  11632. With optional start, test S beginning at that position.\n\
  11633. With optional end, stop comparing S at that position.\n\
  11634. suffix can also be a tuple of strings to try.");
  11635. static PyObject *
  11636. unicode_endswith(PyObject *self,
  11637. PyObject *args)
  11638. {
  11639. PyObject *subobj;
  11640. PyObject *substring;
  11641. Py_ssize_t start = 0;
  11642. Py_ssize_t end = PY_SSIZE_T_MAX;
  11643. int result;
  11644. if (!asciilib_parse_args_finds("endswith", args, &subobj, &start, &end))
  11645. return NULL;
  11646. if (PyTuple_Check(subobj)) {
  11647. Py_ssize_t i;
  11648. for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
  11649. substring = PyTuple_GET_ITEM(subobj, i);
  11650. if (!PyUnicode_Check(substring)) {
  11651. PyErr_Format(PyExc_TypeError,
  11652. "tuple for endswith must only contain str, "
  11653. "not %.100s",
  11654. Py_TYPE(substring)->tp_name);
  11655. return NULL;
  11656. }
  11657. result = tailmatch(self, substring, start, end, +1);
  11658. if (result == -1)
  11659. return NULL;
  11660. if (result) {
  11661. Py_RETURN_TRUE;
  11662. }
  11663. }
  11664. Py_RETURN_FALSE;
  11665. }
  11666. if (!PyUnicode_Check(subobj)) {
  11667. PyErr_Format(PyExc_TypeError,
  11668. "endswith first arg must be str or "
  11669. "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
  11670. return NULL;
  11671. }
  11672. result = tailmatch(self, subobj, start, end, +1);
  11673. if (result == -1)
  11674. return NULL;
  11675. return PyBool_FromLong(result);
  11676. }
  11677. static inline void
  11678. _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
  11679. {
  11680. writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
  11681. writer->data = PyUnicode_DATA(writer->buffer);
  11682. if (!writer->readonly) {
  11683. writer->kind = PyUnicode_KIND(writer->buffer);
  11684. writer->size = PyUnicode_GET_LENGTH(writer->buffer);
  11685. }
  11686. else {
  11687. /* use a value smaller than PyUnicode_1BYTE_KIND() so
  11688. _PyUnicodeWriter_PrepareKind() will copy the buffer. */
  11689. writer->kind = 0;
  11690. assert(writer->kind <= PyUnicode_1BYTE_KIND);
  11691. /* Copy-on-write mode: set buffer size to 0 so
  11692. * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
  11693. * next write. */
  11694. writer->size = 0;
  11695. }
  11696. }
  11697. void
  11698. _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
  11699. {
  11700. memset(writer, 0, sizeof(*writer));
  11701. /* ASCII is the bare minimum */
  11702. writer->min_char = 127;
  11703. /* use a value smaller than PyUnicode_1BYTE_KIND() so
  11704. _PyUnicodeWriter_PrepareKind() will copy the buffer. */
  11705. writer->kind = 0;
  11706. assert(writer->kind <= PyUnicode_1BYTE_KIND);
  11707. }
  11708. // Initialize _PyUnicodeWriter with initial buffer
  11709. static inline void
  11710. _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
  11711. {
  11712. memset(writer, 0, sizeof(*writer));
  11713. writer->buffer = buffer;
  11714. _PyUnicodeWriter_Update(writer);
  11715. writer->min_length = writer->size;
  11716. }
  11717. int
  11718. _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
  11719. Py_ssize_t length, Py_UCS4 maxchar)
  11720. {
  11721. Py_ssize_t newlen;
  11722. PyObject *newbuffer;
  11723. assert(maxchar <= MAX_UNICODE);
  11724. /* ensure that the _PyUnicodeWriter_Prepare macro was used */
  11725. assert((maxchar > writer->maxchar && length >= 0)
  11726. || length > 0);
  11727. if (length > PY_SSIZE_T_MAX - writer->pos) {
  11728. PyErr_NoMemory();
  11729. return -1;
  11730. }
  11731. newlen = writer->pos + length;
  11732. maxchar = Py_MAX(maxchar, writer->min_char);
  11733. if (writer->buffer == NULL) {
  11734. assert(!writer->readonly);
  11735. if (writer->overallocate
  11736. && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
  11737. /* overallocate to limit the number of realloc() */
  11738. newlen += newlen / OVERALLOCATE_FACTOR;
  11739. }
  11740. if (newlen < writer->min_length)
  11741. newlen = writer->min_length;
  11742. writer->buffer = PyUnicode_New(newlen, maxchar);
  11743. if (writer->buffer == NULL)
  11744. return -1;
  11745. }
  11746. else if (newlen > writer->size) {
  11747. if (writer->overallocate
  11748. && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
  11749. /* overallocate to limit the number of realloc() */
  11750. newlen += newlen / OVERALLOCATE_FACTOR;
  11751. }
  11752. if (newlen < writer->min_length)
  11753. newlen = writer->min_length;
  11754. if (maxchar > writer->maxchar || writer->readonly) {
  11755. /* resize + widen */
  11756. maxchar = Py_MAX(maxchar, writer->maxchar);
  11757. newbuffer = PyUnicode_New(newlen, maxchar);
  11758. if (newbuffer == NULL)
  11759. return -1;
  11760. _PyUnicode_FastCopyCharacters(newbuffer, 0,
  11761. writer->buffer, 0, writer->pos);
  11762. Py_DECREF(writer->buffer);
  11763. writer->readonly = 0;
  11764. }
  11765. else {
  11766. newbuffer = resize_compact(writer->buffer, newlen);
  11767. if (newbuffer == NULL)
  11768. return -1;
  11769. }
  11770. writer->buffer = newbuffer;
  11771. }
  11772. else if (maxchar > writer->maxchar) {
  11773. assert(!writer->readonly);
  11774. newbuffer = PyUnicode_New(writer->size, maxchar);
  11775. if (newbuffer == NULL)
  11776. return -1;
  11777. _PyUnicode_FastCopyCharacters(newbuffer, 0,
  11778. writer->buffer, 0, writer->pos);
  11779. Py_SETREF(writer->buffer, newbuffer);
  11780. }
  11781. _PyUnicodeWriter_Update(writer);
  11782. return 0;
  11783. #undef OVERALLOCATE_FACTOR
  11784. }
  11785. int
  11786. _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
  11787. int kind)
  11788. {
  11789. Py_UCS4 maxchar;
  11790. /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
  11791. assert(writer->kind < kind);
  11792. switch (kind)
  11793. {
  11794. case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
  11795. case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
  11796. case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
  11797. default:
  11798. Py_UNREACHABLE();
  11799. }
  11800. return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
  11801. }
  11802. static inline int
  11803. _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
  11804. {
  11805. assert(ch <= MAX_UNICODE);
  11806. if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
  11807. return -1;
  11808. PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
  11809. writer->pos++;
  11810. return 0;
  11811. }
  11812. int
  11813. _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
  11814. {
  11815. return _PyUnicodeWriter_WriteCharInline(writer, ch);
  11816. }
  11817. int
  11818. _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
  11819. {
  11820. Py_UCS4 maxchar;
  11821. Py_ssize_t len;
  11822. len = PyUnicode_GET_LENGTH(str);
  11823. if (len == 0)
  11824. return 0;
  11825. maxchar = PyUnicode_MAX_CHAR_VALUE(str);
  11826. if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
  11827. if (writer->buffer == NULL && !writer->overallocate) {
  11828. assert(_PyUnicode_CheckConsistency(str, 1));
  11829. writer->readonly = 1;
  11830. writer->buffer = Py_NewRef(str);
  11831. _PyUnicodeWriter_Update(writer);
  11832. writer->pos += len;
  11833. return 0;
  11834. }
  11835. if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
  11836. return -1;
  11837. }
  11838. _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
  11839. str, 0, len);
  11840. writer->pos += len;
  11841. return 0;
  11842. }
  11843. int
  11844. _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
  11845. Py_ssize_t start, Py_ssize_t end)
  11846. {
  11847. Py_UCS4 maxchar;
  11848. Py_ssize_t len;
  11849. assert(0 <= start);
  11850. assert(end <= PyUnicode_GET_LENGTH(str));
  11851. assert(start <= end);
  11852. if (end == 0)
  11853. return 0;
  11854. if (start == 0 && end == PyUnicode_GET_LENGTH(str))
  11855. return _PyUnicodeWriter_WriteStr(writer, str);
  11856. if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
  11857. maxchar = _PyUnicode_FindMaxChar(str, start, end);
  11858. else
  11859. maxchar = writer->maxchar;
  11860. len = end - start;
  11861. if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
  11862. return -1;
  11863. _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
  11864. str, start, len);
  11865. writer->pos += len;
  11866. return 0;
  11867. }
  11868. int
  11869. _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
  11870. const char *ascii, Py_ssize_t len)
  11871. {
  11872. if (len == -1)
  11873. len = strlen(ascii);
  11874. assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
  11875. if (writer->buffer == NULL && !writer->overallocate) {
  11876. PyObject *str;
  11877. str = _PyUnicode_FromASCII(ascii, len);
  11878. if (str == NULL)
  11879. return -1;
  11880. writer->readonly = 1;
  11881. writer->buffer = str;
  11882. _PyUnicodeWriter_Update(writer);
  11883. writer->pos += len;
  11884. return 0;
  11885. }
  11886. if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
  11887. return -1;
  11888. switch (writer->kind)
  11889. {
  11890. case PyUnicode_1BYTE_KIND:
  11891. {
  11892. const Py_UCS1 *str = (const Py_UCS1 *)ascii;
  11893. Py_UCS1 *data = writer->data;
  11894. memcpy(data + writer->pos, str, len);
  11895. break;
  11896. }
  11897. case PyUnicode_2BYTE_KIND:
  11898. {
  11899. _PyUnicode_CONVERT_BYTES(
  11900. Py_UCS1, Py_UCS2,
  11901. ascii, ascii + len,
  11902. (Py_UCS2 *)writer->data + writer->pos);
  11903. break;
  11904. }
  11905. case PyUnicode_4BYTE_KIND:
  11906. {
  11907. _PyUnicode_CONVERT_BYTES(
  11908. Py_UCS1, Py_UCS4,
  11909. ascii, ascii + len,
  11910. (Py_UCS4 *)writer->data + writer->pos);
  11911. break;
  11912. }
  11913. default:
  11914. Py_UNREACHABLE();
  11915. }
  11916. writer->pos += len;
  11917. return 0;
  11918. }
  11919. int
  11920. _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
  11921. const char *str, Py_ssize_t len)
  11922. {
  11923. Py_UCS4 maxchar;
  11924. maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
  11925. if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
  11926. return -1;
  11927. unicode_write_cstr(writer->buffer, writer->pos, str, len);
  11928. writer->pos += len;
  11929. return 0;
  11930. }
  11931. PyObject *
  11932. _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
  11933. {
  11934. PyObject *str;
  11935. if (writer->pos == 0) {
  11936. Py_CLEAR(writer->buffer);
  11937. _Py_RETURN_UNICODE_EMPTY();
  11938. }
  11939. str = writer->buffer;
  11940. writer->buffer = NULL;
  11941. if (writer->readonly) {
  11942. assert(PyUnicode_GET_LENGTH(str) == writer->pos);
  11943. return str;
  11944. }
  11945. if (PyUnicode_GET_LENGTH(str) != writer->pos) {
  11946. PyObject *str2;
  11947. str2 = resize_compact(str, writer->pos);
  11948. if (str2 == NULL) {
  11949. Py_DECREF(str);
  11950. return NULL;
  11951. }
  11952. str = str2;
  11953. }
  11954. assert(_PyUnicode_CheckConsistency(str, 1));
  11955. return unicode_result(str);
  11956. }
  11957. void
  11958. _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
  11959. {
  11960. Py_CLEAR(writer->buffer);
  11961. }
  11962. #include "stringlib/unicode_format.h"
  11963. PyDoc_STRVAR(format__doc__,
  11964. "S.format(*args, **kwargs) -> str\n\
  11965. \n\
  11966. Return a formatted version of S, using substitutions from args and kwargs.\n\
  11967. The substitutions are identified by braces ('{' and '}').");
  11968. PyDoc_STRVAR(format_map__doc__,
  11969. "S.format_map(mapping) -> str\n\
  11970. \n\
  11971. Return a formatted version of S, using substitutions from mapping.\n\
  11972. The substitutions are identified by braces ('{' and '}').");
  11973. /*[clinic input]
  11974. str.__format__ as unicode___format__
  11975. format_spec: unicode
  11976. /
  11977. Return a formatted version of the string as described by format_spec.
  11978. [clinic start generated code]*/
  11979. static PyObject *
  11980. unicode___format___impl(PyObject *self, PyObject *format_spec)
  11981. /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
  11982. {
  11983. _PyUnicodeWriter writer;
  11984. int ret;
  11985. _PyUnicodeWriter_Init(&writer);
  11986. ret = _PyUnicode_FormatAdvancedWriter(&writer,
  11987. self, format_spec, 0,
  11988. PyUnicode_GET_LENGTH(format_spec));
  11989. if (ret == -1) {
  11990. _PyUnicodeWriter_Dealloc(&writer);
  11991. return NULL;
  11992. }
  11993. return _PyUnicodeWriter_Finish(&writer);
  11994. }
  11995. /*[clinic input]
  11996. str.__sizeof__ as unicode_sizeof
  11997. Return the size of the string in memory, in bytes.
  11998. [clinic start generated code]*/
  11999. static PyObject *
  12000. unicode_sizeof_impl(PyObject *self)
  12001. /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
  12002. {
  12003. Py_ssize_t size;
  12004. /* If it's a compact object, account for base structure +
  12005. character data. */
  12006. if (PyUnicode_IS_COMPACT_ASCII(self)) {
  12007. size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
  12008. }
  12009. else if (PyUnicode_IS_COMPACT(self)) {
  12010. size = sizeof(PyCompactUnicodeObject) +
  12011. (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
  12012. }
  12013. else {
  12014. /* If it is a two-block object, account for base object, and
  12015. for character block if present. */
  12016. size = sizeof(PyUnicodeObject);
  12017. if (_PyUnicode_DATA_ANY(self))
  12018. size += (PyUnicode_GET_LENGTH(self) + 1) *
  12019. PyUnicode_KIND(self);
  12020. }
  12021. if (_PyUnicode_HAS_UTF8_MEMORY(self))
  12022. size += PyUnicode_UTF8_LENGTH(self) + 1;
  12023. return PyLong_FromSsize_t(size);
  12024. }
  12025. static PyObject *
  12026. unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
  12027. {
  12028. PyObject *copy = _PyUnicode_Copy(v);
  12029. if (!copy)
  12030. return NULL;
  12031. return Py_BuildValue("(N)", copy);
  12032. }
  12033. static PyMethodDef unicode_methods[] = {
  12034. UNICODE_ENCODE_METHODDEF
  12035. UNICODE_REPLACE_METHODDEF
  12036. UNICODE_SPLIT_METHODDEF
  12037. UNICODE_RSPLIT_METHODDEF
  12038. UNICODE_JOIN_METHODDEF
  12039. UNICODE_CAPITALIZE_METHODDEF
  12040. UNICODE_CASEFOLD_METHODDEF
  12041. UNICODE_TITLE_METHODDEF
  12042. UNICODE_CENTER_METHODDEF
  12043. {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
  12044. UNICODE_EXPANDTABS_METHODDEF
  12045. {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
  12046. UNICODE_PARTITION_METHODDEF
  12047. {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
  12048. UNICODE_LJUST_METHODDEF
  12049. UNICODE_LOWER_METHODDEF
  12050. UNICODE_LSTRIP_METHODDEF
  12051. {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
  12052. {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
  12053. UNICODE_RJUST_METHODDEF
  12054. UNICODE_RSTRIP_METHODDEF
  12055. UNICODE_RPARTITION_METHODDEF
  12056. UNICODE_SPLITLINES_METHODDEF
  12057. UNICODE_STRIP_METHODDEF
  12058. UNICODE_SWAPCASE_METHODDEF
  12059. UNICODE_TRANSLATE_METHODDEF
  12060. UNICODE_UPPER_METHODDEF
  12061. {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
  12062. {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
  12063. UNICODE_REMOVEPREFIX_METHODDEF
  12064. UNICODE_REMOVESUFFIX_METHODDEF
  12065. UNICODE_ISASCII_METHODDEF
  12066. UNICODE_ISLOWER_METHODDEF
  12067. UNICODE_ISUPPER_METHODDEF
  12068. UNICODE_ISTITLE_METHODDEF
  12069. UNICODE_ISSPACE_METHODDEF
  12070. UNICODE_ISDECIMAL_METHODDEF
  12071. UNICODE_ISDIGIT_METHODDEF
  12072. UNICODE_ISNUMERIC_METHODDEF
  12073. UNICODE_ISALPHA_METHODDEF
  12074. UNICODE_ISALNUM_METHODDEF
  12075. UNICODE_ISIDENTIFIER_METHODDEF
  12076. UNICODE_ISPRINTABLE_METHODDEF
  12077. UNICODE_ZFILL_METHODDEF
  12078. {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
  12079. {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
  12080. UNICODE___FORMAT___METHODDEF
  12081. UNICODE_MAKETRANS_METHODDEF
  12082. UNICODE_SIZEOF_METHODDEF
  12083. {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
  12084. {NULL, NULL}
  12085. };
  12086. static PyObject *
  12087. unicode_mod(PyObject *v, PyObject *w)
  12088. {
  12089. if (!PyUnicode_Check(v))
  12090. Py_RETURN_NOTIMPLEMENTED;
  12091. return PyUnicode_Format(v, w);
  12092. }
  12093. static PyNumberMethods unicode_as_number = {
  12094. 0, /*nb_add*/
  12095. 0, /*nb_subtract*/
  12096. 0, /*nb_multiply*/
  12097. unicode_mod, /*nb_remainder*/
  12098. };
  12099. static PySequenceMethods unicode_as_sequence = {
  12100. (lenfunc) unicode_length, /* sq_length */
  12101. PyUnicode_Concat, /* sq_concat */
  12102. (ssizeargfunc) unicode_repeat, /* sq_repeat */
  12103. (ssizeargfunc) unicode_getitem, /* sq_item */
  12104. 0, /* sq_slice */
  12105. 0, /* sq_ass_item */
  12106. 0, /* sq_ass_slice */
  12107. PyUnicode_Contains, /* sq_contains */
  12108. };
  12109. static PyObject*
  12110. unicode_subscript(PyObject* self, PyObject* item)
  12111. {
  12112. if (_PyIndex_Check(item)) {
  12113. Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
  12114. if (i == -1 && PyErr_Occurred())
  12115. return NULL;
  12116. if (i < 0)
  12117. i += PyUnicode_GET_LENGTH(self);
  12118. return unicode_getitem(self, i);
  12119. } else if (PySlice_Check(item)) {
  12120. Py_ssize_t start, stop, step, slicelength, i;
  12121. size_t cur;
  12122. PyObject *result;
  12123. const void *src_data;
  12124. void *dest_data;
  12125. int src_kind, dest_kind;
  12126. Py_UCS4 ch, max_char, kind_limit;
  12127. if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
  12128. return NULL;
  12129. }
  12130. slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
  12131. &start, &stop, step);
  12132. if (slicelength <= 0) {
  12133. _Py_RETURN_UNICODE_EMPTY();
  12134. } else if (start == 0 && step == 1 &&
  12135. slicelength == PyUnicode_GET_LENGTH(self)) {
  12136. return unicode_result_unchanged(self);
  12137. } else if (step == 1) {
  12138. return PyUnicode_Substring(self,
  12139. start, start + slicelength);
  12140. }
  12141. /* General case */
  12142. src_kind = PyUnicode_KIND(self);
  12143. src_data = PyUnicode_DATA(self);
  12144. if (!PyUnicode_IS_ASCII(self)) {
  12145. kind_limit = kind_maxchar_limit(src_kind);
  12146. max_char = 0;
  12147. for (cur = start, i = 0; i < slicelength; cur += step, i++) {
  12148. ch = PyUnicode_READ(src_kind, src_data, cur);
  12149. if (ch > max_char) {
  12150. max_char = ch;
  12151. if (max_char >= kind_limit)
  12152. break;
  12153. }
  12154. }
  12155. }
  12156. else
  12157. max_char = 127;
  12158. result = PyUnicode_New(slicelength, max_char);
  12159. if (result == NULL)
  12160. return NULL;
  12161. dest_kind = PyUnicode_KIND(result);
  12162. dest_data = PyUnicode_DATA(result);
  12163. for (cur = start, i = 0; i < slicelength; cur += step, i++) {
  12164. Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
  12165. PyUnicode_WRITE(dest_kind, dest_data, i, ch);
  12166. }
  12167. assert(_PyUnicode_CheckConsistency(result, 1));
  12168. return result;
  12169. } else {
  12170. PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
  12171. Py_TYPE(item)->tp_name);
  12172. return NULL;
  12173. }
  12174. }
  12175. static PyMappingMethods unicode_as_mapping = {
  12176. (lenfunc)unicode_length, /* mp_length */
  12177. (binaryfunc)unicode_subscript, /* mp_subscript */
  12178. (objobjargproc)0, /* mp_ass_subscript */
  12179. };
  12180. /* Helpers for PyUnicode_Format() */
  12181. struct unicode_formatter_t {
  12182. PyObject *args;
  12183. int args_owned;
  12184. Py_ssize_t arglen, argidx;
  12185. PyObject *dict;
  12186. int fmtkind;
  12187. Py_ssize_t fmtcnt, fmtpos;
  12188. const void *fmtdata;
  12189. PyObject *fmtstr;
  12190. _PyUnicodeWriter writer;
  12191. };
  12192. struct unicode_format_arg_t {
  12193. Py_UCS4 ch;
  12194. int flags;
  12195. Py_ssize_t width;
  12196. int prec;
  12197. int sign;
  12198. };
  12199. static PyObject *
  12200. unicode_format_getnextarg(struct unicode_formatter_t *ctx)
  12201. {
  12202. Py_ssize_t argidx = ctx->argidx;
  12203. if (argidx < ctx->arglen) {
  12204. ctx->argidx++;
  12205. if (ctx->arglen < 0)
  12206. return ctx->args;
  12207. else
  12208. return PyTuple_GetItem(ctx->args, argidx);
  12209. }
  12210. PyErr_SetString(PyExc_TypeError,
  12211. "not enough arguments for format string");
  12212. return NULL;
  12213. }
  12214. /* Returns a new reference to a PyUnicode object, or NULL on failure. */
  12215. /* Format a float into the writer if the writer is not NULL, or into *p_output
  12216. otherwise.
  12217. Return 0 on success, raise an exception and return -1 on error. */
  12218. static int
  12219. formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
  12220. PyObject **p_output,
  12221. _PyUnicodeWriter *writer)
  12222. {
  12223. char *p;
  12224. double x;
  12225. Py_ssize_t len;
  12226. int prec;
  12227. int dtoa_flags = 0;
  12228. x = PyFloat_AsDouble(v);
  12229. if (x == -1.0 && PyErr_Occurred())
  12230. return -1;
  12231. prec = arg->prec;
  12232. if (prec < 0)
  12233. prec = 6;
  12234. if (arg->flags & F_ALT)
  12235. dtoa_flags |= Py_DTSF_ALT;
  12236. p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
  12237. if (p == NULL)
  12238. return -1;
  12239. len = strlen(p);
  12240. if (writer) {
  12241. if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
  12242. PyMem_Free(p);
  12243. return -1;
  12244. }
  12245. }
  12246. else
  12247. *p_output = _PyUnicode_FromASCII(p, len);
  12248. PyMem_Free(p);
  12249. return 0;
  12250. }
  12251. /* formatlong() emulates the format codes d, u, o, x and X, and
  12252. * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
  12253. * Python's regular ints.
  12254. * Return value: a new PyUnicodeObject*, or NULL if error.
  12255. * The output string is of the form
  12256. * "-"? ("0x" | "0X")? digit+
  12257. * "0x"/"0X" are present only for x and X conversions, with F_ALT
  12258. * set in flags. The case of hex digits will be correct,
  12259. * There will be at least prec digits, zero-filled on the left if
  12260. * necessary to get that many.
  12261. * val object to be converted
  12262. * flags bitmask of format flags; only F_ALT is looked at
  12263. * prec minimum number of digits; 0-fill on left if needed
  12264. * type a character in [duoxX]; u acts the same as d
  12265. *
  12266. * CAUTION: o, x and X conversions on regular ints can never
  12267. * produce a '-' sign, but can for Python's unbounded ints.
  12268. */
  12269. PyObject *
  12270. _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
  12271. {
  12272. PyObject *result = NULL;
  12273. char *buf;
  12274. Py_ssize_t i;
  12275. int sign; /* 1 if '-', else 0 */
  12276. int len; /* number of characters */
  12277. Py_ssize_t llen;
  12278. int numdigits; /* len == numnondigits + numdigits */
  12279. int numnondigits = 0;
  12280. /* Avoid exceeding SSIZE_T_MAX */
  12281. if (prec > INT_MAX-3) {
  12282. PyErr_SetString(PyExc_OverflowError,
  12283. "precision too large");
  12284. return NULL;
  12285. }
  12286. assert(PyLong_Check(val));
  12287. switch (type) {
  12288. default:
  12289. Py_UNREACHABLE();
  12290. case 'd':
  12291. case 'i':
  12292. case 'u':
  12293. /* int and int subclasses should print numerically when a numeric */
  12294. /* format code is used (see issue18780) */
  12295. result = PyNumber_ToBase(val, 10);
  12296. break;
  12297. case 'o':
  12298. numnondigits = 2;
  12299. result = PyNumber_ToBase(val, 8);
  12300. break;
  12301. case 'x':
  12302. case 'X':
  12303. numnondigits = 2;
  12304. result = PyNumber_ToBase(val, 16);
  12305. break;
  12306. }
  12307. if (!result)
  12308. return NULL;
  12309. assert(unicode_modifiable(result));
  12310. assert(PyUnicode_IS_ASCII(result));
  12311. /* To modify the string in-place, there can only be one reference. */
  12312. if (Py_REFCNT(result) != 1) {
  12313. Py_DECREF(result);
  12314. PyErr_BadInternalCall();
  12315. return NULL;
  12316. }
  12317. buf = PyUnicode_DATA(result);
  12318. llen = PyUnicode_GET_LENGTH(result);
  12319. if (llen > INT_MAX) {
  12320. Py_DECREF(result);
  12321. PyErr_SetString(PyExc_ValueError,
  12322. "string too large in _PyUnicode_FormatLong");
  12323. return NULL;
  12324. }
  12325. len = (int)llen;
  12326. sign = buf[0] == '-';
  12327. numnondigits += sign;
  12328. numdigits = len - numnondigits;
  12329. assert(numdigits > 0);
  12330. /* Get rid of base marker unless F_ALT */
  12331. if (((alt) == 0 &&
  12332. (type == 'o' || type == 'x' || type == 'X'))) {
  12333. assert(buf[sign] == '0');
  12334. assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
  12335. buf[sign+1] == 'o');
  12336. numnondigits -= 2;
  12337. buf += 2;
  12338. len -= 2;
  12339. if (sign)
  12340. buf[0] = '-';
  12341. assert(len == numnondigits + numdigits);
  12342. assert(numdigits > 0);
  12343. }
  12344. /* Fill with leading zeroes to meet minimum width. */
  12345. if (prec > numdigits) {
  12346. PyObject *r1 = PyBytes_FromStringAndSize(NULL,
  12347. numnondigits + prec);
  12348. char *b1;
  12349. if (!r1) {
  12350. Py_DECREF(result);
  12351. return NULL;
  12352. }
  12353. b1 = PyBytes_AS_STRING(r1);
  12354. for (i = 0; i < numnondigits; ++i)
  12355. *b1++ = *buf++;
  12356. for (i = 0; i < prec - numdigits; i++)
  12357. *b1++ = '0';
  12358. for (i = 0; i < numdigits; i++)
  12359. *b1++ = *buf++;
  12360. *b1 = '\0';
  12361. Py_SETREF(result, r1);
  12362. buf = PyBytes_AS_STRING(result);
  12363. len = numnondigits + prec;
  12364. }
  12365. /* Fix up case for hex conversions. */
  12366. if (type == 'X') {
  12367. /* Need to convert all lower case letters to upper case.
  12368. and need to convert 0x to 0X (and -0x to -0X). */
  12369. for (i = 0; i < len; i++)
  12370. if (buf[i] >= 'a' && buf[i] <= 'x')
  12371. buf[i] -= 'a'-'A';
  12372. }
  12373. if (!PyUnicode_Check(result)
  12374. || buf != PyUnicode_DATA(result)) {
  12375. PyObject *unicode;
  12376. unicode = _PyUnicode_FromASCII(buf, len);
  12377. Py_SETREF(result, unicode);
  12378. }
  12379. else if (len != PyUnicode_GET_LENGTH(result)) {
  12380. if (PyUnicode_Resize(&result, len) < 0)
  12381. Py_CLEAR(result);
  12382. }
  12383. return result;
  12384. }
  12385. /* Format an integer or a float as an integer.
  12386. * Return 1 if the number has been formatted into the writer,
  12387. * 0 if the number has been formatted into *p_output
  12388. * -1 and raise an exception on error */
  12389. static int
  12390. mainformatlong(PyObject *v,
  12391. struct unicode_format_arg_t *arg,
  12392. PyObject **p_output,
  12393. _PyUnicodeWriter *writer)
  12394. {
  12395. PyObject *iobj, *res;
  12396. char type = (char)arg->ch;
  12397. if (!PyNumber_Check(v))
  12398. goto wrongtype;
  12399. /* make sure number is a type of integer for o, x, and X */
  12400. if (!PyLong_Check(v)) {
  12401. if (type == 'o' || type == 'x' || type == 'X') {
  12402. iobj = _PyNumber_Index(v);
  12403. }
  12404. else {
  12405. iobj = PyNumber_Long(v);
  12406. }
  12407. if (iobj == NULL ) {
  12408. if (PyErr_ExceptionMatches(PyExc_TypeError))
  12409. goto wrongtype;
  12410. return -1;
  12411. }
  12412. assert(PyLong_Check(iobj));
  12413. }
  12414. else {
  12415. iobj = Py_NewRef(v);
  12416. }
  12417. if (PyLong_CheckExact(v)
  12418. && arg->width == -1 && arg->prec == -1
  12419. && !(arg->flags & (F_SIGN | F_BLANK))
  12420. && type != 'X')
  12421. {
  12422. /* Fast path */
  12423. int alternate = arg->flags & F_ALT;
  12424. int base;
  12425. switch(type)
  12426. {
  12427. default:
  12428. Py_UNREACHABLE();
  12429. case 'd':
  12430. case 'i':
  12431. case 'u':
  12432. base = 10;
  12433. break;
  12434. case 'o':
  12435. base = 8;
  12436. break;
  12437. case 'x':
  12438. case 'X':
  12439. base = 16;
  12440. break;
  12441. }
  12442. if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
  12443. Py_DECREF(iobj);
  12444. return -1;
  12445. }
  12446. Py_DECREF(iobj);
  12447. return 1;
  12448. }
  12449. res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
  12450. Py_DECREF(iobj);
  12451. if (res == NULL)
  12452. return -1;
  12453. *p_output = res;
  12454. return 0;
  12455. wrongtype:
  12456. switch(type)
  12457. {
  12458. case 'o':
  12459. case 'x':
  12460. case 'X':
  12461. PyErr_Format(PyExc_TypeError,
  12462. "%%%c format: an integer is required, "
  12463. "not %.200s",
  12464. type, Py_TYPE(v)->tp_name);
  12465. break;
  12466. default:
  12467. PyErr_Format(PyExc_TypeError,
  12468. "%%%c format: a real number is required, "
  12469. "not %.200s",
  12470. type, Py_TYPE(v)->tp_name);
  12471. break;
  12472. }
  12473. return -1;
  12474. }
  12475. static Py_UCS4
  12476. formatchar(PyObject *v)
  12477. {
  12478. /* presume that the buffer is at least 3 characters long */
  12479. if (PyUnicode_Check(v)) {
  12480. if (PyUnicode_GET_LENGTH(v) == 1) {
  12481. return PyUnicode_READ_CHAR(v, 0);
  12482. }
  12483. goto onError;
  12484. }
  12485. else {
  12486. int overflow;
  12487. long x = PyLong_AsLongAndOverflow(v, &overflow);
  12488. if (x == -1 && PyErr_Occurred()) {
  12489. if (PyErr_ExceptionMatches(PyExc_TypeError)) {
  12490. goto onError;
  12491. }
  12492. return (Py_UCS4) -1;
  12493. }
  12494. if (x < 0 || x > MAX_UNICODE) {
  12495. /* this includes an overflow in converting to C long */
  12496. PyErr_SetString(PyExc_OverflowError,
  12497. "%c arg not in range(0x110000)");
  12498. return (Py_UCS4) -1;
  12499. }
  12500. return (Py_UCS4) x;
  12501. }
  12502. onError:
  12503. PyErr_SetString(PyExc_TypeError,
  12504. "%c requires int or char");
  12505. return (Py_UCS4) -1;
  12506. }
  12507. /* Parse options of an argument: flags, width, precision.
  12508. Handle also "%(name)" syntax.
  12509. Return 0 if the argument has been formatted into arg->str.
  12510. Return 1 if the argument has been written into ctx->writer,
  12511. Raise an exception and return -1 on error. */
  12512. static int
  12513. unicode_format_arg_parse(struct unicode_formatter_t *ctx,
  12514. struct unicode_format_arg_t *arg)
  12515. {
  12516. #define FORMAT_READ(ctx) \
  12517. PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
  12518. PyObject *v;
  12519. if (arg->ch == '(') {
  12520. /* Get argument value from a dictionary. Example: "%(name)s". */
  12521. Py_ssize_t keystart;
  12522. Py_ssize_t keylen;
  12523. PyObject *key;
  12524. int pcount = 1;
  12525. if (ctx->dict == NULL) {
  12526. PyErr_SetString(PyExc_TypeError,
  12527. "format requires a mapping");
  12528. return -1;
  12529. }
  12530. ++ctx->fmtpos;
  12531. --ctx->fmtcnt;
  12532. keystart = ctx->fmtpos;
  12533. /* Skip over balanced parentheses */
  12534. while (pcount > 0 && --ctx->fmtcnt >= 0) {
  12535. arg->ch = FORMAT_READ(ctx);
  12536. if (arg->ch == ')')
  12537. --pcount;
  12538. else if (arg->ch == '(')
  12539. ++pcount;
  12540. ctx->fmtpos++;
  12541. }
  12542. keylen = ctx->fmtpos - keystart - 1;
  12543. if (ctx->fmtcnt < 0 || pcount > 0) {
  12544. PyErr_SetString(PyExc_ValueError,
  12545. "incomplete format key");
  12546. return -1;
  12547. }
  12548. key = PyUnicode_Substring(ctx->fmtstr,
  12549. keystart, keystart + keylen);
  12550. if (key == NULL)
  12551. return -1;
  12552. if (ctx->args_owned) {
  12553. ctx->args_owned = 0;
  12554. Py_DECREF(ctx->args);
  12555. }
  12556. ctx->args = PyObject_GetItem(ctx->dict, key);
  12557. Py_DECREF(key);
  12558. if (ctx->args == NULL)
  12559. return -1;
  12560. ctx->args_owned = 1;
  12561. ctx->arglen = -1;
  12562. ctx->argidx = -2;
  12563. }
  12564. /* Parse flags. Example: "%+i" => flags=F_SIGN. */
  12565. while (--ctx->fmtcnt >= 0) {
  12566. arg->ch = FORMAT_READ(ctx);
  12567. ctx->fmtpos++;
  12568. switch (arg->ch) {
  12569. case '-': arg->flags |= F_LJUST; continue;
  12570. case '+': arg->flags |= F_SIGN; continue;
  12571. case ' ': arg->flags |= F_BLANK; continue;
  12572. case '#': arg->flags |= F_ALT; continue;
  12573. case '0': arg->flags |= F_ZERO; continue;
  12574. }
  12575. break;
  12576. }
  12577. /* Parse width. Example: "%10s" => width=10 */
  12578. if (arg->ch == '*') {
  12579. v = unicode_format_getnextarg(ctx);
  12580. if (v == NULL)
  12581. return -1;
  12582. if (!PyLong_Check(v)) {
  12583. PyErr_SetString(PyExc_TypeError,
  12584. "* wants int");
  12585. return -1;
  12586. }
  12587. arg->width = PyLong_AsSsize_t(v);
  12588. if (arg->width == -1 && PyErr_Occurred())
  12589. return -1;
  12590. if (arg->width < 0) {
  12591. arg->flags |= F_LJUST;
  12592. arg->width = -arg->width;
  12593. }
  12594. if (--ctx->fmtcnt >= 0) {
  12595. arg->ch = FORMAT_READ(ctx);
  12596. ctx->fmtpos++;
  12597. }
  12598. }
  12599. else if (arg->ch >= '0' && arg->ch <= '9') {
  12600. arg->width = arg->ch - '0';
  12601. while (--ctx->fmtcnt >= 0) {
  12602. arg->ch = FORMAT_READ(ctx);
  12603. ctx->fmtpos++;
  12604. if (arg->ch < '0' || arg->ch > '9')
  12605. break;
  12606. /* Since arg->ch is unsigned, the RHS would end up as unsigned,
  12607. mixing signed and unsigned comparison. Since arg->ch is between
  12608. '0' and '9', casting to int is safe. */
  12609. if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
  12610. PyErr_SetString(PyExc_ValueError,
  12611. "width too big");
  12612. return -1;
  12613. }
  12614. arg->width = arg->width*10 + (arg->ch - '0');
  12615. }
  12616. }
  12617. /* Parse precision. Example: "%.3f" => prec=3 */
  12618. if (arg->ch == '.') {
  12619. arg->prec = 0;
  12620. if (--ctx->fmtcnt >= 0) {
  12621. arg->ch = FORMAT_READ(ctx);
  12622. ctx->fmtpos++;
  12623. }
  12624. if (arg->ch == '*') {
  12625. v = unicode_format_getnextarg(ctx);
  12626. if (v == NULL)
  12627. return -1;
  12628. if (!PyLong_Check(v)) {
  12629. PyErr_SetString(PyExc_TypeError,
  12630. "* wants int");
  12631. return -1;
  12632. }
  12633. arg->prec = _PyLong_AsInt(v);
  12634. if (arg->prec == -1 && PyErr_Occurred())
  12635. return -1;
  12636. if (arg->prec < 0)
  12637. arg->prec = 0;
  12638. if (--ctx->fmtcnt >= 0) {
  12639. arg->ch = FORMAT_READ(ctx);
  12640. ctx->fmtpos++;
  12641. }
  12642. }
  12643. else if (arg->ch >= '0' && arg->ch <= '9') {
  12644. arg->prec = arg->ch - '0';
  12645. while (--ctx->fmtcnt >= 0) {
  12646. arg->ch = FORMAT_READ(ctx);
  12647. ctx->fmtpos++;
  12648. if (arg->ch < '0' || arg->ch > '9')
  12649. break;
  12650. if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
  12651. PyErr_SetString(PyExc_ValueError,
  12652. "precision too big");
  12653. return -1;
  12654. }
  12655. arg->prec = arg->prec*10 + (arg->ch - '0');
  12656. }
  12657. }
  12658. }
  12659. /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
  12660. if (ctx->fmtcnt >= 0) {
  12661. if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
  12662. if (--ctx->fmtcnt >= 0) {
  12663. arg->ch = FORMAT_READ(ctx);
  12664. ctx->fmtpos++;
  12665. }
  12666. }
  12667. }
  12668. if (ctx->fmtcnt < 0) {
  12669. PyErr_SetString(PyExc_ValueError,
  12670. "incomplete format");
  12671. return -1;
  12672. }
  12673. return 0;
  12674. #undef FORMAT_READ
  12675. }
  12676. /* Format one argument. Supported conversion specifiers:
  12677. - "s", "r", "a": any type
  12678. - "i", "d", "u": int or float
  12679. - "o", "x", "X": int
  12680. - "e", "E", "f", "F", "g", "G": float
  12681. - "c": int or str (1 character)
  12682. When possible, the output is written directly into the Unicode writer
  12683. (ctx->writer). A string is created when padding is required.
  12684. Return 0 if the argument has been formatted into *p_str,
  12685. 1 if the argument has been written into ctx->writer,
  12686. -1 on error. */
  12687. static int
  12688. unicode_format_arg_format(struct unicode_formatter_t *ctx,
  12689. struct unicode_format_arg_t *arg,
  12690. PyObject **p_str)
  12691. {
  12692. PyObject *v;
  12693. _PyUnicodeWriter *writer = &ctx->writer;
  12694. if (ctx->fmtcnt == 0)
  12695. ctx->writer.overallocate = 0;
  12696. v = unicode_format_getnextarg(ctx);
  12697. if (v == NULL)
  12698. return -1;
  12699. switch (arg->ch) {
  12700. case 's':
  12701. case 'r':
  12702. case 'a':
  12703. if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
  12704. /* Fast path */
  12705. if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
  12706. return -1;
  12707. return 1;
  12708. }
  12709. if (PyUnicode_CheckExact(v) && arg->ch == 's') {
  12710. *p_str = Py_NewRef(v);
  12711. }
  12712. else {
  12713. if (arg->ch == 's')
  12714. *p_str = PyObject_Str(v);
  12715. else if (arg->ch == 'r')
  12716. *p_str = PyObject_Repr(v);
  12717. else
  12718. *p_str = PyObject_ASCII(v);
  12719. }
  12720. break;
  12721. case 'i':
  12722. case 'd':
  12723. case 'u':
  12724. case 'o':
  12725. case 'x':
  12726. case 'X':
  12727. {
  12728. int ret = mainformatlong(v, arg, p_str, writer);
  12729. if (ret != 0)
  12730. return ret;
  12731. arg->sign = 1;
  12732. break;
  12733. }
  12734. case 'e':
  12735. case 'E':
  12736. case 'f':
  12737. case 'F':
  12738. case 'g':
  12739. case 'G':
  12740. if (arg->width == -1 && arg->prec == -1
  12741. && !(arg->flags & (F_SIGN | F_BLANK)))
  12742. {
  12743. /* Fast path */
  12744. if (formatfloat(v, arg, NULL, writer) == -1)
  12745. return -1;
  12746. return 1;
  12747. }
  12748. arg->sign = 1;
  12749. if (formatfloat(v, arg, p_str, NULL) == -1)
  12750. return -1;
  12751. break;
  12752. case 'c':
  12753. {
  12754. Py_UCS4 ch = formatchar(v);
  12755. if (ch == (Py_UCS4) -1)
  12756. return -1;
  12757. if (arg->width == -1 && arg->prec == -1) {
  12758. /* Fast path */
  12759. if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
  12760. return -1;
  12761. return 1;
  12762. }
  12763. *p_str = PyUnicode_FromOrdinal(ch);
  12764. break;
  12765. }
  12766. default:
  12767. PyErr_Format(PyExc_ValueError,
  12768. "unsupported format character '%c' (0x%x) "
  12769. "at index %zd",
  12770. (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
  12771. (int)arg->ch,
  12772. ctx->fmtpos - 1);
  12773. return -1;
  12774. }
  12775. if (*p_str == NULL)
  12776. return -1;
  12777. assert (PyUnicode_Check(*p_str));
  12778. return 0;
  12779. }
  12780. static int
  12781. unicode_format_arg_output(struct unicode_formatter_t *ctx,
  12782. struct unicode_format_arg_t *arg,
  12783. PyObject *str)
  12784. {
  12785. Py_ssize_t len;
  12786. int kind;
  12787. const void *pbuf;
  12788. Py_ssize_t pindex;
  12789. Py_UCS4 signchar;
  12790. Py_ssize_t buflen;
  12791. Py_UCS4 maxchar;
  12792. Py_ssize_t sublen;
  12793. _PyUnicodeWriter *writer = &ctx->writer;
  12794. Py_UCS4 fill;
  12795. fill = ' ';
  12796. if (arg->sign && arg->flags & F_ZERO)
  12797. fill = '0';
  12798. len = PyUnicode_GET_LENGTH(str);
  12799. if ((arg->width == -1 || arg->width <= len)
  12800. && (arg->prec == -1 || arg->prec >= len)
  12801. && !(arg->flags & (F_SIGN | F_BLANK)))
  12802. {
  12803. /* Fast path */
  12804. if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
  12805. return -1;
  12806. return 0;
  12807. }
  12808. /* Truncate the string for "s", "r" and "a" formats
  12809. if the precision is set */
  12810. if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
  12811. if (arg->prec >= 0 && len > arg->prec)
  12812. len = arg->prec;
  12813. }
  12814. /* Adjust sign and width */
  12815. kind = PyUnicode_KIND(str);
  12816. pbuf = PyUnicode_DATA(str);
  12817. pindex = 0;
  12818. signchar = '\0';
  12819. if (arg->sign) {
  12820. Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
  12821. if (ch == '-' || ch == '+') {
  12822. signchar = ch;
  12823. len--;
  12824. pindex++;
  12825. }
  12826. else if (arg->flags & F_SIGN)
  12827. signchar = '+';
  12828. else if (arg->flags & F_BLANK)
  12829. signchar = ' ';
  12830. else
  12831. arg->sign = 0;
  12832. }
  12833. if (arg->width < len)
  12834. arg->width = len;
  12835. /* Prepare the writer */
  12836. maxchar = writer->maxchar;
  12837. if (!(arg->flags & F_LJUST)) {
  12838. if (arg->sign) {
  12839. if ((arg->width-1) > len)
  12840. maxchar = Py_MAX(maxchar, fill);
  12841. }
  12842. else {
  12843. if (arg->width > len)
  12844. maxchar = Py_MAX(maxchar, fill);
  12845. }
  12846. }
  12847. if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
  12848. Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
  12849. maxchar = Py_MAX(maxchar, strmaxchar);
  12850. }
  12851. buflen = arg->width;
  12852. if (arg->sign && len == arg->width)
  12853. buflen++;
  12854. if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
  12855. return -1;
  12856. /* Write the sign if needed */
  12857. if (arg->sign) {
  12858. if (fill != ' ') {
  12859. PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
  12860. writer->pos += 1;
  12861. }
  12862. if (arg->width > len)
  12863. arg->width--;
  12864. }
  12865. /* Write the numeric prefix for "x", "X" and "o" formats
  12866. if the alternate form is used.
  12867. For example, write "0x" for the "%#x" format. */
  12868. if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
  12869. assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
  12870. assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
  12871. if (fill != ' ') {
  12872. PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
  12873. PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
  12874. writer->pos += 2;
  12875. pindex += 2;
  12876. }
  12877. arg->width -= 2;
  12878. if (arg->width < 0)
  12879. arg->width = 0;
  12880. len -= 2;
  12881. }
  12882. /* Pad left with the fill character if needed */
  12883. if (arg->width > len && !(arg->flags & F_LJUST)) {
  12884. sublen = arg->width - len;
  12885. unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
  12886. writer->pos += sublen;
  12887. arg->width = len;
  12888. }
  12889. /* If padding with spaces: write sign if needed and/or numeric prefix if
  12890. the alternate form is used */
  12891. if (fill == ' ') {
  12892. if (arg->sign) {
  12893. PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
  12894. writer->pos += 1;
  12895. }
  12896. if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
  12897. assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
  12898. assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
  12899. PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
  12900. PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
  12901. writer->pos += 2;
  12902. pindex += 2;
  12903. }
  12904. }
  12905. /* Write characters */
  12906. if (len) {
  12907. _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
  12908. str, pindex, len);
  12909. writer->pos += len;
  12910. }
  12911. /* Pad right with the fill character if needed */
  12912. if (arg->width > len) {
  12913. sublen = arg->width - len;
  12914. unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
  12915. writer->pos += sublen;
  12916. }
  12917. return 0;
  12918. }
  12919. /* Helper of PyUnicode_Format(): format one arg.
  12920. Return 0 on success, raise an exception and return -1 on error. */
  12921. static int
  12922. unicode_format_arg(struct unicode_formatter_t *ctx)
  12923. {
  12924. struct unicode_format_arg_t arg;
  12925. PyObject *str;
  12926. int ret;
  12927. arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
  12928. if (arg.ch == '%') {
  12929. ctx->fmtpos++;
  12930. ctx->fmtcnt--;
  12931. if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
  12932. return -1;
  12933. return 0;
  12934. }
  12935. arg.flags = 0;
  12936. arg.width = -1;
  12937. arg.prec = -1;
  12938. arg.sign = 0;
  12939. str = NULL;
  12940. ret = unicode_format_arg_parse(ctx, &arg);
  12941. if (ret == -1)
  12942. return -1;
  12943. ret = unicode_format_arg_format(ctx, &arg, &str);
  12944. if (ret == -1)
  12945. return -1;
  12946. if (ret != 1) {
  12947. ret = unicode_format_arg_output(ctx, &arg, str);
  12948. Py_DECREF(str);
  12949. if (ret == -1)
  12950. return -1;
  12951. }
  12952. if (ctx->dict && (ctx->argidx < ctx->arglen)) {
  12953. PyErr_SetString(PyExc_TypeError,
  12954. "not all arguments converted during string formatting");
  12955. return -1;
  12956. }
  12957. return 0;
  12958. }
  12959. PyObject *
  12960. PyUnicode_Format(PyObject *format, PyObject *args)
  12961. {
  12962. struct unicode_formatter_t ctx;
  12963. if (format == NULL || args == NULL) {
  12964. PyErr_BadInternalCall();
  12965. return NULL;
  12966. }
  12967. if (ensure_unicode(format) < 0)
  12968. return NULL;
  12969. ctx.fmtstr = format;
  12970. ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
  12971. ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
  12972. ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
  12973. ctx.fmtpos = 0;
  12974. _PyUnicodeWriter_Init(&ctx.writer);
  12975. ctx.writer.min_length = ctx.fmtcnt + 100;
  12976. ctx.writer.overallocate = 1;
  12977. if (PyTuple_Check(args)) {
  12978. ctx.arglen = PyTuple_Size(args);
  12979. ctx.argidx = 0;
  12980. }
  12981. else {
  12982. ctx.arglen = -1;
  12983. ctx.argidx = -2;
  12984. }
  12985. ctx.args_owned = 0;
  12986. if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
  12987. ctx.dict = args;
  12988. else
  12989. ctx.dict = NULL;
  12990. ctx.args = args;
  12991. while (--ctx.fmtcnt >= 0) {
  12992. if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
  12993. Py_ssize_t nonfmtpos;
  12994. nonfmtpos = ctx.fmtpos++;
  12995. while (ctx.fmtcnt >= 0 &&
  12996. PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
  12997. ctx.fmtpos++;
  12998. ctx.fmtcnt--;
  12999. }
  13000. if (ctx.fmtcnt < 0) {
  13001. ctx.fmtpos--;
  13002. ctx.writer.overallocate = 0;
  13003. }
  13004. if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
  13005. nonfmtpos, ctx.fmtpos) < 0)
  13006. goto onError;
  13007. }
  13008. else {
  13009. ctx.fmtpos++;
  13010. if (unicode_format_arg(&ctx) == -1)
  13011. goto onError;
  13012. }
  13013. }
  13014. if (ctx.argidx < ctx.arglen && !ctx.dict) {
  13015. PyErr_SetString(PyExc_TypeError,
  13016. "not all arguments converted during string formatting");
  13017. goto onError;
  13018. }
  13019. if (ctx.args_owned) {
  13020. Py_DECREF(ctx.args);
  13021. }
  13022. return _PyUnicodeWriter_Finish(&ctx.writer);
  13023. onError:
  13024. _PyUnicodeWriter_Dealloc(&ctx.writer);
  13025. if (ctx.args_owned) {
  13026. Py_DECREF(ctx.args);
  13027. }
  13028. return NULL;
  13029. }
  13030. static PyObject *
  13031. unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
  13032. /*[clinic input]
  13033. @classmethod
  13034. str.__new__ as unicode_new
  13035. object as x: object = NULL
  13036. encoding: str = NULL
  13037. errors: str = NULL
  13038. [clinic start generated code]*/
  13039. static PyObject *
  13040. unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
  13041. const char *errors)
  13042. /*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
  13043. {
  13044. PyObject *unicode;
  13045. if (x == NULL) {
  13046. unicode = unicode_new_empty();
  13047. }
  13048. else if (encoding == NULL && errors == NULL) {
  13049. unicode = PyObject_Str(x);
  13050. }
  13051. else {
  13052. unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
  13053. }
  13054. if (unicode != NULL && type != &PyUnicode_Type) {
  13055. Py_SETREF(unicode, unicode_subtype_new(type, unicode));
  13056. }
  13057. return unicode;
  13058. }
  13059. static PyObject *
  13060. unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
  13061. {
  13062. PyObject *self;
  13063. Py_ssize_t length, char_size;
  13064. int share_utf8;
  13065. int kind;
  13066. void *data;
  13067. assert(PyType_IsSubtype(type, &PyUnicode_Type));
  13068. assert(_PyUnicode_CHECK(unicode));
  13069. self = type->tp_alloc(type, 0);
  13070. if (self == NULL) {
  13071. return NULL;
  13072. }
  13073. kind = PyUnicode_KIND(unicode);
  13074. length = PyUnicode_GET_LENGTH(unicode);
  13075. _PyUnicode_LENGTH(self) = length;
  13076. #ifdef Py_DEBUG
  13077. _PyUnicode_HASH(self) = -1;
  13078. #else
  13079. _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
  13080. #endif
  13081. _PyUnicode_STATE(self).interned = 0;
  13082. _PyUnicode_STATE(self).kind = kind;
  13083. _PyUnicode_STATE(self).compact = 0;
  13084. _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
  13085. _PyUnicode_STATE(self).statically_allocated = 0;
  13086. _PyUnicode_UTF8_LENGTH(self) = 0;
  13087. _PyUnicode_UTF8(self) = NULL;
  13088. _PyUnicode_DATA_ANY(self) = NULL;
  13089. share_utf8 = 0;
  13090. if (kind == PyUnicode_1BYTE_KIND) {
  13091. char_size = 1;
  13092. if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
  13093. share_utf8 = 1;
  13094. }
  13095. else if (kind == PyUnicode_2BYTE_KIND) {
  13096. char_size = 2;
  13097. }
  13098. else {
  13099. assert(kind == PyUnicode_4BYTE_KIND);
  13100. char_size = 4;
  13101. }
  13102. /* Ensure we won't overflow the length. */
  13103. if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
  13104. PyErr_NoMemory();
  13105. goto onError;
  13106. }
  13107. data = PyObject_Malloc((length + 1) * char_size);
  13108. if (data == NULL) {
  13109. PyErr_NoMemory();
  13110. goto onError;
  13111. }
  13112. _PyUnicode_DATA_ANY(self) = data;
  13113. if (share_utf8) {
  13114. _PyUnicode_UTF8_LENGTH(self) = length;
  13115. _PyUnicode_UTF8(self) = data;
  13116. }
  13117. memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
  13118. assert(_PyUnicode_CheckConsistency(self, 1));
  13119. #ifdef Py_DEBUG
  13120. _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
  13121. #endif
  13122. return self;
  13123. onError:
  13124. Py_DECREF(self);
  13125. return NULL;
  13126. }
  13127. void
  13128. _PyUnicode_ExactDealloc(PyObject *op)
  13129. {
  13130. assert(PyUnicode_CheckExact(op));
  13131. unicode_dealloc(op);
  13132. }
  13133. PyDoc_STRVAR(unicode_doc,
  13134. "str(object='') -> str\n\
  13135. str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
  13136. \n\
  13137. Create a new string object from the given object. If encoding or\n\
  13138. errors is specified, then the object must expose a data buffer\n\
  13139. that will be decoded using the given encoding and error handler.\n\
  13140. Otherwise, returns the result of object.__str__() (if defined)\n\
  13141. or repr(object).\n\
  13142. encoding defaults to sys.getdefaultencoding().\n\
  13143. errors defaults to 'strict'.");
  13144. static PyObject *unicode_iter(PyObject *seq);
  13145. PyTypeObject PyUnicode_Type = {
  13146. PyVarObject_HEAD_INIT(&PyType_Type, 0)
  13147. "str", /* tp_name */
  13148. sizeof(PyUnicodeObject), /* tp_basicsize */
  13149. 0, /* tp_itemsize */
  13150. /* Slots */
  13151. (destructor)unicode_dealloc, /* tp_dealloc */
  13152. 0, /* tp_vectorcall_offset */
  13153. 0, /* tp_getattr */
  13154. 0, /* tp_setattr */
  13155. 0, /* tp_as_async */
  13156. unicode_repr, /* tp_repr */
  13157. &unicode_as_number, /* tp_as_number */
  13158. &unicode_as_sequence, /* tp_as_sequence */
  13159. &unicode_as_mapping, /* tp_as_mapping */
  13160. (hashfunc) unicode_hash, /* tp_hash*/
  13161. 0, /* tp_call*/
  13162. (reprfunc) unicode_str, /* tp_str */
  13163. PyObject_GenericGetAttr, /* tp_getattro */
  13164. 0, /* tp_setattro */
  13165. 0, /* tp_as_buffer */
  13166. Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
  13167. Py_TPFLAGS_UNICODE_SUBCLASS |
  13168. _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
  13169. unicode_doc, /* tp_doc */
  13170. 0, /* tp_traverse */
  13171. 0, /* tp_clear */
  13172. PyUnicode_RichCompare, /* tp_richcompare */
  13173. 0, /* tp_weaklistoffset */
  13174. unicode_iter, /* tp_iter */
  13175. 0, /* tp_iternext */
  13176. unicode_methods, /* tp_methods */
  13177. 0, /* tp_members */
  13178. 0, /* tp_getset */
  13179. 0, /* tp_base */
  13180. 0, /* tp_dict */
  13181. 0, /* tp_descr_get */
  13182. 0, /* tp_descr_set */
  13183. 0, /* tp_dictoffset */
  13184. 0, /* tp_init */
  13185. 0, /* tp_alloc */
  13186. unicode_new, /* tp_new */
  13187. PyObject_Del, /* tp_free */
  13188. };
  13189. /* Initialize the Unicode implementation */
  13190. static void
  13191. _init_global_state(void)
  13192. {
  13193. static int initialized = 0;
  13194. if (initialized) {
  13195. return;
  13196. }
  13197. initialized = 1;
  13198. /* initialize the linebreak bloom filter */
  13199. const Py_UCS2 linebreak[] = {
  13200. 0x000A, /* LINE FEED */
  13201. 0x000D, /* CARRIAGE RETURN */
  13202. 0x001C, /* FILE SEPARATOR */
  13203. 0x001D, /* GROUP SEPARATOR */
  13204. 0x001E, /* RECORD SEPARATOR */
  13205. 0x0085, /* NEXT LINE */
  13206. 0x2028, /* LINE SEPARATOR */
  13207. 0x2029, /* PARAGRAPH SEPARATOR */
  13208. };
  13209. bloom_linebreak = make_bloom_mask(
  13210. PyUnicode_2BYTE_KIND, linebreak,
  13211. Py_ARRAY_LENGTH(linebreak));
  13212. }
  13213. void
  13214. _PyUnicode_InitState(PyInterpreterState *interp)
  13215. {
  13216. if (!_Py_IsMainInterpreter(interp)) {
  13217. return;
  13218. }
  13219. _init_global_state();
  13220. }
  13221. PyStatus
  13222. _PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
  13223. {
  13224. if (_Py_IsMainInterpreter(interp)) {
  13225. PyStatus status = init_global_interned_strings(interp);
  13226. if (_PyStatus_EXCEPTION(status)) {
  13227. return status;
  13228. }
  13229. }
  13230. assert(INTERNED_STRINGS);
  13231. return _PyStatus_OK();
  13232. }
  13233. PyStatus
  13234. _PyUnicode_InitInternDict(PyInterpreterState *interp)
  13235. {
  13236. assert(INTERNED_STRINGS);
  13237. if (init_interned_dict(interp)) {
  13238. PyErr_Clear();
  13239. return _PyStatus_ERR("failed to create interned dict");
  13240. }
  13241. return _PyStatus_OK();
  13242. }
  13243. PyStatus
  13244. _PyUnicode_InitTypes(PyInterpreterState *interp)
  13245. {
  13246. if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
  13247. goto error;
  13248. }
  13249. if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
  13250. goto error;
  13251. }
  13252. if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
  13253. goto error;
  13254. }
  13255. return _PyStatus_OK();
  13256. error:
  13257. return _PyStatus_ERR("Can't initialize unicode types");
  13258. }
  13259. static /* non-null */ PyObject*
  13260. intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
  13261. {
  13262. // Note that this steals a reference to `s`, but in many cases that
  13263. // stolen ref is returned, requiring no decref/incref.
  13264. assert(s != NULL);
  13265. assert(_PyUnicode_CHECK(s));
  13266. assert(_PyUnicode_STATE(s).statically_allocated);
  13267. assert(!PyUnicode_CHECK_INTERNED(s));
  13268. #ifdef Py_DEBUG
  13269. /* We must not add process-global interned string if there's already a
  13270. * per-interpreter interned_dict, which might contain duplicates.
  13271. */
  13272. PyObject *interned = get_interned_dict(interp);
  13273. // assert(interned == NULL);
  13274. #endif
  13275. /* Look in the global cache first. */
  13276. PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
  13277. /* We should only init each string once */
  13278. assert(r == NULL);
  13279. /* but just in case (for the non-debug build), handle this */
  13280. if (r != NULL && r != s) {
  13281. assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
  13282. assert(_PyUnicode_CHECK(r));
  13283. Py_DECREF(s);
  13284. return Py_NewRef(r);
  13285. }
  13286. if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
  13287. Py_FatalError("failed to intern static string");
  13288. }
  13289. _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
  13290. return s;
  13291. }
  13292. void
  13293. _PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
  13294. {
  13295. // This should only be called as part of runtime initialization
  13296. assert(!Py_IsInitialized());
  13297. *p = intern_static(interp, *p);
  13298. assert(*p);
  13299. }
  13300. static void
  13301. immortalize_interned(PyObject *s)
  13302. {
  13303. assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
  13304. assert(!_Py_IsImmortal(s));
  13305. #ifdef Py_REF_DEBUG
  13306. /* The reference count value should be excluded from the RefTotal.
  13307. The decrements to these objects will not be registered so they
  13308. need to be accounted for in here. */
  13309. for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
  13310. _Py_DecRefTotal(_PyInterpreterState_GET());
  13311. }
  13312. #endif
  13313. _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL;
  13314. _Py_SetImmortal(s);
  13315. }
  13316. static /* non-null */ PyObject*
  13317. intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
  13318. bool immortalize)
  13319. {
  13320. // Note that this steals a reference to `s`, but in many cases that
  13321. // stolen ref is returned, requiring no decref/incref.
  13322. #ifdef Py_DEBUG
  13323. assert(s != NULL);
  13324. assert(_PyUnicode_CHECK(s));
  13325. #else
  13326. if (s == NULL || !PyUnicode_Check(s)) {
  13327. return s;
  13328. }
  13329. #endif
  13330. /* If it's a subclass, we don't really know what putting
  13331. it in the interned dict might do. */
  13332. if (!PyUnicode_CheckExact(s)) {
  13333. return s;
  13334. }
  13335. /* Is it already interned? */
  13336. switch (PyUnicode_CHECK_INTERNED(s)) {
  13337. case SSTATE_NOT_INTERNED:
  13338. // no, go on
  13339. break;
  13340. case SSTATE_INTERNED_MORTAL:
  13341. // yes but we might need to make it immortal
  13342. if (immortalize) {
  13343. immortalize_interned(s);
  13344. }
  13345. return s;
  13346. default:
  13347. // all done
  13348. return s;
  13349. }
  13350. if (_PyUnicode_STATE(s).statically_allocated) {
  13351. return intern_static(interp, s);
  13352. }
  13353. /* If it's already immortal, intern it as such */
  13354. if (_Py_IsImmortal(s)) {
  13355. immortalize = 1;
  13356. }
  13357. /* if it's a short string, get the singleton */
  13358. if (PyUnicode_GET_LENGTH(s) == 1 &&
  13359. PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
  13360. PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
  13361. assert(PyUnicode_CHECK_INTERNED(r));
  13362. Py_DECREF(s);
  13363. return r;
  13364. }
  13365. #ifdef Py_DEBUG
  13366. assert(!unicode_is_singleton(s));
  13367. #endif
  13368. /* Look in the global cache now. */
  13369. {
  13370. PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
  13371. if (r != NULL) {
  13372. assert(_PyUnicode_STATE(r).statically_allocated);
  13373. assert(r != s); // r must be statically_allocated; s is not
  13374. Py_DECREF(s);
  13375. return Py_NewRef(r);
  13376. }
  13377. }
  13378. /* Do a setdefault on the per-interpreter cache. */
  13379. PyObject *interned = get_interned_dict(interp);
  13380. assert(interned != NULL);
  13381. PyObject *t = PyDict_SetDefault(interned, s, s); // t is borrowed
  13382. if (t == NULL) {
  13383. PyErr_Clear();
  13384. return s;
  13385. }
  13386. if (t != s) {
  13387. // value was already present (not inserted)
  13388. Py_INCREF(t);
  13389. Py_DECREF(s);
  13390. if (immortalize &&
  13391. PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
  13392. immortalize_interned(t);
  13393. }
  13394. return t;
  13395. }
  13396. else {
  13397. // value was newly inserted
  13398. }
  13399. /* NOT_INTERNED -> INTERNED_MORTAL */
  13400. assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
  13401. if (!_Py_IsImmortal(s)) {
  13402. /* The two references in interned dict (key and value) are not counted.
  13403. unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
  13404. Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
  13405. #ifdef Py_REF_DEBUG
  13406. /* let's be pedantic with the ref total */
  13407. _Py_DecRefTotal(_PyInterpreterState_GET());
  13408. _Py_DecRefTotal(_PyInterpreterState_GET());
  13409. #endif
  13410. }
  13411. _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
  13412. /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
  13413. #ifdef Py_DEBUG
  13414. if (_Py_IsImmortal(s)) {
  13415. assert(immortalize);
  13416. }
  13417. #endif
  13418. if (immortalize) {
  13419. immortalize_interned(s);
  13420. }
  13421. return s;
  13422. }
  13423. void
  13424. _PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
  13425. {
  13426. *p = intern_common(interp, *p, 1);
  13427. assert(*p);
  13428. }
  13429. void
  13430. _PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
  13431. {
  13432. *p = intern_common(interp, *p, 0);
  13433. assert(*p);
  13434. }
  13435. void
  13436. _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
  13437. {
  13438. _PyUnicode_InternImmortal(interp, p);
  13439. return;
  13440. }
  13441. void
  13442. PyUnicode_InternInPlace(PyObject **p)
  13443. {
  13444. PyInterpreterState *interp = _PyInterpreterState_GET();
  13445. _PyUnicode_InternMortal(interp, p);
  13446. }
  13447. // Public-looking name kept for the stable ABI; user should not call this:
  13448. PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
  13449. void
  13450. PyUnicode_InternImmortal(PyObject **p)
  13451. {
  13452. PyInterpreterState *interp = _PyInterpreterState_GET();
  13453. _PyUnicode_InternImmortal(interp, p);
  13454. }
  13455. PyObject *
  13456. PyUnicode_InternFromString(const char *cp)
  13457. {
  13458. PyObject *s = PyUnicode_FromString(cp);
  13459. if (s == NULL) {
  13460. return NULL;
  13461. }
  13462. PyInterpreterState *interp = _PyInterpreterState_GET();
  13463. _PyUnicode_InternMortal(interp, &s);
  13464. return s;
  13465. }
  13466. void
  13467. _PyUnicode_ClearInterned(PyInterpreterState *interp)
  13468. {
  13469. PyObject *interned = get_interned_dict(interp);
  13470. if (interned == NULL) {
  13471. return;
  13472. }
  13473. assert(PyDict_CheckExact(interned));
  13474. if (has_shared_intern_dict(interp)) {
  13475. // the dict doesn't belong to this interpreter, skip the debug
  13476. // checks on it and just clear the pointer to it
  13477. clear_interned_dict(interp);
  13478. return;
  13479. }
  13480. #ifdef INTERNED_STATS
  13481. fprintf(stderr, "releasing %zd interned strings\n",
  13482. PyDict_GET_SIZE(interned));
  13483. Py_ssize_t total_length = 0;
  13484. #endif
  13485. Py_ssize_t pos = 0;
  13486. PyObject *s, *ignored_value;
  13487. while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
  13488. assert(PyUnicode_IS_READY(s));
  13489. int shared = 0;
  13490. switch (PyUnicode_CHECK_INTERNED(s)) {
  13491. case SSTATE_INTERNED_IMMORTAL:
  13492. /* Make immortal interned strings mortal again.
  13493. *
  13494. * Currently, the runtime is not able to guarantee that it can exit
  13495. * without allocations that carry over to a future initialization
  13496. * of Python within the same process. i.e:
  13497. * ./python -X showrefcount -c 'import itertools'
  13498. * [237 refs, 237 blocks]
  13499. *
  13500. * This should remain disabled (`Py_DEBUG` only) until there is a
  13501. * strict guarantee that no memory will be left after
  13502. * `Py_Finalize`.
  13503. */
  13504. #ifdef Py_DEBUG
  13505. // Skip the Immortal Instance check and restore
  13506. // the two references (key and value) ignored
  13507. // by PyUnicode_InternInPlace().
  13508. s->ob_refcnt = 2;
  13509. #ifdef Py_REF_DEBUG
  13510. /* let's be pedantic with the ref total */
  13511. _Py_IncRefTotal(_PyInterpreterState_GET());
  13512. _Py_IncRefTotal(_PyInterpreterState_GET());
  13513. #endif
  13514. #ifdef INTERNED_STATS
  13515. total_length += PyUnicode_GET_LENGTH(s);
  13516. #endif
  13517. #endif // Py_DEBUG
  13518. break;
  13519. case SSTATE_INTERNED_IMMORTAL_STATIC:
  13520. /* It is shared between interpreters, so we should unmark it
  13521. only when this is the last interpreter in which it's
  13522. interned. We immortalize all the statically initialized
  13523. strings during startup, so we can rely on the
  13524. main interpreter to be the last one. */
  13525. if (!_Py_IsMainInterpreter(interp)) {
  13526. shared = 1;
  13527. }
  13528. break;
  13529. case SSTATE_INTERNED_MORTAL:
  13530. // Restore 2 references held by the interned dict; these will
  13531. // be decref'd by clear_interned_dict's PyDict_Clear.
  13532. Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
  13533. #ifdef Py_REF_DEBUG
  13534. /* let's be pedantic with the ref total */
  13535. _Py_IncRefTotal(_PyInterpreterState_GET());
  13536. _Py_IncRefTotal(_PyInterpreterState_GET());
  13537. #endif
  13538. break;
  13539. case SSTATE_NOT_INTERNED:
  13540. /* fall through */
  13541. default:
  13542. Py_UNREACHABLE();
  13543. }
  13544. if (!shared) {
  13545. _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
  13546. }
  13547. }
  13548. #ifdef INTERNED_STATS
  13549. fprintf(stderr,
  13550. "total length of all interned strings: %zd characters\n",
  13551. total_length);
  13552. #endif
  13553. struct _Py_unicode_state *state = &interp->unicode;
  13554. struct _Py_unicode_ids *ids = &state->ids;
  13555. for (Py_ssize_t i=0; i < ids->size; i++) {
  13556. Py_XINCREF(ids->array[i]);
  13557. }
  13558. clear_interned_dict(interp);
  13559. if (_Py_IsMainInterpreter(interp)) {
  13560. clear_global_interned_strings();
  13561. }
  13562. }
  13563. /********************* Unicode Iterator **************************/
  13564. typedef struct {
  13565. PyObject_HEAD
  13566. Py_ssize_t it_index;
  13567. PyObject *it_seq; /* Set to NULL when iterator is exhausted */
  13568. } unicodeiterobject;
  13569. static void
  13570. unicodeiter_dealloc(unicodeiterobject *it)
  13571. {
  13572. _PyObject_GC_UNTRACK(it);
  13573. Py_XDECREF(it->it_seq);
  13574. PyObject_GC_Del(it);
  13575. }
  13576. static int
  13577. unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
  13578. {
  13579. Py_VISIT(it->it_seq);
  13580. return 0;
  13581. }
  13582. static PyObject *
  13583. unicodeiter_next(unicodeiterobject *it)
  13584. {
  13585. PyObject *seq;
  13586. assert(it != NULL);
  13587. seq = it->it_seq;
  13588. if (seq == NULL)
  13589. return NULL;
  13590. assert(_PyUnicode_CHECK(seq));
  13591. if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
  13592. int kind = PyUnicode_KIND(seq);
  13593. const void *data = PyUnicode_DATA(seq);
  13594. Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
  13595. it->it_index++;
  13596. return unicode_char(chr);
  13597. }
  13598. it->it_seq = NULL;
  13599. Py_DECREF(seq);
  13600. return NULL;
  13601. }
  13602. static PyObject *
  13603. unicode_ascii_iter_next(unicodeiterobject *it)
  13604. {
  13605. assert(it != NULL);
  13606. PyObject *seq = it->it_seq;
  13607. if (seq == NULL) {
  13608. return NULL;
  13609. }
  13610. assert(_PyUnicode_CHECK(seq));
  13611. assert(PyUnicode_IS_COMPACT_ASCII(seq));
  13612. if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
  13613. const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
  13614. Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
  13615. data, it->it_index);
  13616. it->it_index++;
  13617. PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
  13618. return Py_NewRef(item);
  13619. }
  13620. it->it_seq = NULL;
  13621. Py_DECREF(seq);
  13622. return NULL;
  13623. }
  13624. static PyObject *
  13625. unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
  13626. {
  13627. Py_ssize_t len = 0;
  13628. if (it->it_seq)
  13629. len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
  13630. return PyLong_FromSsize_t(len);
  13631. }
  13632. PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
  13633. static PyObject *
  13634. unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
  13635. {
  13636. PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
  13637. /* _PyEval_GetBuiltin can invoke arbitrary code,
  13638. * call must be before access of iterator pointers.
  13639. * see issue #101765 */
  13640. if (it->it_seq != NULL) {
  13641. return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
  13642. } else {
  13643. PyObject *u = unicode_new_empty();
  13644. if (u == NULL) {
  13645. Py_XDECREF(iter);
  13646. return NULL;
  13647. }
  13648. return Py_BuildValue("N(N)", iter, u);
  13649. }
  13650. }
  13651. PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
  13652. static PyObject *
  13653. unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
  13654. {
  13655. Py_ssize_t index = PyLong_AsSsize_t(state);
  13656. if (index == -1 && PyErr_Occurred())
  13657. return NULL;
  13658. if (it->it_seq != NULL) {
  13659. if (index < 0)
  13660. index = 0;
  13661. else if (index > PyUnicode_GET_LENGTH(it->it_seq))
  13662. index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
  13663. it->it_index = index;
  13664. }
  13665. Py_RETURN_NONE;
  13666. }
  13667. PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
  13668. static PyMethodDef unicodeiter_methods[] = {
  13669. {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
  13670. length_hint_doc},
  13671. {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
  13672. reduce_doc},
  13673. {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
  13674. setstate_doc},
  13675. {NULL, NULL} /* sentinel */
  13676. };
  13677. PyTypeObject PyUnicodeIter_Type = {
  13678. PyVarObject_HEAD_INIT(&PyType_Type, 0)
  13679. "str_iterator", /* tp_name */
  13680. sizeof(unicodeiterobject), /* tp_basicsize */
  13681. 0, /* tp_itemsize */
  13682. /* methods */
  13683. (destructor)unicodeiter_dealloc, /* tp_dealloc */
  13684. 0, /* tp_vectorcall_offset */
  13685. 0, /* tp_getattr */
  13686. 0, /* tp_setattr */
  13687. 0, /* tp_as_async */
  13688. 0, /* tp_repr */
  13689. 0, /* tp_as_number */
  13690. 0, /* tp_as_sequence */
  13691. 0, /* tp_as_mapping */
  13692. 0, /* tp_hash */
  13693. 0, /* tp_call */
  13694. 0, /* tp_str */
  13695. PyObject_GenericGetAttr, /* tp_getattro */
  13696. 0, /* tp_setattro */
  13697. 0, /* tp_as_buffer */
  13698. Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
  13699. 0, /* tp_doc */
  13700. (traverseproc)unicodeiter_traverse, /* tp_traverse */
  13701. 0, /* tp_clear */
  13702. 0, /* tp_richcompare */
  13703. 0, /* tp_weaklistoffset */
  13704. PyObject_SelfIter, /* tp_iter */
  13705. (iternextfunc)unicodeiter_next, /* tp_iternext */
  13706. unicodeiter_methods, /* tp_methods */
  13707. 0,
  13708. };
  13709. PyTypeObject _PyUnicodeASCIIIter_Type = {
  13710. PyVarObject_HEAD_INIT(&PyType_Type, 0)
  13711. .tp_name = "str_ascii_iterator",
  13712. .tp_basicsize = sizeof(unicodeiterobject),
  13713. .tp_dealloc = (destructor)unicodeiter_dealloc,
  13714. .tp_getattro = PyObject_GenericGetAttr,
  13715. .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
  13716. .tp_traverse = (traverseproc)unicodeiter_traverse,
  13717. .tp_iter = PyObject_SelfIter,
  13718. .tp_iternext = (iternextfunc)unicode_ascii_iter_next,
  13719. .tp_methods = unicodeiter_methods,
  13720. };
  13721. static PyObject *
  13722. unicode_iter(PyObject *seq)
  13723. {
  13724. unicodeiterobject *it;
  13725. if (!PyUnicode_Check(seq)) {
  13726. PyErr_BadInternalCall();
  13727. return NULL;
  13728. }
  13729. if (PyUnicode_IS_COMPACT_ASCII(seq)) {
  13730. it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
  13731. }
  13732. else {
  13733. it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
  13734. }
  13735. if (it == NULL)
  13736. return NULL;
  13737. it->it_index = 0;
  13738. it->it_seq = Py_NewRef(seq);
  13739. _PyObject_GC_TRACK(it);
  13740. return (PyObject *)it;
  13741. }
  13742. static int
  13743. encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
  13744. {
  13745. int res;
  13746. res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
  13747. if (res == -2) {
  13748. PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
  13749. return -1;
  13750. }
  13751. if (res < 0) {
  13752. PyErr_NoMemory();
  13753. return -1;
  13754. }
  13755. return 0;
  13756. }
  13757. static int
  13758. config_get_codec_name(wchar_t **config_encoding)
  13759. {
  13760. char *encoding;
  13761. if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
  13762. return -1;
  13763. }
  13764. PyObject *name_obj = NULL;
  13765. PyObject *codec = _PyCodec_Lookup(encoding);
  13766. PyMem_RawFree(encoding);
  13767. if (!codec)
  13768. goto error;
  13769. name_obj = PyObject_GetAttrString(codec, "name");
  13770. Py_CLEAR(codec);
  13771. if (!name_obj) {
  13772. goto error;
  13773. }
  13774. wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
  13775. Py_DECREF(name_obj);
  13776. if (wname == NULL) {
  13777. goto error;
  13778. }
  13779. wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
  13780. if (raw_wname == NULL) {
  13781. PyMem_Free(wname);
  13782. PyErr_NoMemory();
  13783. goto error;
  13784. }
  13785. PyMem_RawFree(*config_encoding);
  13786. *config_encoding = raw_wname;
  13787. PyMem_Free(wname);
  13788. return 0;
  13789. error:
  13790. Py_XDECREF(codec);
  13791. Py_XDECREF(name_obj);
  13792. return -1;
  13793. }
  13794. static PyStatus
  13795. init_stdio_encoding(PyInterpreterState *interp)
  13796. {
  13797. /* Update the stdio encoding to the normalized Python codec name. */
  13798. PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
  13799. if (config_get_codec_name(&config->stdio_encoding) < 0) {
  13800. return _PyStatus_ERR("failed to get the Python codec name "
  13801. "of the stdio encoding");
  13802. }
  13803. return _PyStatus_OK();
  13804. }
  13805. static int
  13806. init_fs_codec(PyInterpreterState *interp)
  13807. {
  13808. const PyConfig *config = _PyInterpreterState_GetConfig(interp);
  13809. _Py_error_handler error_handler;
  13810. error_handler = get_error_handler_wide(config->filesystem_errors);
  13811. if (error_handler == _Py_ERROR_UNKNOWN) {
  13812. PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
  13813. return -1;
  13814. }
  13815. char *encoding, *errors;
  13816. if (encode_wstr_utf8(config->filesystem_encoding,
  13817. &encoding,
  13818. "filesystem_encoding") < 0) {
  13819. return -1;
  13820. }
  13821. if (encode_wstr_utf8(config->filesystem_errors,
  13822. &errors,
  13823. "filesystem_errors") < 0) {
  13824. PyMem_RawFree(encoding);
  13825. return -1;
  13826. }
  13827. struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
  13828. PyMem_RawFree(fs_codec->encoding);
  13829. fs_codec->encoding = encoding;
  13830. /* encoding has been normalized by init_fs_encoding() */
  13831. fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
  13832. PyMem_RawFree(fs_codec->errors);
  13833. fs_codec->errors = errors;
  13834. fs_codec->error_handler = error_handler;
  13835. #ifdef _Py_FORCE_UTF8_FS_ENCODING
  13836. assert(fs_codec->utf8 == 1);
  13837. #endif
  13838. /* At this point, PyUnicode_EncodeFSDefault() and
  13839. PyUnicode_DecodeFSDefault() can now use the Python codec rather than
  13840. the C implementation of the filesystem encoding. */
  13841. /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
  13842. global configuration variables. */
  13843. if (_Py_IsMainInterpreter(interp)) {
  13844. if (_Py_SetFileSystemEncoding(fs_codec->encoding,
  13845. fs_codec->errors) < 0) {
  13846. PyErr_NoMemory();
  13847. return -1;
  13848. }
  13849. }
  13850. return 0;
  13851. }
  13852. static PyStatus
  13853. init_fs_encoding(PyThreadState *tstate)
  13854. {
  13855. PyInterpreterState *interp = tstate->interp;
  13856. /* Update the filesystem encoding to the normalized Python codec name.
  13857. For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
  13858. (Python codec name). */
  13859. PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
  13860. if (config_get_codec_name(&config->filesystem_encoding) < 0) {
  13861. _Py_DumpPathConfig(tstate);
  13862. return _PyStatus_ERR("failed to get the Python codec "
  13863. "of the filesystem encoding");
  13864. }
  13865. if (init_fs_codec(interp) < 0) {
  13866. return _PyStatus_ERR("cannot initialize filesystem codec");
  13867. }
  13868. return _PyStatus_OK();
  13869. }
  13870. PyStatus
  13871. _PyUnicode_InitEncodings(PyThreadState *tstate)
  13872. {
  13873. PyStatus status = init_fs_encoding(tstate);
  13874. if (_PyStatus_EXCEPTION(status)) {
  13875. return status;
  13876. }
  13877. return init_stdio_encoding(tstate->interp);
  13878. }
  13879. static void
  13880. _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
  13881. {
  13882. PyMem_RawFree(fs_codec->encoding);
  13883. fs_codec->encoding = NULL;
  13884. fs_codec->utf8 = 0;
  13885. PyMem_RawFree(fs_codec->errors);
  13886. fs_codec->errors = NULL;
  13887. fs_codec->error_handler = _Py_ERROR_UNKNOWN;
  13888. }
  13889. #ifdef MS_WINDOWS
  13890. int
  13891. _PyUnicode_EnableLegacyWindowsFSEncoding(void)
  13892. {
  13893. PyInterpreterState *interp = _PyInterpreterState_GET();
  13894. PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
  13895. /* Set the filesystem encoding to mbcs/replace (PEP 529) */
  13896. wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
  13897. wchar_t *errors = _PyMem_RawWcsdup(L"replace");
  13898. if (encoding == NULL || errors == NULL) {
  13899. PyMem_RawFree(encoding);
  13900. PyMem_RawFree(errors);
  13901. PyErr_NoMemory();
  13902. return -1;
  13903. }
  13904. PyMem_RawFree(config->filesystem_encoding);
  13905. config->filesystem_encoding = encoding;
  13906. PyMem_RawFree(config->filesystem_errors);
  13907. config->filesystem_errors = errors;
  13908. return init_fs_codec(interp);
  13909. }
  13910. #endif
  13911. #ifdef Py_DEBUG
  13912. static inline int
  13913. unicode_is_finalizing(void)
  13914. {
  13915. return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
  13916. }
  13917. #endif
  13918. void
  13919. _PyUnicode_FiniTypes(PyInterpreterState *interp)
  13920. {
  13921. _PyStaticType_Dealloc(interp, &EncodingMapType);
  13922. _PyStaticType_Dealloc(interp, &PyFieldNameIter_Type);
  13923. _PyStaticType_Dealloc(interp, &PyFormatterIter_Type);
  13924. }
  13925. void
  13926. _PyUnicode_Fini(PyInterpreterState *interp)
  13927. {
  13928. struct _Py_unicode_state *state = &interp->unicode;
  13929. if (!has_shared_intern_dict(interp)) {
  13930. // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
  13931. assert(get_interned_dict(interp) == NULL);
  13932. }
  13933. _PyUnicode_FiniEncodings(&state->fs_codec);
  13934. // bpo-47182: force a unicodedata CAPI capsule re-import on
  13935. // subsequent initialization of interpreter.
  13936. interp->unicode.ucnhash_capi = NULL;
  13937. unicode_clear_identifiers(state);
  13938. }
  13939. /* A _string module, to export formatter_parser and formatter_field_name_split
  13940. to the string.Formatter class implemented in Python. */
  13941. static PyMethodDef _string_methods[] = {
  13942. {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
  13943. METH_O, PyDoc_STR("split the argument as a field name")},
  13944. {"formatter_parser", (PyCFunction) formatter_parser,
  13945. METH_O, PyDoc_STR("parse the argument as a format string")},
  13946. {NULL, NULL}
  13947. };
  13948. static PyModuleDef_Slot module_slots[] = {
  13949. {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
  13950. {0, NULL}
  13951. };
  13952. static struct PyModuleDef _string_module = {
  13953. PyModuleDef_HEAD_INIT,
  13954. .m_name = "_string",
  13955. .m_doc = PyDoc_STR("string helper module"),
  13956. .m_size = 0,
  13957. .m_methods = _string_methods,
  13958. .m_slots = module_slots,
  13959. };
  13960. PyMODINIT_FUNC
  13961. PyInit__string(void)
  13962. {
  13963. return PyModuleDef_Init(&_string_module);
  13964. }
  13965. #ifdef __cplusplus
  13966. }
  13967. #endif