sre.c 88 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249
  1. /*
  2. * Secret Labs' Regular Expression Engine
  3. *
  4. * regular expression matching engine
  5. *
  6. * partial history:
  7. * 1999-10-24 fl created (based on existing template matcher code)
  8. * 2000-03-06 fl first alpha, sort of
  9. * 2000-08-01 fl fixes for 1.6b1
  10. * 2000-08-07 fl use PyOS_CheckStack() if available
  11. * 2000-09-20 fl added expand method
  12. * 2001-03-20 fl lots of fixes for 2.1b2
  13. * 2001-04-15 fl export copyright as Python attribute, not global
  14. * 2001-04-28 fl added __copy__ methods (work in progress)
  15. * 2001-05-14 fl fixes for 1.5.2 compatibility
  16. * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
  17. * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
  18. * 2001-10-20 fl added split primitive; re-enable unicode for 1.6/2.0/2.1
  19. * 2001-10-21 fl added sub/subn primitive
  20. * 2001-10-24 fl added finditer primitive (for 2.2 only)
  21. * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
  22. * 2002-11-09 fl fixed empty sub/subn return type
  23. * 2003-04-18 mvl fully support 4-byte codes
  24. * 2003-10-17 gn implemented non recursive scheme
  25. * 2013-02-04 mrab added fullmatch primitive
  26. *
  27. * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
  28. *
  29. * This version of the SRE library can be redistributed under CNRI's
  30. * Python 1.6 license. For any other use, please contact Secret Labs
  31. * AB (info@pythonware.com).
  32. *
  33. * Portions of this engine have been developed in cooperation with
  34. * CNRI. Hewlett-Packard provided funding for 1.6 integration and
  35. * other compatibility work.
  36. */
  37. static const char copyright[] =
  38. " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
  39. #define PY_SSIZE_T_CLEAN
  40. #include "Python.h"
  41. #include "pycore_long.h" // _PyLong_GetZero()
  42. #include "pycore_moduleobject.h" // _PyModule_GetState()
  43. #include "structmember.h" // PyMemberDef
  44. #include "sre.h"
  45. #define SRE_CODE_BITS (8 * sizeof(SRE_CODE))
  46. #include <ctype.h>
  47. /* defining this one enables tracing */
  48. #undef VERBOSE
  49. /* -------------------------------------------------------------------- */
  50. #if defined(_MSC_VER)
  51. #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
  52. #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
  53. /* fastest possible local call under MSVC */
  54. #define LOCAL(type) static __inline type __fastcall
  55. #else
  56. #define LOCAL(type) static inline type
  57. #endif
  58. /* error codes */
  59. #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
  60. #define SRE_ERROR_STATE -2 /* illegal state */
  61. #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
  62. #define SRE_ERROR_MEMORY -9 /* out of memory */
  63. #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
  64. #if defined(VERBOSE)
  65. #define TRACE(v) printf v
  66. #else
  67. #define TRACE(v)
  68. #endif
  69. /* -------------------------------------------------------------------- */
  70. /* search engine state */
  71. #define SRE_IS_DIGIT(ch)\
  72. ((ch) <= '9' && Py_ISDIGIT(ch))
  73. #define SRE_IS_SPACE(ch)\
  74. ((ch) <= ' ' && Py_ISSPACE(ch))
  75. #define SRE_IS_LINEBREAK(ch)\
  76. ((ch) == '\n')
  77. #define SRE_IS_WORD(ch)\
  78. ((ch) <= 'z' && (Py_ISALNUM(ch) || (ch) == '_'))
  79. static unsigned int sre_lower_ascii(unsigned int ch)
  80. {
  81. return ((ch) < 128 ? Py_TOLOWER(ch) : ch);
  82. }
  83. /* locale-specific character predicates */
  84. /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
  85. * warnings when c's type supports only numbers < N+1 */
  86. #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
  87. #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
  88. static unsigned int sre_lower_locale(unsigned int ch)
  89. {
  90. return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
  91. }
  92. static unsigned int sre_upper_locale(unsigned int ch)
  93. {
  94. return ((ch) < 256 ? (unsigned int)toupper((ch)) : ch);
  95. }
  96. /* unicode-specific character predicates */
  97. #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL(ch)
  98. #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE(ch)
  99. #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK(ch)
  100. #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM(ch)
  101. #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM(ch) || (ch) == '_')
  102. static unsigned int sre_lower_unicode(unsigned int ch)
  103. {
  104. return (unsigned int) Py_UNICODE_TOLOWER(ch);
  105. }
  106. static unsigned int sre_upper_unicode(unsigned int ch)
  107. {
  108. return (unsigned int) Py_UNICODE_TOUPPER(ch);
  109. }
  110. LOCAL(int)
  111. sre_category(SRE_CODE category, unsigned int ch)
  112. {
  113. switch (category) {
  114. case SRE_CATEGORY_DIGIT:
  115. return SRE_IS_DIGIT(ch);
  116. case SRE_CATEGORY_NOT_DIGIT:
  117. return !SRE_IS_DIGIT(ch);
  118. case SRE_CATEGORY_SPACE:
  119. return SRE_IS_SPACE(ch);
  120. case SRE_CATEGORY_NOT_SPACE:
  121. return !SRE_IS_SPACE(ch);
  122. case SRE_CATEGORY_WORD:
  123. return SRE_IS_WORD(ch);
  124. case SRE_CATEGORY_NOT_WORD:
  125. return !SRE_IS_WORD(ch);
  126. case SRE_CATEGORY_LINEBREAK:
  127. return SRE_IS_LINEBREAK(ch);
  128. case SRE_CATEGORY_NOT_LINEBREAK:
  129. return !SRE_IS_LINEBREAK(ch);
  130. case SRE_CATEGORY_LOC_WORD:
  131. return SRE_LOC_IS_WORD(ch);
  132. case SRE_CATEGORY_LOC_NOT_WORD:
  133. return !SRE_LOC_IS_WORD(ch);
  134. case SRE_CATEGORY_UNI_DIGIT:
  135. return SRE_UNI_IS_DIGIT(ch);
  136. case SRE_CATEGORY_UNI_NOT_DIGIT:
  137. return !SRE_UNI_IS_DIGIT(ch);
  138. case SRE_CATEGORY_UNI_SPACE:
  139. return SRE_UNI_IS_SPACE(ch);
  140. case SRE_CATEGORY_UNI_NOT_SPACE:
  141. return !SRE_UNI_IS_SPACE(ch);
  142. case SRE_CATEGORY_UNI_WORD:
  143. return SRE_UNI_IS_WORD(ch);
  144. case SRE_CATEGORY_UNI_NOT_WORD:
  145. return !SRE_UNI_IS_WORD(ch);
  146. case SRE_CATEGORY_UNI_LINEBREAK:
  147. return SRE_UNI_IS_LINEBREAK(ch);
  148. case SRE_CATEGORY_UNI_NOT_LINEBREAK:
  149. return !SRE_UNI_IS_LINEBREAK(ch);
  150. }
  151. return 0;
  152. }
  153. LOCAL(int)
  154. char_loc_ignore(SRE_CODE pattern, SRE_CODE ch)
  155. {
  156. return ch == pattern
  157. || (SRE_CODE) sre_lower_locale(ch) == pattern
  158. || (SRE_CODE) sre_upper_locale(ch) == pattern;
  159. }
  160. /* helpers */
  161. static void
  162. data_stack_dealloc(SRE_STATE* state)
  163. {
  164. if (state->data_stack) {
  165. PyMem_Free(state->data_stack);
  166. state->data_stack = NULL;
  167. }
  168. state->data_stack_size = state->data_stack_base = 0;
  169. }
  170. static int
  171. data_stack_grow(SRE_STATE* state, Py_ssize_t size)
  172. {
  173. Py_ssize_t minsize, cursize;
  174. minsize = state->data_stack_base+size;
  175. cursize = state->data_stack_size;
  176. if (cursize < minsize) {
  177. void* stack;
  178. cursize = minsize+minsize/4+1024;
  179. TRACE(("allocate/grow stack %zd\n", cursize));
  180. stack = PyMem_Realloc(state->data_stack, cursize);
  181. if (!stack) {
  182. data_stack_dealloc(state);
  183. return SRE_ERROR_MEMORY;
  184. }
  185. state->data_stack = (char *)stack;
  186. state->data_stack_size = cursize;
  187. }
  188. return 0;
  189. }
  190. /* generate 8-bit version */
  191. #define SRE_CHAR Py_UCS1
  192. #define SIZEOF_SRE_CHAR 1
  193. #define SRE(F) sre_ucs1_##F
  194. #include "sre_lib.h"
  195. /* generate 16-bit unicode version */
  196. #define SRE_CHAR Py_UCS2
  197. #define SIZEOF_SRE_CHAR 2
  198. #define SRE(F) sre_ucs2_##F
  199. #include "sre_lib.h"
  200. /* generate 32-bit unicode version */
  201. #define SRE_CHAR Py_UCS4
  202. #define SIZEOF_SRE_CHAR 4
  203. #define SRE(F) sre_ucs4_##F
  204. #include "sre_lib.h"
  205. /* -------------------------------------------------------------------- */
  206. /* factories and destructors */
  207. /* module state */
  208. typedef struct {
  209. PyTypeObject *Pattern_Type;
  210. PyTypeObject *Match_Type;
  211. PyTypeObject *Scanner_Type;
  212. PyTypeObject *Template_Type;
  213. PyObject *compile_template; // reference to re._compile_template
  214. } _sremodulestate;
  215. static _sremodulestate *
  216. get_sre_module_state(PyObject *m)
  217. {
  218. _sremodulestate *state = (_sremodulestate *)_PyModule_GetState(m);
  219. assert(state);
  220. return state;
  221. }
  222. static struct PyModuleDef sremodule;
  223. #define get_sre_module_state_by_class(cls) \
  224. (get_sre_module_state(PyType_GetModule(cls)))
  225. /* see sre.h for object declarations */
  226. static PyObject*pattern_new_match(_sremodulestate *, PatternObject*, SRE_STATE*, Py_ssize_t);
  227. static PyObject *pattern_scanner(_sremodulestate *, PatternObject *, PyObject *, Py_ssize_t, Py_ssize_t);
  228. /*[clinic input]
  229. module _sre
  230. class _sre.SRE_Pattern "PatternObject *" "get_sre_module_state_by_class(tp)->Pattern_Type"
  231. class _sre.SRE_Match "MatchObject *" "get_sre_module_state_by_class(tp)->Match_Type"
  232. class _sre.SRE_Scanner "ScannerObject *" "get_sre_module_state_by_class(tp)->Scanner_Type"
  233. [clinic start generated code]*/
  234. /*[clinic end generated code: output=da39a3ee5e6b4b0d input=fe2966e32b66a231]*/
  235. /*[clinic input]
  236. _sre.getcodesize -> int
  237. [clinic start generated code]*/
  238. static int
  239. _sre_getcodesize_impl(PyObject *module)
  240. /*[clinic end generated code: output=e0db7ce34a6dd7b1 input=bd6f6ecf4916bb2b]*/
  241. {
  242. return sizeof(SRE_CODE);
  243. }
  244. /*[clinic input]
  245. _sre.ascii_iscased -> bool
  246. character: int
  247. /
  248. [clinic start generated code]*/
  249. static int
  250. _sre_ascii_iscased_impl(PyObject *module, int character)
  251. /*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
  252. {
  253. unsigned int ch = (unsigned int)character;
  254. return ch < 128 && Py_ISALPHA(ch);
  255. }
  256. /*[clinic input]
  257. _sre.unicode_iscased -> bool
  258. character: int
  259. /
  260. [clinic start generated code]*/
  261. static int
  262. _sre_unicode_iscased_impl(PyObject *module, int character)
  263. /*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
  264. {
  265. unsigned int ch = (unsigned int)character;
  266. return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
  267. }
  268. /*[clinic input]
  269. _sre.ascii_tolower -> int
  270. character: int
  271. /
  272. [clinic start generated code]*/
  273. static int
  274. _sre_ascii_tolower_impl(PyObject *module, int character)
  275. /*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/
  276. {
  277. return sre_lower_ascii(character);
  278. }
  279. /*[clinic input]
  280. _sre.unicode_tolower -> int
  281. character: int
  282. /
  283. [clinic start generated code]*/
  284. static int
  285. _sre_unicode_tolower_impl(PyObject *module, int character)
  286. /*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/
  287. {
  288. return sre_lower_unicode(character);
  289. }
  290. LOCAL(void)
  291. state_reset(SRE_STATE* state)
  292. {
  293. /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
  294. /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
  295. state->lastmark = -1;
  296. state->lastindex = -1;
  297. state->repeat = NULL;
  298. data_stack_dealloc(state);
  299. }
  300. static const void*
  301. getstring(PyObject* string, Py_ssize_t* p_length,
  302. int* p_isbytes, int* p_charsize,
  303. Py_buffer *view)
  304. {
  305. /* given a python object, return a data pointer, a length (in
  306. characters), and a character size. return NULL if the object
  307. is not a string (or not compatible) */
  308. /* Unicode objects do not support the buffer API. So, get the data
  309. directly instead. */
  310. if (PyUnicode_Check(string)) {
  311. if (PyUnicode_READY(string) == -1)
  312. return NULL;
  313. *p_length = PyUnicode_GET_LENGTH(string);
  314. *p_charsize = PyUnicode_KIND(string);
  315. *p_isbytes = 0;
  316. return PyUnicode_DATA(string);
  317. }
  318. /* get pointer to byte string buffer */
  319. if (PyObject_GetBuffer(string, view, PyBUF_SIMPLE) != 0) {
  320. PyErr_Format(PyExc_TypeError, "expected string or bytes-like "
  321. "object, got '%.200s'", Py_TYPE(string)->tp_name);
  322. return NULL;
  323. }
  324. *p_length = view->len;
  325. *p_charsize = 1;
  326. *p_isbytes = 1;
  327. if (view->buf == NULL) {
  328. PyErr_SetString(PyExc_ValueError, "Buffer is NULL");
  329. PyBuffer_Release(view);
  330. view->buf = NULL;
  331. return NULL;
  332. }
  333. return view->buf;
  334. }
  335. LOCAL(PyObject*)
  336. state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
  337. Py_ssize_t start, Py_ssize_t end)
  338. {
  339. /* prepare state object */
  340. Py_ssize_t length;
  341. int isbytes, charsize;
  342. const void* ptr;
  343. memset(state, 0, sizeof(SRE_STATE));
  344. state->mark = PyMem_New(const void *, pattern->groups * 2);
  345. if (!state->mark) {
  346. PyErr_NoMemory();
  347. goto err;
  348. }
  349. state->lastmark = -1;
  350. state->lastindex = -1;
  351. state->buffer.buf = NULL;
  352. ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer);
  353. if (!ptr)
  354. goto err;
  355. if (isbytes && pattern->isbytes == 0) {
  356. PyErr_SetString(PyExc_TypeError,
  357. "cannot use a string pattern on a bytes-like object");
  358. goto err;
  359. }
  360. if (!isbytes && pattern->isbytes > 0) {
  361. PyErr_SetString(PyExc_TypeError,
  362. "cannot use a bytes pattern on a string-like object");
  363. goto err;
  364. }
  365. /* adjust boundaries */
  366. if (start < 0)
  367. start = 0;
  368. else if (start > length)
  369. start = length;
  370. if (end < 0)
  371. end = 0;
  372. else if (end > length)
  373. end = length;
  374. state->isbytes = isbytes;
  375. state->charsize = charsize;
  376. state->match_all = 0;
  377. state->must_advance = 0;
  378. state->beginning = ptr;
  379. state->start = (void*) ((char*) ptr + start * state->charsize);
  380. state->end = (void*) ((char*) ptr + end * state->charsize);
  381. state->string = Py_NewRef(string);
  382. state->pos = start;
  383. state->endpos = end;
  384. return string;
  385. err:
  386. /* We add an explicit cast here because MSVC has a bug when
  387. compiling C code where it believes that `const void**` cannot be
  388. safely casted to `void*`, see bpo-39943 for details. */
  389. PyMem_Free((void*) state->mark);
  390. state->mark = NULL;
  391. if (state->buffer.buf)
  392. PyBuffer_Release(&state->buffer);
  393. return NULL;
  394. }
  395. LOCAL(void)
  396. state_fini(SRE_STATE* state)
  397. {
  398. if (state->buffer.buf)
  399. PyBuffer_Release(&state->buffer);
  400. Py_XDECREF(state->string);
  401. data_stack_dealloc(state);
  402. /* See above PyMem_Del for why we explicitly cast here. */
  403. PyMem_Free((void*) state->mark);
  404. state->mark = NULL;
  405. }
  406. /* calculate offset from start of string */
  407. #define STATE_OFFSET(state, member)\
  408. (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
  409. LOCAL(PyObject*)
  410. getslice(int isbytes, const void *ptr,
  411. PyObject* string, Py_ssize_t start, Py_ssize_t end)
  412. {
  413. if (isbytes) {
  414. if (PyBytes_CheckExact(string) &&
  415. start == 0 && end == PyBytes_GET_SIZE(string)) {
  416. return Py_NewRef(string);
  417. }
  418. return PyBytes_FromStringAndSize(
  419. (const char *)ptr + start, end - start);
  420. }
  421. else {
  422. return PyUnicode_Substring(string, start, end);
  423. }
  424. }
  425. LOCAL(PyObject*)
  426. state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
  427. {
  428. Py_ssize_t i, j;
  429. index = (index - 1) * 2;
  430. if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
  431. if (empty)
  432. /* want empty string */
  433. i = j = 0;
  434. else {
  435. Py_RETURN_NONE;
  436. }
  437. } else {
  438. i = STATE_OFFSET(state, state->mark[index]);
  439. j = STATE_OFFSET(state, state->mark[index+1]);
  440. /* check wrong span */
  441. if (i > j) {
  442. PyErr_SetString(PyExc_SystemError,
  443. "The span of capturing group is wrong,"
  444. " please report a bug for the re module.");
  445. return NULL;
  446. }
  447. }
  448. return getslice(state->isbytes, state->beginning, string, i, j);
  449. }
  450. static void
  451. pattern_error(Py_ssize_t status)
  452. {
  453. switch (status) {
  454. case SRE_ERROR_RECURSION_LIMIT:
  455. /* This error code seems to be unused. */
  456. PyErr_SetString(
  457. PyExc_RecursionError,
  458. "maximum recursion limit exceeded"
  459. );
  460. break;
  461. case SRE_ERROR_MEMORY:
  462. PyErr_NoMemory();
  463. break;
  464. case SRE_ERROR_INTERRUPTED:
  465. /* An exception has already been raised, so let it fly */
  466. break;
  467. default:
  468. /* other error codes indicate compiler/engine bugs */
  469. PyErr_SetString(
  470. PyExc_RuntimeError,
  471. "internal error in regular expression engine"
  472. );
  473. }
  474. }
  475. static int
  476. pattern_traverse(PatternObject *self, visitproc visit, void *arg)
  477. {
  478. Py_VISIT(Py_TYPE(self));
  479. Py_VISIT(self->groupindex);
  480. Py_VISIT(self->indexgroup);
  481. Py_VISIT(self->pattern);
  482. return 0;
  483. }
  484. static int
  485. pattern_clear(PatternObject *self)
  486. {
  487. Py_CLEAR(self->groupindex);
  488. Py_CLEAR(self->indexgroup);
  489. Py_CLEAR(self->pattern);
  490. return 0;
  491. }
  492. static void
  493. pattern_dealloc(PatternObject* self)
  494. {
  495. PyTypeObject *tp = Py_TYPE(self);
  496. PyObject_GC_UnTrack(self);
  497. if (self->weakreflist != NULL) {
  498. PyObject_ClearWeakRefs((PyObject *) self);
  499. }
  500. (void)pattern_clear(self);
  501. tp->tp_free(self);
  502. Py_DECREF(tp);
  503. }
  504. LOCAL(Py_ssize_t)
  505. sre_match(SRE_STATE* state, SRE_CODE* pattern)
  506. {
  507. if (state->charsize == 1)
  508. return sre_ucs1_match(state, pattern, 1);
  509. if (state->charsize == 2)
  510. return sre_ucs2_match(state, pattern, 1);
  511. assert(state->charsize == 4);
  512. return sre_ucs4_match(state, pattern, 1);
  513. }
  514. LOCAL(Py_ssize_t)
  515. sre_search(SRE_STATE* state, SRE_CODE* pattern)
  516. {
  517. if (state->charsize == 1)
  518. return sre_ucs1_search(state, pattern);
  519. if (state->charsize == 2)
  520. return sre_ucs2_search(state, pattern);
  521. assert(state->charsize == 4);
  522. return sre_ucs4_search(state, pattern);
  523. }
  524. /*[clinic input]
  525. _sre.SRE_Pattern.match
  526. cls: defining_class
  527. /
  528. string: object
  529. pos: Py_ssize_t = 0
  530. endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
  531. Matches zero or more characters at the beginning of the string.
  532. [clinic start generated code]*/
  533. static PyObject *
  534. _sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls,
  535. PyObject *string, Py_ssize_t pos,
  536. Py_ssize_t endpos)
  537. /*[clinic end generated code: output=ec6208ea58a0cca0 input=4bdb9c3e564d13ac]*/
  538. {
  539. _sremodulestate *module_state = get_sre_module_state_by_class(cls);
  540. SRE_STATE state;
  541. Py_ssize_t status;
  542. PyObject *match;
  543. if (!state_init(&state, (PatternObject *)self, string, pos, endpos))
  544. return NULL;
  545. state.ptr = state.start;
  546. TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
  547. status = sre_match(&state, PatternObject_GetCode(self));
  548. TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
  549. if (PyErr_Occurred()) {
  550. state_fini(&state);
  551. return NULL;
  552. }
  553. match = pattern_new_match(module_state, self, &state, status);
  554. state_fini(&state);
  555. return match;
  556. }
  557. /*[clinic input]
  558. _sre.SRE_Pattern.fullmatch
  559. cls: defining_class
  560. /
  561. string: object
  562. pos: Py_ssize_t = 0
  563. endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
  564. Matches against all of the string.
  565. [clinic start generated code]*/
  566. static PyObject *
  567. _sre_SRE_Pattern_fullmatch_impl(PatternObject *self, PyTypeObject *cls,
  568. PyObject *string, Py_ssize_t pos,
  569. Py_ssize_t endpos)
  570. /*[clinic end generated code: output=625b75b027ef94da input=50981172ab0fcfdd]*/
  571. {
  572. _sremodulestate *module_state = get_sre_module_state_by_class(cls);
  573. SRE_STATE state;
  574. Py_ssize_t status;
  575. PyObject *match;
  576. if (!state_init(&state, self, string, pos, endpos))
  577. return NULL;
  578. state.ptr = state.start;
  579. TRACE(("|%p|%p|FULLMATCH\n", PatternObject_GetCode(self), state.ptr));
  580. state.match_all = 1;
  581. status = sre_match(&state, PatternObject_GetCode(self));
  582. TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
  583. if (PyErr_Occurred()) {
  584. state_fini(&state);
  585. return NULL;
  586. }
  587. match = pattern_new_match(module_state, self, &state, status);
  588. state_fini(&state);
  589. return match;
  590. }
  591. /*[clinic input]
  592. _sre.SRE_Pattern.search
  593. cls: defining_class
  594. /
  595. string: object
  596. pos: Py_ssize_t = 0
  597. endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
  598. Scan through string looking for a match, and return a corresponding match object instance.
  599. Return None if no position in the string matches.
  600. [clinic start generated code]*/
  601. static PyObject *
  602. _sre_SRE_Pattern_search_impl(PatternObject *self, PyTypeObject *cls,
  603. PyObject *string, Py_ssize_t pos,
  604. Py_ssize_t endpos)
  605. /*[clinic end generated code: output=bd7f2d9d583e1463 input=afa9afb66a74a4b3]*/
  606. {
  607. _sremodulestate *module_state = get_sre_module_state_by_class(cls);
  608. SRE_STATE state;
  609. Py_ssize_t status;
  610. PyObject *match;
  611. if (!state_init(&state, self, string, pos, endpos))
  612. return NULL;
  613. TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
  614. status = sre_search(&state, PatternObject_GetCode(self));
  615. TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
  616. if (PyErr_Occurred()) {
  617. state_fini(&state);
  618. return NULL;
  619. }
  620. match = pattern_new_match(module_state, self, &state, status);
  621. state_fini(&state);
  622. return match;
  623. }
  624. /*[clinic input]
  625. _sre.SRE_Pattern.findall
  626. string: object
  627. pos: Py_ssize_t = 0
  628. endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
  629. Return a list of all non-overlapping matches of pattern in string.
  630. [clinic start generated code]*/
  631. static PyObject *
  632. _sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
  633. Py_ssize_t pos, Py_ssize_t endpos)
  634. /*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
  635. {
  636. SRE_STATE state;
  637. PyObject* list;
  638. Py_ssize_t status;
  639. Py_ssize_t i, b, e;
  640. if (!state_init(&state, self, string, pos, endpos))
  641. return NULL;
  642. list = PyList_New(0);
  643. if (!list) {
  644. state_fini(&state);
  645. return NULL;
  646. }
  647. while (state.start <= state.end) {
  648. PyObject* item;
  649. state_reset(&state);
  650. state.ptr = state.start;
  651. status = sre_search(&state, PatternObject_GetCode(self));
  652. if (PyErr_Occurred())
  653. goto error;
  654. if (status <= 0) {
  655. if (status == 0)
  656. break;
  657. pattern_error(status);
  658. goto error;
  659. }
  660. /* don't bother to build a match object */
  661. switch (self->groups) {
  662. case 0:
  663. b = STATE_OFFSET(&state, state.start);
  664. e = STATE_OFFSET(&state, state.ptr);
  665. item = getslice(state.isbytes, state.beginning,
  666. string, b, e);
  667. if (!item)
  668. goto error;
  669. break;
  670. case 1:
  671. item = state_getslice(&state, 1, string, 1);
  672. if (!item)
  673. goto error;
  674. break;
  675. default:
  676. item = PyTuple_New(self->groups);
  677. if (!item)
  678. goto error;
  679. for (i = 0; i < self->groups; i++) {
  680. PyObject* o = state_getslice(&state, i+1, string, 1);
  681. if (!o) {
  682. Py_DECREF(item);
  683. goto error;
  684. }
  685. PyTuple_SET_ITEM(item, i, o);
  686. }
  687. break;
  688. }
  689. status = PyList_Append(list, item);
  690. Py_DECREF(item);
  691. if (status < 0)
  692. goto error;
  693. state.must_advance = (state.ptr == state.start);
  694. state.start = state.ptr;
  695. }
  696. state_fini(&state);
  697. return list;
  698. error:
  699. Py_DECREF(list);
  700. state_fini(&state);
  701. return NULL;
  702. }
  703. /*[clinic input]
  704. _sre.SRE_Pattern.finditer
  705. cls: defining_class
  706. /
  707. string: object
  708. pos: Py_ssize_t = 0
  709. endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
  710. Return an iterator over all non-overlapping matches for the RE pattern in string.
  711. For each match, the iterator returns a match object.
  712. [clinic start generated code]*/
  713. static PyObject *
  714. _sre_SRE_Pattern_finditer_impl(PatternObject *self, PyTypeObject *cls,
  715. PyObject *string, Py_ssize_t pos,
  716. Py_ssize_t endpos)
  717. /*[clinic end generated code: output=1791dbf3618ade56 input=812e332a4848cbaf]*/
  718. {
  719. _sremodulestate *module_state = get_sre_module_state_by_class(cls);
  720. PyObject* scanner;
  721. PyObject* search;
  722. PyObject* iterator;
  723. scanner = pattern_scanner(module_state, self, string, pos, endpos);
  724. if (!scanner)
  725. return NULL;
  726. search = PyObject_GetAttrString(scanner, "search");
  727. Py_DECREF(scanner);
  728. if (!search)
  729. return NULL;
  730. iterator = PyCallIter_New(search, Py_None);
  731. Py_DECREF(search);
  732. return iterator;
  733. }
  734. /*[clinic input]
  735. _sre.SRE_Pattern.scanner
  736. cls: defining_class
  737. /
  738. string: object
  739. pos: Py_ssize_t = 0
  740. endpos: Py_ssize_t(c_default="PY_SSIZE_T_MAX") = sys.maxsize
  741. [clinic start generated code]*/
  742. static PyObject *
  743. _sre_SRE_Pattern_scanner_impl(PatternObject *self, PyTypeObject *cls,
  744. PyObject *string, Py_ssize_t pos,
  745. Py_ssize_t endpos)
  746. /*[clinic end generated code: output=f70cd506112f1bd9 input=2e487e5151bcee4c]*/
  747. {
  748. _sremodulestate *module_state = get_sre_module_state_by_class(cls);
  749. return pattern_scanner(module_state, self, string, pos, endpos);
  750. }
  751. /*[clinic input]
  752. _sre.SRE_Pattern.split
  753. string: object
  754. maxsplit: Py_ssize_t = 0
  755. Split string by the occurrences of pattern.
  756. [clinic start generated code]*/
  757. static PyObject *
  758. _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
  759. Py_ssize_t maxsplit)
  760. /*[clinic end generated code: output=7ac66f381c45e0be input=1eeeb10dafc9947a]*/
  761. {
  762. SRE_STATE state;
  763. PyObject* list;
  764. PyObject* item;
  765. Py_ssize_t status;
  766. Py_ssize_t n;
  767. Py_ssize_t i;
  768. const void* last;
  769. assert(self->codesize != 0);
  770. if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX))
  771. return NULL;
  772. list = PyList_New(0);
  773. if (!list) {
  774. state_fini(&state);
  775. return NULL;
  776. }
  777. n = 0;
  778. last = state.start;
  779. while (!maxsplit || n < maxsplit) {
  780. state_reset(&state);
  781. state.ptr = state.start;
  782. status = sre_search(&state, PatternObject_GetCode(self));
  783. if (PyErr_Occurred())
  784. goto error;
  785. if (status <= 0) {
  786. if (status == 0)
  787. break;
  788. pattern_error(status);
  789. goto error;
  790. }
  791. /* get segment before this match */
  792. item = getslice(state.isbytes, state.beginning,
  793. string, STATE_OFFSET(&state, last),
  794. STATE_OFFSET(&state, state.start)
  795. );
  796. if (!item)
  797. goto error;
  798. status = PyList_Append(list, item);
  799. Py_DECREF(item);
  800. if (status < 0)
  801. goto error;
  802. /* add groups (if any) */
  803. for (i = 0; i < self->groups; i++) {
  804. item = state_getslice(&state, i+1, string, 0);
  805. if (!item)
  806. goto error;
  807. status = PyList_Append(list, item);
  808. Py_DECREF(item);
  809. if (status < 0)
  810. goto error;
  811. }
  812. n = n + 1;
  813. state.must_advance = (state.ptr == state.start);
  814. last = state.start = state.ptr;
  815. }
  816. /* get segment following last match (even if empty) */
  817. item = getslice(state.isbytes, state.beginning,
  818. string, STATE_OFFSET(&state, last), state.endpos
  819. );
  820. if (!item)
  821. goto error;
  822. status = PyList_Append(list, item);
  823. Py_DECREF(item);
  824. if (status < 0)
  825. goto error;
  826. state_fini(&state);
  827. return list;
  828. error:
  829. Py_DECREF(list);
  830. state_fini(&state);
  831. return NULL;
  832. }
  833. static PyObject *
  834. compile_template(_sremodulestate *module_state,
  835. PatternObject *pattern, PyObject *template)
  836. {
  837. /* delegate to Python code */
  838. PyObject *func = module_state->compile_template;
  839. if (func == NULL) {
  840. func = _PyImport_GetModuleAttrString("re", "_compile_template");
  841. if (func == NULL) {
  842. return NULL;
  843. }
  844. Py_XSETREF(module_state->compile_template, func);
  845. }
  846. PyObject *args[] = {(PyObject *)pattern, template};
  847. PyObject *result = PyObject_Vectorcall(func, args, 2, NULL);
  848. if (result == NULL && PyErr_ExceptionMatches(PyExc_TypeError)) {
  849. /* If the replacement string is unhashable (e.g. bytearray),
  850. * convert it to the basic type (str or bytes) and repeat. */
  851. if (PyUnicode_Check(template) && !PyUnicode_CheckExact(template)) {
  852. PyErr_Clear();
  853. template = _PyUnicode_Copy(template);
  854. }
  855. else if (PyObject_CheckBuffer(template) && !PyBytes_CheckExact(template)) {
  856. PyErr_Clear();
  857. template = PyBytes_FromObject(template);
  858. }
  859. else {
  860. return NULL;
  861. }
  862. if (template == NULL) {
  863. return NULL;
  864. }
  865. args[1] = template;
  866. result = PyObject_Vectorcall(func, args, 2, NULL);
  867. Py_DECREF(template);
  868. }
  869. if (result != NULL && Py_TYPE(result) != module_state->Template_Type) {
  870. PyErr_Format(PyExc_RuntimeError,
  871. "the result of compiling a replacement string is %.200s",
  872. Py_TYPE(result)->tp_name);
  873. Py_DECREF(result);
  874. return NULL;
  875. }
  876. return result;
  877. }
  878. static PyObject *expand_template(TemplateObject *, MatchObject *); /* Forward */
  879. static PyObject*
  880. pattern_subx(_sremodulestate* module_state,
  881. PatternObject* self,
  882. PyObject* ptemplate,
  883. PyObject* string,
  884. Py_ssize_t count,
  885. Py_ssize_t subn)
  886. {
  887. SRE_STATE state;
  888. PyObject* list;
  889. PyObject* joiner;
  890. PyObject* item;
  891. PyObject* filter;
  892. PyObject* match;
  893. const void* ptr;
  894. Py_ssize_t status;
  895. Py_ssize_t n;
  896. Py_ssize_t i, b, e;
  897. int isbytes, charsize;
  898. enum {LITERAL, TEMPLATE, CALLABLE} filter_type;
  899. Py_buffer view;
  900. if (PyCallable_Check(ptemplate)) {
  901. /* sub/subn takes either a function or a template */
  902. filter = Py_NewRef(ptemplate);
  903. filter_type = CALLABLE;
  904. } else {
  905. /* if not callable, check if it's a literal string */
  906. int literal;
  907. view.buf = NULL;
  908. ptr = getstring(ptemplate, &n, &isbytes, &charsize, &view);
  909. if (ptr) {
  910. if (charsize == 1)
  911. literal = memchr(ptr, '\\', n) == NULL;
  912. else
  913. literal = PyUnicode_FindChar(ptemplate, '\\', 0, n, 1) == -1;
  914. } else {
  915. PyErr_Clear();
  916. literal = 0;
  917. }
  918. if (view.buf)
  919. PyBuffer_Release(&view);
  920. if (literal) {
  921. filter = Py_NewRef(ptemplate);
  922. filter_type = LITERAL;
  923. } else {
  924. /* not a literal; hand it over to the template compiler */
  925. filter = compile_template(module_state, self, ptemplate);
  926. if (!filter)
  927. return NULL;
  928. assert(Py_TYPE(filter) == module_state->Template_Type);
  929. if (Py_SIZE(filter) == 0) {
  930. Py_SETREF(filter,
  931. Py_NewRef(((TemplateObject *)filter)->literal));
  932. filter_type = LITERAL;
  933. }
  934. else {
  935. filter_type = TEMPLATE;
  936. }
  937. }
  938. }
  939. if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX)) {
  940. Py_DECREF(filter);
  941. return NULL;
  942. }
  943. list = PyList_New(0);
  944. if (!list) {
  945. Py_DECREF(filter);
  946. state_fini(&state);
  947. return NULL;
  948. }
  949. n = i = 0;
  950. while (!count || n < count) {
  951. state_reset(&state);
  952. state.ptr = state.start;
  953. status = sre_search(&state, PatternObject_GetCode(self));
  954. if (PyErr_Occurred())
  955. goto error;
  956. if (status <= 0) {
  957. if (status == 0)
  958. break;
  959. pattern_error(status);
  960. goto error;
  961. }
  962. b = STATE_OFFSET(&state, state.start);
  963. e = STATE_OFFSET(&state, state.ptr);
  964. if (i < b) {
  965. /* get segment before this match */
  966. item = getslice(state.isbytes, state.beginning,
  967. string, i, b);
  968. if (!item)
  969. goto error;
  970. status = PyList_Append(list, item);
  971. Py_DECREF(item);
  972. if (status < 0)
  973. goto error;
  974. }
  975. if (filter_type != LITERAL) {
  976. /* pass match object through filter */
  977. match = pattern_new_match(module_state, self, &state, 1);
  978. if (!match)
  979. goto error;
  980. if (filter_type == TEMPLATE) {
  981. item = expand_template((TemplateObject *)filter,
  982. (MatchObject *)match);
  983. }
  984. else {
  985. assert(filter_type == CALLABLE);
  986. item = PyObject_CallOneArg(filter, match);
  987. }
  988. Py_DECREF(match);
  989. if (!item)
  990. goto error;
  991. } else {
  992. /* filter is literal string */
  993. item = Py_NewRef(filter);
  994. }
  995. /* add to list */
  996. if (item != Py_None) {
  997. status = PyList_Append(list, item);
  998. Py_DECREF(item);
  999. if (status < 0)
  1000. goto error;
  1001. }
  1002. i = e;
  1003. n = n + 1;
  1004. state.must_advance = (state.ptr == state.start);
  1005. state.start = state.ptr;
  1006. }
  1007. /* get segment following last match */
  1008. if (i < state.endpos) {
  1009. item = getslice(state.isbytes, state.beginning,
  1010. string, i, state.endpos);
  1011. if (!item)
  1012. goto error;
  1013. status = PyList_Append(list, item);
  1014. Py_DECREF(item);
  1015. if (status < 0)
  1016. goto error;
  1017. }
  1018. state_fini(&state);
  1019. Py_DECREF(filter);
  1020. /* convert list to single string (also removes list) */
  1021. joiner = getslice(state.isbytes, state.beginning, string, 0, 0);
  1022. if (!joiner) {
  1023. Py_DECREF(list);
  1024. return NULL;
  1025. }
  1026. if (PyList_GET_SIZE(list) == 0) {
  1027. Py_DECREF(list);
  1028. item = joiner;
  1029. }
  1030. else {
  1031. if (state.isbytes)
  1032. item = _PyBytes_Join(joiner, list);
  1033. else
  1034. item = PyUnicode_Join(joiner, list);
  1035. Py_DECREF(joiner);
  1036. Py_DECREF(list);
  1037. if (!item)
  1038. return NULL;
  1039. }
  1040. if (subn)
  1041. return Py_BuildValue("Nn", item, n);
  1042. return item;
  1043. error:
  1044. Py_DECREF(list);
  1045. state_fini(&state);
  1046. Py_DECREF(filter);
  1047. return NULL;
  1048. }
  1049. /*[clinic input]
  1050. _sre.SRE_Pattern.sub
  1051. cls: defining_class
  1052. /
  1053. repl: object
  1054. string: object
  1055. count: Py_ssize_t = 0
  1056. Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl.
  1057. [clinic start generated code]*/
  1058. static PyObject *
  1059. _sre_SRE_Pattern_sub_impl(PatternObject *self, PyTypeObject *cls,
  1060. PyObject *repl, PyObject *string, Py_ssize_t count)
  1061. /*[clinic end generated code: output=4be141ab04bca60d input=d8d1d4ac2311a07c]*/
  1062. {
  1063. _sremodulestate *module_state = get_sre_module_state_by_class(cls);
  1064. return pattern_subx(module_state, self, repl, string, count, 0);
  1065. }
  1066. /*[clinic input]
  1067. _sre.SRE_Pattern.subn
  1068. cls: defining_class
  1069. /
  1070. repl: object
  1071. string: object
  1072. count: Py_ssize_t = 0
  1073. Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl.
  1074. [clinic start generated code]*/
  1075. static PyObject *
  1076. _sre_SRE_Pattern_subn_impl(PatternObject *self, PyTypeObject *cls,
  1077. PyObject *repl, PyObject *string,
  1078. Py_ssize_t count)
  1079. /*[clinic end generated code: output=da02fd85258b1e1f input=8b78a65b8302e58d]*/
  1080. {
  1081. _sremodulestate *module_state = get_sre_module_state_by_class(cls);
  1082. return pattern_subx(module_state, self, repl, string, count, 1);
  1083. }
  1084. /*[clinic input]
  1085. _sre.SRE_Pattern.__copy__
  1086. [clinic start generated code]*/
  1087. static PyObject *
  1088. _sre_SRE_Pattern___copy___impl(PatternObject *self)
  1089. /*[clinic end generated code: output=85dedc2db1bd8694 input=a730a59d863bc9f5]*/
  1090. {
  1091. return Py_NewRef(self);
  1092. }
  1093. /*[clinic input]
  1094. _sre.SRE_Pattern.__deepcopy__
  1095. memo: object
  1096. /
  1097. [clinic start generated code]*/
  1098. static PyObject *
  1099. _sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo)
  1100. /*[clinic end generated code: output=2ad25679c1f1204a input=a465b1602f997bed]*/
  1101. {
  1102. return Py_NewRef(self);
  1103. }
  1104. static PyObject *
  1105. pattern_repr(PatternObject *obj)
  1106. {
  1107. static const struct {
  1108. const char *name;
  1109. int value;
  1110. } flag_names[] = {
  1111. {"re.TEMPLATE", SRE_FLAG_TEMPLATE},
  1112. {"re.IGNORECASE", SRE_FLAG_IGNORECASE},
  1113. {"re.LOCALE", SRE_FLAG_LOCALE},
  1114. {"re.MULTILINE", SRE_FLAG_MULTILINE},
  1115. {"re.DOTALL", SRE_FLAG_DOTALL},
  1116. {"re.UNICODE", SRE_FLAG_UNICODE},
  1117. {"re.VERBOSE", SRE_FLAG_VERBOSE},
  1118. {"re.DEBUG", SRE_FLAG_DEBUG},
  1119. {"re.ASCII", SRE_FLAG_ASCII},
  1120. };
  1121. PyObject *result = NULL;
  1122. PyObject *flag_items;
  1123. size_t i;
  1124. int flags = obj->flags;
  1125. /* Omit re.UNICODE for valid string patterns. */
  1126. if (obj->isbytes == 0 &&
  1127. (flags & (SRE_FLAG_LOCALE|SRE_FLAG_UNICODE|SRE_FLAG_ASCII)) ==
  1128. SRE_FLAG_UNICODE)
  1129. flags &= ~SRE_FLAG_UNICODE;
  1130. flag_items = PyList_New(0);
  1131. if (!flag_items)
  1132. return NULL;
  1133. for (i = 0; i < Py_ARRAY_LENGTH(flag_names); i++) {
  1134. if (flags & flag_names[i].value) {
  1135. PyObject *item = PyUnicode_FromString(flag_names[i].name);
  1136. if (!item)
  1137. goto done;
  1138. if (PyList_Append(flag_items, item) < 0) {
  1139. Py_DECREF(item);
  1140. goto done;
  1141. }
  1142. Py_DECREF(item);
  1143. flags &= ~flag_names[i].value;
  1144. }
  1145. }
  1146. if (flags) {
  1147. PyObject *item = PyUnicode_FromFormat("0x%x", flags);
  1148. if (!item)
  1149. goto done;
  1150. if (PyList_Append(flag_items, item) < 0) {
  1151. Py_DECREF(item);
  1152. goto done;
  1153. }
  1154. Py_DECREF(item);
  1155. }
  1156. if (PyList_Size(flag_items) > 0) {
  1157. PyObject *flags_result;
  1158. PyObject *sep = PyUnicode_FromString("|");
  1159. if (!sep)
  1160. goto done;
  1161. flags_result = PyUnicode_Join(sep, flag_items);
  1162. Py_DECREF(sep);
  1163. if (!flags_result)
  1164. goto done;
  1165. result = PyUnicode_FromFormat("re.compile(%.200R, %S)",
  1166. obj->pattern, flags_result);
  1167. Py_DECREF(flags_result);
  1168. }
  1169. else {
  1170. result = PyUnicode_FromFormat("re.compile(%.200R)", obj->pattern);
  1171. }
  1172. done:
  1173. Py_DECREF(flag_items);
  1174. return result;
  1175. }
  1176. PyDoc_STRVAR(pattern_doc, "Compiled regular expression object.");
  1177. /* PatternObject's 'groupindex' method. */
  1178. static PyObject *
  1179. pattern_groupindex(PatternObject *self, void *Py_UNUSED(ignored))
  1180. {
  1181. if (self->groupindex == NULL)
  1182. return PyDict_New();
  1183. return PyDictProxy_New(self->groupindex);
  1184. }
  1185. static int _validate(PatternObject *self); /* Forward */
  1186. /*[clinic input]
  1187. _sre.compile
  1188. pattern: object
  1189. flags: int
  1190. code: object(subclass_of='&PyList_Type')
  1191. groups: Py_ssize_t
  1192. groupindex: object(subclass_of='&PyDict_Type')
  1193. indexgroup: object(subclass_of='&PyTuple_Type')
  1194. [clinic start generated code]*/
  1195. static PyObject *
  1196. _sre_compile_impl(PyObject *module, PyObject *pattern, int flags,
  1197. PyObject *code, Py_ssize_t groups, PyObject *groupindex,
  1198. PyObject *indexgroup)
  1199. /*[clinic end generated code: output=ef9c2b3693776404 input=0a68476dbbe5db30]*/
  1200. {
  1201. /* "compile" pattern descriptor to pattern object */
  1202. _sremodulestate *module_state = get_sre_module_state(module);
  1203. PatternObject* self;
  1204. Py_ssize_t i, n;
  1205. n = PyList_GET_SIZE(code);
  1206. /* coverity[ampersand_in_size] */
  1207. self = PyObject_GC_NewVar(PatternObject, module_state->Pattern_Type, n);
  1208. if (!self)
  1209. return NULL;
  1210. self->weakreflist = NULL;
  1211. self->pattern = NULL;
  1212. self->groupindex = NULL;
  1213. self->indexgroup = NULL;
  1214. self->codesize = n;
  1215. for (i = 0; i < n; i++) {
  1216. PyObject *o = PyList_GET_ITEM(code, i);
  1217. unsigned long value = PyLong_AsUnsignedLong(o);
  1218. if (value == (unsigned long)-1 && PyErr_Occurred()) {
  1219. break;
  1220. }
  1221. self->code[i] = (SRE_CODE) value;
  1222. if ((unsigned long) self->code[i] != value) {
  1223. PyErr_SetString(PyExc_OverflowError,
  1224. "regular expression code size limit exceeded");
  1225. break;
  1226. }
  1227. }
  1228. PyObject_GC_Track(self);
  1229. if (PyErr_Occurred()) {
  1230. Py_DECREF(self);
  1231. return NULL;
  1232. }
  1233. if (pattern == Py_None) {
  1234. self->isbytes = -1;
  1235. }
  1236. else {
  1237. Py_ssize_t p_length;
  1238. int charsize;
  1239. Py_buffer view;
  1240. view.buf = NULL;
  1241. if (!getstring(pattern, &p_length, &self->isbytes,
  1242. &charsize, &view)) {
  1243. Py_DECREF(self);
  1244. return NULL;
  1245. }
  1246. if (view.buf)
  1247. PyBuffer_Release(&view);
  1248. }
  1249. self->pattern = Py_NewRef(pattern);
  1250. self->flags = flags;
  1251. self->groups = groups;
  1252. if (PyDict_GET_SIZE(groupindex) > 0) {
  1253. self->groupindex = Py_NewRef(groupindex);
  1254. if (PyTuple_GET_SIZE(indexgroup) > 0) {
  1255. self->indexgroup = Py_NewRef(indexgroup);
  1256. }
  1257. }
  1258. if (!_validate(self)) {
  1259. Py_DECREF(self);
  1260. return NULL;
  1261. }
  1262. return (PyObject*) self;
  1263. }
  1264. /*[clinic input]
  1265. _sre.template
  1266. pattern: object
  1267. template: object(subclass_of="&PyList_Type")
  1268. A list containing interleaved literal strings (str or bytes) and group
  1269. indices (int), as returned by re._parser.parse_template():
  1270. [literal1, group1, ..., literalN, groupN]
  1271. /
  1272. [clinic start generated code]*/
  1273. static PyObject *
  1274. _sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template)
  1275. /*[clinic end generated code: output=d51290e596ebca86 input=af55380b27f02942]*/
  1276. {
  1277. /* template is a list containing interleaved literal strings (str or bytes)
  1278. * and group indices (int), as returned by _parser.parse_template:
  1279. * [literal1, group1, literal2, ..., literalN].
  1280. */
  1281. _sremodulestate *module_state = get_sre_module_state(module);
  1282. TemplateObject *self = NULL;
  1283. Py_ssize_t n = PyList_GET_SIZE(template);
  1284. if ((n & 1) == 0 || n < 1) {
  1285. goto bad_template;
  1286. }
  1287. n /= 2;
  1288. self = PyObject_GC_NewVar(TemplateObject, module_state->Template_Type, n);
  1289. if (!self)
  1290. return NULL;
  1291. self->chunks = 1 + 2*n;
  1292. self->literal = Py_NewRef(PyList_GET_ITEM(template, 0));
  1293. for (Py_ssize_t i = 0; i < n; i++) {
  1294. Py_ssize_t index = PyLong_AsSsize_t(PyList_GET_ITEM(template, 2*i+1));
  1295. if (index == -1 && PyErr_Occurred()) {
  1296. Py_SET_SIZE(self, i);
  1297. Py_DECREF(self);
  1298. return NULL;
  1299. }
  1300. if (index < 0) {
  1301. Py_SET_SIZE(self, i);
  1302. goto bad_template;
  1303. }
  1304. self->items[i].index = index;
  1305. PyObject *literal = PyList_GET_ITEM(template, 2*i+2);
  1306. // Skip empty literals.
  1307. if ((PyUnicode_Check(literal) && !PyUnicode_GET_LENGTH(literal)) ||
  1308. (PyBytes_Check(literal) && !PyBytes_GET_SIZE(literal)))
  1309. {
  1310. literal = NULL;
  1311. self->chunks--;
  1312. }
  1313. self->items[i].literal = Py_XNewRef(literal);
  1314. }
  1315. return (PyObject*) self;
  1316. bad_template:
  1317. PyErr_SetString(PyExc_TypeError, "invalid template");
  1318. Py_XDECREF(self);
  1319. return NULL;
  1320. }
  1321. /* -------------------------------------------------------------------- */
  1322. /* Code validation */
  1323. /* To learn more about this code, have a look at the _compile() function in
  1324. Lib/sre_compile.py. The validation functions below checks the code array
  1325. for conformance with the code patterns generated there.
  1326. The nice thing about the generated code is that it is position-independent:
  1327. all jumps are relative jumps forward. Also, jumps don't cross each other:
  1328. the target of a later jump is always earlier than the target of an earlier
  1329. jump. IOW, this is okay:
  1330. J---------J-------T--------T
  1331. \ \_____/ /
  1332. \______________________/
  1333. but this is not:
  1334. J---------J-------T--------T
  1335. \_________\_____/ /
  1336. \____________/
  1337. It also helps that SRE_CODE is always an unsigned type.
  1338. */
  1339. /* Defining this one enables tracing of the validator */
  1340. #undef VVERBOSE
  1341. /* Trace macro for the validator */
  1342. #if defined(VVERBOSE)
  1343. #define VTRACE(v) printf v
  1344. #else
  1345. #define VTRACE(v) do {} while(0) /* do nothing */
  1346. #endif
  1347. /* Report failure */
  1348. #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return -1; } while (0)
  1349. /* Extract opcode, argument, or skip count from code array */
  1350. #define GET_OP \
  1351. do { \
  1352. VTRACE(("%p: ", code)); \
  1353. if (code >= end) FAIL; \
  1354. op = *code++; \
  1355. VTRACE(("%lu (op)\n", (unsigned long)op)); \
  1356. } while (0)
  1357. #define GET_ARG \
  1358. do { \
  1359. VTRACE(("%p= ", code)); \
  1360. if (code >= end) FAIL; \
  1361. arg = *code++; \
  1362. VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
  1363. } while (0)
  1364. #define GET_SKIP_ADJ(adj) \
  1365. do { \
  1366. VTRACE(("%p= ", code)); \
  1367. if (code >= end) FAIL; \
  1368. skip = *code; \
  1369. VTRACE(("%lu (skip to %p)\n", \
  1370. (unsigned long)skip, code+skip)); \
  1371. if (skip-adj > (uintptr_t)(end - code)) \
  1372. FAIL; \
  1373. code++; \
  1374. } while (0)
  1375. #define GET_SKIP GET_SKIP_ADJ(0)
  1376. static int
  1377. _validate_charset(SRE_CODE *code, SRE_CODE *end)
  1378. {
  1379. /* Some variables are manipulated by the macros above */
  1380. SRE_CODE op;
  1381. SRE_CODE arg;
  1382. SRE_CODE offset;
  1383. int i;
  1384. while (code < end) {
  1385. GET_OP;
  1386. switch (op) {
  1387. case SRE_OP_NEGATE:
  1388. break;
  1389. case SRE_OP_LITERAL:
  1390. GET_ARG;
  1391. break;
  1392. case SRE_OP_RANGE:
  1393. case SRE_OP_RANGE_UNI_IGNORE:
  1394. GET_ARG;
  1395. GET_ARG;
  1396. break;
  1397. case SRE_OP_CHARSET:
  1398. offset = 256/SRE_CODE_BITS; /* 256-bit bitmap */
  1399. if (offset > (uintptr_t)(end - code))
  1400. FAIL;
  1401. code += offset;
  1402. break;
  1403. case SRE_OP_BIGCHARSET:
  1404. GET_ARG; /* Number of blocks */
  1405. offset = 256/sizeof(SRE_CODE); /* 256-byte table */
  1406. if (offset > (uintptr_t)(end - code))
  1407. FAIL;
  1408. /* Make sure that each byte points to a valid block */
  1409. for (i = 0; i < 256; i++) {
  1410. if (((unsigned char *)code)[i] >= arg)
  1411. FAIL;
  1412. }
  1413. code += offset;
  1414. offset = arg * (256/SRE_CODE_BITS); /* 256-bit bitmap times arg */
  1415. if (offset > (uintptr_t)(end - code))
  1416. FAIL;
  1417. code += offset;
  1418. break;
  1419. case SRE_OP_CATEGORY:
  1420. GET_ARG;
  1421. switch (arg) {
  1422. case SRE_CATEGORY_DIGIT:
  1423. case SRE_CATEGORY_NOT_DIGIT:
  1424. case SRE_CATEGORY_SPACE:
  1425. case SRE_CATEGORY_NOT_SPACE:
  1426. case SRE_CATEGORY_WORD:
  1427. case SRE_CATEGORY_NOT_WORD:
  1428. case SRE_CATEGORY_LINEBREAK:
  1429. case SRE_CATEGORY_NOT_LINEBREAK:
  1430. case SRE_CATEGORY_LOC_WORD:
  1431. case SRE_CATEGORY_LOC_NOT_WORD:
  1432. case SRE_CATEGORY_UNI_DIGIT:
  1433. case SRE_CATEGORY_UNI_NOT_DIGIT:
  1434. case SRE_CATEGORY_UNI_SPACE:
  1435. case SRE_CATEGORY_UNI_NOT_SPACE:
  1436. case SRE_CATEGORY_UNI_WORD:
  1437. case SRE_CATEGORY_UNI_NOT_WORD:
  1438. case SRE_CATEGORY_UNI_LINEBREAK:
  1439. case SRE_CATEGORY_UNI_NOT_LINEBREAK:
  1440. break;
  1441. default:
  1442. FAIL;
  1443. }
  1444. break;
  1445. default:
  1446. FAIL;
  1447. }
  1448. }
  1449. return 0;
  1450. }
  1451. /* Returns 0 on success, -1 on failure, and 1 if the last op is JUMP. */
  1452. static int
  1453. _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
  1454. {
  1455. /* Some variables are manipulated by the macros above */
  1456. SRE_CODE op;
  1457. SRE_CODE arg;
  1458. SRE_CODE skip;
  1459. VTRACE(("code=%p, end=%p\n", code, end));
  1460. if (code > end)
  1461. FAIL;
  1462. while (code < end) {
  1463. GET_OP;
  1464. switch (op) {
  1465. case SRE_OP_MARK:
  1466. /* We don't check whether marks are properly nested; the
  1467. sre_match() code is robust even if they don't, and the worst
  1468. you can get is nonsensical match results. */
  1469. GET_ARG;
  1470. if (arg > 2 * (size_t)groups + 1) {
  1471. VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
  1472. FAIL;
  1473. }
  1474. break;
  1475. case SRE_OP_LITERAL:
  1476. case SRE_OP_NOT_LITERAL:
  1477. case SRE_OP_LITERAL_IGNORE:
  1478. case SRE_OP_NOT_LITERAL_IGNORE:
  1479. case SRE_OP_LITERAL_UNI_IGNORE:
  1480. case SRE_OP_NOT_LITERAL_UNI_IGNORE:
  1481. case SRE_OP_LITERAL_LOC_IGNORE:
  1482. case SRE_OP_NOT_LITERAL_LOC_IGNORE:
  1483. GET_ARG;
  1484. /* The arg is just a character, nothing to check */
  1485. break;
  1486. case SRE_OP_SUCCESS:
  1487. case SRE_OP_FAILURE:
  1488. /* Nothing to check; these normally end the matching process */
  1489. break;
  1490. case SRE_OP_AT:
  1491. GET_ARG;
  1492. switch (arg) {
  1493. case SRE_AT_BEGINNING:
  1494. case SRE_AT_BEGINNING_STRING:
  1495. case SRE_AT_BEGINNING_LINE:
  1496. case SRE_AT_END:
  1497. case SRE_AT_END_LINE:
  1498. case SRE_AT_END_STRING:
  1499. case SRE_AT_BOUNDARY:
  1500. case SRE_AT_NON_BOUNDARY:
  1501. case SRE_AT_LOC_BOUNDARY:
  1502. case SRE_AT_LOC_NON_BOUNDARY:
  1503. case SRE_AT_UNI_BOUNDARY:
  1504. case SRE_AT_UNI_NON_BOUNDARY:
  1505. break;
  1506. default:
  1507. FAIL;
  1508. }
  1509. break;
  1510. case SRE_OP_ANY:
  1511. case SRE_OP_ANY_ALL:
  1512. /* These have no operands */
  1513. break;
  1514. case SRE_OP_IN:
  1515. case SRE_OP_IN_IGNORE:
  1516. case SRE_OP_IN_UNI_IGNORE:
  1517. case SRE_OP_IN_LOC_IGNORE:
  1518. GET_SKIP;
  1519. /* Stop 1 before the end; we check the FAILURE below */
  1520. if (_validate_charset(code, code+skip-2))
  1521. FAIL;
  1522. if (code[skip-2] != SRE_OP_FAILURE)
  1523. FAIL;
  1524. code += skip-1;
  1525. break;
  1526. case SRE_OP_INFO:
  1527. {
  1528. /* A minimal info field is
  1529. <INFO> <1=skip> <2=flags> <3=min> <4=max>;
  1530. If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
  1531. more follows. */
  1532. SRE_CODE flags, i;
  1533. SRE_CODE *newcode;
  1534. GET_SKIP;
  1535. newcode = code+skip-1;
  1536. GET_ARG; flags = arg;
  1537. GET_ARG;
  1538. GET_ARG;
  1539. /* Check that only valid flags are present */
  1540. if ((flags & ~(SRE_INFO_PREFIX |
  1541. SRE_INFO_LITERAL |
  1542. SRE_INFO_CHARSET)) != 0)
  1543. FAIL;
  1544. /* PREFIX and CHARSET are mutually exclusive */
  1545. if ((flags & SRE_INFO_PREFIX) &&
  1546. (flags & SRE_INFO_CHARSET))
  1547. FAIL;
  1548. /* LITERAL implies PREFIX */
  1549. if ((flags & SRE_INFO_LITERAL) &&
  1550. !(flags & SRE_INFO_PREFIX))
  1551. FAIL;
  1552. /* Validate the prefix */
  1553. if (flags & SRE_INFO_PREFIX) {
  1554. SRE_CODE prefix_len;
  1555. GET_ARG; prefix_len = arg;
  1556. GET_ARG;
  1557. /* Here comes the prefix string */
  1558. if (prefix_len > (uintptr_t)(newcode - code))
  1559. FAIL;
  1560. code += prefix_len;
  1561. /* And here comes the overlap table */
  1562. if (prefix_len > (uintptr_t)(newcode - code))
  1563. FAIL;
  1564. /* Each overlap value should be < prefix_len */
  1565. for (i = 0; i < prefix_len; i++) {
  1566. if (code[i] >= prefix_len)
  1567. FAIL;
  1568. }
  1569. code += prefix_len;
  1570. }
  1571. /* Validate the charset */
  1572. if (flags & SRE_INFO_CHARSET) {
  1573. if (_validate_charset(code, newcode-1))
  1574. FAIL;
  1575. if (newcode[-1] != SRE_OP_FAILURE)
  1576. FAIL;
  1577. code = newcode;
  1578. }
  1579. else if (code != newcode) {
  1580. VTRACE(("code=%p, newcode=%p\n", code, newcode));
  1581. FAIL;
  1582. }
  1583. }
  1584. break;
  1585. case SRE_OP_BRANCH:
  1586. {
  1587. SRE_CODE *target = NULL;
  1588. for (;;) {
  1589. GET_SKIP;
  1590. if (skip == 0)
  1591. break;
  1592. /* Stop 2 before the end; we check the JUMP below */
  1593. if (_validate_inner(code, code+skip-3, groups))
  1594. FAIL;
  1595. code += skip-3;
  1596. /* Check that it ends with a JUMP, and that each JUMP
  1597. has the same target */
  1598. GET_OP;
  1599. if (op != SRE_OP_JUMP)
  1600. FAIL;
  1601. GET_SKIP;
  1602. if (target == NULL)
  1603. target = code+skip-1;
  1604. else if (code+skip-1 != target)
  1605. FAIL;
  1606. }
  1607. if (code != target)
  1608. FAIL;
  1609. }
  1610. break;
  1611. case SRE_OP_REPEAT_ONE:
  1612. case SRE_OP_MIN_REPEAT_ONE:
  1613. case SRE_OP_POSSESSIVE_REPEAT_ONE:
  1614. {
  1615. SRE_CODE min, max;
  1616. GET_SKIP;
  1617. GET_ARG; min = arg;
  1618. GET_ARG; max = arg;
  1619. if (min > max)
  1620. FAIL;
  1621. if (max > SRE_MAXREPEAT)
  1622. FAIL;
  1623. if (_validate_inner(code, code+skip-4, groups))
  1624. FAIL;
  1625. code += skip-4;
  1626. GET_OP;
  1627. if (op != SRE_OP_SUCCESS)
  1628. FAIL;
  1629. }
  1630. break;
  1631. case SRE_OP_REPEAT:
  1632. case SRE_OP_POSSESSIVE_REPEAT:
  1633. {
  1634. SRE_CODE op1 = op, min, max;
  1635. GET_SKIP;
  1636. GET_ARG; min = arg;
  1637. GET_ARG; max = arg;
  1638. if (min > max)
  1639. FAIL;
  1640. if (max > SRE_MAXREPEAT)
  1641. FAIL;
  1642. if (_validate_inner(code, code+skip-3, groups))
  1643. FAIL;
  1644. code += skip-3;
  1645. GET_OP;
  1646. if (op1 == SRE_OP_POSSESSIVE_REPEAT) {
  1647. if (op != SRE_OP_SUCCESS)
  1648. FAIL;
  1649. }
  1650. else {
  1651. if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
  1652. FAIL;
  1653. }
  1654. }
  1655. break;
  1656. case SRE_OP_ATOMIC_GROUP:
  1657. {
  1658. GET_SKIP;
  1659. if (_validate_inner(code, code+skip-2, groups))
  1660. FAIL;
  1661. code += skip-2;
  1662. GET_OP;
  1663. if (op != SRE_OP_SUCCESS)
  1664. FAIL;
  1665. }
  1666. break;
  1667. case SRE_OP_GROUPREF:
  1668. case SRE_OP_GROUPREF_IGNORE:
  1669. case SRE_OP_GROUPREF_UNI_IGNORE:
  1670. case SRE_OP_GROUPREF_LOC_IGNORE:
  1671. GET_ARG;
  1672. if (arg >= (size_t)groups)
  1673. FAIL;
  1674. break;
  1675. case SRE_OP_GROUPREF_EXISTS:
  1676. /* The regex syntax for this is: '(?(group)then|else)', where
  1677. 'group' is either an integer group number or a group name,
  1678. 'then' and 'else' are sub-regexes, and 'else' is optional. */
  1679. GET_ARG;
  1680. if (arg >= (size_t)groups)
  1681. FAIL;
  1682. GET_SKIP_ADJ(1);
  1683. code--; /* The skip is relative to the first arg! */
  1684. /* There are two possibilities here: if there is both a 'then'
  1685. part and an 'else' part, the generated code looks like:
  1686. GROUPREF_EXISTS
  1687. <group>
  1688. <skipyes>
  1689. ...then part...
  1690. JUMP
  1691. <skipno>
  1692. (<skipyes> jumps here)
  1693. ...else part...
  1694. (<skipno> jumps here)
  1695. If there is only a 'then' part, it looks like:
  1696. GROUPREF_EXISTS
  1697. <group>
  1698. <skip>
  1699. ...then part...
  1700. (<skip> jumps here)
  1701. There is no direct way to decide which it is, and we don't want
  1702. to allow arbitrary jumps anywhere in the code; so we just look
  1703. for a JUMP opcode preceding our skip target.
  1704. */
  1705. VTRACE(("then part:\n"));
  1706. int rc = _validate_inner(code+1, code+skip-1, groups);
  1707. if (rc == 1) {
  1708. VTRACE(("else part:\n"));
  1709. code += skip-2; /* Position after JUMP, at <skipno> */
  1710. GET_SKIP;
  1711. rc = _validate_inner(code, code+skip-1, groups);
  1712. }
  1713. if (rc)
  1714. FAIL;
  1715. code += skip-1;
  1716. break;
  1717. case SRE_OP_ASSERT:
  1718. case SRE_OP_ASSERT_NOT:
  1719. GET_SKIP;
  1720. GET_ARG; /* 0 for lookahead, width for lookbehind */
  1721. code--; /* Back up over arg to simplify math below */
  1722. /* Stop 1 before the end; we check the SUCCESS below */
  1723. if (_validate_inner(code+1, code+skip-2, groups))
  1724. FAIL;
  1725. code += skip-2;
  1726. GET_OP;
  1727. if (op != SRE_OP_SUCCESS)
  1728. FAIL;
  1729. break;
  1730. case SRE_OP_JUMP:
  1731. if (code + 1 != end)
  1732. FAIL;
  1733. VTRACE(("JUMP: %d\n", __LINE__));
  1734. return 1;
  1735. default:
  1736. FAIL;
  1737. }
  1738. }
  1739. VTRACE(("okay\n"));
  1740. return 0;
  1741. }
  1742. static int
  1743. _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
  1744. {
  1745. if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
  1746. code >= end || end[-1] != SRE_OP_SUCCESS)
  1747. FAIL;
  1748. return _validate_inner(code, end-1, groups);
  1749. }
  1750. static int
  1751. _validate(PatternObject *self)
  1752. {
  1753. if (_validate_outer(self->code, self->code+self->codesize, self->groups))
  1754. {
  1755. PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
  1756. return 0;
  1757. }
  1758. else
  1759. VTRACE(("Success!\n"));
  1760. return 1;
  1761. }
  1762. /* -------------------------------------------------------------------- */
  1763. /* match methods */
  1764. static int
  1765. match_traverse(MatchObject *self, visitproc visit, void *arg)
  1766. {
  1767. Py_VISIT(Py_TYPE(self));
  1768. Py_VISIT(self->string);
  1769. Py_VISIT(self->regs);
  1770. Py_VISIT(self->pattern);
  1771. return 0;
  1772. }
  1773. static int
  1774. match_clear(MatchObject *self)
  1775. {
  1776. Py_CLEAR(self->string);
  1777. Py_CLEAR(self->regs);
  1778. Py_CLEAR(self->pattern);
  1779. return 0;
  1780. }
  1781. static void
  1782. match_dealloc(MatchObject* self)
  1783. {
  1784. PyTypeObject *tp = Py_TYPE(self);
  1785. PyObject_GC_UnTrack(self);
  1786. (void)match_clear(self);
  1787. tp->tp_free(self);
  1788. Py_DECREF(tp);
  1789. }
  1790. static PyObject*
  1791. match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
  1792. {
  1793. Py_ssize_t length;
  1794. int isbytes, charsize;
  1795. Py_buffer view;
  1796. PyObject *result;
  1797. const void* ptr;
  1798. Py_ssize_t i, j;
  1799. assert(0 <= index && index < self->groups);
  1800. index *= 2;
  1801. if (self->string == Py_None || self->mark[index] < 0) {
  1802. /* return default value if the string or group is undefined */
  1803. return Py_NewRef(def);
  1804. }
  1805. ptr = getstring(self->string, &length, &isbytes, &charsize, &view);
  1806. if (ptr == NULL)
  1807. return NULL;
  1808. i = self->mark[index];
  1809. j = self->mark[index+1];
  1810. i = Py_MIN(i, length);
  1811. j = Py_MIN(j, length);
  1812. result = getslice(isbytes, ptr, self->string, i, j);
  1813. if (isbytes && view.buf != NULL)
  1814. PyBuffer_Release(&view);
  1815. return result;
  1816. }
  1817. static Py_ssize_t
  1818. match_getindex(MatchObject* self, PyObject* index)
  1819. {
  1820. Py_ssize_t i;
  1821. if (index == NULL)
  1822. /* Default value */
  1823. return 0;
  1824. if (PyIndex_Check(index)) {
  1825. i = PyNumber_AsSsize_t(index, NULL);
  1826. }
  1827. else {
  1828. i = -1;
  1829. if (self->pattern->groupindex) {
  1830. index = PyDict_GetItemWithError(self->pattern->groupindex, index);
  1831. if (index && PyLong_Check(index)) {
  1832. i = PyLong_AsSsize_t(index);
  1833. }
  1834. }
  1835. }
  1836. if (i < 0 || i >= self->groups) {
  1837. /* raise IndexError if we were given a bad group number */
  1838. if (!PyErr_Occurred()) {
  1839. PyErr_SetString(PyExc_IndexError, "no such group");
  1840. }
  1841. return -1;
  1842. }
  1843. return i;
  1844. }
  1845. static PyObject*
  1846. match_getslice(MatchObject* self, PyObject* index, PyObject* def)
  1847. {
  1848. Py_ssize_t i = match_getindex(self, index);
  1849. if (i < 0) {
  1850. return NULL;
  1851. }
  1852. return match_getslice_by_index(self, i, def);
  1853. }
  1854. /*[clinic input]
  1855. _sre.SRE_Match.expand
  1856. template: object
  1857. Return the string obtained by doing backslash substitution on the string template, as done by the sub() method.
  1858. [clinic start generated code]*/
  1859. static PyObject *
  1860. _sre_SRE_Match_expand_impl(MatchObject *self, PyObject *template)
  1861. /*[clinic end generated code: output=931b58ccc323c3a1 input=4bfdb22c2f8b146a]*/
  1862. {
  1863. _sremodulestate *module_state = get_sre_module_state_by_class(Py_TYPE(self));
  1864. PyObject *filter = compile_template(module_state, self->pattern, template);
  1865. if (filter == NULL) {
  1866. return NULL;
  1867. }
  1868. PyObject *result = expand_template((TemplateObject *)filter, self);
  1869. Py_DECREF(filter);
  1870. return result;
  1871. }
  1872. static PyObject*
  1873. match_group(MatchObject* self, PyObject* args)
  1874. {
  1875. PyObject* result;
  1876. Py_ssize_t i, size;
  1877. size = PyTuple_GET_SIZE(args);
  1878. switch (size) {
  1879. case 0:
  1880. result = match_getslice(self, _PyLong_GetZero(), Py_None);
  1881. break;
  1882. case 1:
  1883. result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
  1884. break;
  1885. default:
  1886. /* fetch multiple items */
  1887. result = PyTuple_New(size);
  1888. if (!result)
  1889. return NULL;
  1890. for (i = 0; i < size; i++) {
  1891. PyObject* item = match_getslice(
  1892. self, PyTuple_GET_ITEM(args, i), Py_None
  1893. );
  1894. if (!item) {
  1895. Py_DECREF(result);
  1896. return NULL;
  1897. }
  1898. PyTuple_SET_ITEM(result, i, item);
  1899. }
  1900. break;
  1901. }
  1902. return result;
  1903. }
  1904. static PyObject*
  1905. match_getitem(MatchObject* self, PyObject* name)
  1906. {
  1907. return match_getslice(self, name, Py_None);
  1908. }
  1909. /*[clinic input]
  1910. _sre.SRE_Match.groups
  1911. default: object = None
  1912. Is used for groups that did not participate in the match.
  1913. Return a tuple containing all the subgroups of the match, from 1.
  1914. [clinic start generated code]*/
  1915. static PyObject *
  1916. _sre_SRE_Match_groups_impl(MatchObject *self, PyObject *default_value)
  1917. /*[clinic end generated code: output=daf8e2641537238a input=bb069ef55dabca91]*/
  1918. {
  1919. PyObject* result;
  1920. Py_ssize_t index;
  1921. result = PyTuple_New(self->groups-1);
  1922. if (!result)
  1923. return NULL;
  1924. for (index = 1; index < self->groups; index++) {
  1925. PyObject* item;
  1926. item = match_getslice_by_index(self, index, default_value);
  1927. if (!item) {
  1928. Py_DECREF(result);
  1929. return NULL;
  1930. }
  1931. PyTuple_SET_ITEM(result, index-1, item);
  1932. }
  1933. return result;
  1934. }
  1935. /*[clinic input]
  1936. _sre.SRE_Match.groupdict
  1937. default: object = None
  1938. Is used for groups that did not participate in the match.
  1939. Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.
  1940. [clinic start generated code]*/
  1941. static PyObject *
  1942. _sre_SRE_Match_groupdict_impl(MatchObject *self, PyObject *default_value)
  1943. /*[clinic end generated code: output=29917c9073e41757 input=0ded7960b23780aa]*/
  1944. {
  1945. PyObject *result;
  1946. PyObject *key;
  1947. PyObject *value;
  1948. Py_ssize_t pos = 0;
  1949. Py_hash_t hash;
  1950. result = PyDict_New();
  1951. if (!result || !self->pattern->groupindex)
  1952. return result;
  1953. while (_PyDict_Next(self->pattern->groupindex, &pos, &key, &value, &hash)) {
  1954. int status;
  1955. Py_INCREF(key);
  1956. value = match_getslice(self, key, default_value);
  1957. if (!value) {
  1958. Py_DECREF(key);
  1959. goto failed;
  1960. }
  1961. status = _PyDict_SetItem_KnownHash(result, key, value, hash);
  1962. Py_DECREF(value);
  1963. Py_DECREF(key);
  1964. if (status < 0)
  1965. goto failed;
  1966. }
  1967. return result;
  1968. failed:
  1969. Py_DECREF(result);
  1970. return NULL;
  1971. }
  1972. /*[clinic input]
  1973. _sre.SRE_Match.start -> Py_ssize_t
  1974. group: object(c_default="NULL") = 0
  1975. /
  1976. Return index of the start of the substring matched by group.
  1977. [clinic start generated code]*/
  1978. static Py_ssize_t
  1979. _sre_SRE_Match_start_impl(MatchObject *self, PyObject *group)
  1980. /*[clinic end generated code: output=3f6e7f9df2fb5201 input=ced8e4ed4b33ee6c]*/
  1981. {
  1982. Py_ssize_t index = match_getindex(self, group);
  1983. if (index < 0) {
  1984. return -1;
  1985. }
  1986. /* mark is -1 if group is undefined */
  1987. return self->mark[index*2];
  1988. }
  1989. /*[clinic input]
  1990. _sre.SRE_Match.end -> Py_ssize_t
  1991. group: object(c_default="NULL") = 0
  1992. /
  1993. Return index of the end of the substring matched by group.
  1994. [clinic start generated code]*/
  1995. static Py_ssize_t
  1996. _sre_SRE_Match_end_impl(MatchObject *self, PyObject *group)
  1997. /*[clinic end generated code: output=f4240b09911f7692 input=1b799560c7f3d7e6]*/
  1998. {
  1999. Py_ssize_t index = match_getindex(self, group);
  2000. if (index < 0) {
  2001. return -1;
  2002. }
  2003. /* mark is -1 if group is undefined */
  2004. return self->mark[index*2+1];
  2005. }
  2006. LOCAL(PyObject*)
  2007. _pair(Py_ssize_t i1, Py_ssize_t i2)
  2008. {
  2009. PyObject* pair;
  2010. PyObject* item;
  2011. pair = PyTuple_New(2);
  2012. if (!pair)
  2013. return NULL;
  2014. item = PyLong_FromSsize_t(i1);
  2015. if (!item)
  2016. goto error;
  2017. PyTuple_SET_ITEM(pair, 0, item);
  2018. item = PyLong_FromSsize_t(i2);
  2019. if (!item)
  2020. goto error;
  2021. PyTuple_SET_ITEM(pair, 1, item);
  2022. return pair;
  2023. error:
  2024. Py_DECREF(pair);
  2025. return NULL;
  2026. }
  2027. /*[clinic input]
  2028. _sre.SRE_Match.span
  2029. group: object(c_default="NULL") = 0
  2030. /
  2031. For match object m, return the 2-tuple (m.start(group), m.end(group)).
  2032. [clinic start generated code]*/
  2033. static PyObject *
  2034. _sre_SRE_Match_span_impl(MatchObject *self, PyObject *group)
  2035. /*[clinic end generated code: output=f02ae40594d14fe6 input=8fa6014e982d71d4]*/
  2036. {
  2037. Py_ssize_t index = match_getindex(self, group);
  2038. if (index < 0) {
  2039. return NULL;
  2040. }
  2041. /* marks are -1 if group is undefined */
  2042. return _pair(self->mark[index*2], self->mark[index*2+1]);
  2043. }
  2044. static PyObject*
  2045. match_regs(MatchObject* self)
  2046. {
  2047. PyObject* regs;
  2048. PyObject* item;
  2049. Py_ssize_t index;
  2050. regs = PyTuple_New(self->groups);
  2051. if (!regs)
  2052. return NULL;
  2053. for (index = 0; index < self->groups; index++) {
  2054. item = _pair(self->mark[index*2], self->mark[index*2+1]);
  2055. if (!item) {
  2056. Py_DECREF(regs);
  2057. return NULL;
  2058. }
  2059. PyTuple_SET_ITEM(regs, index, item);
  2060. }
  2061. self->regs = Py_NewRef(regs);
  2062. return regs;
  2063. }
  2064. /*[clinic input]
  2065. _sre.SRE_Match.__copy__
  2066. [clinic start generated code]*/
  2067. static PyObject *
  2068. _sre_SRE_Match___copy___impl(MatchObject *self)
  2069. /*[clinic end generated code: output=a779c5fc8b5b4eb4 input=3bb4d30b6baddb5b]*/
  2070. {
  2071. return Py_NewRef(self);
  2072. }
  2073. /*[clinic input]
  2074. _sre.SRE_Match.__deepcopy__
  2075. memo: object
  2076. /
  2077. [clinic start generated code]*/
  2078. static PyObject *
  2079. _sre_SRE_Match___deepcopy__(MatchObject *self, PyObject *memo)
  2080. /*[clinic end generated code: output=ba7cb46d655e4ee2 input=779d12a31c2c325e]*/
  2081. {
  2082. return Py_NewRef(self);
  2083. }
  2084. PyDoc_STRVAR(match_doc,
  2085. "The result of re.match() and re.search().\n\
  2086. Match objects always have a boolean value of True.");
  2087. PyDoc_STRVAR(match_group_doc,
  2088. "group([group1, ...]) -> str or tuple.\n\
  2089. Return subgroup(s) of the match by indices or names.\n\
  2090. For 0 returns the entire match.");
  2091. static PyObject *
  2092. match_lastindex_get(MatchObject *self, void *Py_UNUSED(ignored))
  2093. {
  2094. if (self->lastindex >= 0)
  2095. return PyLong_FromSsize_t(self->lastindex);
  2096. Py_RETURN_NONE;
  2097. }
  2098. static PyObject *
  2099. match_lastgroup_get(MatchObject *self, void *Py_UNUSED(ignored))
  2100. {
  2101. if (self->pattern->indexgroup &&
  2102. self->lastindex >= 0 &&
  2103. self->lastindex < PyTuple_GET_SIZE(self->pattern->indexgroup))
  2104. {
  2105. PyObject *result = PyTuple_GET_ITEM(self->pattern->indexgroup,
  2106. self->lastindex);
  2107. return Py_NewRef(result);
  2108. }
  2109. Py_RETURN_NONE;
  2110. }
  2111. static PyObject *
  2112. match_regs_get(MatchObject *self, void *Py_UNUSED(ignored))
  2113. {
  2114. if (self->regs) {
  2115. return Py_NewRef(self->regs);
  2116. } else
  2117. return match_regs(self);
  2118. }
  2119. static PyObject *
  2120. match_repr(MatchObject *self)
  2121. {
  2122. PyObject *result;
  2123. PyObject *group0 = match_getslice_by_index(self, 0, Py_None);
  2124. if (group0 == NULL)
  2125. return NULL;
  2126. result = PyUnicode_FromFormat(
  2127. "<%s object; span=(%zd, %zd), match=%.50R>",
  2128. Py_TYPE(self)->tp_name,
  2129. self->mark[0], self->mark[1], group0);
  2130. Py_DECREF(group0);
  2131. return result;
  2132. }
  2133. static PyObject*
  2134. pattern_new_match(_sremodulestate* module_state,
  2135. PatternObject* pattern,
  2136. SRE_STATE* state,
  2137. Py_ssize_t status)
  2138. {
  2139. /* create match object (from state object) */
  2140. MatchObject* match;
  2141. Py_ssize_t i, j;
  2142. char* base;
  2143. int n;
  2144. if (status > 0) {
  2145. /* create match object (with room for extra group marks) */
  2146. /* coverity[ampersand_in_size] */
  2147. match = PyObject_GC_NewVar(MatchObject,
  2148. module_state->Match_Type,
  2149. 2*(pattern->groups+1));
  2150. if (!match)
  2151. return NULL;
  2152. match->pattern = (PatternObject*)Py_NewRef(pattern);
  2153. match->string = Py_NewRef(state->string);
  2154. match->regs = NULL;
  2155. match->groups = pattern->groups+1;
  2156. /* fill in group slices */
  2157. base = (char*) state->beginning;
  2158. n = state->charsize;
  2159. match->mark[0] = ((char*) state->start - base) / n;
  2160. match->mark[1] = ((char*) state->ptr - base) / n;
  2161. for (i = j = 0; i < pattern->groups; i++, j+=2)
  2162. if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
  2163. match->mark[j+2] = ((char*) state->mark[j] - base) / n;
  2164. match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
  2165. /* check wrong span */
  2166. if (match->mark[j+2] > match->mark[j+3]) {
  2167. PyErr_SetString(PyExc_SystemError,
  2168. "The span of capturing group is wrong,"
  2169. " please report a bug for the re module.");
  2170. Py_DECREF(match);
  2171. return NULL;
  2172. }
  2173. } else
  2174. match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
  2175. match->pos = state->pos;
  2176. match->endpos = state->endpos;
  2177. match->lastindex = state->lastindex;
  2178. PyObject_GC_Track(match);
  2179. return (PyObject*) match;
  2180. } else if (status == 0) {
  2181. /* no match */
  2182. Py_RETURN_NONE;
  2183. }
  2184. /* internal error */
  2185. pattern_error(status);
  2186. return NULL;
  2187. }
  2188. /* -------------------------------------------------------------------- */
  2189. /* scanner methods (experimental) */
  2190. static int
  2191. scanner_traverse(ScannerObject *self, visitproc visit, void *arg)
  2192. {
  2193. Py_VISIT(Py_TYPE(self));
  2194. Py_VISIT(self->pattern);
  2195. return 0;
  2196. }
  2197. static int
  2198. scanner_clear(ScannerObject *self)
  2199. {
  2200. Py_CLEAR(self->pattern);
  2201. return 0;
  2202. }
  2203. static void
  2204. scanner_dealloc(ScannerObject* self)
  2205. {
  2206. PyTypeObject *tp = Py_TYPE(self);
  2207. PyObject_GC_UnTrack(self);
  2208. state_fini(&self->state);
  2209. (void)scanner_clear(self);
  2210. tp->tp_free(self);
  2211. Py_DECREF(tp);
  2212. }
  2213. static int
  2214. scanner_begin(ScannerObject* self)
  2215. {
  2216. if (self->executing) {
  2217. PyErr_SetString(PyExc_ValueError,
  2218. "regular expression scanner already executing");
  2219. return 0;
  2220. }
  2221. self->executing = 1;
  2222. return 1;
  2223. }
  2224. static void
  2225. scanner_end(ScannerObject* self)
  2226. {
  2227. assert(self->executing);
  2228. self->executing = 0;
  2229. }
  2230. /*[clinic input]
  2231. _sre.SRE_Scanner.match
  2232. cls: defining_class
  2233. /
  2234. [clinic start generated code]*/
  2235. static PyObject *
  2236. _sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls)
  2237. /*[clinic end generated code: output=6e22c149dc0f0325 input=b5146e1f30278cb7]*/
  2238. {
  2239. _sremodulestate *module_state = get_sre_module_state_by_class(cls);
  2240. SRE_STATE* state = &self->state;
  2241. PyObject* match;
  2242. Py_ssize_t status;
  2243. if (!scanner_begin(self)) {
  2244. return NULL;
  2245. }
  2246. if (state->start == NULL) {
  2247. scanner_end(self);
  2248. Py_RETURN_NONE;
  2249. }
  2250. state_reset(state);
  2251. state->ptr = state->start;
  2252. status = sre_match(state, PatternObject_GetCode(self->pattern));
  2253. if (PyErr_Occurred()) {
  2254. scanner_end(self);
  2255. return NULL;
  2256. }
  2257. match = pattern_new_match(module_state, (PatternObject*) self->pattern,
  2258. state, status);
  2259. if (status == 0)
  2260. state->start = NULL;
  2261. else {
  2262. state->must_advance = (state->ptr == state->start);
  2263. state->start = state->ptr;
  2264. }
  2265. scanner_end(self);
  2266. return match;
  2267. }
  2268. /*[clinic input]
  2269. _sre.SRE_Scanner.search
  2270. cls: defining_class
  2271. /
  2272. [clinic start generated code]*/
  2273. static PyObject *
  2274. _sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls)
  2275. /*[clinic end generated code: output=23e8fc78013f9161 input=056c2d37171d0bf2]*/
  2276. {
  2277. _sremodulestate *module_state = get_sre_module_state_by_class(cls);
  2278. SRE_STATE* state = &self->state;
  2279. PyObject* match;
  2280. Py_ssize_t status;
  2281. if (!scanner_begin(self)) {
  2282. return NULL;
  2283. }
  2284. if (state->start == NULL) {
  2285. scanner_end(self);
  2286. Py_RETURN_NONE;
  2287. }
  2288. state_reset(state);
  2289. state->ptr = state->start;
  2290. status = sre_search(state, PatternObject_GetCode(self->pattern));
  2291. if (PyErr_Occurred()) {
  2292. scanner_end(self);
  2293. return NULL;
  2294. }
  2295. match = pattern_new_match(module_state, (PatternObject*) self->pattern,
  2296. state, status);
  2297. if (status == 0)
  2298. state->start = NULL;
  2299. else {
  2300. state->must_advance = (state->ptr == state->start);
  2301. state->start = state->ptr;
  2302. }
  2303. scanner_end(self);
  2304. return match;
  2305. }
  2306. static PyObject *
  2307. pattern_scanner(_sremodulestate *module_state,
  2308. PatternObject *self,
  2309. PyObject *string,
  2310. Py_ssize_t pos,
  2311. Py_ssize_t endpos)
  2312. {
  2313. ScannerObject* scanner;
  2314. /* create scanner object */
  2315. scanner = PyObject_GC_New(ScannerObject, module_state->Scanner_Type);
  2316. if (!scanner)
  2317. return NULL;
  2318. scanner->pattern = NULL;
  2319. scanner->executing = 0;
  2320. /* create search state object */
  2321. if (!state_init(&scanner->state, self, string, pos, endpos)) {
  2322. Py_DECREF(scanner);
  2323. return NULL;
  2324. }
  2325. scanner->pattern = Py_NewRef(self);
  2326. PyObject_GC_Track(scanner);
  2327. return (PyObject*) scanner;
  2328. }
  2329. /* -------------------------------------------------------------------- */
  2330. /* template methods */
  2331. static int
  2332. template_traverse(TemplateObject *self, visitproc visit, void *arg)
  2333. {
  2334. Py_VISIT(Py_TYPE(self));
  2335. Py_VISIT(self->literal);
  2336. for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) {
  2337. Py_VISIT(self->items[i].literal);
  2338. }
  2339. return 0;
  2340. }
  2341. static int
  2342. template_clear(TemplateObject *self)
  2343. {
  2344. Py_CLEAR(self->literal);
  2345. for (Py_ssize_t i = 0, n = Py_SIZE(self); i < n; i++) {
  2346. Py_CLEAR(self->items[i].literal);
  2347. }
  2348. return 0;
  2349. }
  2350. static void
  2351. template_dealloc(TemplateObject *self)
  2352. {
  2353. PyTypeObject *tp = Py_TYPE(self);
  2354. PyObject_GC_UnTrack(self);
  2355. (void)template_clear(self);
  2356. tp->tp_free(self);
  2357. Py_DECREF(tp);
  2358. }
  2359. static PyObject *
  2360. expand_template(TemplateObject *self, MatchObject *match)
  2361. {
  2362. if (Py_SIZE(self) == 0) {
  2363. return Py_NewRef(self->literal);
  2364. }
  2365. PyObject *result = NULL;
  2366. Py_ssize_t count = 0; // the number of non-empty chunks
  2367. /* For small number of strings use a buffer allocated on the stack,
  2368. * otherwise use a list object. */
  2369. PyObject *buffer[10];
  2370. PyObject **out = buffer;
  2371. PyObject *list = NULL;
  2372. if (self->chunks > (int)Py_ARRAY_LENGTH(buffer) ||
  2373. !PyUnicode_Check(self->literal))
  2374. {
  2375. list = PyList_New(self->chunks);
  2376. if (!list) {
  2377. return NULL;
  2378. }
  2379. out = &PyList_GET_ITEM(list, 0);
  2380. }
  2381. out[count++] = Py_NewRef(self->literal);
  2382. for (Py_ssize_t i = 0; i < Py_SIZE(self); i++) {
  2383. Py_ssize_t index = self->items[i].index;
  2384. if (index >= match->groups) {
  2385. PyErr_SetString(PyExc_IndexError, "no such group");
  2386. goto cleanup;
  2387. }
  2388. PyObject *item = match_getslice_by_index(match, index, Py_None);
  2389. if (item == NULL) {
  2390. goto cleanup;
  2391. }
  2392. if (item != Py_None) {
  2393. out[count++] = Py_NewRef(item);
  2394. }
  2395. Py_DECREF(item);
  2396. PyObject *literal = self->items[i].literal;
  2397. if (literal != NULL) {
  2398. out[count++] = Py_NewRef(literal);
  2399. }
  2400. }
  2401. if (PyUnicode_Check(self->literal)) {
  2402. result = _PyUnicode_JoinArray(&_Py_STR(empty), out, count);
  2403. }
  2404. else {
  2405. Py_SET_SIZE(list, count);
  2406. result = _PyBytes_Join((PyObject *)&_Py_SINGLETON(bytes_empty), list);
  2407. }
  2408. cleanup:
  2409. if (list) {
  2410. Py_DECREF(list);
  2411. }
  2412. else {
  2413. for (Py_ssize_t i = 0; i < count; i++) {
  2414. Py_DECREF(out[i]);
  2415. }
  2416. }
  2417. return result;
  2418. }
  2419. static Py_hash_t
  2420. pattern_hash(PatternObject *self)
  2421. {
  2422. Py_hash_t hash, hash2;
  2423. hash = PyObject_Hash(self->pattern);
  2424. if (hash == -1) {
  2425. return -1;
  2426. }
  2427. hash2 = _Py_HashBytes(self->code, sizeof(self->code[0]) * self->codesize);
  2428. hash ^= hash2;
  2429. hash ^= self->flags;
  2430. hash ^= self->isbytes;
  2431. hash ^= self->codesize;
  2432. if (hash == -1) {
  2433. hash = -2;
  2434. }
  2435. return hash;
  2436. }
  2437. static PyObject*
  2438. pattern_richcompare(PyObject *lefto, PyObject *righto, int op)
  2439. {
  2440. PyTypeObject *tp = Py_TYPE(lefto);
  2441. _sremodulestate *module_state = get_sre_module_state_by_class(tp);
  2442. PatternObject *left, *right;
  2443. int cmp;
  2444. if (op != Py_EQ && op != Py_NE) {
  2445. Py_RETURN_NOTIMPLEMENTED;
  2446. }
  2447. if (!Py_IS_TYPE(righto, module_state->Pattern_Type))
  2448. {
  2449. Py_RETURN_NOTIMPLEMENTED;
  2450. }
  2451. if (lefto == righto) {
  2452. /* a pattern is equal to itself */
  2453. return PyBool_FromLong(op == Py_EQ);
  2454. }
  2455. left = (PatternObject *)lefto;
  2456. right = (PatternObject *)righto;
  2457. cmp = (left->flags == right->flags
  2458. && left->isbytes == right->isbytes
  2459. && left->codesize == right->codesize);
  2460. if (cmp) {
  2461. /* Compare the code and the pattern because the same pattern can
  2462. produce different codes depending on the locale used to compile the
  2463. pattern when the re.LOCALE flag is used. Don't compare groups,
  2464. indexgroup nor groupindex: they are derivated from the pattern. */
  2465. cmp = (memcmp(left->code, right->code,
  2466. sizeof(left->code[0]) * left->codesize) == 0);
  2467. }
  2468. if (cmp) {
  2469. cmp = PyObject_RichCompareBool(left->pattern, right->pattern,
  2470. Py_EQ);
  2471. if (cmp < 0) {
  2472. return NULL;
  2473. }
  2474. }
  2475. if (op == Py_NE) {
  2476. cmp = !cmp;
  2477. }
  2478. return PyBool_FromLong(cmp);
  2479. }
  2480. #include "clinic/sre.c.h"
  2481. static PyMethodDef pattern_methods[] = {
  2482. _SRE_SRE_PATTERN_MATCH_METHODDEF
  2483. _SRE_SRE_PATTERN_FULLMATCH_METHODDEF
  2484. _SRE_SRE_PATTERN_SEARCH_METHODDEF
  2485. _SRE_SRE_PATTERN_SUB_METHODDEF
  2486. _SRE_SRE_PATTERN_SUBN_METHODDEF
  2487. _SRE_SRE_PATTERN_FINDALL_METHODDEF
  2488. _SRE_SRE_PATTERN_SPLIT_METHODDEF
  2489. _SRE_SRE_PATTERN_FINDITER_METHODDEF
  2490. _SRE_SRE_PATTERN_SCANNER_METHODDEF
  2491. _SRE_SRE_PATTERN___COPY___METHODDEF
  2492. _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF
  2493. {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
  2494. PyDoc_STR("See PEP 585")},
  2495. {NULL, NULL}
  2496. };
  2497. static PyGetSetDef pattern_getset[] = {
  2498. {"groupindex", (getter)pattern_groupindex, (setter)NULL,
  2499. "A dictionary mapping group names to group numbers."},
  2500. {NULL} /* Sentinel */
  2501. };
  2502. #define PAT_OFF(x) offsetof(PatternObject, x)
  2503. static PyMemberDef pattern_members[] = {
  2504. {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY,
  2505. "The pattern string from which the RE object was compiled."},
  2506. {"flags", T_INT, PAT_OFF(flags), READONLY,
  2507. "The regex matching flags."},
  2508. {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY,
  2509. "The number of capturing groups in the pattern."},
  2510. {"__weaklistoffset__", T_PYSSIZET, offsetof(PatternObject, weakreflist), READONLY},
  2511. {NULL} /* Sentinel */
  2512. };
  2513. static PyType_Slot pattern_slots[] = {
  2514. {Py_tp_dealloc, (destructor)pattern_dealloc},
  2515. {Py_tp_repr, (reprfunc)pattern_repr},
  2516. {Py_tp_hash, (hashfunc)pattern_hash},
  2517. {Py_tp_doc, (void *)pattern_doc},
  2518. {Py_tp_richcompare, pattern_richcompare},
  2519. {Py_tp_methods, pattern_methods},
  2520. {Py_tp_members, pattern_members},
  2521. {Py_tp_getset, pattern_getset},
  2522. {Py_tp_traverse, pattern_traverse},
  2523. {Py_tp_clear, pattern_clear},
  2524. {0, NULL},
  2525. };
  2526. static PyType_Spec pattern_spec = {
  2527. .name = "re.Pattern",
  2528. .basicsize = sizeof(PatternObject),
  2529. .itemsize = sizeof(SRE_CODE),
  2530. .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
  2531. Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
  2532. .slots = pattern_slots,
  2533. };
  2534. static PyMethodDef match_methods[] = {
  2535. {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
  2536. _SRE_SRE_MATCH_START_METHODDEF
  2537. _SRE_SRE_MATCH_END_METHODDEF
  2538. _SRE_SRE_MATCH_SPAN_METHODDEF
  2539. _SRE_SRE_MATCH_GROUPS_METHODDEF
  2540. _SRE_SRE_MATCH_GROUPDICT_METHODDEF
  2541. _SRE_SRE_MATCH_EXPAND_METHODDEF
  2542. _SRE_SRE_MATCH___COPY___METHODDEF
  2543. _SRE_SRE_MATCH___DEEPCOPY___METHODDEF
  2544. {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS,
  2545. PyDoc_STR("See PEP 585")},
  2546. {NULL, NULL}
  2547. };
  2548. static PyGetSetDef match_getset[] = {
  2549. {"lastindex", (getter)match_lastindex_get, (setter)NULL,
  2550. "The integer index of the last matched capturing group."},
  2551. {"lastgroup", (getter)match_lastgroup_get, (setter)NULL,
  2552. "The name of the last matched capturing group."},
  2553. {"regs", (getter)match_regs_get, (setter)NULL},
  2554. {NULL}
  2555. };
  2556. #define MATCH_OFF(x) offsetof(MatchObject, x)
  2557. static PyMemberDef match_members[] = {
  2558. {"string", T_OBJECT, MATCH_OFF(string), READONLY,
  2559. "The string passed to match() or search()."},
  2560. {"re", T_OBJECT, MATCH_OFF(pattern), READONLY,
  2561. "The regular expression object."},
  2562. {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY,
  2563. "The index into the string at which the RE engine started looking for a match."},
  2564. {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY,
  2565. "The index into the string beyond which the RE engine will not go."},
  2566. {NULL}
  2567. };
  2568. /* FIXME: implement setattr("string", None) as a special case (to
  2569. detach the associated string, if any */
  2570. static PyType_Slot match_slots[] = {
  2571. {Py_tp_dealloc, match_dealloc},
  2572. {Py_tp_repr, match_repr},
  2573. {Py_tp_doc, (void *)match_doc},
  2574. {Py_tp_methods, match_methods},
  2575. {Py_tp_members, match_members},
  2576. {Py_tp_getset, match_getset},
  2577. {Py_tp_traverse, match_traverse},
  2578. {Py_tp_clear, match_clear},
  2579. /* As mapping.
  2580. *
  2581. * Match objects do not support length or assignment, but do support
  2582. * __getitem__.
  2583. */
  2584. {Py_mp_subscript, match_getitem},
  2585. {0, NULL},
  2586. };
  2587. static PyType_Spec match_spec = {
  2588. .name = "re.Match",
  2589. .basicsize = sizeof(MatchObject),
  2590. .itemsize = sizeof(Py_ssize_t),
  2591. .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
  2592. Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
  2593. .slots = match_slots,
  2594. };
  2595. static PyMethodDef scanner_methods[] = {
  2596. _SRE_SRE_SCANNER_MATCH_METHODDEF
  2597. _SRE_SRE_SCANNER_SEARCH_METHODDEF
  2598. {NULL, NULL}
  2599. };
  2600. #define SCAN_OFF(x) offsetof(ScannerObject, x)
  2601. static PyMemberDef scanner_members[] = {
  2602. {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
  2603. {NULL} /* Sentinel */
  2604. };
  2605. static PyType_Slot scanner_slots[] = {
  2606. {Py_tp_dealloc, scanner_dealloc},
  2607. {Py_tp_methods, scanner_methods},
  2608. {Py_tp_members, scanner_members},
  2609. {Py_tp_traverse, scanner_traverse},
  2610. {Py_tp_clear, scanner_clear},
  2611. {0, NULL},
  2612. };
  2613. static PyType_Spec scanner_spec = {
  2614. .name = "_sre.SRE_Scanner",
  2615. .basicsize = sizeof(ScannerObject),
  2616. .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
  2617. Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
  2618. .slots = scanner_slots,
  2619. };
  2620. static PyType_Slot template_slots[] = {
  2621. {Py_tp_dealloc, template_dealloc},
  2622. {Py_tp_traverse, template_traverse},
  2623. {Py_tp_clear, template_clear},
  2624. {0, NULL},
  2625. };
  2626. static PyType_Spec template_spec = {
  2627. .name = "_sre.SRE_Template",
  2628. .basicsize = sizeof(TemplateObject),
  2629. .itemsize = sizeof(((TemplateObject *)0)->items[0]),
  2630. .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE |
  2631. Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC),
  2632. .slots = template_slots,
  2633. };
  2634. static PyMethodDef _functions[] = {
  2635. _SRE_COMPILE_METHODDEF
  2636. _SRE_TEMPLATE_METHODDEF
  2637. _SRE_GETCODESIZE_METHODDEF
  2638. _SRE_ASCII_ISCASED_METHODDEF
  2639. _SRE_UNICODE_ISCASED_METHODDEF
  2640. _SRE_ASCII_TOLOWER_METHODDEF
  2641. _SRE_UNICODE_TOLOWER_METHODDEF
  2642. {NULL, NULL}
  2643. };
  2644. static int
  2645. sre_traverse(PyObject *module, visitproc visit, void *arg)
  2646. {
  2647. _sremodulestate *state = get_sre_module_state(module);
  2648. Py_VISIT(state->Pattern_Type);
  2649. Py_VISIT(state->Match_Type);
  2650. Py_VISIT(state->Scanner_Type);
  2651. Py_VISIT(state->Template_Type);
  2652. Py_VISIT(state->compile_template);
  2653. return 0;
  2654. }
  2655. static int
  2656. sre_clear(PyObject *module)
  2657. {
  2658. _sremodulestate *state = get_sre_module_state(module);
  2659. Py_CLEAR(state->Pattern_Type);
  2660. Py_CLEAR(state->Match_Type);
  2661. Py_CLEAR(state->Scanner_Type);
  2662. Py_CLEAR(state->Template_Type);
  2663. Py_CLEAR(state->compile_template);
  2664. return 0;
  2665. }
  2666. static void
  2667. sre_free(void *module)
  2668. {
  2669. sre_clear((PyObject *)module);
  2670. }
  2671. #define CREATE_TYPE(m, type, spec) \
  2672. do { \
  2673. type = (PyTypeObject *)PyType_FromModuleAndSpec(m, spec, NULL); \
  2674. if (type == NULL) { \
  2675. goto error; \
  2676. } \
  2677. } while (0)
  2678. #define ADD_ULONG_CONSTANT(module, name, value) \
  2679. do { \
  2680. PyObject *o = PyLong_FromUnsignedLong(value); \
  2681. if (!o) \
  2682. goto error; \
  2683. int res = PyModule_AddObjectRef(module, name, o); \
  2684. Py_DECREF(o); \
  2685. if (res < 0) { \
  2686. goto error; \
  2687. } \
  2688. } while (0)
  2689. static int
  2690. sre_exec(PyObject *m)
  2691. {
  2692. _sremodulestate *state;
  2693. /* Create heap types */
  2694. state = get_sre_module_state(m);
  2695. CREATE_TYPE(m, state->Pattern_Type, &pattern_spec);
  2696. CREATE_TYPE(m, state->Match_Type, &match_spec);
  2697. CREATE_TYPE(m, state->Scanner_Type, &scanner_spec);
  2698. CREATE_TYPE(m, state->Template_Type, &template_spec);
  2699. if (PyModule_AddIntConstant(m, "MAGIC", SRE_MAGIC) < 0) {
  2700. goto error;
  2701. }
  2702. if (PyModule_AddIntConstant(m, "CODESIZE", sizeof(SRE_CODE)) < 0) {
  2703. goto error;
  2704. }
  2705. ADD_ULONG_CONSTANT(m, "MAXREPEAT", SRE_MAXREPEAT);
  2706. ADD_ULONG_CONSTANT(m, "MAXGROUPS", SRE_MAXGROUPS);
  2707. if (PyModule_AddStringConstant(m, "copyright", copyright) < 0) {
  2708. goto error;
  2709. }
  2710. return 0;
  2711. error:
  2712. return -1;
  2713. }
  2714. static PyModuleDef_Slot sre_slots[] = {
  2715. {Py_mod_exec, sre_exec},
  2716. {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
  2717. {0, NULL},
  2718. };
  2719. static struct PyModuleDef sremodule = {
  2720. .m_base = PyModuleDef_HEAD_INIT,
  2721. .m_name = "_sre",
  2722. .m_size = sizeof(_sremodulestate),
  2723. .m_methods = _functions,
  2724. .m_slots = sre_slots,
  2725. .m_traverse = sre_traverse,
  2726. .m_free = sre_free,
  2727. .m_clear = sre_clear,
  2728. };
  2729. PyMODINIT_FUNC
  2730. PyInit__sre(void)
  2731. {
  2732. return PyModuleDef_Init(&sremodule);
  2733. }
  2734. /* vim:ts=4:sw=4:et
  2735. */