regex.h 84 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2002-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * file name: regex.h
  9. * encoding: UTF-8
  10. * indentation:4
  11. *
  12. * created on: 2002oct22
  13. * created by: Andy Heninger
  14. *
  15. * ICU Regular Expressions, API for C++
  16. */
  17. #ifndef REGEX_H
  18. #define REGEX_H
  19. //#define REGEX_DEBUG
  20. /**
  21. * \file
  22. * \brief C++ API: Regular Expressions
  23. *
  24. * The ICU API for processing regular expressions consists of two classes,
  25. * `RegexPattern` and `RegexMatcher`.
  26. * `RegexPattern` objects represent a pre-processed, or compiled
  27. * regular expression. They are created from a regular expression pattern string,
  28. * and can be used to create `RegexMatcher` objects for the pattern.
  29. *
  30. * Class `RegexMatcher` bundles together a regular expression
  31. * pattern and a target string to which the search pattern will be applied.
  32. * `RegexMatcher` includes API for doing plain find or search
  33. * operations, for search and replace operations, and for obtaining detailed
  34. * information about bounds of a match.
  35. *
  36. * Note that by constructing `RegexMatcher` objects directly from regular
  37. * expression pattern strings application code can be simplified and the explicit
  38. * need for `RegexPattern` objects can usually be eliminated.
  39. *
  40. */
  41. #include "unicode/utypes.h"
  42. #if U_SHOW_CPLUSPLUS_API
  43. #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  44. #include "unicode/uobject.h"
  45. #include "unicode/unistr.h"
  46. #include "unicode/utext.h"
  47. #include "unicode/parseerr.h"
  48. #include "unicode/uregex.h"
  49. // Forward Declarations
  50. struct UHashtable;
  51. U_NAMESPACE_BEGIN
  52. struct Regex8BitSet;
  53. class RegexCImpl;
  54. class RegexMatcher;
  55. class RegexPattern;
  56. struct REStackFrame;
  57. class BreakIterator;
  58. class UnicodeSet;
  59. class UVector;
  60. class UVector32;
  61. class UVector64;
  62. /**
  63. * Class `RegexPattern` represents a compiled regular expression. It includes
  64. * factory methods for creating a RegexPattern object from the source (string) form
  65. * of a regular expression, methods for creating RegexMatchers that allow the pattern
  66. * to be applied to input text, and a few convenience methods for simple common
  67. * uses of regular expressions.
  68. *
  69. * Class RegexPattern is not intended to be subclassed.
  70. *
  71. * @stable ICU 2.4
  72. */
  73. class U_I18N_API RegexPattern final : public UObject {
  74. public:
  75. /**
  76. * default constructor. Create a RegexPattern object that refers to no actual
  77. * pattern. Not normally needed; RegexPattern objects are usually
  78. * created using the factory method `compile()`.
  79. *
  80. * @stable ICU 2.4
  81. */
  82. RegexPattern();
  83. /**
  84. * Copy Constructor. Create a new RegexPattern object that is equivalent
  85. * to the source object.
  86. * @param source the pattern object to be copied.
  87. * @stable ICU 2.4
  88. */
  89. RegexPattern(const RegexPattern &source);
  90. /**
  91. * Destructor. Note that a RegexPattern object must persist so long as any
  92. * RegexMatcher objects that were created from the RegexPattern are active.
  93. * @stable ICU 2.4
  94. */
  95. virtual ~RegexPattern();
  96. /**
  97. * Comparison operator. Two RegexPattern objects are considered equal if they
  98. * were constructed from identical source patterns using the same #URegexpFlag
  99. * settings.
  100. * @param that a RegexPattern object to compare with "this".
  101. * @return true if the objects are equivalent.
  102. * @stable ICU 2.4
  103. */
  104. bool operator==(const RegexPattern& that) const;
  105. /**
  106. * Comparison operator. Two RegexPattern objects are considered equal if they
  107. * were constructed from identical source patterns using the same #URegexpFlag
  108. * settings.
  109. * @param that a RegexPattern object to compare with "this".
  110. * @return true if the objects are different.
  111. * @stable ICU 2.4
  112. */
  113. inline bool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
  114. /**
  115. * Assignment operator. After assignment, this RegexPattern will behave identically
  116. * to the source object.
  117. * @stable ICU 2.4
  118. */
  119. RegexPattern &operator =(const RegexPattern &source);
  120. /**
  121. * Create an exact copy of this RegexPattern object. Since RegexPattern is not
  122. * intended to be subclassed, <code>clone()</code> and the copy construction are
  123. * equivalent operations.
  124. * @return the copy of this RegexPattern
  125. * @stable ICU 2.4
  126. */
  127. virtual RegexPattern *clone() const;
  128. /**
  129. * Compiles the regular expression in string form into a RegexPattern
  130. * object. These compile methods, rather than the constructors, are the usual
  131. * way that RegexPattern objects are created.
  132. *
  133. * Note that RegexPattern objects must not be deleted while RegexMatcher
  134. * objects created from the pattern are active. RegexMatchers keep a pointer
  135. * back to their pattern, so premature deletion of the pattern is a
  136. * catastrophic error.
  137. *
  138. * All #URegexpFlag pattern match mode flags are set to their default values.
  139. *
  140. * Note that it is often more convenient to construct a RegexMatcher directly
  141. * from a pattern string rather than separately compiling the pattern and
  142. * then creating a RegexMatcher object from the pattern.
  143. *
  144. * @param regex The regular expression to be compiled.
  145. * @param pe Receives the position (line and column nubers) of any error
  146. * within the regular expression.)
  147. * @param status A reference to a UErrorCode to receive any errors.
  148. * @return A regexPattern object for the compiled pattern.
  149. *
  150. * @stable ICU 2.4
  151. */
  152. static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
  153. UParseError &pe,
  154. UErrorCode &status);
  155. /**
  156. * Compiles the regular expression in string form into a RegexPattern
  157. * object. These compile methods, rather than the constructors, are the usual
  158. * way that RegexPattern objects are created.
  159. *
  160. * Note that RegexPattern objects must not be deleted while RegexMatcher
  161. * objects created from the pattern are active. RegexMatchers keep a pointer
  162. * back to their pattern, so premature deletion of the pattern is a
  163. * catastrophic error.
  164. *
  165. * All #URegexpFlag pattern match mode flags are set to their default values.
  166. *
  167. * Note that it is often more convenient to construct a RegexMatcher directly
  168. * from a pattern string rather than separately compiling the pattern and
  169. * then creating a RegexMatcher object from the pattern.
  170. *
  171. * @param regex The regular expression to be compiled. Note, the text referred
  172. * to by this UText must not be deleted during the lifetime of the
  173. * RegexPattern object or any RegexMatcher object created from it.
  174. * @param pe Receives the position (line and column nubers) of any error
  175. * within the regular expression.)
  176. * @param status A reference to a UErrorCode to receive any errors.
  177. * @return A regexPattern object for the compiled pattern.
  178. *
  179. * @stable ICU 4.6
  180. */
  181. static RegexPattern * U_EXPORT2 compile( UText *regex,
  182. UParseError &pe,
  183. UErrorCode &status);
  184. /**
  185. * Compiles the regular expression in string form into a RegexPattern
  186. * object using the specified #URegexpFlag match mode flags. These compile methods,
  187. * rather than the constructors, are the usual way that RegexPattern objects
  188. * are created.
  189. *
  190. * Note that RegexPattern objects must not be deleted while RegexMatcher
  191. * objects created from the pattern are active. RegexMatchers keep a pointer
  192. * back to their pattern, so premature deletion of the pattern is a
  193. * catastrophic error.
  194. *
  195. * Note that it is often more convenient to construct a RegexMatcher directly
  196. * from a pattern string instead of than separately compiling the pattern and
  197. * then creating a RegexMatcher object from the pattern.
  198. *
  199. * @param regex The regular expression to be compiled.
  200. * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
  201. * @param pe Receives the position (line and column numbers) of any error
  202. * within the regular expression.)
  203. * @param status A reference to a UErrorCode to receive any errors.
  204. * @return A regexPattern object for the compiled pattern.
  205. *
  206. * @stable ICU 2.4
  207. */
  208. static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
  209. uint32_t flags,
  210. UParseError &pe,
  211. UErrorCode &status);
  212. /**
  213. * Compiles the regular expression in string form into a RegexPattern
  214. * object using the specified #URegexpFlag match mode flags. These compile methods,
  215. * rather than the constructors, are the usual way that RegexPattern objects
  216. * are created.
  217. *
  218. * Note that RegexPattern objects must not be deleted while RegexMatcher
  219. * objects created from the pattern are active. RegexMatchers keep a pointer
  220. * back to their pattern, so premature deletion of the pattern is a
  221. * catastrophic error.
  222. *
  223. * Note that it is often more convenient to construct a RegexMatcher directly
  224. * from a pattern string instead of than separately compiling the pattern and
  225. * then creating a RegexMatcher object from the pattern.
  226. *
  227. * @param regex The regular expression to be compiled. Note, the text referred
  228. * to by this UText must not be deleted during the lifetime of the
  229. * RegexPattern object or any RegexMatcher object created from it.
  230. * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
  231. * @param pe Receives the position (line and column numbers) of any error
  232. * within the regular expression.)
  233. * @param status A reference to a UErrorCode to receive any errors.
  234. * @return A regexPattern object for the compiled pattern.
  235. *
  236. * @stable ICU 4.6
  237. */
  238. static RegexPattern * U_EXPORT2 compile( UText *regex,
  239. uint32_t flags,
  240. UParseError &pe,
  241. UErrorCode &status);
  242. /**
  243. * Compiles the regular expression in string form into a RegexPattern
  244. * object using the specified #URegexpFlag match mode flags. These compile methods,
  245. * rather than the constructors, are the usual way that RegexPattern objects
  246. * are created.
  247. *
  248. * Note that RegexPattern objects must not be deleted while RegexMatcher
  249. * objects created from the pattern are active. RegexMatchers keep a pointer
  250. * back to their pattern, so premature deletion of the pattern is a
  251. * catastrophic error.
  252. *
  253. * Note that it is often more convenient to construct a RegexMatcher directly
  254. * from a pattern string instead of than separately compiling the pattern and
  255. * then creating a RegexMatcher object from the pattern.
  256. *
  257. * @param regex The regular expression to be compiled.
  258. * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
  259. * @param status A reference to a UErrorCode to receive any errors.
  260. * @return A regexPattern object for the compiled pattern.
  261. *
  262. * @stable ICU 2.6
  263. */
  264. static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
  265. uint32_t flags,
  266. UErrorCode &status);
  267. /**
  268. * Compiles the regular expression in string form into a RegexPattern
  269. * object using the specified #URegexpFlag match mode flags. These compile methods,
  270. * rather than the constructors, are the usual way that RegexPattern objects
  271. * are created.
  272. *
  273. * Note that RegexPattern objects must not be deleted while RegexMatcher
  274. * objects created from the pattern are active. RegexMatchers keep a pointer
  275. * back to their pattern, so premature deletion of the pattern is a
  276. * catastrophic error.
  277. *
  278. * Note that it is often more convenient to construct a RegexMatcher directly
  279. * from a pattern string instead of than separately compiling the pattern and
  280. * then creating a RegexMatcher object from the pattern.
  281. *
  282. * @param regex The regular expression to be compiled. Note, the text referred
  283. * to by this UText must not be deleted during the lifetime of the
  284. * RegexPattern object or any RegexMatcher object created from it.
  285. * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
  286. * @param status A reference to a UErrorCode to receive any errors.
  287. * @return A regexPattern object for the compiled pattern.
  288. *
  289. * @stable ICU 4.6
  290. */
  291. static RegexPattern * U_EXPORT2 compile( UText *regex,
  292. uint32_t flags,
  293. UErrorCode &status);
  294. /**
  295. * Get the #URegexpFlag match mode flags that were used when compiling this pattern.
  296. * @return the #URegexpFlag match mode flags
  297. * @stable ICU 2.4
  298. */
  299. virtual uint32_t flags() const;
  300. /**
  301. * Creates a RegexMatcher that will match the given input against this pattern. The
  302. * RegexMatcher can then be used to perform match, find or replace operations
  303. * on the input. Note that a RegexPattern object must not be deleted while
  304. * RegexMatchers created from it still exist and might possibly be used again.
  305. *
  306. * The matcher will retain a reference to the supplied input string, and all regexp
  307. * pattern matching operations happen directly on this original string. It is
  308. * critical that the string not be altered or deleted before use by the regular
  309. * expression operations is complete.
  310. *
  311. * @param input The input string to which the regular expression will be applied.
  312. * @param status A reference to a UErrorCode to receive any errors.
  313. * @return A RegexMatcher object for this pattern and input.
  314. *
  315. * @stable ICU 2.4
  316. */
  317. virtual RegexMatcher *matcher(const UnicodeString &input,
  318. UErrorCode &status) const;
  319. private:
  320. /**
  321. * Cause a compilation error if an application accidentally attempts to
  322. * create a matcher with a (char16_t *) string as input rather than
  323. * a UnicodeString. Avoids a dangling reference to a temporary string.
  324. *
  325. * To efficiently work with char16_t *strings, wrap the data in a UnicodeString
  326. * using one of the aliasing constructors, such as
  327. * `UnicodeString(UBool isTerminated, const char16_t *text, int32_t textLength);`
  328. * or in a UText, using
  329. * `utext_openUChars(UText *ut, const char16_t *text, int64_t textLength, UErrorCode *status);`
  330. *
  331. */
  332. RegexMatcher *matcher(const char16_t *input,
  333. UErrorCode &status) const = delete;
  334. public:
  335. /**
  336. * Creates a RegexMatcher that will match against this pattern. The
  337. * RegexMatcher can be used to perform match, find or replace operations.
  338. * Note that a RegexPattern object must not be deleted while
  339. * RegexMatchers created from it still exist and might possibly be used again.
  340. *
  341. * @param status A reference to a UErrorCode to receive any errors.
  342. * @return A RegexMatcher object for this pattern and input.
  343. *
  344. * @stable ICU 2.6
  345. */
  346. virtual RegexMatcher *matcher(UErrorCode &status) const;
  347. /**
  348. * Test whether a string matches a regular expression. This convenience function
  349. * both compiles the regular expression and applies it in a single operation.
  350. * Note that if the same pattern needs to be applied repeatedly, this method will be
  351. * less efficient than creating and reusing a RegexMatcher object.
  352. *
  353. * @param regex The regular expression
  354. * @param input The string data to be matched
  355. * @param pe Receives the position of any syntax errors within the regular expression
  356. * @param status A reference to a UErrorCode to receive any errors.
  357. * @return True if the regular expression exactly matches the full input string.
  358. *
  359. * @stable ICU 2.4
  360. */
  361. static UBool U_EXPORT2 matches(const UnicodeString &regex,
  362. const UnicodeString &input,
  363. UParseError &pe,
  364. UErrorCode &status);
  365. /**
  366. * Test whether a string matches a regular expression. This convenience function
  367. * both compiles the regular expression and applies it in a single operation.
  368. * Note that if the same pattern needs to be applied repeatedly, this method will be
  369. * less efficient than creating and reusing a RegexMatcher object.
  370. *
  371. * @param regex The regular expression
  372. * @param input The string data to be matched
  373. * @param pe Receives the position of any syntax errors within the regular expression
  374. * @param status A reference to a UErrorCode to receive any errors.
  375. * @return True if the regular expression exactly matches the full input string.
  376. *
  377. * @stable ICU 4.6
  378. */
  379. static UBool U_EXPORT2 matches(UText *regex,
  380. UText *input,
  381. UParseError &pe,
  382. UErrorCode &status);
  383. /**
  384. * Returns the regular expression from which this pattern was compiled. This method will work
  385. * even if the pattern was compiled from a UText.
  386. *
  387. * Note: If the pattern was originally compiled from a UText, and that UText was modified,
  388. * the returned string may no longer reflect the RegexPattern object.
  389. * @stable ICU 2.4
  390. */
  391. virtual UnicodeString pattern() const;
  392. /**
  393. * Returns the regular expression from which this pattern was compiled. This method will work
  394. * even if the pattern was compiled from a UnicodeString.
  395. *
  396. * Note: This is the original input, not a clone. If the pattern was originally compiled from a
  397. * UText, and that UText was modified, the returned UText may no longer reflect the RegexPattern
  398. * object.
  399. *
  400. * @stable ICU 4.6
  401. */
  402. virtual UText *patternText(UErrorCode &status) const;
  403. /**
  404. * Get the group number corresponding to a named capture group.
  405. * The returned number can be used with any function that access
  406. * capture groups by number.
  407. *
  408. * The function returns an error status if the specified name does not
  409. * appear in the pattern.
  410. *
  411. * @param groupName The capture group name.
  412. * @param status A UErrorCode to receive any errors.
  413. *
  414. * @stable ICU 55
  415. */
  416. virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const;
  417. /**
  418. * Get the group number corresponding to a named capture group.
  419. * The returned number can be used with any function that access
  420. * capture groups by number.
  421. *
  422. * The function returns an error status if the specified name does not
  423. * appear in the pattern.
  424. *
  425. * @param groupName The capture group name,
  426. * platform invariant characters only.
  427. * @param nameLength The length of the name, or -1 if the name is
  428. * nul-terminated.
  429. * @param status A UErrorCode to receive any errors.
  430. *
  431. * @stable ICU 55
  432. */
  433. virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const;
  434. /**
  435. * Split a string into fields. Somewhat like split() from Perl or Java.
  436. * Pattern matches identify delimiters that separate the input
  437. * into fields. The input data between the delimiters becomes the
  438. * fields themselves.
  439. *
  440. * If the delimiter pattern includes capture groups, the captured text will
  441. * also appear in the destination array of output strings, interspersed
  442. * with the fields. This is similar to Perl, but differs from Java,
  443. * which ignores the presence of capture groups in the pattern.
  444. *
  445. * Trailing empty fields will always be returned, assuming sufficient
  446. * destination capacity. This differs from the default behavior for Java
  447. * and Perl where trailing empty fields are not returned.
  448. *
  449. * The number of strings produced by the split operation is returned.
  450. * This count includes the strings from capture groups in the delimiter pattern.
  451. * This behavior differs from Java, which ignores capture groups.
  452. *
  453. * For the best performance on split() operations,
  454. * <code>RegexMatcher::split</code> is preferable to this function
  455. *
  456. * @param input The string to be split into fields. The field delimiters
  457. * match the pattern (in the "this" object)
  458. * @param dest An array of UnicodeStrings to receive the results of the split.
  459. * This is an array of actual UnicodeString objects, not an
  460. * array of pointers to strings. Local (stack based) arrays can
  461. * work well here.
  462. * @param destCapacity The number of elements in the destination array.
  463. * If the number of fields found is less than destCapacity, the
  464. * extra strings in the destination array are not altered.
  465. * If the number of destination strings is less than the number
  466. * of fields, the trailing part of the input string, including any
  467. * field delimiters, is placed in the last destination string.
  468. * @param status A reference to a UErrorCode to receive any errors.
  469. * @return The number of fields into which the input string was split.
  470. * @stable ICU 2.4
  471. */
  472. virtual int32_t split(const UnicodeString &input,
  473. UnicodeString dest[],
  474. int32_t destCapacity,
  475. UErrorCode &status) const;
  476. /**
  477. * Split a string into fields. Somewhat like %split() from Perl or Java.
  478. * Pattern matches identify delimiters that separate the input
  479. * into fields. The input data between the delimiters becomes the
  480. * fields themselves.
  481. *
  482. * If the delimiter pattern includes capture groups, the captured text will
  483. * also appear in the destination array of output strings, interspersed
  484. * with the fields. This is similar to Perl, but differs from Java,
  485. * which ignores the presence of capture groups in the pattern.
  486. *
  487. * Trailing empty fields will always be returned, assuming sufficient
  488. * destination capacity. This differs from the default behavior for Java
  489. * and Perl where trailing empty fields are not returned.
  490. *
  491. * The number of strings produced by the split operation is returned.
  492. * This count includes the strings from capture groups in the delimiter pattern.
  493. * This behavior differs from Java, which ignores capture groups.
  494. *
  495. * For the best performance on split() operations,
  496. * `RegexMatcher::split()` is preferable to this function
  497. *
  498. * @param input The string to be split into fields. The field delimiters
  499. * match the pattern (in the "this" object)
  500. * @param dest An array of mutable UText structs to receive the results of the split.
  501. * If a field is nullptr, a new UText is allocated to contain the results for
  502. * that field. This new UText is not guaranteed to be mutable.
  503. * @param destCapacity The number of elements in the destination array.
  504. * If the number of fields found is less than destCapacity, the
  505. * extra strings in the destination array are not altered.
  506. * If the number of destination strings is less than the number
  507. * of fields, the trailing part of the input string, including any
  508. * field delimiters, is placed in the last destination string.
  509. * @param status A reference to a UErrorCode to receive any errors.
  510. * @return The number of destination strings used.
  511. *
  512. * @stable ICU 4.6
  513. */
  514. virtual int32_t split(UText *input,
  515. UText *dest[],
  516. int32_t destCapacity,
  517. UErrorCode &status) const;
  518. /**
  519. * ICU "poor man's RTTI", returns a UClassID for the actual class.
  520. *
  521. * @stable ICU 2.4
  522. */
  523. virtual UClassID getDynamicClassID() const override;
  524. /**
  525. * ICU "poor man's RTTI", returns a UClassID for this class.
  526. *
  527. * @stable ICU 2.4
  528. */
  529. static UClassID U_EXPORT2 getStaticClassID();
  530. private:
  531. //
  532. // Implementation Data
  533. //
  534. UText *fPattern; // The original pattern string.
  535. UnicodeString *fPatternString; // The original pattern UncodeString if relevant
  536. uint32_t fFlags; // The flags used when compiling the pattern.
  537. //
  538. UVector64 *fCompiledPat; // The compiled pattern p-code.
  539. UnicodeString fLiteralText; // Any literal string data from the pattern,
  540. // after un-escaping, for use during the match.
  541. UVector *fSets; // Any UnicodeSets referenced from the pattern.
  542. Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
  543. UErrorCode fDeferredStatus; // status if some prior error has left this
  544. // RegexPattern in an unusable state.
  545. int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
  546. // >= this value. For some patterns, this calculated
  547. // value may be less than the true shortest
  548. // possible match.
  549. int32_t fFrameSize; // Size of a state stack frame in the
  550. // execution engine.
  551. int32_t fDataSize; // The size of the data needed by the pattern that
  552. // does not go on the state stack, but has just
  553. // a single copy per matcher.
  554. UVector32 *fGroupMap; // Map from capture group number to position of
  555. // the group's variables in the matcher stack frame.
  556. int32_t fStartType; // Info on how a match must start.
  557. int32_t fInitialStringIdx; //
  558. int32_t fInitialStringLen;
  559. UnicodeSet *fInitialChars;
  560. UChar32 fInitialChar;
  561. Regex8BitSet *fInitialChars8;
  562. UBool fNeedsAltInput;
  563. UHashtable *fNamedCaptureMap; // Map from capture group names to numbers.
  564. friend class RegexCompile;
  565. friend class RegexMatcher;
  566. friend class RegexCImpl;
  567. //
  568. // Implementation Methods
  569. //
  570. void init(); // Common initialization, for use by constructors.
  571. bool initNamedCaptureMap(); // Lazy init for fNamedCaptureMap.
  572. void zap(); // Common cleanup
  573. void dumpOp(int32_t index) const;
  574. public:
  575. #ifndef U_HIDE_INTERNAL_API
  576. /**
  577. * Dump a compiled pattern. Internal debug function.
  578. * @internal
  579. */
  580. void dumpPattern() const;
  581. #endif /* U_HIDE_INTERNAL_API */
  582. };
  583. /**
  584. * class RegexMatcher bundles together a regular expression pattern and
  585. * input text to which the expression can be applied. It includes methods
  586. * for testing for matches, and for find and replace operations.
  587. *
  588. * <p>Class RegexMatcher is not intended to be subclassed.</p>
  589. *
  590. * @stable ICU 2.4
  591. */
  592. class U_I18N_API RegexMatcher final : public UObject {
  593. public:
  594. /**
  595. * Construct a RegexMatcher for a regular expression.
  596. * This is a convenience method that avoids the need to explicitly create
  597. * a RegexPattern object. Note that if several RegexMatchers need to be
  598. * created for the same expression, it will be more efficient to
  599. * separately create and cache a RegexPattern object, and use
  600. * its matcher() method to create the RegexMatcher objects.
  601. *
  602. * @param regexp The Regular Expression to be compiled.
  603. * @param flags #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
  604. * @param status Any errors are reported by setting this UErrorCode variable.
  605. * @stable ICU 2.6
  606. */
  607. RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
  608. /**
  609. * Construct a RegexMatcher for a regular expression.
  610. * This is a convenience method that avoids the need to explicitly create
  611. * a RegexPattern object. Note that if several RegexMatchers need to be
  612. * created for the same expression, it will be more efficient to
  613. * separately create and cache a RegexPattern object, and use
  614. * its matcher() method to create the RegexMatcher objects.
  615. *
  616. * @param regexp The regular expression to be compiled.
  617. * @param flags #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
  618. * @param status Any errors are reported by setting this UErrorCode variable.
  619. *
  620. * @stable ICU 4.6
  621. */
  622. RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
  623. /**
  624. * Construct a RegexMatcher for a regular expression.
  625. * This is a convenience method that avoids the need to explicitly create
  626. * a RegexPattern object. Note that if several RegexMatchers need to be
  627. * created for the same expression, it will be more efficient to
  628. * separately create and cache a RegexPattern object, and use
  629. * its matcher() method to create the RegexMatcher objects.
  630. *
  631. * The matcher will retain a reference to the supplied input string, and all regexp
  632. * pattern matching operations happen directly on the original string. It is
  633. * critical that the string not be altered or deleted before use by the regular
  634. * expression operations is complete.
  635. *
  636. * @param regexp The Regular Expression to be compiled.
  637. * @param input The string to match. The matcher retains a reference to the
  638. * caller's string; mo copy is made.
  639. * @param flags #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
  640. * @param status Any errors are reported by setting this UErrorCode variable.
  641. * @stable ICU 2.6
  642. */
  643. RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
  644. uint32_t flags, UErrorCode &status);
  645. /**
  646. * Construct a RegexMatcher for a regular expression.
  647. * This is a convenience method that avoids the need to explicitly create
  648. * a RegexPattern object. Note that if several RegexMatchers need to be
  649. * created for the same expression, it will be more efficient to
  650. * separately create and cache a RegexPattern object, and use
  651. * its matcher() method to create the RegexMatcher objects.
  652. *
  653. * The matcher will make a shallow clone of the supplied input text, and all regexp
  654. * pattern matching operations happen on this clone. While read-only operations on
  655. * the supplied text are permitted, it is critical that the underlying string not be
  656. * altered or deleted before use by the regular expression operations is complete.
  657. *
  658. * @param regexp The Regular Expression to be compiled.
  659. * @param input The string to match. The matcher retains a shallow clone of the text.
  660. * @param flags #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
  661. * @param status Any errors are reported by setting this UErrorCode variable.
  662. *
  663. * @stable ICU 4.6
  664. */
  665. RegexMatcher(UText *regexp, UText *input,
  666. uint32_t flags, UErrorCode &status);
  667. private:
  668. /**
  669. * Cause a compilation error if an application accidentally attempts to
  670. * create a matcher with a (char16_t *) string as input rather than
  671. * a UnicodeString. Avoids a dangling reference to a temporary string.
  672. *
  673. * To efficiently work with char16_t *strings, wrap the data in a UnicodeString
  674. * using one of the aliasing constructors, such as
  675. * `UnicodeString(UBool isTerminated, const char16_t *text, int32_t textLength);`
  676. * or in a UText, using
  677. * `utext_openUChars(UText *ut, const char16_t *text, int64_t textLength, UErrorCode *status);`
  678. */
  679. RegexMatcher(const UnicodeString &regexp, const char16_t *input,
  680. uint32_t flags, UErrorCode &status) = delete;
  681. public:
  682. /**
  683. * Destructor.
  684. *
  685. * @stable ICU 2.4
  686. */
  687. virtual ~RegexMatcher();
  688. /**
  689. * Attempts to match the entire input region against the pattern.
  690. * @param status A reference to a UErrorCode to receive any errors.
  691. * @return true if there is a match
  692. * @stable ICU 2.4
  693. */
  694. virtual UBool matches(UErrorCode &status);
  695. /**
  696. * Resets the matcher, then attempts to match the input beginning
  697. * at the specified startIndex, and extending to the end of the input.
  698. * The input region is reset to include the entire input string.
  699. * A successful match must extend to the end of the input.
  700. * @param startIndex The input string (native) index at which to begin matching.
  701. * @param status A reference to a UErrorCode to receive any errors.
  702. * @return true if there is a match
  703. * @stable ICU 2.8
  704. */
  705. virtual UBool matches(int64_t startIndex, UErrorCode &status);
  706. /**
  707. * Attempts to match the input string, starting from the beginning of the region,
  708. * against the pattern. Like the matches() method, this function
  709. * always starts at the beginning of the input region;
  710. * unlike that function, it does not require that the entire region be matched.
  711. *
  712. * If the match succeeds then more information can be obtained via the start(),
  713. * end(), and group() functions.
  714. *
  715. * @param status A reference to a UErrorCode to receive any errors.
  716. * @return true if there is a match at the start of the input string.
  717. * @stable ICU 2.4
  718. */
  719. virtual UBool lookingAt(UErrorCode &status);
  720. /**
  721. * Attempts to match the input string, starting from the specified index, against the pattern.
  722. * The match may be of any length, and is not required to extend to the end
  723. * of the input string. Contrast with match().
  724. *
  725. * If the match succeeds then more information can be obtained via the start(),
  726. * end(), and group() functions.
  727. *
  728. * @param startIndex The input string (native) index at which to begin matching.
  729. * @param status A reference to a UErrorCode to receive any errors.
  730. * @return true if there is a match.
  731. * @stable ICU 2.8
  732. */
  733. virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
  734. /**
  735. * Find the next pattern match in the input string.
  736. * The find begins searching the input at the location following the end of
  737. * the previous match, or at the start of the string if there is no previous match.
  738. * If a match is found, `start()`, `end()` and `group()`
  739. * will provide more information regarding the match.
  740. * Note that if the input string is changed by the application,
  741. * use find(startPos, status) instead of find(), because the saved starting
  742. * position may not be valid with the altered input string.
  743. * @return true if a match is found.
  744. * @stable ICU 2.4
  745. */
  746. virtual UBool find();
  747. /**
  748. * Find the next pattern match in the input string.
  749. * The find begins searching the input at the location following the end of
  750. * the previous match, or at the start of the string if there is no previous match.
  751. * If a match is found, `start()`, `end()` and `group()`
  752. * will provide more information regarding the match.
  753. *
  754. * Note that if the input string is changed by the application,
  755. * use find(startPos, status) instead of find(), because the saved starting
  756. * position may not be valid with the altered input string.
  757. * @param status A reference to a UErrorCode to receive any errors.
  758. * @return true if a match is found.
  759. * @stable ICU 55
  760. */
  761. virtual UBool find(UErrorCode &status);
  762. /**
  763. * Resets this RegexMatcher and then attempts to find the next substring of the
  764. * input string that matches the pattern, starting at the specified index.
  765. *
  766. * @param start The (native) index in the input string to begin the search.
  767. * @param status A reference to a UErrorCode to receive any errors.
  768. * @return true if a match is found.
  769. * @stable ICU 2.4
  770. */
  771. virtual UBool find(int64_t start, UErrorCode &status);
  772. /**
  773. * Returns a string containing the text matched by the previous match.
  774. * If the pattern can match an empty string, an empty string may be returned.
  775. * @param status A reference to a UErrorCode to receive any errors.
  776. * Possible errors are U_REGEX_INVALID_STATE if no match
  777. * has been attempted or the last match failed.
  778. * @return a string containing the matched input text.
  779. * @stable ICU 2.4
  780. */
  781. virtual UnicodeString group(UErrorCode &status) const;
  782. /**
  783. * Returns a string containing the text captured by the given group
  784. * during the previous match operation. Group(0) is the entire match.
  785. *
  786. * A zero length string is returned both for capture groups that did not
  787. * participate in the match and for actual zero length matches.
  788. * To distinguish between these two cases use the function start(),
  789. * which returns -1 for non-participating groups.
  790. *
  791. * @param groupNum the capture group number
  792. * @param status A reference to a UErrorCode to receive any errors.
  793. * Possible errors are U_REGEX_INVALID_STATE if no match
  794. * has been attempted or the last match failed and
  795. * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
  796. * @return the captured text
  797. * @stable ICU 2.4
  798. */
  799. virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
  800. /**
  801. * Returns the number of capturing groups in this matcher's pattern.
  802. * @return the number of capture groups
  803. * @stable ICU 2.4
  804. */
  805. virtual int32_t groupCount() const;
  806. /**
  807. * Returns a shallow clone of the entire live input string with the UText current native index
  808. * set to the beginning of the requested group.
  809. *
  810. * @param dest The UText into which the input should be cloned, or nullptr to create a new UText
  811. * @param group_len A reference to receive the length of the desired capture group
  812. * @param status A reference to a UErrorCode to receive any errors.
  813. * Possible errors are U_REGEX_INVALID_STATE if no match
  814. * has been attempted or the last match failed and
  815. * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
  816. * @return dest if non-nullptr, a shallow copy of the input text otherwise
  817. *
  818. * @stable ICU 4.6
  819. */
  820. virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
  821. /**
  822. * Returns a shallow clone of the entire live input string with the UText current native index
  823. * set to the beginning of the requested group.
  824. *
  825. * A group length of zero is returned both for capture groups that did not
  826. * participate in the match and for actual zero length matches.
  827. * To distinguish between these two cases use the function start(),
  828. * which returns -1 for non-participating groups.
  829. *
  830. * @param groupNum The capture group number.
  831. * @param dest The UText into which the input should be cloned, or nullptr to create a new UText.
  832. * @param group_len A reference to receive the length of the desired capture group
  833. * @param status A reference to a UErrorCode to receive any errors.
  834. * Possible errors are U_REGEX_INVALID_STATE if no match
  835. * has been attempted or the last match failed and
  836. * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
  837. * @return dest if non-nullptr, a shallow copy of the input text otherwise
  838. *
  839. * @stable ICU 4.6
  840. */
  841. virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
  842. /**
  843. * Returns the index in the input string of the start of the text matched
  844. * during the previous match operation.
  845. * @param status a reference to a UErrorCode to receive any errors.
  846. * @return The (native) position in the input string of the start of the last match.
  847. * @stable ICU 2.4
  848. */
  849. virtual int32_t start(UErrorCode &status) const;
  850. /**
  851. * Returns the index in the input string of the start of the text matched
  852. * during the previous match operation.
  853. * @param status a reference to a UErrorCode to receive any errors.
  854. * @return The (native) position in the input string of the start of the last match.
  855. * @stable ICU 4.6
  856. */
  857. virtual int64_t start64(UErrorCode &status) const;
  858. /**
  859. * Returns the index in the input string of the start of the text matched by the
  860. * specified capture group during the previous match operation. Return -1 if
  861. * the capture group exists in the pattern, but was not part of the last match.
  862. *
  863. * @param group the capture group number
  864. * @param status A reference to a UErrorCode to receive any errors. Possible
  865. * errors are U_REGEX_INVALID_STATE if no match has been
  866. * attempted or the last match failed, and
  867. * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
  868. * @return the (native) start position of substring matched by the specified group.
  869. * @stable ICU 2.4
  870. */
  871. virtual int32_t start(int32_t group, UErrorCode &status) const;
  872. /**
  873. * Returns the index in the input string of the start of the text matched by the
  874. * specified capture group during the previous match operation. Return -1 if
  875. * the capture group exists in the pattern, but was not part of the last match.
  876. *
  877. * @param group the capture group number.
  878. * @param status A reference to a UErrorCode to receive any errors. Possible
  879. * errors are U_REGEX_INVALID_STATE if no match has been
  880. * attempted or the last match failed, and
  881. * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
  882. * @return the (native) start position of substring matched by the specified group.
  883. * @stable ICU 4.6
  884. */
  885. virtual int64_t start64(int32_t group, UErrorCode &status) const;
  886. /**
  887. * Returns the index in the input string of the first character following the
  888. * text matched during the previous match operation.
  889. *
  890. * @param status A reference to a UErrorCode to receive any errors. Possible
  891. * errors are U_REGEX_INVALID_STATE if no match has been
  892. * attempted or the last match failed.
  893. * @return the index of the last character matched, plus one.
  894. * The index value returned is a native index, corresponding to
  895. * code units for the underlying encoding type, for example,
  896. * a byte index for UTF-8.
  897. * @stable ICU 2.4
  898. */
  899. virtual int32_t end(UErrorCode &status) const;
  900. /**
  901. * Returns the index in the input string of the first character following the
  902. * text matched during the previous match operation.
  903. *
  904. * @param status A reference to a UErrorCode to receive any errors. Possible
  905. * errors are U_REGEX_INVALID_STATE if no match has been
  906. * attempted or the last match failed.
  907. * @return the index of the last character matched, plus one.
  908. * The index value returned is a native index, corresponding to
  909. * code units for the underlying encoding type, for example,
  910. * a byte index for UTF-8.
  911. * @stable ICU 4.6
  912. */
  913. virtual int64_t end64(UErrorCode &status) const;
  914. /**
  915. * Returns the index in the input string of the character following the
  916. * text matched by the specified capture group during the previous match operation.
  917. *
  918. * @param group the capture group number
  919. * @param status A reference to a UErrorCode to receive any errors. Possible
  920. * errors are U_REGEX_INVALID_STATE if no match has been
  921. * attempted or the last match failed and
  922. * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
  923. * @return the index of the first character following the text
  924. * captured by the specified group during the previous match operation.
  925. * Return -1 if the capture group exists in the pattern but was not part of the match.
  926. * The index value returned is a native index, corresponding to
  927. * code units for the underlying encoding type, for example,
  928. * a byte index for UTF8.
  929. * @stable ICU 2.4
  930. */
  931. virtual int32_t end(int32_t group, UErrorCode &status) const;
  932. /**
  933. * Returns the index in the input string of the character following the
  934. * text matched by the specified capture group during the previous match operation.
  935. *
  936. * @param group the capture group number
  937. * @param status A reference to a UErrorCode to receive any errors. Possible
  938. * errors are U_REGEX_INVALID_STATE if no match has been
  939. * attempted or the last match failed and
  940. * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
  941. * @return the index of the first character following the text
  942. * captured by the specified group during the previous match operation.
  943. * Return -1 if the capture group exists in the pattern but was not part of the match.
  944. * The index value returned is a native index, corresponding to
  945. * code units for the underlying encoding type, for example,
  946. * a byte index for UTF8.
  947. * @stable ICU 4.6
  948. */
  949. virtual int64_t end64(int32_t group, UErrorCode &status) const;
  950. /**
  951. * Resets this matcher. The effect is to remove any memory of previous matches,
  952. * and to cause subsequent find() operations to begin at the beginning of
  953. * the input string.
  954. *
  955. * @return this RegexMatcher.
  956. * @stable ICU 2.4
  957. */
  958. virtual RegexMatcher &reset();
  959. /**
  960. * Resets this matcher, and set the current input position.
  961. * The effect is to remove any memory of previous matches,
  962. * and to cause subsequent find() operations to begin at
  963. * the specified (native) position in the input string.
  964. *
  965. * The matcher's region is reset to its default, which is the entire
  966. * input string.
  967. *
  968. * An alternative to this function is to set a match region
  969. * beginning at the desired index.
  970. *
  971. * @return this RegexMatcher.
  972. * @stable ICU 2.8
  973. */
  974. virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
  975. /**
  976. * Resets this matcher with a new input string. This allows instances of RegexMatcher
  977. * to be reused, which is more efficient than creating a new RegexMatcher for
  978. * each input string to be processed.
  979. * @param input The new string on which subsequent pattern matches will operate.
  980. * The matcher retains a reference to the callers string, and operates
  981. * directly on that. Ownership of the string remains with the caller.
  982. * Because no copy of the string is made, it is essential that the
  983. * caller not delete the string until after regexp operations on it
  984. * are done.
  985. * Note that while a reset on the matcher with an input string that is then
  986. * modified across/during matcher operations may be supported currently for UnicodeString,
  987. * this was not originally intended behavior, and support for this is not guaranteed
  988. * in upcoming versions of ICU.
  989. * @return this RegexMatcher.
  990. * @stable ICU 2.4
  991. */
  992. virtual RegexMatcher &reset(const UnicodeString &input);
  993. /**
  994. * Resets this matcher with a new input string. This allows instances of RegexMatcher
  995. * to be reused, which is more efficient than creating a new RegexMatcher for
  996. * each input string to be processed.
  997. * @param input The new string on which subsequent pattern matches will operate.
  998. * The matcher makes a shallow clone of the given text; ownership of the
  999. * original string remains with the caller. Because no deep copy of the
  1000. * text is made, it is essential that the caller not modify the string
  1001. * until after regexp operations on it are done.
  1002. * @return this RegexMatcher.
  1003. *
  1004. * @stable ICU 4.6
  1005. */
  1006. virtual RegexMatcher &reset(UText *input);
  1007. /**
  1008. * Set the subject text string upon which the regular expression is looking for matches
  1009. * without changing any other aspect of the matching state.
  1010. * The new and previous text strings must have the same content.
  1011. *
  1012. * This function is intended for use in environments where ICU is operating on
  1013. * strings that may move around in memory. It provides a mechanism for notifying
  1014. * ICU that the string has been relocated, and providing a new UText to access the
  1015. * string in its new position.
  1016. *
  1017. * Note that the regular expression implementation never copies the underlying text
  1018. * of a string being matched, but always operates directly on the original text
  1019. * provided by the user. Refreshing simply drops the references to the old text
  1020. * and replaces them with references to the new.
  1021. *
  1022. * Caution: this function is normally used only by very specialized,
  1023. * system-level code. One example use case is with garbage collection that moves
  1024. * the text in memory.
  1025. *
  1026. * @param input The new (moved) text string.
  1027. * @param status Receives errors detected by this function.
  1028. *
  1029. * @stable ICU 4.8
  1030. */
  1031. virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
  1032. private:
  1033. /**
  1034. * Cause a compilation error if an application accidentally attempts to
  1035. * reset a matcher with a (char16_t *) string as input rather than
  1036. * a UnicodeString. Avoids a dangling reference to a temporary string.
  1037. *
  1038. * To efficiently work with char16_t *strings, wrap the data in a UnicodeString
  1039. * using one of the aliasing constructors, such as
  1040. * `UnicodeString(UBool isTerminated, const char16_t *text, int32_t textLength);`
  1041. * or in a UText, using
  1042. * `utext_openUChars(UText *ut, const char16_t *text, int64_t textLength, UErrorCode *status);`
  1043. *
  1044. */
  1045. RegexMatcher &reset(const char16_t *input) = delete;
  1046. public:
  1047. /**
  1048. * Returns the input string being matched. Ownership of the string belongs to
  1049. * the matcher; it should not be altered or deleted. This method will work even if the input
  1050. * was originally supplied as a UText.
  1051. * @return the input string
  1052. * @stable ICU 2.4
  1053. */
  1054. virtual const UnicodeString &input() const;
  1055. /**
  1056. * Returns the input string being matched. This is the live input text; it should not be
  1057. * altered or deleted. This method will work even if the input was originally supplied as
  1058. * a UnicodeString.
  1059. * @return the input text
  1060. *
  1061. * @stable ICU 4.6
  1062. */
  1063. virtual UText *inputText() const;
  1064. /**
  1065. * Returns the input string being matched, either by copying it into the provided
  1066. * UText parameter or by returning a shallow clone of the live input. Note that copying
  1067. * the entire input may cause significant performance and memory issues.
  1068. * @param dest The UText into which the input should be copied, or nullptr to create a new UText
  1069. * @param status error code
  1070. * @return dest if non-nullptr, a shallow copy of the input text otherwise
  1071. *
  1072. * @stable ICU 4.6
  1073. */
  1074. virtual UText *getInput(UText *dest, UErrorCode &status) const;
  1075. /** Sets the limits of this matcher's region.
  1076. * The region is the part of the input string that will be searched to find a match.
  1077. * Invoking this method resets the matcher, and then sets the region to start
  1078. * at the index specified by the start parameter and end at the index specified
  1079. * by the end parameter.
  1080. *
  1081. * Depending on the transparency and anchoring being used (see useTransparentBounds
  1082. * and useAnchoringBounds), certain constructs such as anchors may behave differently
  1083. * at or around the boundaries of the region
  1084. *
  1085. * The function will fail if start is greater than limit, or if either index
  1086. * is less than zero or greater than the length of the string being matched.
  1087. *
  1088. * @param start The (native) index to begin searches at.
  1089. * @param limit The index to end searches at (exclusive).
  1090. * @param status A reference to a UErrorCode to receive any errors.
  1091. * @stable ICU 4.0
  1092. */
  1093. virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
  1094. /**
  1095. * Identical to region(start, limit, status) but also allows a start position without
  1096. * resetting the region state.
  1097. * @param regionStart The region start
  1098. * @param regionLimit the limit of the region
  1099. * @param startIndex The (native) index within the region bounds at which to begin searches.
  1100. * @param status A reference to a UErrorCode to receive any errors.
  1101. * If startIndex is not within the specified region bounds,
  1102. * U_INDEX_OUTOFBOUNDS_ERROR is returned.
  1103. * @stable ICU 4.6
  1104. */
  1105. virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
  1106. /**
  1107. * Reports the start index of this matcher's region. The searches this matcher
  1108. * conducts are limited to finding matches within regionStart (inclusive) and
  1109. * regionEnd (exclusive).
  1110. *
  1111. * @return The starting (native) index of this matcher's region.
  1112. * @stable ICU 4.0
  1113. */
  1114. virtual int32_t regionStart() const;
  1115. /**
  1116. * Reports the start index of this matcher's region. The searches this matcher
  1117. * conducts are limited to finding matches within regionStart (inclusive) and
  1118. * regionEnd (exclusive).
  1119. *
  1120. * @return The starting (native) index of this matcher's region.
  1121. * @stable ICU 4.6
  1122. */
  1123. virtual int64_t regionStart64() const;
  1124. /**
  1125. * Reports the end (limit) index (exclusive) of this matcher's region. The searches
  1126. * this matcher conducts are limited to finding matches within regionStart
  1127. * (inclusive) and regionEnd (exclusive).
  1128. *
  1129. * @return The ending point (native) of this matcher's region.
  1130. * @stable ICU 4.0
  1131. */
  1132. virtual int32_t regionEnd() const;
  1133. /**
  1134. * Reports the end (limit) index (exclusive) of this matcher's region. The searches
  1135. * this matcher conducts are limited to finding matches within regionStart
  1136. * (inclusive) and regionEnd (exclusive).
  1137. *
  1138. * @return The ending point (native) of this matcher's region.
  1139. * @stable ICU 4.6
  1140. */
  1141. virtual int64_t regionEnd64() const;
  1142. /**
  1143. * Queries the transparency of region bounds for this matcher.
  1144. * See useTransparentBounds for a description of transparent and opaque bounds.
  1145. * By default, a matcher uses opaque region boundaries.
  1146. *
  1147. * @return true if this matcher is using opaque bounds, false if it is not.
  1148. * @stable ICU 4.0
  1149. */
  1150. virtual UBool hasTransparentBounds() const;
  1151. /**
  1152. * Sets the transparency of region bounds for this matcher.
  1153. * Invoking this function with an argument of true will set this matcher to use transparent bounds.
  1154. * If the boolean argument is false, then opaque bounds will be used.
  1155. *
  1156. * Using transparent bounds, the boundaries of this matcher's region are transparent
  1157. * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
  1158. * see text beyond the boundaries of the region while checking for a match.
  1159. *
  1160. * With opaque bounds, no text outside of the matcher's region is visible to lookahead,
  1161. * lookbehind, and boundary matching constructs.
  1162. *
  1163. * By default, a matcher uses opaque bounds.
  1164. *
  1165. * @param b true for transparent bounds; false for opaque bounds
  1166. * @return This Matcher;
  1167. * @stable ICU 4.0
  1168. **/
  1169. virtual RegexMatcher &useTransparentBounds(UBool b);
  1170. /**
  1171. * Return true if this matcher is using anchoring bounds.
  1172. * By default, matchers use anchoring region bounds.
  1173. *
  1174. * @return true if this matcher is using anchoring bounds.
  1175. * @stable ICU 4.0
  1176. */
  1177. virtual UBool hasAnchoringBounds() const;
  1178. /**
  1179. * Set whether this matcher is using Anchoring Bounds for its region.
  1180. * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
  1181. * and end of the region. Without Anchoring Bounds, anchors will only match at
  1182. * the positions they would in the complete text.
  1183. *
  1184. * Anchoring Bounds are the default for regions.
  1185. *
  1186. * @param b true if to enable anchoring bounds; false to disable them.
  1187. * @return This Matcher
  1188. * @stable ICU 4.0
  1189. */
  1190. virtual RegexMatcher &useAnchoringBounds(UBool b);
  1191. /**
  1192. * Return true if the most recent matching operation attempted to access
  1193. * additional input beyond the available input text.
  1194. * In this case, additional input text could change the results of the match.
  1195. *
  1196. * hitEnd() is defined for both successful and unsuccessful matches.
  1197. * In either case hitEnd() will return true if if the end of the text was
  1198. * reached at any point during the matching process.
  1199. *
  1200. * @return true if the most recent match hit the end of input
  1201. * @stable ICU 4.0
  1202. */
  1203. virtual UBool hitEnd() const;
  1204. /**
  1205. * Return true the most recent match succeeded and additional input could cause
  1206. * it to fail. If this method returns false and a match was found, then more input
  1207. * might change the match but the match won't be lost. If a match was not found,
  1208. * then requireEnd has no meaning.
  1209. *
  1210. * @return true if more input could cause the most recent match to no longer match.
  1211. * @stable ICU 4.0
  1212. */
  1213. virtual UBool requireEnd() const;
  1214. /**
  1215. * Returns the pattern that is interpreted by this matcher.
  1216. * @return the RegexPattern for this RegexMatcher
  1217. * @stable ICU 2.4
  1218. */
  1219. virtual const RegexPattern &pattern() const;
  1220. /**
  1221. * Replaces every substring of the input that matches the pattern
  1222. * with the given replacement string. This is a convenience function that
  1223. * provides a complete find-and-replace-all operation.
  1224. *
  1225. * This method first resets this matcher. It then scans the input string
  1226. * looking for matches of the pattern. Input that is not part of any
  1227. * match is left unchanged; each match is replaced in the result by the
  1228. * replacement string. The replacement string may contain references to
  1229. * capture groups.
  1230. *
  1231. * @param replacement a string containing the replacement text.
  1232. * @param status a reference to a UErrorCode to receive any errors.
  1233. * @return a string containing the results of the find and replace.
  1234. * @stable ICU 2.4
  1235. */
  1236. virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
  1237. /**
  1238. * Replaces every substring of the input that matches the pattern
  1239. * with the given replacement string. This is a convenience function that
  1240. * provides a complete find-and-replace-all operation.
  1241. *
  1242. * This method first resets this matcher. It then scans the input string
  1243. * looking for matches of the pattern. Input that is not part of any
  1244. * match is left unchanged; each match is replaced in the result by the
  1245. * replacement string. The replacement string may contain references to
  1246. * capture groups.
  1247. *
  1248. * @param replacement a string containing the replacement text.
  1249. * @param dest a mutable UText in which the results are placed.
  1250. * If nullptr, a new UText will be created (which may not be mutable).
  1251. * @param status a reference to a UErrorCode to receive any errors.
  1252. * @return a string containing the results of the find and replace.
  1253. * If a pre-allocated UText was provided, it will always be used and returned.
  1254. *
  1255. * @stable ICU 4.6
  1256. */
  1257. virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
  1258. /**
  1259. * Replaces the first substring of the input that matches
  1260. * the pattern with the replacement string. This is a convenience
  1261. * function that provides a complete find-and-replace operation.
  1262. *
  1263. * This function first resets this RegexMatcher. It then scans the input string
  1264. * looking for a match of the pattern. Input that is not part
  1265. * of the match is appended directly to the result string; the match is replaced
  1266. * in the result by the replacement string. The replacement string may contain
  1267. * references to captured groups.
  1268. *
  1269. * The state of the matcher (the position at which a subsequent find()
  1270. * would begin) after completing a replaceFirst() is not specified. The
  1271. * RegexMatcher should be reset before doing additional find() operations.
  1272. *
  1273. * @param replacement a string containing the replacement text.
  1274. * @param status a reference to a UErrorCode to receive any errors.
  1275. * @return a string containing the results of the find and replace.
  1276. * @stable ICU 2.4
  1277. */
  1278. virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
  1279. /**
  1280. * Replaces the first substring of the input that matches
  1281. * the pattern with the replacement string. This is a convenience
  1282. * function that provides a complete find-and-replace operation.
  1283. *
  1284. * This function first resets this RegexMatcher. It then scans the input string
  1285. * looking for a match of the pattern. Input that is not part
  1286. * of the match is appended directly to the result string; the match is replaced
  1287. * in the result by the replacement string. The replacement string may contain
  1288. * references to captured groups.
  1289. *
  1290. * The state of the matcher (the position at which a subsequent find()
  1291. * would begin) after completing a replaceFirst() is not specified. The
  1292. * RegexMatcher should be reset before doing additional find() operations.
  1293. *
  1294. * @param replacement a string containing the replacement text.
  1295. * @param dest a mutable UText in which the results are placed.
  1296. * If nullptr, a new UText will be created (which may not be mutable).
  1297. * @param status a reference to a UErrorCode to receive any errors.
  1298. * @return a string containing the results of the find and replace.
  1299. * If a pre-allocated UText was provided, it will always be used and returned.
  1300. *
  1301. * @stable ICU 4.6
  1302. */
  1303. virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
  1304. /**
  1305. * Implements a replace operation intended to be used as part of an
  1306. * incremental find-and-replace.
  1307. *
  1308. * The input string, starting from the end of the previous replacement and ending at
  1309. * the start of the current match, is appended to the destination string. Then the
  1310. * replacement string is appended to the output string,
  1311. * including handling any substitutions of captured text.
  1312. *
  1313. * For simple, prepackaged, non-incremental find-and-replace
  1314. * operations, see replaceFirst() or replaceAll().
  1315. *
  1316. * @param dest A UnicodeString to which the results of the find-and-replace are appended.
  1317. * @param replacement A UnicodeString that provides the text to be substituted for
  1318. * the input text that matched the regexp pattern. The replacement
  1319. * text may contain references to captured text from the
  1320. * input.
  1321. * @param status A reference to a UErrorCode to receive any errors. Possible
  1322. * errors are U_REGEX_INVALID_STATE if no match has been
  1323. * attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
  1324. * if the replacement text specifies a capture group that
  1325. * does not exist in the pattern.
  1326. *
  1327. * @return this RegexMatcher
  1328. * @stable ICU 2.4
  1329. *
  1330. */
  1331. virtual RegexMatcher &appendReplacement(UnicodeString &dest,
  1332. const UnicodeString &replacement, UErrorCode &status);
  1333. /**
  1334. * Implements a replace operation intended to be used as part of an
  1335. * incremental find-and-replace.
  1336. *
  1337. * The input string, starting from the end of the previous replacement and ending at
  1338. * the start of the current match, is appended to the destination string. Then the
  1339. * replacement string is appended to the output string,
  1340. * including handling any substitutions of captured text.
  1341. *
  1342. * For simple, prepackaged, non-incremental find-and-replace
  1343. * operations, see replaceFirst() or replaceAll().
  1344. *
  1345. * @param dest A mutable UText to which the results of the find-and-replace are appended.
  1346. * Must not be nullptr.
  1347. * @param replacement A UText that provides the text to be substituted for
  1348. * the input text that matched the regexp pattern. The replacement
  1349. * text may contain references to captured text from the input.
  1350. * @param status A reference to a UErrorCode to receive any errors. Possible
  1351. * errors are U_REGEX_INVALID_STATE if no match has been
  1352. * attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
  1353. * if the replacement text specifies a capture group that
  1354. * does not exist in the pattern.
  1355. *
  1356. * @return this RegexMatcher
  1357. *
  1358. * @stable ICU 4.6
  1359. */
  1360. virtual RegexMatcher &appendReplacement(UText *dest,
  1361. UText *replacement, UErrorCode &status);
  1362. /**
  1363. * As the final step in a find-and-replace operation, append the remainder
  1364. * of the input string, starting at the position following the last appendReplacement(),
  1365. * to the destination string. `appendTail()` is intended to be invoked after one
  1366. * or more invocations of the `RegexMatcher::appendReplacement()`.
  1367. *
  1368. * @param dest A UnicodeString to which the results of the find-and-replace are appended.
  1369. * @return the destination string.
  1370. * @stable ICU 2.4
  1371. */
  1372. virtual UnicodeString &appendTail(UnicodeString &dest);
  1373. /**
  1374. * As the final step in a find-and-replace operation, append the remainder
  1375. * of the input string, starting at the position following the last appendReplacement(),
  1376. * to the destination string. `appendTail()` is intended to be invoked after one
  1377. * or more invocations of the `RegexMatcher::appendReplacement()`.
  1378. *
  1379. * @param dest A mutable UText to which the results of the find-and-replace are appended.
  1380. * Must not be nullptr.
  1381. * @param status error cod
  1382. * @return the destination string.
  1383. *
  1384. * @stable ICU 4.6
  1385. */
  1386. virtual UText *appendTail(UText *dest, UErrorCode &status);
  1387. /**
  1388. * Split a string into fields. Somewhat like %split() from Perl.
  1389. * The pattern matches identify delimiters that separate the input
  1390. * into fields. The input data between the matches becomes the
  1391. * fields themselves.
  1392. *
  1393. * @param input The string to be split into fields. The field delimiters
  1394. * match the pattern (in the "this" object). This matcher
  1395. * will be reset to this input string.
  1396. * @param dest An array of UnicodeStrings to receive the results of the split.
  1397. * This is an array of actual UnicodeString objects, not an
  1398. * array of pointers to strings. Local (stack based) arrays can
  1399. * work well here.
  1400. * @param destCapacity The number of elements in the destination array.
  1401. * If the number of fields found is less than destCapacity, the
  1402. * extra strings in the destination array are not altered.
  1403. * If the number of destination strings is less than the number
  1404. * of fields, the trailing part of the input string, including any
  1405. * field delimiters, is placed in the last destination string.
  1406. * @param status A reference to a UErrorCode to receive any errors.
  1407. * @return The number of fields into which the input string was split.
  1408. * @stable ICU 2.6
  1409. */
  1410. virtual int32_t split(const UnicodeString &input,
  1411. UnicodeString dest[],
  1412. int32_t destCapacity,
  1413. UErrorCode &status);
  1414. /**
  1415. * Split a string into fields. Somewhat like %split() from Perl.
  1416. * The pattern matches identify delimiters that separate the input
  1417. * into fields. The input data between the matches becomes the
  1418. * fields themselves.
  1419. *
  1420. * @param input The string to be split into fields. The field delimiters
  1421. * match the pattern (in the "this" object). This matcher
  1422. * will be reset to this input string.
  1423. * @param dest An array of mutable UText structs to receive the results of the split.
  1424. * If a field is nullptr, a new UText is allocated to contain the results for
  1425. * that field. This new UText is not guaranteed to be mutable.
  1426. * @param destCapacity The number of elements in the destination array.
  1427. * If the number of fields found is less than destCapacity, the
  1428. * extra strings in the destination array are not altered.
  1429. * If the number of destination strings is less than the number
  1430. * of fields, the trailing part of the input string, including any
  1431. * field delimiters, is placed in the last destination string.
  1432. * @param status A reference to a UErrorCode to receive any errors.
  1433. * @return The number of fields into which the input string was split.
  1434. *
  1435. * @stable ICU 4.6
  1436. */
  1437. virtual int32_t split(UText *input,
  1438. UText *dest[],
  1439. int32_t destCapacity,
  1440. UErrorCode &status);
  1441. /**
  1442. * Set a processing time limit for match operations with this Matcher.
  1443. *
  1444. * Some patterns, when matching certain strings, can run in exponential time.
  1445. * For practical purposes, the match operation may appear to be in an
  1446. * infinite loop.
  1447. * When a limit is set a match operation will fail with an error if the
  1448. * limit is exceeded.
  1449. *
  1450. * The units of the limit are steps of the match engine.
  1451. * Correspondence with actual processor time will depend on the speed
  1452. * of the processor and the details of the specific pattern, but will
  1453. * typically be on the order of milliseconds.
  1454. *
  1455. * By default, the matching time is not limited.
  1456. *
  1457. *
  1458. * @param limit The limit value, or 0 for no limit.
  1459. * @param status A reference to a UErrorCode to receive any errors.
  1460. * @stable ICU 4.0
  1461. */
  1462. virtual void setTimeLimit(int32_t limit, UErrorCode &status);
  1463. /**
  1464. * Get the time limit, if any, for match operations made with this Matcher.
  1465. *
  1466. * @return the maximum allowed time for a match, in units of processing steps.
  1467. * @stable ICU 4.0
  1468. */
  1469. virtual int32_t getTimeLimit() const;
  1470. /**
  1471. * Set the amount of heap storage available for use by the match backtracking stack.
  1472. * The matcher is also reset, discarding any results from previous matches.
  1473. *
  1474. * ICU uses a backtracking regular expression engine, with the backtrack stack
  1475. * maintained on the heap. This function sets the limit to the amount of memory
  1476. * that can be used for this purpose. A backtracking stack overflow will
  1477. * result in an error from the match operation that caused it.
  1478. *
  1479. * A limit is desirable because a malicious or poorly designed pattern can use
  1480. * excessive memory, potentially crashing the process. A limit is enabled
  1481. * by default.
  1482. *
  1483. * @param limit The maximum size, in bytes, of the matching backtrack stack.
  1484. * A value of zero means no limit.
  1485. * The limit must be greater or equal to zero.
  1486. *
  1487. * @param status A reference to a UErrorCode to receive any errors.
  1488. *
  1489. * @stable ICU 4.0
  1490. */
  1491. virtual void setStackLimit(int32_t limit, UErrorCode &status);
  1492. /**
  1493. * Get the size of the heap storage available for use by the back tracking stack.
  1494. *
  1495. * @return the maximum backtracking stack size, in bytes, or zero if the
  1496. * stack size is unlimited.
  1497. * @stable ICU 4.0
  1498. */
  1499. virtual int32_t getStackLimit() const;
  1500. /**
  1501. * Set a callback function for use with this Matcher.
  1502. * During matching operations the function will be called periodically,
  1503. * giving the application the opportunity to terminate a long-running
  1504. * match.
  1505. *
  1506. * @param callback A pointer to the user-supplied callback function.
  1507. * @param context User context pointer. The value supplied at the
  1508. * time the callback function is set will be saved
  1509. * and passed to the callback each time that it is called.
  1510. * @param status A reference to a UErrorCode to receive any errors.
  1511. * @stable ICU 4.0
  1512. */
  1513. virtual void setMatchCallback(URegexMatchCallback *callback,
  1514. const void *context,
  1515. UErrorCode &status);
  1516. /**
  1517. * Get the callback function for this URegularExpression.
  1518. *
  1519. * @param callback Out parameter, receives a pointer to the user-supplied
  1520. * callback function.
  1521. * @param context Out parameter, receives the user context pointer that
  1522. * was set when uregex_setMatchCallback() was called.
  1523. * @param status A reference to a UErrorCode to receive any errors.
  1524. * @stable ICU 4.0
  1525. */
  1526. virtual void getMatchCallback(URegexMatchCallback *&callback,
  1527. const void *&context,
  1528. UErrorCode &status);
  1529. /**
  1530. * Set a progress callback function for use with find operations on this Matcher.
  1531. * During find operations, the callback will be invoked after each return from a
  1532. * match attempt, giving the application the opportunity to terminate a long-running
  1533. * find operation.
  1534. *
  1535. * @param callback A pointer to the user-supplied callback function.
  1536. * @param context User context pointer. The value supplied at the
  1537. * time the callback function is set will be saved
  1538. * and passed to the callback each time that it is called.
  1539. * @param status A reference to a UErrorCode to receive any errors.
  1540. * @stable ICU 4.6
  1541. */
  1542. virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
  1543. const void *context,
  1544. UErrorCode &status);
  1545. /**
  1546. * Get the find progress callback function for this URegularExpression.
  1547. *
  1548. * @param callback Out parameter, receives a pointer to the user-supplied
  1549. * callback function.
  1550. * @param context Out parameter, receives the user context pointer that
  1551. * was set when uregex_setFindProgressCallback() was called.
  1552. * @param status A reference to a UErrorCode to receive any errors.
  1553. * @stable ICU 4.6
  1554. */
  1555. virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
  1556. const void *&context,
  1557. UErrorCode &status);
  1558. #ifndef U_HIDE_INTERNAL_API
  1559. /**
  1560. * setTrace Debug function, enable/disable tracing of the matching engine.
  1561. * For internal ICU development use only. DO NO USE!!!!
  1562. * @internal
  1563. */
  1564. void setTrace(UBool state);
  1565. #endif /* U_HIDE_INTERNAL_API */
  1566. /**
  1567. * ICU "poor man's RTTI", returns a UClassID for this class.
  1568. *
  1569. * @stable ICU 2.2
  1570. */
  1571. static UClassID U_EXPORT2 getStaticClassID();
  1572. /**
  1573. * ICU "poor man's RTTI", returns a UClassID for the actual class.
  1574. *
  1575. * @stable ICU 2.2
  1576. */
  1577. virtual UClassID getDynamicClassID() const override;
  1578. private:
  1579. // Constructors and other object boilerplate are private.
  1580. // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
  1581. RegexMatcher() = delete; // default constructor not implemented
  1582. RegexMatcher(const RegexPattern *pat);
  1583. RegexMatcher(const RegexMatcher &other) = delete;
  1584. RegexMatcher &operator =(const RegexMatcher &rhs) = delete;
  1585. void init(UErrorCode &status); // Common initialization
  1586. void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
  1587. friend class RegexPattern;
  1588. friend class RegexCImpl;
  1589. public:
  1590. #ifndef U_HIDE_INTERNAL_API
  1591. /** @internal */
  1592. void resetPreserveRegion(); // Reset matcher state, but preserve any region.
  1593. #endif /* U_HIDE_INTERNAL_API */
  1594. private:
  1595. //
  1596. // MatchAt This is the internal interface to the match engine itself.
  1597. // Match status comes back in matcher member variables.
  1598. //
  1599. void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
  1600. inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
  1601. UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
  1602. UBool isUWordBoundary(int64_t pos, UErrorCode &status); // perform RBBI based \b test
  1603. // Find a grapheme cluster boundary using a break iterator. For handling \X in regexes.
  1604. int64_t followingGCBoundary(int64_t pos, UErrorCode &status);
  1605. REStackFrame *resetStack();
  1606. inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
  1607. void IncrementTime(UErrorCode &status);
  1608. // Call user find callback function, if set. Return true if operation should be interrupted.
  1609. inline UBool findProgressInterrupt(int64_t matchIndex, UErrorCode &status);
  1610. int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
  1611. UBool findUsingChunk(UErrorCode &status);
  1612. void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
  1613. UBool isChunkWordBoundary(int32_t pos);
  1614. const RegexPattern *fPattern;
  1615. RegexPattern *fPatternOwned; // Non-nullptr if this matcher owns the pattern, and
  1616. // should delete it when through.
  1617. const UnicodeString *fInput; // The string being matched. Only used for input()
  1618. UText *fInputText; // The text being matched. Is never nullptr.
  1619. UText *fAltInputText; // A shallow copy of the text being matched.
  1620. // Only created if the pattern contains backreferences.
  1621. int64_t fInputLength; // Full length of the input text.
  1622. int32_t fFrameSize; // The size of a frame in the backtrack stack.
  1623. int64_t fRegionStart; // Start of the input region, default = 0.
  1624. int64_t fRegionLimit; // End of input region, default to input.length.
  1625. int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
  1626. int64_t fAnchorLimit; // See useAnchoringBounds
  1627. int64_t fLookStart; // Region bounds for look-ahead/behind and
  1628. int64_t fLookLimit; // and other boundary tests. See
  1629. // useTransparentBounds
  1630. int64_t fActiveStart; // Currently active bounds for matching.
  1631. int64_t fActiveLimit; // Usually is the same as region, but
  1632. // is changed to fLookStart/Limit when
  1633. // entering look around regions.
  1634. UBool fTransparentBounds; // True if using transparent bounds.
  1635. UBool fAnchoringBounds; // True if using anchoring bounds.
  1636. UBool fMatch; // True if the last attempted match was successful.
  1637. int64_t fMatchStart; // Position of the start of the most recent match
  1638. int64_t fMatchEnd; // First position after the end of the most recent match
  1639. // Zero if no previous match, even when a region
  1640. // is active.
  1641. int64_t fLastMatchEnd; // First position after the end of the previous match,
  1642. // or -1 if there was no previous match.
  1643. int64_t fAppendPosition; // First position after the end of the previous
  1644. // appendReplacement(). As described by the
  1645. // JavaDoc for Java Matcher, where it is called
  1646. // "append position"
  1647. UBool fHitEnd; // True if the last match touched the end of input.
  1648. UBool fRequireEnd; // True if the last match required end-of-input
  1649. // (matched $ or Z)
  1650. UVector64 *fStack;
  1651. REStackFrame *fFrame; // After finding a match, the last active stack frame,
  1652. // which will contain the capture group results.
  1653. // NOT valid while match engine is running.
  1654. int64_t *fData; // Data area for use by the compiled pattern.
  1655. int64_t fSmallData[8]; // Use this for data if it's enough.
  1656. int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
  1657. // match engine run. Zero for unlimited.
  1658. int32_t fTime; // Match time, accumulates while matching.
  1659. int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
  1660. // Kept separately from fTime to keep as much
  1661. // code as possible out of the inline
  1662. // StateSave function.
  1663. int32_t fStackLimit; // Maximum memory size to use for the backtrack
  1664. // stack, in bytes. Zero for unlimited.
  1665. URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
  1666. // nullptr if there is no callback.
  1667. const void *fCallbackContext; // User Context ptr for callback function.
  1668. URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
  1669. // nullptr if there is no callback.
  1670. const void *fFindProgressCallbackContext; // User Context ptr for callback function.
  1671. UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
  1672. UBool fTraceDebug; // Set true for debug tracing of match engine.
  1673. UErrorCode fDeferredStatus; // Save error state that cannot be immediately
  1674. // reported, or that permanently disables this matcher.
  1675. BreakIterator *fWordBreakItr;
  1676. BreakIterator *fGCBreakItr;
  1677. };
  1678. U_NAMESPACE_END
  1679. #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
  1680. #endif /* U_SHOW_CPLUSPLUS_API */
  1681. #endif