uregex.h 72 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2004-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * file name: uregex.h
  9. * encoding: UTF-8
  10. * indentation:4
  11. *
  12. * created on: 2004mar09
  13. * created by: Andy Heninger
  14. *
  15. * ICU Regular Expressions, API for C
  16. */
  17. /**
  18. * \file
  19. * \brief C API: Regular Expressions
  20. *
  21. * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p>
  22. */
  23. #ifndef UREGEX_H
  24. #define UREGEX_H
  25. #include "unicode/utext.h"
  26. #include "unicode/utypes.h"
  27. #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  28. #include "unicode/parseerr.h"
  29. #if U_SHOW_CPLUSPLUS_API
  30. #include "unicode/localpointer.h"
  31. #endif // U_SHOW_CPLUSPLUS_API
  32. struct URegularExpression;
  33. /**
  34. * Structure representing a compiled regular expression, plus the results
  35. * of a match operation.
  36. * @stable ICU 3.0
  37. */
  38. typedef struct URegularExpression URegularExpression;
  39. /**
  40. * Constants for Regular Expression Match Modes.
  41. * @stable ICU 2.4
  42. */
  43. typedef enum URegexpFlag{
  44. #ifndef U_HIDE_DRAFT_API
  45. /** Forces normalization of pattern and strings.
  46. Not implemented yet, just a placeholder, hence draft.
  47. @draft ICU 2.4 */
  48. UREGEX_CANON_EQ = 128,
  49. #endif /* U_HIDE_DRAFT_API */
  50. /** Enable case insensitive matching. @stable ICU 2.4 */
  51. UREGEX_CASE_INSENSITIVE = 2,
  52. /** Allow white space and comments within patterns @stable ICU 2.4 */
  53. UREGEX_COMMENTS = 4,
  54. /** If set, '.' matches line terminators, otherwise '.' matching stops at line end.
  55. * @stable ICU 2.4 */
  56. UREGEX_DOTALL = 32,
  57. /** If set, treat the entire pattern as a literal string.
  58. * Metacharacters or escape sequences in the input sequence will be given
  59. * no special meaning.
  60. *
  61. * The flag UREGEX_CASE_INSENSITIVE retains its impact
  62. * on matching when used in conjunction with this flag.
  63. * The other flags become superfluous.
  64. *
  65. * @stable ICU 4.0
  66. */
  67. UREGEX_LITERAL = 16,
  68. /** Control behavior of "$" and "^"
  69. * If set, recognize line terminators within string,
  70. * otherwise, match only at start and end of input string.
  71. * @stable ICU 2.4 */
  72. UREGEX_MULTILINE = 8,
  73. /** Unix-only line endings.
  74. * When this mode is enabled, only \\u000a is recognized as a line ending
  75. * in the behavior of ., ^, and $.
  76. * @stable ICU 4.0
  77. */
  78. UREGEX_UNIX_LINES = 1,
  79. /** Unicode word boundaries.
  80. * If set, \b uses the Unicode TR 29 definition of word boundaries.
  81. * Warning: Unicode word boundaries are quite different from
  82. * traditional regular expression word boundaries. See
  83. * http://unicode.org/reports/tr29/#Word_Boundaries
  84. * @stable ICU 2.8
  85. */
  86. UREGEX_UWORD = 256,
  87. /** Error on Unrecognized backslash escapes.
  88. * If set, fail with an error on patterns that contain
  89. * backslash-escaped ASCII letters without a known special
  90. * meaning. If this flag is not set, these
  91. * escaped letters represent themselves.
  92. * @stable ICU 4.0
  93. */
  94. UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512
  95. } URegexpFlag;
  96. /**
  97. * Open (compile) an ICU regular expression. Compiles the regular expression in
  98. * string form into an internal representation using the specified match mode flags.
  99. * The resulting regular expression handle can then be used to perform various
  100. * matching operations.
  101. *
  102. *
  103. * @param pattern The Regular Expression pattern to be compiled.
  104. * @param patternLength The length of the pattern, or -1 if the pattern is
  105. * NUL terminated.
  106. * @param flags Flags that alter the default matching behavior for
  107. * the regular expression, UREGEX_CASE_INSENSITIVE, for
  108. * example. For default behavior, set this parameter to zero.
  109. * See <code>enum URegexpFlag</code>. All desired flags
  110. * are bitwise-ORed together.
  111. * @param pe Receives the position (line and column numbers) of any syntax
  112. * error within the source regular expression string. If this
  113. * information is not wanted, pass NULL for this parameter.
  114. * @param status Receives error detected by this function.
  115. * @stable ICU 3.0
  116. *
  117. */
  118. U_CAPI URegularExpression * U_EXPORT2
  119. uregex_open( const UChar *pattern,
  120. int32_t patternLength,
  121. uint32_t flags,
  122. UParseError *pe,
  123. UErrorCode *status);
  124. /**
  125. * Open (compile) an ICU regular expression. Compiles the regular expression in
  126. * string form into an internal representation using the specified match mode flags.
  127. * The resulting regular expression handle can then be used to perform various
  128. * matching operations.
  129. * <p>
  130. * The contents of the pattern UText will be extracted and saved. Ownership of the
  131. * UText struct itself remains with the caller. This is to match the behavior of
  132. * uregex_open().
  133. *
  134. * @param pattern The Regular Expression pattern to be compiled.
  135. * @param flags Flags that alter the default matching behavior for
  136. * the regular expression, UREGEX_CASE_INSENSITIVE, for
  137. * example. For default behavior, set this parameter to zero.
  138. * See <code>enum URegexpFlag</code>. All desired flags
  139. * are bitwise-ORed together.
  140. * @param pe Receives the position (line and column numbers) of any syntax
  141. * error within the source regular expression string. If this
  142. * information is not wanted, pass NULL for this parameter.
  143. * @param status Receives error detected by this function.
  144. *
  145. * @stable ICU 4.6
  146. */
  147. U_CAPI URegularExpression * U_EXPORT2
  148. uregex_openUText(UText *pattern,
  149. uint32_t flags,
  150. UParseError *pe,
  151. UErrorCode *status);
  152. #if !UCONFIG_NO_CONVERSION
  153. /**
  154. * Open (compile) an ICU regular expression. The resulting regular expression
  155. * handle can then be used to perform various matching operations.
  156. * <p>
  157. * This function is the same as uregex_open, except that the pattern
  158. * is supplied as an 8 bit char * string in the default code page.
  159. *
  160. * @param pattern The Regular Expression pattern to be compiled,
  161. * NUL terminated.
  162. * @param flags Flags that alter the default matching behavior for
  163. * the regular expression, UREGEX_CASE_INSENSITIVE, for
  164. * example. For default behavior, set this parameter to zero.
  165. * See <code>enum URegexpFlag</code>. All desired flags
  166. * are bitwise-ORed together.
  167. * @param pe Receives the position (line and column numbers) of any syntax
  168. * error within the source regular expression string. If this
  169. * information is not wanted, pass NULL for this parameter.
  170. * @param status Receives errors detected by this function.
  171. * @return The URegularExpression object representing the compiled
  172. * pattern.
  173. *
  174. * @stable ICU 3.0
  175. */
  176. U_CAPI URegularExpression * U_EXPORT2
  177. uregex_openC( const char *pattern,
  178. uint32_t flags,
  179. UParseError *pe,
  180. UErrorCode *status);
  181. #endif
  182. /**
  183. * Close the regular expression, recovering all resources (memory) it
  184. * was holding.
  185. *
  186. * @param regexp The regular expression to be closed.
  187. * @stable ICU 3.0
  188. */
  189. U_CAPI void U_EXPORT2
  190. uregex_close(URegularExpression *regexp);
  191. #if U_SHOW_CPLUSPLUS_API
  192. U_NAMESPACE_BEGIN
  193. /**
  194. * \class LocalURegularExpressionPointer
  195. * "Smart pointer" class, closes a URegularExpression via uregex_close().
  196. * For most methods see the LocalPointerBase base class.
  197. *
  198. * @see LocalPointerBase
  199. * @see LocalPointer
  200. * @stable ICU 4.4
  201. */
  202. U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close);
  203. U_NAMESPACE_END
  204. #endif
  205. /**
  206. * Make a copy of a compiled regular expression. Cloning a regular
  207. * expression is faster than opening a second instance from the source
  208. * form of the expression, and requires less memory.
  209. * <p>
  210. * Note that the current input string and the position of any matched text
  211. * within it are not cloned; only the pattern itself and the
  212. * match mode flags are copied.
  213. * <p>
  214. * Cloning can be particularly useful to threaded applications that perform
  215. * multiple match operations in parallel. Each concurrent RE
  216. * operation requires its own instance of a URegularExpression.
  217. *
  218. * @param regexp The compiled regular expression to be cloned.
  219. * @param status Receives indication of any errors encountered
  220. * @return the cloned copy of the compiled regular expression.
  221. * @stable ICU 3.0
  222. */
  223. U_CAPI URegularExpression * U_EXPORT2
  224. uregex_clone(const URegularExpression *regexp, UErrorCode *status);
  225. /**
  226. * Returns a pointer to the source form of the pattern for this regular expression.
  227. * This function will work even if the pattern was originally specified as a UText.
  228. *
  229. * @param regexp The compiled regular expression.
  230. * @param patLength This output parameter will be set to the length of the
  231. * pattern string. A NULL pointer may be used here if the
  232. * pattern length is not needed, as would be the case if
  233. * the pattern is known in advance to be a NUL terminated
  234. * string.
  235. * @param status Receives errors detected by this function.
  236. * @return a pointer to the pattern string. The storage for the string is
  237. * owned by the regular expression object, and must not be
  238. * altered or deleted by the application. The returned string
  239. * will remain valid until the regular expression is closed.
  240. * @stable ICU 3.0
  241. */
  242. U_CAPI const UChar * U_EXPORT2
  243. uregex_pattern(const URegularExpression *regexp,
  244. int32_t *patLength,
  245. UErrorCode *status);
  246. /**
  247. * Returns the source text of the pattern for this regular expression.
  248. * This function will work even if the pattern was originally specified as a UChar string.
  249. *
  250. * @param regexp The compiled regular expression.
  251. * @param status Receives errors detected by this function.
  252. * @return the pattern text. The storage for the text is owned by the regular expression
  253. * object, and must not be altered or deleted.
  254. *
  255. * @stable ICU 4.6
  256. */
  257. U_CAPI UText * U_EXPORT2
  258. uregex_patternUText(const URegularExpression *regexp,
  259. UErrorCode *status);
  260. /**
  261. * Get the match mode flags that were specified when compiling this regular expression.
  262. * @param status Receives errors detected by this function.
  263. * @param regexp The compiled regular expression.
  264. * @return The match mode flags
  265. * @see URegexpFlag
  266. * @stable ICU 3.0
  267. */
  268. U_CAPI int32_t U_EXPORT2
  269. uregex_flags(const URegularExpression *regexp,
  270. UErrorCode *status);
  271. /**
  272. * Set the subject text string upon which the regular expression will look for matches.
  273. * This function may be called any number of times, allowing the regular
  274. * expression pattern to be applied to different strings.
  275. * <p>
  276. * Regular expression matching operations work directly on the application's
  277. * string data. No copy is made. The subject string data must not be
  278. * altered after calling this function until after all regular expression
  279. * operations involving this string data are completed.
  280. * <p>
  281. * Zero length strings are permitted. In this case, no subsequent match
  282. * operation will dereference the text string pointer.
  283. *
  284. * @param regexp The compiled regular expression.
  285. * @param text The subject text string.
  286. * @param textLength The length of the subject text, or -1 if the string
  287. * is NUL terminated.
  288. * @param status Receives errors detected by this function.
  289. * @stable ICU 3.0
  290. */
  291. U_CAPI void U_EXPORT2
  292. uregex_setText(URegularExpression *regexp,
  293. const UChar *text,
  294. int32_t textLength,
  295. UErrorCode *status);
  296. /**
  297. * Set the subject text string upon which the regular expression will look for matches.
  298. * This function may be called any number of times, allowing the regular
  299. * expression pattern to be applied to different strings.
  300. * <p>
  301. * Regular expression matching operations work directly on the application's
  302. * string data; only a shallow clone is made. The subject string data must not be
  303. * altered after calling this function until after all regular expression
  304. * operations involving this string data are completed.
  305. *
  306. * @param regexp The compiled regular expression.
  307. * @param text The subject text string.
  308. * @param status Receives errors detected by this function.
  309. *
  310. * @stable ICU 4.6
  311. */
  312. U_CAPI void U_EXPORT2
  313. uregex_setUText(URegularExpression *regexp,
  314. UText *text,
  315. UErrorCode *status);
  316. /**
  317. * Get the subject text that is currently associated with this
  318. * regular expression object. If the input was supplied using uregex_setText(),
  319. * that pointer will be returned. Otherwise, the characters in the input will
  320. * be extracted to a buffer and returned. In either case, ownership remains
  321. * with the regular expression object.
  322. *
  323. * This function will work even if the input was originally specified as a UText.
  324. *
  325. * @param regexp The compiled regular expression.
  326. * @param textLength The length of the string is returned in this output parameter.
  327. * A NULL pointer may be used here if the
  328. * text length is not needed, as would be the case if
  329. * the text is known in advance to be a NUL terminated
  330. * string.
  331. * @param status Receives errors detected by this function.
  332. * @return Pointer to the subject text string currently associated with
  333. * this regular expression.
  334. * @stable ICU 3.0
  335. */
  336. U_CAPI const UChar * U_EXPORT2
  337. uregex_getText(URegularExpression *regexp,
  338. int32_t *textLength,
  339. UErrorCode *status);
  340. /**
  341. * Get the subject text that is currently associated with this
  342. * regular expression object.
  343. *
  344. * This function will work even if the input was originally specified as a UChar string.
  345. *
  346. * @param regexp The compiled regular expression.
  347. * @param dest A mutable UText in which to store the current input.
  348. * If NULL, a new UText will be created as an immutable shallow clone
  349. * of the actual input string.
  350. * @param status Receives errors detected by this function.
  351. * @return The subject text currently associated with this regular expression.
  352. * If a pre-allocated UText was provided, it will always be used and returned.
  353. *
  354. * @stable ICU 4.6
  355. */
  356. U_CAPI UText * U_EXPORT2
  357. uregex_getUText(URegularExpression *regexp,
  358. UText *dest,
  359. UErrorCode *status);
  360. /**
  361. * Set the subject text string upon which the regular expression is looking for matches
  362. * without changing any other aspect of the matching state.
  363. * The new and previous text strings must have the same content.
  364. *
  365. * This function is intended for use in environments where ICU is operating on
  366. * strings that may move around in memory. It provides a mechanism for notifying
  367. * ICU that the string has been relocated, and providing a new UText to access the
  368. * string in its new position.
  369. *
  370. * Note that the regular expression implementation never copies the underlying text
  371. * of a string being matched, but always operates directly on the original text
  372. * provided by the user. Refreshing simply drops the references to the old text
  373. * and replaces them with references to the new.
  374. *
  375. * Caution: this function is normally used only by very specialized
  376. * system-level code. One example use case is with garbage collection
  377. * that moves the text in memory.
  378. *
  379. * @param regexp The compiled regular expression.
  380. * @param text The new (moved) text string.
  381. * @param status Receives errors detected by this function.
  382. *
  383. * @stable ICU 4.8
  384. */
  385. U_CAPI void U_EXPORT2
  386. uregex_refreshUText(URegularExpression *regexp,
  387. UText *text,
  388. UErrorCode *status);
  389. /**
  390. * Attempts to match the input string against the pattern.
  391. * To succeed, the match must extend to the end of the string,
  392. * or cover the complete match region.
  393. *
  394. * If startIndex >= zero the match operation starts at the specified
  395. * index and must extend to the end of the input string. Any region
  396. * that has been specified is reset.
  397. *
  398. * If startIndex == -1 the match must cover the input region, or the entire
  399. * input string if no region has been set. This directly corresponds to
  400. * Matcher.matches() in Java
  401. *
  402. * @param regexp The compiled regular expression.
  403. * @param startIndex The input string (native) index at which to begin matching, or -1
  404. * to match the input Region.
  405. * @param status Receives errors detected by this function.
  406. * @return true if there is a match
  407. * @stable ICU 3.0
  408. */
  409. U_CAPI UBool U_EXPORT2
  410. uregex_matches(URegularExpression *regexp,
  411. int32_t startIndex,
  412. UErrorCode *status);
  413. /**
  414. * 64bit version of uregex_matches.
  415. * Attempts to match the input string against the pattern.
  416. * To succeed, the match must extend to the end of the string,
  417. * or cover the complete match region.
  418. *
  419. * If startIndex >= zero the match operation starts at the specified
  420. * index and must extend to the end of the input string. Any region
  421. * that has been specified is reset.
  422. *
  423. * If startIndex == -1 the match must cover the input region, or the entire
  424. * input string if no region has been set. This directly corresponds to
  425. * Matcher.matches() in Java
  426. *
  427. * @param regexp The compiled regular expression.
  428. * @param startIndex The input string (native) index at which to begin matching, or -1
  429. * to match the input Region.
  430. * @param status Receives errors detected by this function.
  431. * @return true if there is a match
  432. * @stable ICU 4.6
  433. */
  434. U_CAPI UBool U_EXPORT2
  435. uregex_matches64(URegularExpression *regexp,
  436. int64_t startIndex,
  437. UErrorCode *status);
  438. /**
  439. * Attempts to match the input string, starting from the specified index, against the pattern.
  440. * The match may be of any length, and is not required to extend to the end
  441. * of the input string. Contrast with uregex_matches().
  442. *
  443. * <p>If startIndex is >= 0 any input region that was set for this
  444. * URegularExpression is reset before the operation begins.
  445. *
  446. * <p>If the specified starting index == -1 the match begins at the start of the input
  447. * region, or at the start of the full string if no region has been specified.
  448. * This corresponds directly with Matcher.lookingAt() in Java.
  449. *
  450. * <p>If the match succeeds then more information can be obtained via the
  451. * <code>uregexp_start()</code>, <code>uregexp_end()</code>,
  452. * and <code>uregex_group()</code> functions.</p>
  453. *
  454. * @param regexp The compiled regular expression.
  455. * @param startIndex The input string (native) index at which to begin matching, or
  456. * -1 to match the Input Region
  457. * @param status A reference to a UErrorCode to receive any errors.
  458. * @return true if there is a match.
  459. * @stable ICU 3.0
  460. */
  461. U_CAPI UBool U_EXPORT2
  462. uregex_lookingAt(URegularExpression *regexp,
  463. int32_t startIndex,
  464. UErrorCode *status);
  465. /**
  466. * 64bit version of uregex_lookingAt.
  467. * Attempts to match the input string, starting from the specified index, against the pattern.
  468. * The match may be of any length, and is not required to extend to the end
  469. * of the input string. Contrast with uregex_matches().
  470. *
  471. * <p>If startIndex is >= 0 any input region that was set for this
  472. * URegularExpression is reset before the operation begins.
  473. *
  474. * <p>If the specified starting index == -1 the match begins at the start of the input
  475. * region, or at the start of the full string if no region has been specified.
  476. * This corresponds directly with Matcher.lookingAt() in Java.
  477. *
  478. * <p>If the match succeeds then more information can be obtained via the
  479. * <code>uregexp_start()</code>, <code>uregexp_end()</code>,
  480. * and <code>uregex_group()</code> functions.</p>
  481. *
  482. * @param regexp The compiled regular expression.
  483. * @param startIndex The input string (native) index at which to begin matching, or
  484. * -1 to match the Input Region
  485. * @param status A reference to a UErrorCode to receive any errors.
  486. * @return true if there is a match.
  487. * @stable ICU 4.6
  488. */
  489. U_CAPI UBool U_EXPORT2
  490. uregex_lookingAt64(URegularExpression *regexp,
  491. int64_t startIndex,
  492. UErrorCode *status);
  493. /**
  494. * Find the first matching substring of the input string that matches the pattern.
  495. * If startIndex is >= zero the search for a match begins at the specified index,
  496. * and any match region is reset. This corresponds directly with
  497. * Matcher.find(startIndex) in Java.
  498. *
  499. * If startIndex == -1 the search begins at the start of the input region,
  500. * or at the start of the full string if no region has been specified.
  501. *
  502. * If a match is found, <code>uregex_start(), uregex_end()</code>, and
  503. * <code>uregex_group()</code> will provide more information regarding the match.
  504. *
  505. * @param regexp The compiled regular expression.
  506. * @param startIndex The position (native) in the input string to begin the search, or
  507. * -1 to search within the Input Region.
  508. * @param status A reference to a UErrorCode to receive any errors.
  509. * @return true if a match is found.
  510. * @stable ICU 3.0
  511. */
  512. U_CAPI UBool U_EXPORT2
  513. uregex_find(URegularExpression *regexp,
  514. int32_t startIndex,
  515. UErrorCode *status);
  516. /**
  517. * 64bit version of uregex_find.
  518. * Find the first matching substring of the input string that matches the pattern.
  519. * If startIndex is >= zero the search for a match begins at the specified index,
  520. * and any match region is reset. This corresponds directly with
  521. * Matcher.find(startIndex) in Java.
  522. *
  523. * If startIndex == -1 the search begins at the start of the input region,
  524. * or at the start of the full string if no region has been specified.
  525. *
  526. * If a match is found, <code>uregex_start(), uregex_end()</code>, and
  527. * <code>uregex_group()</code> will provide more information regarding the match.
  528. *
  529. * @param regexp The compiled regular expression.
  530. * @param startIndex The position (native) in the input string to begin the search, or
  531. * -1 to search within the Input Region.
  532. * @param status A reference to a UErrorCode to receive any errors.
  533. * @return true if a match is found.
  534. * @stable ICU 4.6
  535. */
  536. U_CAPI UBool U_EXPORT2
  537. uregex_find64(URegularExpression *regexp,
  538. int64_t startIndex,
  539. UErrorCode *status);
  540. /**
  541. * Find the next pattern match in the input string. Begin searching
  542. * the input at the location following the end of he previous match,
  543. * or at the start of the string (or region) if there is no
  544. * previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and
  545. * <code>uregex_group()</code> will provide more information regarding the match.
  546. *
  547. * @param regexp The compiled regular expression.
  548. * @param status A reference to a UErrorCode to receive any errors.
  549. * @return true if a match is found.
  550. * @see uregex_reset
  551. * @stable ICU 3.0
  552. */
  553. U_CAPI UBool U_EXPORT2
  554. uregex_findNext(URegularExpression *regexp,
  555. UErrorCode *status);
  556. /**
  557. * Get the number of capturing groups in this regular expression's pattern.
  558. * @param regexp The compiled regular expression.
  559. * @param status A reference to a UErrorCode to receive any errors.
  560. * @return the number of capture groups
  561. * @stable ICU 3.0
  562. */
  563. U_CAPI int32_t U_EXPORT2
  564. uregex_groupCount(URegularExpression *regexp,
  565. UErrorCode *status);
  566. /**
  567. * Get the group number corresponding to a named capture group.
  568. * The returned number can be used with any function that access
  569. * capture groups by number.
  570. *
  571. * The function returns an error status if the specified name does not
  572. * appear in the pattern.
  573. *
  574. * @param regexp The compiled regular expression.
  575. * @param groupName The capture group name.
  576. * @param nameLength The length of the name, or -1 if the name is a
  577. * nul-terminated string.
  578. * @param status A pointer to a UErrorCode to receive any errors.
  579. *
  580. * @stable ICU 55
  581. */
  582. U_CAPI int32_t U_EXPORT2
  583. uregex_groupNumberFromName(URegularExpression *regexp,
  584. const UChar *groupName,
  585. int32_t nameLength,
  586. UErrorCode *status);
  587. /**
  588. * Get the group number corresponding to a named capture group.
  589. * The returned number can be used with any function that access
  590. * capture groups by number.
  591. *
  592. * The function returns an error status if the specified name does not
  593. * appear in the pattern.
  594. *
  595. * @param regexp The compiled regular expression.
  596. * @param groupName The capture group name,
  597. * platform invariant characters only.
  598. * @param nameLength The length of the name, or -1 if the name is
  599. * nul-terminated.
  600. * @param status A pointer to a UErrorCode to receive any errors.
  601. *
  602. * @stable ICU 55
  603. */
  604. U_CAPI int32_t U_EXPORT2
  605. uregex_groupNumberFromCName(URegularExpression *regexp,
  606. const char *groupName,
  607. int32_t nameLength,
  608. UErrorCode *status);
  609. /** Extract the string for the specified matching expression or subexpression.
  610. * Group #0 is the complete string of matched text.
  611. * Group #1 is the text matched by the first set of capturing parentheses.
  612. *
  613. * @param regexp The compiled regular expression.
  614. * @param groupNum The capture group to extract. Group 0 is the complete
  615. * match. The value of this parameter must be
  616. * less than or equal to the number of capture groups in
  617. * the pattern.
  618. * @param dest Buffer to receive the matching string data
  619. * @param destCapacity Capacity of the dest buffer.
  620. * @param status A reference to a UErrorCode to receive any errors.
  621. * @return Length of matching data,
  622. * or -1 if no applicable match.
  623. * @stable ICU 3.0
  624. */
  625. U_CAPI int32_t U_EXPORT2
  626. uregex_group(URegularExpression *regexp,
  627. int32_t groupNum,
  628. UChar *dest,
  629. int32_t destCapacity,
  630. UErrorCode *status);
  631. /** Returns a shallow immutable clone of the entire input string with the current index set
  632. * to the beginning of the requested capture group. The capture group length is also
  633. * returned via groupLength.
  634. * Group #0 is the complete string of matched text.
  635. * Group #1 is the text matched by the first set of capturing parentheses.
  636. *
  637. * @param regexp The compiled regular expression.
  638. * @param groupNum The capture group to extract. Group 0 is the complete
  639. * match. The value of this parameter must be
  640. * less than or equal to the number of capture groups in
  641. * the pattern.
  642. * @param dest A mutable UText in which to store the current input.
  643. * If NULL, a new UText will be created as an immutable shallow clone
  644. * of the entire input string.
  645. * @param groupLength The group length of the desired capture group. Output parameter.
  646. * @param status A reference to a UErrorCode to receive any errors.
  647. * @return The subject text currently associated with this regular expression.
  648. * If a pre-allocated UText was provided, it will always be used and returned.
  649. *
  650. * @stable ICU 4.6
  651. */
  652. U_CAPI UText * U_EXPORT2
  653. uregex_groupUText(URegularExpression *regexp,
  654. int32_t groupNum,
  655. UText *dest,
  656. int64_t *groupLength,
  657. UErrorCode *status);
  658. /**
  659. * Returns the index in the input string of the start of the text matched by the
  660. * specified capture group during the previous match operation. Return -1 if
  661. * the capture group was not part of the last match.
  662. * Group #0 refers to the complete range of matched text.
  663. * Group #1 refers to the text matched by the first set of capturing parentheses.
  664. *
  665. * @param regexp The compiled regular expression.
  666. * @param groupNum The capture group number
  667. * @param status A reference to a UErrorCode to receive any errors.
  668. * @return the starting (native) position in the input of the text matched
  669. * by the specified group.
  670. * @stable ICU 3.0
  671. */
  672. U_CAPI int32_t U_EXPORT2
  673. uregex_start(URegularExpression *regexp,
  674. int32_t groupNum,
  675. UErrorCode *status);
  676. /**
  677. * 64bit version of uregex_start.
  678. * Returns the index in the input string of the start of the text matched by the
  679. * specified capture group during the previous match operation. Return -1 if
  680. * the capture group was not part of the last match.
  681. * Group #0 refers to the complete range of matched text.
  682. * Group #1 refers to the text matched by the first set of capturing parentheses.
  683. *
  684. * @param regexp The compiled regular expression.
  685. * @param groupNum The capture group number
  686. * @param status A reference to a UErrorCode to receive any errors.
  687. * @return the starting (native) position in the input of the text matched
  688. * by the specified group.
  689. * @stable ICU 4.6
  690. */
  691. U_CAPI int64_t U_EXPORT2
  692. uregex_start64(URegularExpression *regexp,
  693. int32_t groupNum,
  694. UErrorCode *status);
  695. /**
  696. * Returns the index in the input string of the position following the end
  697. * of the text matched by the specified capture group.
  698. * Return -1 if the capture group was not part of the last match.
  699. * Group #0 refers to the complete range of matched text.
  700. * Group #1 refers to the text matched by the first set of capturing parentheses.
  701. *
  702. * @param regexp The compiled regular expression.
  703. * @param groupNum The capture group number
  704. * @param status A reference to a UErrorCode to receive any errors.
  705. * @return the (native) index of the position following the last matched character.
  706. * @stable ICU 3.0
  707. */
  708. U_CAPI int32_t U_EXPORT2
  709. uregex_end(URegularExpression *regexp,
  710. int32_t groupNum,
  711. UErrorCode *status);
  712. /**
  713. * 64bit version of uregex_end.
  714. * Returns the index in the input string of the position following the end
  715. * of the text matched by the specified capture group.
  716. * Return -1 if the capture group was not part of the last match.
  717. * Group #0 refers to the complete range of matched text.
  718. * Group #1 refers to the text matched by the first set of capturing parentheses.
  719. *
  720. * @param regexp The compiled regular expression.
  721. * @param groupNum The capture group number
  722. * @param status A reference to a UErrorCode to receive any errors.
  723. * @return the (native) index of the position following the last matched character.
  724. * @stable ICU 4.6
  725. */
  726. U_CAPI int64_t U_EXPORT2
  727. uregex_end64(URegularExpression *regexp,
  728. int32_t groupNum,
  729. UErrorCode *status);
  730. /**
  731. * Reset any saved state from the previous match. Has the effect of
  732. * causing uregex_findNext to begin at the specified index, and causing
  733. * uregex_start(), uregex_end() and uregex_group() to return an error
  734. * indicating that there is no match information available. Clears any
  735. * match region that may have been set.
  736. *
  737. * @param regexp The compiled regular expression.
  738. * @param index The position (native) in the text at which a
  739. * uregex_findNext() should begin searching.
  740. * @param status A reference to a UErrorCode to receive any errors.
  741. * @stable ICU 3.0
  742. */
  743. U_CAPI void U_EXPORT2
  744. uregex_reset(URegularExpression *regexp,
  745. int32_t index,
  746. UErrorCode *status);
  747. /**
  748. * 64bit version of uregex_reset.
  749. * Reset any saved state from the previous match. Has the effect of
  750. * causing uregex_findNext to begin at the specified index, and causing
  751. * uregex_start(), uregex_end() and uregex_group() to return an error
  752. * indicating that there is no match information available. Clears any
  753. * match region that may have been set.
  754. *
  755. * @param regexp The compiled regular expression.
  756. * @param index The position (native) in the text at which a
  757. * uregex_findNext() should begin searching.
  758. * @param status A reference to a UErrorCode to receive any errors.
  759. * @stable ICU 4.6
  760. */
  761. U_CAPI void U_EXPORT2
  762. uregex_reset64(URegularExpression *regexp,
  763. int64_t index,
  764. UErrorCode *status);
  765. /**
  766. * Sets the limits of the matching region for this URegularExpression.
  767. * The region is the part of the input string that will be considered when matching.
  768. * Invoking this method resets any saved state from the previous match,
  769. * then sets the region to start at the index specified by the start parameter
  770. * and end at the index specified by the end parameter.
  771. *
  772. * Depending on the transparency and anchoring being used (see useTransparentBounds
  773. * and useAnchoringBounds), certain constructs such as anchors may behave differently
  774. * at or around the boundaries of the region
  775. *
  776. * The function will fail if start is greater than limit, or if either index
  777. * is less than zero or greater than the length of the string being matched.
  778. *
  779. * @param regexp The compiled regular expression.
  780. * @param regionStart The (native) index to begin searches at.
  781. * @param regionLimit The (native) index to end searches at (exclusive).
  782. * @param status A pointer to a UErrorCode to receive any errors.
  783. * @stable ICU 4.0
  784. */
  785. U_CAPI void U_EXPORT2
  786. uregex_setRegion(URegularExpression *regexp,
  787. int32_t regionStart,
  788. int32_t regionLimit,
  789. UErrorCode *status);
  790. /**
  791. * 64bit version of uregex_setRegion.
  792. * Sets the limits of the matching region for this URegularExpression.
  793. * The region is the part of the input string that will be considered when matching.
  794. * Invoking this method resets any saved state from the previous match,
  795. * then sets the region to start at the index specified by the start parameter
  796. * and end at the index specified by the end parameter.
  797. *
  798. * Depending on the transparency and anchoring being used (see useTransparentBounds
  799. * and useAnchoringBounds), certain constructs such as anchors may behave differently
  800. * at or around the boundaries of the region
  801. *
  802. * The function will fail if start is greater than limit, or if either index
  803. * is less than zero or greater than the length of the string being matched.
  804. *
  805. * @param regexp The compiled regular expression.
  806. * @param regionStart The (native) index to begin searches at.
  807. * @param regionLimit The (native) index to end searches at (exclusive).
  808. * @param status A pointer to a UErrorCode to receive any errors.
  809. * @stable ICU 4.6
  810. */
  811. U_CAPI void U_EXPORT2
  812. uregex_setRegion64(URegularExpression *regexp,
  813. int64_t regionStart,
  814. int64_t regionLimit,
  815. UErrorCode *status);
  816. /**
  817. * Set the matching region and the starting index for subsequent matches
  818. * in a single operation.
  819. * This is useful because the usual function for setting the starting
  820. * index, urgex_reset(), also resets any region limits.
  821. *
  822. * @param regexp The compiled regular expression.
  823. * @param regionStart The (native) index to begin searches at.
  824. * @param regionLimit The (native) index to end searches at (exclusive).
  825. * @param startIndex The index in the input text at which the next
  826. * match operation should begin.
  827. * @param status A pointer to a UErrorCode to receive any errors.
  828. * @stable ICU 4.6
  829. */
  830. U_CAPI void U_EXPORT2
  831. uregex_setRegionAndStart(URegularExpression *regexp,
  832. int64_t regionStart,
  833. int64_t regionLimit,
  834. int64_t startIndex,
  835. UErrorCode *status);
  836. /**
  837. * Reports the start index of the matching region. Any matches found are limited to
  838. * to the region bounded by regionStart (inclusive) and regionEnd (exclusive).
  839. *
  840. * @param regexp The compiled regular expression.
  841. * @param status A pointer to a UErrorCode to receive any errors.
  842. * @return The starting (native) index of this matcher's region.
  843. * @stable ICU 4.0
  844. */
  845. U_CAPI int32_t U_EXPORT2
  846. uregex_regionStart(const URegularExpression *regexp,
  847. UErrorCode *status);
  848. /**
  849. * 64bit version of uregex_regionStart.
  850. * Reports the start index of the matching region. Any matches found are limited to
  851. * to the region bounded by regionStart (inclusive) and regionEnd (exclusive).
  852. *
  853. * @param regexp The compiled regular expression.
  854. * @param status A pointer to a UErrorCode to receive any errors.
  855. * @return The starting (native) index of this matcher's region.
  856. * @stable ICU 4.6
  857. */
  858. U_CAPI int64_t U_EXPORT2
  859. uregex_regionStart64(const URegularExpression *regexp,
  860. UErrorCode *status);
  861. /**
  862. * Reports the end index (exclusive) of the matching region for this URegularExpression.
  863. * Any matches found are limited to to the region bounded by regionStart (inclusive)
  864. * and regionEnd (exclusive).
  865. *
  866. * @param regexp The compiled regular expression.
  867. * @param status A pointer to a UErrorCode to receive any errors.
  868. * @return The ending point (native) of this matcher's region.
  869. * @stable ICU 4.0
  870. */
  871. U_CAPI int32_t U_EXPORT2
  872. uregex_regionEnd(const URegularExpression *regexp,
  873. UErrorCode *status);
  874. /**
  875. * 64bit version of uregex_regionEnd.
  876. * Reports the end index (exclusive) of the matching region for this URegularExpression.
  877. * Any matches found are limited to to the region bounded by regionStart (inclusive)
  878. * and regionEnd (exclusive).
  879. *
  880. * @param regexp The compiled regular expression.
  881. * @param status A pointer to a UErrorCode to receive any errors.
  882. * @return The ending point (native) of this matcher's region.
  883. * @stable ICU 4.6
  884. */
  885. U_CAPI int64_t U_EXPORT2
  886. uregex_regionEnd64(const URegularExpression *regexp,
  887. UErrorCode *status);
  888. /**
  889. * Queries the transparency of region bounds for this URegularExpression.
  890. * See useTransparentBounds for a description of transparent and opaque bounds.
  891. * By default, matching boundaries are opaque.
  892. *
  893. * @param regexp The compiled regular expression.
  894. * @param status A pointer to a UErrorCode to receive any errors.
  895. * @return true if this matcher is using opaque bounds, false if it is not.
  896. * @stable ICU 4.0
  897. */
  898. U_CAPI UBool U_EXPORT2
  899. uregex_hasTransparentBounds(const URegularExpression *regexp,
  900. UErrorCode *status);
  901. /**
  902. * Sets the transparency of region bounds for this URegularExpression.
  903. * Invoking this function with an argument of true will set matches to use transparent bounds.
  904. * If the boolean argument is false, then opaque bounds will be used.
  905. *
  906. * Using transparent bounds, the boundaries of the matching region are transparent
  907. * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
  908. * see text beyond the boundaries of the region while checking for a match.
  909. *
  910. * With opaque bounds, no text outside of the matching region is visible to lookahead,
  911. * lookbehind, and boundary matching constructs.
  912. *
  913. * By default, opaque bounds are used.
  914. *
  915. * @param regexp The compiled regular expression.
  916. * @param b true for transparent bounds; false for opaque bounds
  917. * @param status A pointer to a UErrorCode to receive any errors.
  918. * @stable ICU 4.0
  919. **/
  920. U_CAPI void U_EXPORT2
  921. uregex_useTransparentBounds(URegularExpression *regexp,
  922. UBool b,
  923. UErrorCode *status);
  924. /**
  925. * Return true if this URegularExpression is using anchoring bounds.
  926. * By default, anchoring region bounds are used.
  927. *
  928. * @param regexp The compiled regular expression.
  929. * @param status A pointer to a UErrorCode to receive any errors.
  930. * @return true if this matcher is using anchoring bounds.
  931. * @stable ICU 4.0
  932. */
  933. U_CAPI UBool U_EXPORT2
  934. uregex_hasAnchoringBounds(const URegularExpression *regexp,
  935. UErrorCode *status);
  936. /**
  937. * Set whether this URegularExpression is using Anchoring Bounds for its region.
  938. * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
  939. * and end of the region. Without Anchoring Bounds, anchors will only match at
  940. * the positions they would in the complete text.
  941. *
  942. * Anchoring Bounds are the default for regions.
  943. *
  944. * @param regexp The compiled regular expression.
  945. * @param b true if to enable anchoring bounds; false to disable them.
  946. * @param status A pointer to a UErrorCode to receive any errors.
  947. * @stable ICU 4.0
  948. */
  949. U_CAPI void U_EXPORT2
  950. uregex_useAnchoringBounds(URegularExpression *regexp,
  951. UBool b,
  952. UErrorCode *status);
  953. /**
  954. * Return true if the most recent matching operation touched the
  955. * end of the text being processed. In this case, additional input text could
  956. * change the results of that match.
  957. *
  958. * @param regexp The compiled regular expression.
  959. * @param status A pointer to a UErrorCode to receive any errors.
  960. * @return true if the most recent match hit the end of input
  961. * @stable ICU 4.0
  962. */
  963. U_CAPI UBool U_EXPORT2
  964. uregex_hitEnd(const URegularExpression *regexp,
  965. UErrorCode *status);
  966. /**
  967. * Return true the most recent match succeeded and additional input could cause
  968. * it to fail. If this function returns false and a match was found, then more input
  969. * might change the match but the match won't be lost. If a match was not found,
  970. * then requireEnd has no meaning.
  971. *
  972. * @param regexp The compiled regular expression.
  973. * @param status A pointer to a UErrorCode to receive any errors.
  974. * @return true if more input could cause the most recent match to no longer match.
  975. * @stable ICU 4.0
  976. */
  977. U_CAPI UBool U_EXPORT2
  978. uregex_requireEnd(const URegularExpression *regexp,
  979. UErrorCode *status);
  980. /**
  981. * Replaces every substring of the input that matches the pattern
  982. * with the given replacement string. This is a convenience function that
  983. * provides a complete find-and-replace-all operation.
  984. *
  985. * This method scans the input string looking for matches of the pattern.
  986. * Input that is not part of any match is copied unchanged to the
  987. * destination buffer. Matched regions are replaced in the output
  988. * buffer by the replacement string. The replacement string may contain
  989. * references to capture groups; these take the form of $1, $2, etc.
  990. *
  991. * @param regexp The compiled regular expression.
  992. * @param replacementText A string containing the replacement text.
  993. * @param replacementLength The length of the replacement string, or
  994. * -1 if it is NUL terminated.
  995. * @param destBuf A (UChar *) buffer that will receive the result.
  996. * @param destCapacity The capacity of the destination buffer.
  997. * @param status A reference to a UErrorCode to receive any errors.
  998. * @return The length of the string resulting from the find
  999. * and replace operation. In the event that the
  1000. * destination capacity is inadequate, the return value
  1001. * is still the full length of the untruncated string.
  1002. * @stable ICU 3.0
  1003. */
  1004. U_CAPI int32_t U_EXPORT2
  1005. uregex_replaceAll(URegularExpression *regexp,
  1006. const UChar *replacementText,
  1007. int32_t replacementLength,
  1008. UChar *destBuf,
  1009. int32_t destCapacity,
  1010. UErrorCode *status);
  1011. /**
  1012. * Replaces every substring of the input that matches the pattern
  1013. * with the given replacement string. This is a convenience function that
  1014. * provides a complete find-and-replace-all operation.
  1015. *
  1016. * This method scans the input string looking for matches of the pattern.
  1017. * Input that is not part of any match is copied unchanged to the
  1018. * destination buffer. Matched regions are replaced in the output
  1019. * buffer by the replacement string. The replacement string may contain
  1020. * references to capture groups; these take the form of $1, $2, etc.
  1021. *
  1022. * @param regexp The compiled regular expression.
  1023. * @param replacement A string containing the replacement text.
  1024. * @param dest A mutable UText that will receive the result.
  1025. * If NULL, a new UText will be created (which may not be mutable).
  1026. * @param status A reference to a UErrorCode to receive any errors.
  1027. * @return A UText containing the results of the find and replace.
  1028. * If a pre-allocated UText was provided, it will always be used and returned.
  1029. *
  1030. * @stable ICU 4.6
  1031. */
  1032. U_CAPI UText * U_EXPORT2
  1033. uregex_replaceAllUText(URegularExpression *regexp,
  1034. UText *replacement,
  1035. UText *dest,
  1036. UErrorCode *status);
  1037. /**
  1038. * Replaces the first substring of the input that matches the pattern
  1039. * with the given replacement string. This is a convenience function that
  1040. * provides a complete find-and-replace operation.
  1041. *
  1042. * This method scans the input string looking for a match of the pattern.
  1043. * All input that is not part of the match is copied unchanged to the
  1044. * destination buffer. The matched region is replaced in the output
  1045. * buffer by the replacement string. The replacement string may contain
  1046. * references to capture groups; these take the form of $1, $2, etc.
  1047. *
  1048. * @param regexp The compiled regular expression.
  1049. * @param replacementText A string containing the replacement text.
  1050. * @param replacementLength The length of the replacement string, or
  1051. * -1 if it is NUL terminated.
  1052. * @param destBuf A (UChar *) buffer that will receive the result.
  1053. * @param destCapacity The capacity of the destination buffer.
  1054. * @param status a reference to a UErrorCode to receive any errors.
  1055. * @return The length of the string resulting from the find
  1056. * and replace operation. In the event that the
  1057. * destination capacity is inadequate, the return value
  1058. * is still the full length of the untruncated string.
  1059. * @stable ICU 3.0
  1060. */
  1061. U_CAPI int32_t U_EXPORT2
  1062. uregex_replaceFirst(URegularExpression *regexp,
  1063. const UChar *replacementText,
  1064. int32_t replacementLength,
  1065. UChar *destBuf,
  1066. int32_t destCapacity,
  1067. UErrorCode *status);
  1068. /**
  1069. * Replaces the first substring of the input that matches the pattern
  1070. * with the given replacement string. This is a convenience function that
  1071. * provides a complete find-and-replace operation.
  1072. *
  1073. * This method scans the input string looking for a match of the pattern.
  1074. * All input that is not part of the match is copied unchanged to the
  1075. * destination buffer. The matched region is replaced in the output
  1076. * buffer by the replacement string. The replacement string may contain
  1077. * references to capture groups; these take the form of $1, $2, etc.
  1078. *
  1079. * @param regexp The compiled regular expression.
  1080. * @param replacement A string containing the replacement text.
  1081. * @param dest A mutable UText that will receive the result.
  1082. * If NULL, a new UText will be created (which may not be mutable).
  1083. * @param status A reference to a UErrorCode to receive any errors.
  1084. * @return A UText containing the results of the find and replace.
  1085. * If a pre-allocated UText was provided, it will always be used and returned.
  1086. *
  1087. * @stable ICU 4.6
  1088. */
  1089. U_CAPI UText * U_EXPORT2
  1090. uregex_replaceFirstUText(URegularExpression *regexp,
  1091. UText *replacement,
  1092. UText *dest,
  1093. UErrorCode *status);
  1094. /**
  1095. * Implements a replace operation intended to be used as part of an
  1096. * incremental find-and-replace.
  1097. *
  1098. * <p>The input string, starting from the end of the previous match and ending at
  1099. * the start of the current match, is appended to the destination string. Then the
  1100. * replacement string is appended to the output string,
  1101. * including handling any substitutions of captured text.</p>
  1102. *
  1103. * <p>A note on preflight computation of buffersize and error handling:
  1104. * Calls to uregex_appendReplacement() and uregex_appendTail() are
  1105. * designed to be chained, one after another, with the destination
  1106. * buffer pointer and buffer capacity updated after each in preparation
  1107. * to for the next. If the destination buffer is exhausted partway through such a
  1108. * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal
  1109. * ICU conventions are for a function to perform no action if it is
  1110. * called with an error status, but for this one case, uregex_appendRepacement()
  1111. * will operate normally so that buffer size computations will complete
  1112. * correctly.
  1113. *
  1114. * <p>For simple, prepackaged, non-incremental find-and-replace
  1115. * operations, see replaceFirst() or replaceAll().</p>
  1116. *
  1117. * @param regexp The regular expression object.
  1118. * @param replacementText The string that will replace the matched portion of the
  1119. * input string as it is copied to the destination buffer.
  1120. * The replacement text may contain references ($1, for
  1121. * example) to capture groups from the match.
  1122. * @param replacementLength The length of the replacement text string,
  1123. * or -1 if the string is NUL terminated.
  1124. * @param destBuf The buffer into which the results of the
  1125. * find-and-replace are placed. On return, this pointer
  1126. * will be updated to refer to the beginning of the
  1127. * unused portion of buffer, leaving it in position for
  1128. * a subsequent call to this function.
  1129. * @param destCapacity The size of the output buffer, On return, this
  1130. * parameter will be updated to reflect the space remaining
  1131. * unused in the output buffer.
  1132. * @param status A reference to a UErrorCode to receive any errors.
  1133. * @return The length of the result string. In the event that
  1134. * destCapacity is inadequate, the full length of the
  1135. * untruncated output string is returned.
  1136. *
  1137. * @stable ICU 3.0
  1138. *
  1139. */
  1140. U_CAPI int32_t U_EXPORT2
  1141. uregex_appendReplacement(URegularExpression *regexp,
  1142. const UChar *replacementText,
  1143. int32_t replacementLength,
  1144. UChar **destBuf,
  1145. int32_t *destCapacity,
  1146. UErrorCode *status);
  1147. /**
  1148. * Implements a replace operation intended to be used as part of an
  1149. * incremental find-and-replace.
  1150. *
  1151. * <p>The input string, starting from the end of the previous match and ending at
  1152. * the start of the current match, is appended to the destination string. Then the
  1153. * replacement string is appended to the output string,
  1154. * including handling any substitutions of captured text.</p>
  1155. *
  1156. * <p>For simple, prepackaged, non-incremental find-and-replace
  1157. * operations, see replaceFirst() or replaceAll().</p>
  1158. *
  1159. * @param regexp The regular expression object.
  1160. * @param replacementText The string that will replace the matched portion of the
  1161. * input string as it is copied to the destination buffer.
  1162. * The replacement text may contain references ($1, for
  1163. * example) to capture groups from the match.
  1164. * @param dest A mutable UText that will receive the result. Must not be NULL.
  1165. * @param status A reference to a UErrorCode to receive any errors.
  1166. *
  1167. * @stable ICU 4.6
  1168. */
  1169. U_CAPI void U_EXPORT2
  1170. uregex_appendReplacementUText(URegularExpression *regexp,
  1171. UText *replacementText,
  1172. UText *dest,
  1173. UErrorCode *status);
  1174. /**
  1175. * As the final step in a find-and-replace operation, append the remainder
  1176. * of the input string, starting at the position following the last match,
  1177. * to the destination string. <code>uregex_appendTail()</code> is intended
  1178. * to be invoked after one or more invocations of the
  1179. * <code>uregex_appendReplacement()</code> function.
  1180. *
  1181. * @param regexp The regular expression object. This is needed to
  1182. * obtain the input string and with the position
  1183. * of the last match within it.
  1184. * @param destBuf The buffer in which the results of the
  1185. * find-and-replace are placed. On return, the pointer
  1186. * will be updated to refer to the beginning of the
  1187. * unused portion of buffer.
  1188. * @param destCapacity The size of the output buffer, On return, this
  1189. * value will be updated to reflect the space remaining
  1190. * unused in the output buffer.
  1191. * @param status A reference to a UErrorCode to receive any errors.
  1192. * @return The length of the result string. In the event that
  1193. * destCapacity is inadequate, the full length of the
  1194. * untruncated output string is returned.
  1195. *
  1196. * @stable ICU 3.0
  1197. */
  1198. U_CAPI int32_t U_EXPORT2
  1199. uregex_appendTail(URegularExpression *regexp,
  1200. UChar **destBuf,
  1201. int32_t *destCapacity,
  1202. UErrorCode *status);
  1203. /**
  1204. * As the final step in a find-and-replace operation, append the remainder
  1205. * of the input string, starting at the position following the last match,
  1206. * to the destination string. <code>uregex_appendTailUText()</code> is intended
  1207. * to be invoked after one or more invocations of the
  1208. * <code>uregex_appendReplacementUText()</code> function.
  1209. *
  1210. * @param regexp The regular expression object. This is needed to
  1211. * obtain the input string and with the position
  1212. * of the last match within it.
  1213. * @param dest A mutable UText that will receive the result. Must not be NULL.
  1214. *
  1215. * @param status Error code
  1216. *
  1217. * @return The destination UText.
  1218. *
  1219. * @stable ICU 4.6
  1220. */
  1221. U_CAPI UText * U_EXPORT2
  1222. uregex_appendTailUText(URegularExpression *regexp,
  1223. UText *dest,
  1224. UErrorCode *status);
  1225. /**
  1226. * Split a string into fields. Somewhat like split() from Perl.
  1227. * The pattern matches identify delimiters that separate the input
  1228. * into fields. The input data between the matches becomes the
  1229. * fields themselves.
  1230. *
  1231. * Each of the fields is copied from the input string to the destination
  1232. * buffer, and NUL terminated. The position of each field within
  1233. * the destination buffer is returned in the destFields array.
  1234. *
  1235. * If the delimiter pattern includes capture groups, the captured text will
  1236. * also appear in the destination array of output strings, interspersed
  1237. * with the fields. This is similar to Perl, but differs from Java,
  1238. * which ignores the presence of capture groups in the pattern.
  1239. *
  1240. * Trailing empty fields will always be returned, assuming sufficient
  1241. * destination capacity. This differs from the default behavior for Java
  1242. * and Perl where trailing empty fields are not returned.
  1243. *
  1244. * The number of strings produced by the split operation is returned.
  1245. * This count includes the strings from capture groups in the delimiter pattern.
  1246. * This behavior differs from Java, which ignores capture groups.
  1247. *
  1248. * @param regexp The compiled regular expression.
  1249. * @param destBuf A (UChar *) buffer to receive the fields that
  1250. * are extracted from the input string. These
  1251. * field pointers will refer to positions within the
  1252. * destination buffer supplied by the caller. Any
  1253. * extra positions within the destFields array will be
  1254. * set to NULL.
  1255. * @param destCapacity The capacity of the destBuf.
  1256. * @param requiredCapacity The actual capacity required of the destBuf.
  1257. * If destCapacity is too small, requiredCapacity will return
  1258. * the total capacity required to hold all of the output, and
  1259. * a U_BUFFER_OVERFLOW_ERROR will be returned.
  1260. * @param destFields An array to be filled with the position of each
  1261. * of the extracted fields within destBuf.
  1262. * @param destFieldsCapacity The number of elements in the destFields array.
  1263. * If the number of fields found is less than destFieldsCapacity,
  1264. * the extra destFields elements are set to zero.
  1265. * If destFieldsCapacity is too small, the trailing part of the
  1266. * input, including any field delimiters, is treated as if it
  1267. * were the last field - it is copied to the destBuf, and
  1268. * its position is in the destBuf is stored in the last element
  1269. * of destFields. This behavior mimics that of Perl. It is not
  1270. * an error condition, and no error status is returned when all destField
  1271. * positions are used.
  1272. * @param status A reference to a UErrorCode to receive any errors.
  1273. * @return The number of fields into which the input string was split.
  1274. * @stable ICU 3.0
  1275. */
  1276. U_CAPI int32_t U_EXPORT2
  1277. uregex_split( URegularExpression *regexp,
  1278. UChar *destBuf,
  1279. int32_t destCapacity,
  1280. int32_t *requiredCapacity,
  1281. UChar *destFields[],
  1282. int32_t destFieldsCapacity,
  1283. UErrorCode *status);
  1284. /**
  1285. * Split a string into fields. Somewhat like split() from Perl.
  1286. * The pattern matches identify delimiters that separate the input
  1287. * into fields. The input data between the matches becomes the
  1288. * fields themselves.
  1289. * <p>
  1290. * The behavior of this function is not very closely aligned with uregex_split();
  1291. * instead, it is based on (and implemented directly on top of) the C++ split method.
  1292. *
  1293. * @param regexp The compiled regular expression.
  1294. * @param destFields An array of mutable UText structs to receive the results of the split.
  1295. * If a field is NULL, a new UText is allocated to contain the results for
  1296. * that field. This new UText is not guaranteed to be mutable.
  1297. * @param destFieldsCapacity The number of elements in the destination array.
  1298. * If the number of fields found is less than destCapacity, the
  1299. * extra strings in the destination array are not altered.
  1300. * If the number of destination strings is less than the number
  1301. * of fields, the trailing part of the input string, including any
  1302. * field delimiters, is placed in the last destination string.
  1303. * This behavior mimics that of Perl. It is not an error condition, and no
  1304. * error status is returned when all destField positions are used.
  1305. * @param status A reference to a UErrorCode to receive any errors.
  1306. * @return The number of fields into which the input string was split.
  1307. *
  1308. * @stable ICU 4.6
  1309. */
  1310. U_CAPI int32_t U_EXPORT2
  1311. uregex_splitUText(URegularExpression *regexp,
  1312. UText *destFields[],
  1313. int32_t destFieldsCapacity,
  1314. UErrorCode *status);
  1315. /**
  1316. * Set a processing time limit for match operations with this URegularExpression.
  1317. *
  1318. * Some patterns, when matching certain strings, can run in exponential time.
  1319. * For practical purposes, the match operation may appear to be in an
  1320. * infinite loop.
  1321. * When a limit is set a match operation will fail with an error if the
  1322. * limit is exceeded.
  1323. * <p>
  1324. * The units of the limit are steps of the match engine.
  1325. * Correspondence with actual processor time will depend on the speed
  1326. * of the processor and the details of the specific pattern, but will
  1327. * typically be on the order of milliseconds.
  1328. * <p>
  1329. * By default, the matching time is not limited.
  1330. * <p>
  1331. *
  1332. * @param regexp The compiled regular expression.
  1333. * @param limit The limit value, or 0 for no limit.
  1334. * @param status A reference to a UErrorCode to receive any errors.
  1335. * @stable ICU 4.0
  1336. */
  1337. U_CAPI void U_EXPORT2
  1338. uregex_setTimeLimit(URegularExpression *regexp,
  1339. int32_t limit,
  1340. UErrorCode *status);
  1341. /**
  1342. * Get the time limit for for matches with this URegularExpression.
  1343. * A return value of zero indicates that there is no limit.
  1344. *
  1345. * @param regexp The compiled regular expression.
  1346. * @param status A reference to a UErrorCode to receive any errors.
  1347. * @return the maximum allowed time for a match, in units of processing steps.
  1348. * @stable ICU 4.0
  1349. */
  1350. U_CAPI int32_t U_EXPORT2
  1351. uregex_getTimeLimit(const URegularExpression *regexp,
  1352. UErrorCode *status);
  1353. /**
  1354. * Set the amount of heap storage available for use by the match backtracking stack.
  1355. * <p>
  1356. * ICU uses a backtracking regular expression engine, with the backtrack stack
  1357. * maintained on the heap. This function sets the limit to the amount of memory
  1358. * that can be used for this purpose. A backtracking stack overflow will
  1359. * result in an error from the match operation that caused it.
  1360. * <p>
  1361. * A limit is desirable because a malicious or poorly designed pattern can use
  1362. * excessive memory, potentially crashing the process. A limit is enabled
  1363. * by default.
  1364. * <p>
  1365. * @param regexp The compiled regular expression.
  1366. * @param limit The maximum size, in bytes, of the matching backtrack stack.
  1367. * A value of zero means no limit.
  1368. * The limit must be greater than or equal to zero.
  1369. * @param status A reference to a UErrorCode to receive any errors.
  1370. *
  1371. * @stable ICU 4.0
  1372. */
  1373. U_CAPI void U_EXPORT2
  1374. uregex_setStackLimit(URegularExpression *regexp,
  1375. int32_t limit,
  1376. UErrorCode *status);
  1377. /**
  1378. * Get the size of the heap storage available for use by the back tracking stack.
  1379. *
  1380. * @return the maximum backtracking stack size, in bytes, or zero if the
  1381. * stack size is unlimited.
  1382. * @stable ICU 4.0
  1383. */
  1384. U_CAPI int32_t U_EXPORT2
  1385. uregex_getStackLimit(const URegularExpression *regexp,
  1386. UErrorCode *status);
  1387. /**
  1388. * Function pointer for a regular expression matching callback function.
  1389. * When set, a callback function will be called periodically during matching
  1390. * operations. If the call back function returns false, the matching
  1391. * operation will be terminated early.
  1392. *
  1393. * Note: the callback function must not call other functions on this
  1394. * URegularExpression.
  1395. *
  1396. * @param context context pointer. The callback function will be invoked
  1397. * with the context specified at the time that
  1398. * uregex_setMatchCallback() is called.
  1399. * @param steps the accumulated processing time, in match steps,
  1400. * for this matching operation.
  1401. * @return true to continue the matching operation.
  1402. * false to terminate the matching operation.
  1403. * @stable ICU 4.0
  1404. */
  1405. U_CDECL_BEGIN
  1406. typedef UBool U_CALLCONV URegexMatchCallback (
  1407. const void *context,
  1408. int32_t steps);
  1409. U_CDECL_END
  1410. /**
  1411. * Set a callback function for this URegularExpression.
  1412. * During matching operations the function will be called periodically,
  1413. * giving the application the opportunity to terminate a long-running
  1414. * match.
  1415. *
  1416. * @param regexp The compiled regular expression.
  1417. * @param callback A pointer to the user-supplied callback function.
  1418. * @param context User context pointer. The value supplied at the
  1419. * time the callback function is set will be saved
  1420. * and passed to the callback each time that it is called.
  1421. * @param status A reference to a UErrorCode to receive any errors.
  1422. * @stable ICU 4.0
  1423. */
  1424. U_CAPI void U_EXPORT2
  1425. uregex_setMatchCallback(URegularExpression *regexp,
  1426. URegexMatchCallback *callback,
  1427. const void *context,
  1428. UErrorCode *status);
  1429. /**
  1430. * Get the callback function for this URegularExpression.
  1431. *
  1432. * @param regexp The compiled regular expression.
  1433. * @param callback Out parameter, receives a pointer to the user-supplied
  1434. * callback function.
  1435. * @param context Out parameter, receives the user context pointer that
  1436. * was set when uregex_setMatchCallback() was called.
  1437. * @param status A reference to a UErrorCode to receive any errors.
  1438. * @stable ICU 4.0
  1439. */
  1440. U_CAPI void U_EXPORT2
  1441. uregex_getMatchCallback(const URegularExpression *regexp,
  1442. URegexMatchCallback **callback,
  1443. const void **context,
  1444. UErrorCode *status);
  1445. /**
  1446. * Function pointer for a regular expression find callback function.
  1447. *
  1448. * When set, a callback function will be called during a find operation
  1449. * and for operations that depend on find, such as findNext, split and some replace
  1450. * operations like replaceFirst.
  1451. * The callback will usually be called after each attempt at a match, but this is not a
  1452. * guarantee that the callback will be invoked at each character. For finds where the
  1453. * match engine is invoked at each character, this may be close to true, but less likely
  1454. * for more optimized loops where the pattern is known to only start, and the match
  1455. * engine invoked, at certain characters.
  1456. * When invoked, this callback will specify the index at which a match operation is about
  1457. * to be attempted, giving the application the opportunity to terminate a long-running
  1458. * find operation.
  1459. *
  1460. * If the call back function returns false, the find operation will be terminated early.
  1461. *
  1462. * Note: the callback function must not call other functions on this
  1463. * URegularExpression
  1464. *
  1465. * @param context context pointer. The callback function will be invoked
  1466. * with the context specified at the time that
  1467. * uregex_setFindProgressCallback() is called.
  1468. * @param matchIndex the next index at which a match attempt will be attempted for this
  1469. * find operation. If this callback interrupts the search, this is the
  1470. * index at which a find/findNext operation may be re-initiated.
  1471. * @return true to continue the matching operation.
  1472. * false to terminate the matching operation.
  1473. * @stable ICU 4.6
  1474. */
  1475. U_CDECL_BEGIN
  1476. typedef UBool U_CALLCONV URegexFindProgressCallback (
  1477. const void *context,
  1478. int64_t matchIndex);
  1479. U_CDECL_END
  1480. /**
  1481. * Set the find progress callback function for this URegularExpression.
  1482. *
  1483. * @param regexp The compiled regular expression.
  1484. * @param callback A pointer to the user-supplied callback function.
  1485. * @param context User context pointer. The value supplied at the
  1486. * time the callback function is set will be saved
  1487. * and passed to the callback each time that it is called.
  1488. * @param status A reference to a UErrorCode to receive any errors.
  1489. * @stable ICU 4.6
  1490. */
  1491. U_CAPI void U_EXPORT2
  1492. uregex_setFindProgressCallback(URegularExpression *regexp,
  1493. URegexFindProgressCallback *callback,
  1494. const void *context,
  1495. UErrorCode *status);
  1496. /**
  1497. * Get the find progress callback function for this URegularExpression.
  1498. *
  1499. * @param regexp The compiled regular expression.
  1500. * @param callback Out parameter, receives a pointer to the user-supplied
  1501. * callback function.
  1502. * @param context Out parameter, receives the user context pointer that
  1503. * was set when uregex_setFindProgressCallback() was called.
  1504. * @param status A reference to a UErrorCode to receive any errors.
  1505. * @stable ICU 4.6
  1506. */
  1507. U_CAPI void U_EXPORT2
  1508. uregex_getFindProgressCallback(const URegularExpression *regexp,
  1509. URegexFindProgressCallback **callback,
  1510. const void **context,
  1511. UErrorCode *status);
  1512. #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
  1513. #endif /* UREGEX_H */