usearch.h 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2001-2011,2014 IBM and others. All rights reserved.
  6. **********************************************************************
  7. * Date Name Description
  8. * 06/28/2001 synwee Creation.
  9. **********************************************************************
  10. */
  11. #ifndef USEARCH_H
  12. #define USEARCH_H
  13. #include "unicode/utypes.h"
  14. #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
  15. #include "unicode/ucol.h"
  16. #include "unicode/ucoleitr.h"
  17. #include "unicode/ubrk.h"
  18. #if U_SHOW_CPLUSPLUS_API
  19. #include "unicode/localpointer.h"
  20. #endif // U_SHOW_CPLUSPLUS_API
  21. /**
  22. * \file
  23. * \brief C API: StringSearch
  24. *
  25. * C APIs for an engine that provides language-sensitive text searching based
  26. * on the comparison rules defined in a <code>UCollator</code> data struct,
  27. * see <code>ucol.h</code>. This ensures that language eccentricity can be
  28. * handled, e.g. for the German collator, characters &szlig; and SS will be matched
  29. * if case is chosen to be ignored.
  30. * See the <a href="https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/collation/ICU_collation_design.htm">
  31. * "ICU Collation Design Document"</a> for more information.
  32. * <p>
  33. * As of ICU4C 4.0 / ICU4J 53, the implementation uses a linear search. In previous versions,
  34. * a modified form of the Boyer-Moore searching algorithm was used. For more information
  35. * on the modified Boyer-Moore algorithm see
  36. * <a href="http://icu-project.org/docs/papers/efficient_text_searching_in_java.html">
  37. * "Efficient Text Searching in Java"</a>, published in <i>Java Report</i>
  38. * in February, 1999.
  39. * <p>
  40. * There are 2 match options for selection:<br>
  41. * Let S' be the sub-string of a text string S between the offsets start and
  42. * end <start, end>.
  43. * <br>
  44. * A pattern string P matches a text string S at the offsets <start, end>
  45. * if
  46. * <pre>
  47. * option 1. Some canonical equivalent of P matches some canonical equivalent
  48. * of S'
  49. * option 2. P matches S' and if P starts or ends with a combining mark,
  50. * there exists no non-ignorable combining mark before or after S'
  51. * in S respectively.
  52. * </pre>
  53. * Option 2. will be the default.
  54. * <p>
  55. * This search has APIs similar to that of other text iteration mechanisms
  56. * such as the break iterators in <code>ubrk.h</code>. Using these
  57. * APIs, it is easy to scan through text looking for all occurrences of
  58. * a given pattern. This search iterator allows changing of direction by
  59. * calling a <code>reset</code> followed by a <code>next</code> or <code>previous</code>.
  60. * Though a direction change can occur without calling <code>reset</code> first,
  61. * this operation comes with some speed penalty.
  62. * Generally, match results in the forward direction will match the result
  63. * matches in the backwards direction in the reverse order
  64. * <p>
  65. * <code>usearch.h</code> provides APIs to specify the starting position
  66. * within the text string to be searched, e.g. <code>usearch_setOffset</code>,
  67. * <code>usearch_preceding</code> and <code>usearch_following</code>. Since the
  68. * starting position will be set as it is specified, please take note that
  69. * there are some dangerous positions which the search may render incorrect
  70. * results:
  71. * <ul>
  72. * <li> The midst of a substring that requires normalization.
  73. * <li> If the following match is to be found, the position should not be the
  74. * second character which requires to be swapped with the preceding
  75. * character. Vice versa, if the preceding match is to be found,
  76. * position to search from should not be the first character which
  77. * requires to be swapped with the next character. E.g certain Thai and
  78. * Lao characters require swapping.
  79. * <li> If a following pattern match is to be found, any position within a
  80. * contracting sequence except the first will fail. Vice versa if a
  81. * preceding pattern match is to be found, a invalid starting point
  82. * would be any character within a contracting sequence except the last.
  83. * </ul>
  84. * <p>
  85. * A breakiterator can be used if only matches at logical breaks are desired.
  86. * Using a breakiterator will only give you results that exactly matches the
  87. * boundaries given by the breakiterator. For instance the pattern "e" will
  88. * not be found in the string "\u00e9" if a character break iterator is used.
  89. * <p>
  90. * Options are provided to handle overlapping matches.
  91. * E.g. In English, overlapping matches produces the result 0 and 2
  92. * for the pattern "abab" in the text "ababab", where else mutually
  93. * exclusive matches only produce the result of 0.
  94. * <p>
  95. * Options are also provided to implement "asymmetric search" as described in
  96. * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search">
  97. * UTS #10 Unicode Collation Algorithm</a>, specifically the USearchAttribute
  98. * USEARCH_ELEMENT_COMPARISON and its values.
  99. * <p>
  100. * Though collator attributes will be taken into consideration while
  101. * performing matches, there are no APIs here for setting and getting the
  102. * attributes. These attributes can be set by getting the collator
  103. * from <code>usearch_getCollator</code> and using the APIs in <code>ucol.h</code>.
  104. * Lastly to update String Search to the new collator attributes,
  105. * usearch_reset() has to be called.
  106. * <p>
  107. * Restriction: <br>
  108. * Currently there are no composite characters that consists of a
  109. * character with combining class > 0 before a character with combining
  110. * class == 0. However, if such a character exists in the future, the
  111. * search mechanism does not guarantee the results for option 1.
  112. *
  113. * <p>
  114. * Example of use:<br>
  115. * <pre><code>
  116. * char *tgtstr = "The quick brown fox jumped over the lazy fox";
  117. * char *patstr = "fox";
  118. * UChar target[64];
  119. * UChar pattern[16];
  120. * UErrorCode status = U_ZERO_ERROR;
  121. * u_uastrcpy(target, tgtstr);
  122. * u_uastrcpy(pattern, patstr);
  123. *
  124. * UStringSearch *search = usearch_open(pattern, -1, target, -1, "en_US",
  125. * NULL, &status);
  126. * if (U_SUCCESS(status)) {
  127. * for (int pos = usearch_first(search, &status);
  128. * pos != USEARCH_DONE;
  129. * pos = usearch_next(search, &status))
  130. * {
  131. * printf("Found match at %d pos, length is %d\n", pos,
  132. * usearch_getMatchedLength(search));
  133. * }
  134. * }
  135. *
  136. * usearch_close(search);
  137. * </code></pre>
  138. * @stable ICU 2.4
  139. */
  140. /**
  141. * DONE is returned by previous() and next() after all valid matches have
  142. * been returned, and by first() and last() if there are no matches at all.
  143. * @stable ICU 2.4
  144. */
  145. #define USEARCH_DONE -1
  146. /**
  147. * Data structure for searching
  148. * @stable ICU 2.4
  149. */
  150. struct UStringSearch;
  151. /**
  152. * Data structure for searching
  153. * @stable ICU 2.4
  154. */
  155. typedef struct UStringSearch UStringSearch;
  156. /**
  157. * @stable ICU 2.4
  158. */
  159. typedef enum {
  160. /**
  161. * Option for overlapping matches
  162. * @stable ICU 2.4
  163. */
  164. USEARCH_OVERLAP = 0,
  165. #ifndef U_HIDE_DEPRECATED_API
  166. /**
  167. * Option for canonical matches; option 1 in header documentation.
  168. * The default value will be USEARCH_OFF.
  169. * Note: Setting this option to USEARCH_ON currently has no effect on
  170. * search behavior, and this option is deprecated. Instead, to control
  171. * canonical match behavior, you must set UCOL_NORMALIZATION_MODE
  172. * appropriately (to UCOL_OFF or UCOL_ON) in the UCollator used by
  173. * the UStringSearch object.
  174. * @see usearch_openFromCollator
  175. * @see usearch_getCollator
  176. * @see usearch_setCollator
  177. * @see ucol_getAttribute
  178. * @deprecated ICU 53
  179. */
  180. USEARCH_CANONICAL_MATCH = 1,
  181. #endif /* U_HIDE_DEPRECATED_API */
  182. /**
  183. * Option to control how collation elements are compared.
  184. * The default value will be USEARCH_STANDARD_ELEMENT_COMPARISON.
  185. * @stable ICU 4.4
  186. */
  187. USEARCH_ELEMENT_COMPARISON = 2,
  188. #ifndef U_HIDE_DEPRECATED_API
  189. /**
  190. * One more than the highest normal USearchAttribute value.
  191. * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
  192. */
  193. USEARCH_ATTRIBUTE_COUNT = 3
  194. #endif /* U_HIDE_DEPRECATED_API */
  195. } USearchAttribute;
  196. /**
  197. * @stable ICU 2.4
  198. */
  199. typedef enum {
  200. /**
  201. * Default value for any USearchAttribute
  202. * @stable ICU 2.4
  203. */
  204. USEARCH_DEFAULT = -1,
  205. /**
  206. * Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH
  207. * @stable ICU 2.4
  208. */
  209. USEARCH_OFF,
  210. /**
  211. * Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH
  212. * @stable ICU 2.4
  213. */
  214. USEARCH_ON,
  215. /**
  216. * Value (default) for USEARCH_ELEMENT_COMPARISON;
  217. * standard collation element comparison at the specified collator
  218. * strength.
  219. * @stable ICU 4.4
  220. */
  221. USEARCH_STANDARD_ELEMENT_COMPARISON,
  222. /**
  223. * Value for USEARCH_ELEMENT_COMPARISON;
  224. * collation element comparison is modified to effectively provide
  225. * behavior between the specified strength and strength - 1. Collation
  226. * elements in the pattern that have the base weight for the specified
  227. * strength are treated as "wildcards" that match an element with any
  228. * other weight at that collation level in the searched text. For
  229. * example, with a secondary-strength English collator, a plain 'e' in
  230. * the pattern will match a plain e or an e with any diacritic in the
  231. * searched text, but an e with diacritic in the pattern will only
  232. * match an e with the same diacritic in the searched text.
  233. *
  234. * This supports "asymmetric search" as described in
  235. * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search">
  236. * UTS #10 Unicode Collation Algorithm</a>.
  237. *
  238. * @stable ICU 4.4
  239. */
  240. USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD,
  241. /**
  242. * Value for USEARCH_ELEMENT_COMPARISON.
  243. * collation element comparison is modified to effectively provide
  244. * behavior between the specified strength and strength - 1. Collation
  245. * elements in either the pattern or the searched text that have the
  246. * base weight for the specified strength are treated as "wildcards"
  247. * that match an element with any other weight at that collation level.
  248. * For example, with a secondary-strength English collator, a plain 'e'
  249. * in the pattern will match a plain e or an e with any diacritic in the
  250. * searched text, but an e with diacritic in the pattern will only
  251. * match an e with the same diacritic or a plain e in the searched text.
  252. *
  253. * This option is similar to "asymmetric search" as described in
  254. * [UTS #10 Unicode Collation Algorithm](http://www.unicode.org/reports/tr10/#Asymmetric_Search),
  255. * but also allows unmarked characters in the searched text to match
  256. * marked or unmarked versions of that character in the pattern.
  257. *
  258. * @stable ICU 4.4
  259. */
  260. USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD,
  261. #ifndef U_HIDE_DEPRECATED_API
  262. /**
  263. * One more than the highest normal USearchAttributeValue value.
  264. * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
  265. */
  266. USEARCH_ATTRIBUTE_VALUE_COUNT
  267. #endif /* U_HIDE_DEPRECATED_API */
  268. } USearchAttributeValue;
  269. /* open and close ------------------------------------------------------ */
  270. /**
  271. * Creates a String Search iterator data struct using the argument locale language
  272. * rule set. A collator will be created in the process, which will be owned by
  273. * this String Search and will be deleted in <code>usearch_close</code>.
  274. *
  275. * The UStringSearch retains a pointer to both the pattern and text strings.
  276. * The caller must not modify or delete them while using the UStringSearch.
  277. *
  278. * @param pattern for matching
  279. * @param patternlength length of the pattern, -1 for null-termination
  280. * @param text text string
  281. * @param textlength length of the text string, -1 for null-termination
  282. * @param locale name of locale for the rules to be used
  283. * @param breakiter A BreakIterator that will be used to restrict the points
  284. * at which matches are detected. If a match is found, but
  285. * the match's start or end index is not a boundary as
  286. * determined by the <code>BreakIterator</code>, the match will
  287. * be rejected and another will be searched for.
  288. * If this parameter is <code>NULL</code>, no break detection is
  289. * attempted.
  290. * @param status for errors if it occurs. If pattern or text is NULL, or if
  291. * patternlength or textlength is 0 then an
  292. * U_ILLEGAL_ARGUMENT_ERROR is returned.
  293. * @return search iterator data structure, or NULL if there is an error.
  294. * @stable ICU 2.4
  295. */
  296. U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
  297. int32_t patternlength,
  298. const UChar *text,
  299. int32_t textlength,
  300. const char *locale,
  301. UBreakIterator *breakiter,
  302. UErrorCode *status);
  303. /**
  304. * Creates a String Search iterator data struct using the argument collator language
  305. * rule set. Note, user retains the ownership of this collator, thus the
  306. * responsibility of deletion lies with the user.
  307. * NOTE: String Search cannot be instantiated from a collator that has
  308. * collate digits as numbers (CODAN) turned on (UCOL_NUMERIC_COLLATION).
  309. *
  310. * The UStringSearch retains a pointer to both the pattern and text strings.
  311. * The caller must not modify or delete them while using the UStringSearch.
  312. *
  313. * @param pattern for matching
  314. * @param patternlength length of the pattern, -1 for null-termination
  315. * @param text text string
  316. * @param textlength length of the text string, -1 for null-termination
  317. * @param collator used for the language rules
  318. * @param breakiter A BreakIterator that will be used to restrict the points
  319. * at which matches are detected. If a match is found, but
  320. * the match's start or end index is not a boundary as
  321. * determined by the <code>BreakIterator</code>, the match will
  322. * be rejected and another will be searched for.
  323. * If this parameter is <code>NULL</code>, no break detection is
  324. * attempted.
  325. * @param status for errors if it occurs. If collator, pattern or text is NULL,
  326. * or if patternlength or textlength is 0 then an
  327. * U_ILLEGAL_ARGUMENT_ERROR is returned.
  328. * @return search iterator data structure, or NULL if there is an error.
  329. * @stable ICU 2.4
  330. */
  331. U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
  332. const UChar *pattern,
  333. int32_t patternlength,
  334. const UChar *text,
  335. int32_t textlength,
  336. const UCollator *collator,
  337. UBreakIterator *breakiter,
  338. UErrorCode *status);
  339. /**
  340. * Destroys and cleans up the String Search iterator data struct.
  341. * If a collator was created in <code>usearch_open</code>, then it will be destroyed here.
  342. * @param searchiter The UStringSearch to clean up
  343. * @stable ICU 2.4
  344. */
  345. U_CAPI void U_EXPORT2 usearch_close(UStringSearch *searchiter);
  346. #if U_SHOW_CPLUSPLUS_API
  347. U_NAMESPACE_BEGIN
  348. /**
  349. * \class LocalUStringSearchPointer
  350. * "Smart pointer" class, closes a UStringSearch via usearch_close().
  351. * For most methods see the LocalPointerBase base class.
  352. *
  353. * @see LocalPointerBase
  354. * @see LocalPointer
  355. * @stable ICU 4.4
  356. */
  357. U_DEFINE_LOCAL_OPEN_POINTER(LocalUStringSearchPointer, UStringSearch, usearch_close);
  358. U_NAMESPACE_END
  359. #endif
  360. /* get and set methods -------------------------------------------------- */
  361. /**
  362. * Sets the current position in the text string which the next search will
  363. * start from. Clears previous states.
  364. * This method takes the argument index and sets the position in the text
  365. * string accordingly without checking if the index is pointing to a
  366. * valid starting point to begin searching.
  367. * Search positions that may render incorrect results are highlighted in the
  368. * header comments
  369. * @param strsrch search iterator data struct
  370. * @param position position to start next search from. If position is less
  371. * than or greater than the text range for searching,
  372. * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
  373. * @param status error status if any.
  374. * @stable ICU 2.4
  375. */
  376. U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
  377. int32_t position,
  378. UErrorCode *status);
  379. /**
  380. * Return the current index in the string text being searched.
  381. * If the iteration has gone past the end of the text (or past the beginning
  382. * for a backwards search), <code>USEARCH_DONE</code> is returned.
  383. * @param strsrch search iterator data struct
  384. * @see #USEARCH_DONE
  385. * @stable ICU 2.4
  386. */
  387. U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch);
  388. /**
  389. * Sets the text searching attributes located in the enum USearchAttribute
  390. * with values from the enum USearchAttributeValue.
  391. * <code>USEARCH_DEFAULT</code> can be used for all attributes for resetting.
  392. * @param strsrch search iterator data struct
  393. * @param attribute text attribute to be set
  394. * @param value text attribute value
  395. * @param status for errors if it occurs
  396. * @see #usearch_getAttribute
  397. * @stable ICU 2.4
  398. */
  399. U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
  400. USearchAttribute attribute,
  401. USearchAttributeValue value,
  402. UErrorCode *status);
  403. /**
  404. * Gets the text searching attributes.
  405. * @param strsrch search iterator data struct
  406. * @param attribute text attribute to be retrieve
  407. * @return text attribute value
  408. * @see #usearch_setAttribute
  409. * @stable ICU 2.4
  410. */
  411. U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
  412. const UStringSearch *strsrch,
  413. USearchAttribute attribute);
  414. /**
  415. * Returns the index to the match in the text string that was searched.
  416. * This call returns a valid result only after a successful call to
  417. * <code>usearch_first</code>, <code>usearch_next</code>, <code>usearch_previous</code>,
  418. * or <code>usearch_last</code>.
  419. * Just after construction, or after a searching method returns
  420. * <code>USEARCH_DONE</code>, this method will return <code>USEARCH_DONE</code>.
  421. * <p>
  422. * Use <code>usearch_getMatchedLength</code> to get the matched string length.
  423. * @param strsrch search iterator data struct
  424. * @return index to a substring within the text string that is being
  425. * searched.
  426. * @see #usearch_first
  427. * @see #usearch_next
  428. * @see #usearch_previous
  429. * @see #usearch_last
  430. * @see #USEARCH_DONE
  431. * @stable ICU 2.4
  432. */
  433. U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
  434. const UStringSearch *strsrch);
  435. /**
  436. * Returns the length of text in the string which matches the search pattern.
  437. * This call returns a valid result only after a successful call to
  438. * <code>usearch_first</code>, <code>usearch_next</code>, <code>usearch_previous</code>,
  439. * or <code>usearch_last</code>.
  440. * Just after construction, or after a searching method returns
  441. * <code>USEARCH_DONE</code>, this method will return 0.
  442. * @param strsrch search iterator data struct
  443. * @return The length of the match in the string text, or 0 if there is no
  444. * match currently.
  445. * @see #usearch_first
  446. * @see #usearch_next
  447. * @see #usearch_previous
  448. * @see #usearch_last
  449. * @see #USEARCH_DONE
  450. * @stable ICU 2.4
  451. */
  452. U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
  453. const UStringSearch *strsrch);
  454. /**
  455. * Returns the text that was matched by the most recent call to
  456. * <code>usearch_first</code>, <code>usearch_next</code>, <code>usearch_previous</code>,
  457. * or <code>usearch_last</code>.
  458. * If the iterator is not pointing at a valid match (e.g. just after
  459. * construction or after <code>USEARCH_DONE</code> has been returned, returns
  460. * an empty string. If result is not large enough to store the matched text,
  461. * result will be filled with the partial text and an U_BUFFER_OVERFLOW_ERROR
  462. * will be returned in status. result will be null-terminated whenever
  463. * possible. If the buffer fits the matched text exactly, a null-termination
  464. * is not possible, then a U_STRING_NOT_TERMINATED_ERROR set in status.
  465. * Pre-flighting can be either done with length = 0 or the API
  466. * <code>usearch_getMatchedLength</code>.
  467. * @param strsrch search iterator data struct
  468. * @param result UChar buffer to store the matched string
  469. * @param resultCapacity length of the result buffer
  470. * @param status error returned if result is not large enough
  471. * @return exact length of the matched text, not counting the null-termination
  472. * @see #usearch_first
  473. * @see #usearch_next
  474. * @see #usearch_previous
  475. * @see #usearch_last
  476. * @see #USEARCH_DONE
  477. * @stable ICU 2.4
  478. */
  479. U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
  480. UChar *result,
  481. int32_t resultCapacity,
  482. UErrorCode *status);
  483. #if !UCONFIG_NO_BREAK_ITERATION
  484. /**
  485. * Set the BreakIterator that will be used to restrict the points at which
  486. * matches are detected.
  487. * @param strsrch search iterator data struct
  488. * @param breakiter A BreakIterator that will be used to restrict the points
  489. * at which matches are detected. If a match is found, but
  490. * the match's start or end index is not a boundary as
  491. * determined by the <code>BreakIterator</code>, the match will
  492. * be rejected and another will be searched for.
  493. * If this parameter is <code>NULL</code>, no break detection is
  494. * attempted.
  495. * @param status for errors if it occurs
  496. * @see #usearch_getBreakIterator
  497. * @stable ICU 2.4
  498. */
  499. U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
  500. UBreakIterator *breakiter,
  501. UErrorCode *status);
  502. /**
  503. * Returns the BreakIterator that is used to restrict the points at which
  504. * matches are detected. This will be the same object that was passed to the
  505. * constructor or to <code>usearch_setBreakIterator</code>. Note that
  506. * <code>NULL</code>
  507. * is a legal value; it means that break detection should not be attempted.
  508. * @param strsrch search iterator data struct
  509. * @return break iterator used
  510. * @see #usearch_setBreakIterator
  511. * @stable ICU 2.4
  512. */
  513. U_CAPI const UBreakIterator * U_EXPORT2 usearch_getBreakIterator(
  514. const UStringSearch *strsrch);
  515. #endif
  516. /**
  517. * Set the string text to be searched. Text iteration will hence begin at the
  518. * start of the text string. This method is useful if you want to re-use an
  519. * iterator to search for the same pattern within a different body of text.
  520. *
  521. * The UStringSearch retains a pointer to the text string. The caller must not
  522. * modify or delete the string while using the UStringSearch.
  523. *
  524. * @param strsrch search iterator data struct
  525. * @param text new string to look for match
  526. * @param textlength length of the new string, -1 for null-termination
  527. * @param status for errors if it occurs. If text is NULL, or textlength is 0
  528. * then an U_ILLEGAL_ARGUMENT_ERROR is returned with no change
  529. * done to strsrch.
  530. * @see #usearch_getText
  531. * @stable ICU 2.4
  532. */
  533. U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
  534. const UChar *text,
  535. int32_t textlength,
  536. UErrorCode *status);
  537. /**
  538. * Return the string text to be searched.
  539. * @param strsrch search iterator data struct
  540. * @param length returned string text length
  541. * @return string text
  542. * @see #usearch_setText
  543. * @stable ICU 2.4
  544. */
  545. U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
  546. int32_t *length);
  547. /**
  548. * Gets the collator used for the language rules.
  549. * <p>
  550. * Deleting the returned <code>UCollator</code> before calling
  551. * <code>usearch_close</code> would cause the string search to fail.
  552. * <code>usearch_close</code> will delete the collator if this search owns it.
  553. * @param strsrch search iterator data struct
  554. * @return collator
  555. * @stable ICU 2.4
  556. */
  557. U_CAPI UCollator * U_EXPORT2 usearch_getCollator(
  558. const UStringSearch *strsrch);
  559. /**
  560. * Sets the collator used for the language rules. User retains the ownership
  561. * of this collator, thus the responsibility of deletion lies with the user.
  562. * This method causes internal data such as the pattern collation elements
  563. * and shift tables to be recalculated, but the iterator's position is unchanged.
  564. * @param strsrch search iterator data struct
  565. * @param collator to be used
  566. * @param status for errors if it occurs
  567. * @stable ICU 2.4
  568. */
  569. U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
  570. const UCollator *collator,
  571. UErrorCode *status);
  572. /**
  573. * Sets the pattern used for matching.
  574. * Internal data like the pattern collation elements will be recalculated, but the
  575. * iterator's position is unchanged.
  576. *
  577. * The UStringSearch retains a pointer to the pattern string. The caller must not
  578. * modify or delete the string while using the UStringSearch.
  579. *
  580. * @param strsrch search iterator data struct
  581. * @param pattern string
  582. * @param patternlength pattern length, -1 for null-terminated string
  583. * @param status for errors if it occurs. If text is NULL, or textlength is 0
  584. * then an U_ILLEGAL_ARGUMENT_ERROR is returned with no change
  585. * done to strsrch.
  586. * @stable ICU 2.4
  587. */
  588. U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
  589. const UChar *pattern,
  590. int32_t patternlength,
  591. UErrorCode *status);
  592. /**
  593. * Gets the search pattern
  594. * @param strsrch search iterator data struct
  595. * @param length return length of the pattern, -1 indicates that the pattern
  596. * is null-terminated
  597. * @return pattern string
  598. * @stable ICU 2.4
  599. */
  600. U_CAPI const UChar * U_EXPORT2 usearch_getPattern(
  601. const UStringSearch *strsrch,
  602. int32_t *length);
  603. /* methods ------------------------------------------------------------- */
  604. /**
  605. * Returns the first index at which the string text matches the search
  606. * pattern.
  607. * The iterator is adjusted so that its current index (as returned by
  608. * <code>usearch_getOffset</code>) is the match position if one was found.
  609. * If a match is not found, <code>USEARCH_DONE</code> will be returned and
  610. * the iterator will be adjusted to the index <code>USEARCH_DONE</code>.
  611. * @param strsrch search iterator data struct
  612. * @param status for errors if it occurs
  613. * @return The character index of the first match, or
  614. * <code>USEARCH_DONE</code> if there are no matches.
  615. * @see #usearch_getOffset
  616. * @see #USEARCH_DONE
  617. * @stable ICU 2.4
  618. */
  619. U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
  620. UErrorCode *status);
  621. /**
  622. * Returns the first index equal or greater than <code>position</code> at which
  623. * the string text
  624. * matches the search pattern. The iterator is adjusted so that its current
  625. * index (as returned by <code>usearch_getOffset</code>) is the match position if
  626. * one was found.
  627. * If a match is not found, <code>USEARCH_DONE</code> will be returned and
  628. * the iterator will be adjusted to the index <code>USEARCH_DONE</code>
  629. * <p>
  630. * Search positions that may render incorrect results are highlighted in the
  631. * header comments. If position is less than or greater than the text range
  632. * for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned
  633. * @param strsrch search iterator data struct
  634. * @param position to start the search at
  635. * @param status for errors if it occurs
  636. * @return The character index of the first match following <code>pos</code>,
  637. * or <code>USEARCH_DONE</code> if there are no matches.
  638. * @see #usearch_getOffset
  639. * @see #USEARCH_DONE
  640. * @stable ICU 2.4
  641. */
  642. U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
  643. int32_t position,
  644. UErrorCode *status);
  645. /**
  646. * Returns the last index in the target text at which it matches the search
  647. * pattern. The iterator is adjusted so that its current
  648. * index (as returned by <code>usearch_getOffset</code>) is the match position if
  649. * one was found.
  650. * If a match is not found, <code>USEARCH_DONE</code> will be returned and
  651. * the iterator will be adjusted to the index <code>USEARCH_DONE</code>.
  652. * @param strsrch search iterator data struct
  653. * @param status for errors if it occurs
  654. * @return The index of the first match, or <code>USEARCH_DONE</code> if there
  655. * are no matches.
  656. * @see #usearch_getOffset
  657. * @see #USEARCH_DONE
  658. * @stable ICU 2.4
  659. */
  660. U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
  661. UErrorCode *status);
  662. /**
  663. * Returns the first index less than <code>position</code> at which the string text
  664. * matches the search pattern. The iterator is adjusted so that its current
  665. * index (as returned by <code>usearch_getOffset</code>) is the match position if
  666. * one was found.
  667. * If a match is not found, <code>USEARCH_DONE</code> will be returned and
  668. * the iterator will be adjusted to the index <code>USEARCH_DONE</code>
  669. * <p>
  670. * Search positions that may render incorrect results are highlighted in the
  671. * header comments. If position is less than or greater than the text range
  672. * for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned.
  673. * <p>
  674. * When <code>USEARCH_OVERLAP</code> option is off, the last index of the
  675. * result match is always less than <code>position</code>.
  676. * When <code>USERARCH_OVERLAP</code> is on, the result match may span across
  677. * <code>position</code>.
  678. * @param strsrch search iterator data struct
  679. * @param position index position the search is to begin at
  680. * @param status for errors if it occurs
  681. * @return The character index of the first match preceding <code>pos</code>,
  682. * or <code>USEARCH_DONE</code> if there are no matches.
  683. * @see #usearch_getOffset
  684. * @see #USEARCH_DONE
  685. * @stable ICU 2.4
  686. */
  687. U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
  688. int32_t position,
  689. UErrorCode *status);
  690. /**
  691. * Returns the index of the next point at which the string text matches the
  692. * search pattern, starting from the current position.
  693. * The iterator is adjusted so that its current
  694. * index (as returned by <code>usearch_getOffset</code>) is the match position if
  695. * one was found.
  696. * If a match is not found, <code>USEARCH_DONE</code> will be returned and
  697. * the iterator will be adjusted to the index <code>USEARCH_DONE</code>
  698. * @param strsrch search iterator data struct
  699. * @param status for errors if it occurs
  700. * @return The index of the next match after the current position, or
  701. * <code>USEARCH_DONE</code> if there are no more matches.
  702. * @see #usearch_first
  703. * @see #usearch_getOffset
  704. * @see #USEARCH_DONE
  705. * @stable ICU 2.4
  706. */
  707. U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
  708. UErrorCode *status);
  709. /**
  710. * Returns the index of the previous point at which the string text matches
  711. * the search pattern, starting at the current position.
  712. * The iterator is adjusted so that its current
  713. * index (as returned by <code>usearch_getOffset</code>) is the match position if
  714. * one was found.
  715. * If a match is not found, <code>USEARCH_DONE</code> will be returned and
  716. * the iterator will be adjusted to the index <code>USEARCH_DONE</code>
  717. * @param strsrch search iterator data struct
  718. * @param status for errors if it occurs
  719. * @return The index of the previous match before the current position,
  720. * or <code>USEARCH_DONE</code> if there are no more matches.
  721. * @see #usearch_last
  722. * @see #usearch_getOffset
  723. * @see #USEARCH_DONE
  724. * @stable ICU 2.4
  725. */
  726. U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
  727. UErrorCode *status);
  728. /**
  729. * Reset the iteration.
  730. * Search will begin at the start of the text string if a forward iteration
  731. * is initiated before a backwards iteration. Otherwise if a backwards
  732. * iteration is initiated before a forwards iteration, the search will begin
  733. * at the end of the text string.
  734. * @param strsrch search iterator data struct
  735. * @see #usearch_first
  736. * @stable ICU 2.4
  737. */
  738. U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch);
  739. #ifndef U_HIDE_INTERNAL_API
  740. /**
  741. * Simple forward search for the pattern, starting at a specified index,
  742. * and using a default set search options.
  743. *
  744. * This is an experimental function, and is not an official part of the
  745. * ICU API.
  746. *
  747. * The collator options, such as UCOL_STRENGTH and UCOL_NORMALIZTION, are honored.
  748. *
  749. * The UStringSearch options USEARCH_CANONICAL_MATCH, USEARCH_OVERLAP and
  750. * any Break Iterator are ignored.
  751. *
  752. * Matches obey the following constraints:
  753. *
  754. * Characters at the start or end positions of a match that are ignorable
  755. * for collation are not included as part of the match, unless they
  756. * are part of a combining sequence, as described below.
  757. *
  758. * A match will not include a partial combining sequence. Combining
  759. * character sequences are considered to be inseparable units,
  760. * and either match the pattern completely, or are considered to not match
  761. * at all. Thus, for example, an A followed a combining accent mark will
  762. * not be found when searching for a plain (unaccented) A. (unless
  763. * the collation strength has been set to ignore all accents).
  764. *
  765. * When beginning a search, the initial starting position, startIdx,
  766. * is assumed to be an acceptable match boundary with respect to
  767. * combining characters. A combining sequence that spans across the
  768. * starting point will not suppress a match beginning at startIdx.
  769. *
  770. * Characters that expand to multiple collation elements
  771. * (German sharp-S becoming 'ss', or the composed forms of accented
  772. * characters, for example) also must match completely.
  773. * Searching for a single 's' in a string containing only a sharp-s will
  774. * find no match.
  775. *
  776. *
  777. * @param strsrch the UStringSearch struct, which references both
  778. * the text to be searched and the pattern being sought.
  779. * @param startIdx The index into the text to begin the search.
  780. * @param matchStart An out parameter, the starting index of the matched text.
  781. * This parameter may be NULL.
  782. * A value of -1 will be returned if no match was found.
  783. * @param matchLimit Out parameter, the index of the first position following the matched text.
  784. * The matchLimit will be at a suitable position for beginning a subsequent search
  785. * in the input text.
  786. * This parameter may be NULL.
  787. * A value of -1 will be returned if no match was found.
  788. *
  789. * @param status Report any errors. Note that no match found is not an error.
  790. * @return true if a match was found, false otherwise.
  791. *
  792. * @internal
  793. */
  794. U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
  795. int32_t startIdx,
  796. int32_t *matchStart,
  797. int32_t *matchLimit,
  798. UErrorCode *status);
  799. /**
  800. * Simple backwards search for the pattern, starting at a specified index,
  801. * and using using a default set search options.
  802. *
  803. * This is an experimental function, and is not an official part of the
  804. * ICU API.
  805. *
  806. * The collator options, such as UCOL_STRENGTH and UCOL_NORMALIZTION, are honored.
  807. *
  808. * The UStringSearch options USEARCH_CANONICAL_MATCH, USEARCH_OVERLAP and
  809. * any Break Iterator are ignored.
  810. *
  811. * Matches obey the following constraints:
  812. *
  813. * Characters at the start or end positions of a match that are ignorable
  814. * for collation are not included as part of the match, unless they
  815. * are part of a combining sequence, as described below.
  816. *
  817. * A match will not include a partial combining sequence. Combining
  818. * character sequences are considered to be inseparable units,
  819. * and either match the pattern completely, or are considered to not match
  820. * at all. Thus, for example, an A followed a combining accent mark will
  821. * not be found when searching for a plain (unaccented) A. (unless
  822. * the collation strength has been set to ignore all accents).
  823. *
  824. * When beginning a search, the initial starting position, startIdx,
  825. * is assumed to be an acceptable match boundary with respect to
  826. * combining characters. A combining sequence that spans across the
  827. * starting point will not suppress a match beginning at startIdx.
  828. *
  829. * Characters that expand to multiple collation elements
  830. * (German sharp-S becoming 'ss', or the composed forms of accented
  831. * characters, for example) also must match completely.
  832. * Searching for a single 's' in a string containing only a sharp-s will
  833. * find no match.
  834. *
  835. *
  836. * @param strsrch the UStringSearch struct, which references both
  837. * the text to be searched and the pattern being sought.
  838. * @param startIdx The index into the text to begin the search.
  839. * @param matchStart An out parameter, the starting index of the matched text.
  840. * This parameter may be NULL.
  841. * A value of -1 will be returned if no match was found.
  842. * @param matchLimit Out parameter, the index of the first position following the matched text.
  843. * The matchLimit will be at a suitable position for beginning a subsequent search
  844. * in the input text.
  845. * This parameter may be NULL.
  846. * A value of -1 will be returned if no match was found.
  847. *
  848. * @param status Report any errors. Note that no match found is not an error.
  849. * @return true if a match was found, false otherwise.
  850. *
  851. * @internal
  852. */
  853. U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
  854. int32_t startIdx,
  855. int32_t *matchStart,
  856. int32_t *matchLimit,
  857. UErrorCode *status);
  858. #endif /* U_HIDE_INTERNAL_API */
  859. #endif /* #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION */
  860. #endif