search.h 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2001-2011 IBM and others. All rights reserved.
  6. **********************************************************************
  7. * Date Name Description
  8. * 03/22/2000 helena Creation.
  9. **********************************************************************
  10. */
  11. #ifndef SEARCH_H
  12. #define SEARCH_H
  13. #include "unicode/utypes.h"
  14. #if U_SHOW_CPLUSPLUS_API
  15. /**
  16. * \file
  17. * \brief C++ API: SearchIterator object.
  18. */
  19. #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
  20. #include "unicode/uobject.h"
  21. #include "unicode/unistr.h"
  22. #include "unicode/chariter.h"
  23. #include "unicode/brkiter.h"
  24. #include "unicode/usearch.h"
  25. /**
  26. * @stable ICU 2.0
  27. */
  28. struct USearch;
  29. /**
  30. * @stable ICU 2.0
  31. */
  32. typedef struct USearch USearch;
  33. U_NAMESPACE_BEGIN
  34. /**
  35. *
  36. * <tt>SearchIterator</tt> is an abstract base class that provides
  37. * methods to search for a pattern within a text string. Instances of
  38. * <tt>SearchIterator</tt> maintain a current position and scans over the
  39. * target text, returning the indices the pattern is matched and the length
  40. * of each match.
  41. * <p>
  42. * <tt>SearchIterator</tt> defines a protocol for text searching.
  43. * Subclasses provide concrete implementations of various search algorithms.
  44. * For example, <tt>StringSearch</tt> implements language-sensitive pattern
  45. * matching based on the comparison rules defined in a
  46. * <tt>RuleBasedCollator</tt> object.
  47. * <p>
  48. * Other options for searching includes using a BreakIterator to restrict
  49. * the points at which matches are detected.
  50. * <p>
  51. * <tt>SearchIterator</tt> provides an API that is similar to that of
  52. * other text iteration classes such as <tt>BreakIterator</tt>. Using
  53. * this class, it is easy to scan through text looking for all occurrences of
  54. * a given pattern. The following example uses a <tt>StringSearch</tt>
  55. * object to find all instances of "fox" in the target string. Any other
  56. * subclass of <tt>SearchIterator</tt> can be used in an identical
  57. * manner.
  58. * <pre><code>
  59. * UnicodeString target("The quick brown fox jumped over the lazy fox");
  60. * UnicodeString pattern("fox");
  61. *
  62. * SearchIterator *iter = new StringSearch(pattern, target);
  63. * UErrorCode error = U_ZERO_ERROR;
  64. * for (int pos = iter->first(error); pos != USEARCH_DONE;
  65. * pos = iter->next(error)) {
  66. * printf("Found match at %d pos, length is %d\n", pos, iter.getMatchedLength());
  67. * }
  68. * </code></pre>
  69. *
  70. * @see StringSearch
  71. * @see RuleBasedCollator
  72. */
  73. class U_I18N_API SearchIterator : public UObject {
  74. public:
  75. // public constructors and destructors -------------------------------
  76. /**
  77. * Copy constructor that creates a SearchIterator instance with the same
  78. * behavior, and iterating over the same text.
  79. * @param other the SearchIterator instance to be copied.
  80. * @stable ICU 2.0
  81. */
  82. SearchIterator(const SearchIterator &other);
  83. /**
  84. * Destructor. Cleans up the search iterator data struct.
  85. * @stable ICU 2.0
  86. */
  87. virtual ~SearchIterator();
  88. // public get and set methods ----------------------------------------
  89. /**
  90. * Sets the index to point to the given position, and clears any state
  91. * that's affected.
  92. * <p>
  93. * This method takes the argument index and sets the position in the text
  94. * string accordingly without checking if the index is pointing to a
  95. * valid starting point to begin searching.
  96. * @param position within the text to be set. If position is less
  97. * than or greater than the text range for searching,
  98. * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
  99. * @param status for errors if it occurs
  100. * @stable ICU 2.0
  101. */
  102. virtual void setOffset(int32_t position, UErrorCode &status) = 0;
  103. /**
  104. * Return the current index in the text being searched.
  105. * If the iteration has gone past the end of the text
  106. * (or past the beginning for a backwards search), USEARCH_DONE
  107. * is returned.
  108. * @return current index in the text being searched.
  109. * @stable ICU 2.0
  110. */
  111. virtual int32_t getOffset() const = 0;
  112. /**
  113. * Sets the text searching attributes located in the enum
  114. * USearchAttribute with values from the enum USearchAttributeValue.
  115. * USEARCH_DEFAULT can be used for all attributes for resetting.
  116. * @param attribute text attribute (enum USearchAttribute) to be set
  117. * @param value text attribute value
  118. * @param status for errors if it occurs
  119. * @stable ICU 2.0
  120. */
  121. void setAttribute(USearchAttribute attribute,
  122. USearchAttributeValue value,
  123. UErrorCode &status);
  124. /**
  125. * Gets the text searching attributes
  126. * @param attribute text attribute (enum USearchAttribute) to be retrieve
  127. * @return text attribute value
  128. * @stable ICU 2.0
  129. */
  130. USearchAttributeValue getAttribute(USearchAttribute attribute) const;
  131. /**
  132. * Returns the index to the match in the text string that was searched.
  133. * This call returns a valid result only after a successful call to
  134. * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
  135. * Just after construction, or after a searching method returns
  136. * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
  137. * <p>
  138. * Use getMatchedLength to get the matched string length.
  139. * @return index of a substring within the text string that is being
  140. * searched.
  141. * @see #first
  142. * @see #next
  143. * @see #previous
  144. * @see #last
  145. * @stable ICU 2.0
  146. */
  147. int32_t getMatchedStart() const;
  148. /**
  149. * Returns the length of text in the string which matches the search
  150. * pattern. This call returns a valid result only after a successful call
  151. * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
  152. * Just after construction, or after a searching method returns
  153. * <tt>USEARCH_DONE</tt>, this method will return 0.
  154. * @return The length of the match in the target text, or 0 if there
  155. * is no match currently.
  156. * @see #first
  157. * @see #next
  158. * @see #previous
  159. * @see #last
  160. * @stable ICU 2.0
  161. */
  162. int32_t getMatchedLength() const;
  163. /**
  164. * Returns the text that was matched by the most recent call to
  165. * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
  166. * If the iterator is not pointing at a valid match (e.g. just after
  167. * construction or after <tt>USEARCH_DONE</tt> has been returned,
  168. * returns an empty string.
  169. * @param result stores the matched string or an empty string if a match
  170. * is not found.
  171. * @see #first
  172. * @see #next
  173. * @see #previous
  174. * @see #last
  175. * @stable ICU 2.0
  176. */
  177. void getMatchedText(UnicodeString &result) const;
  178. /**
  179. * Set the BreakIterator that will be used to restrict the points
  180. * at which matches are detected. The user is responsible for deleting
  181. * the breakiterator.
  182. * @param breakiter A BreakIterator that will be used to restrict the
  183. * points at which matches are detected. If a match is
  184. * found, but the match's start or end index is not a
  185. * boundary as determined by the <tt>BreakIterator</tt>,
  186. * the match will be rejected and another will be searched
  187. * for. If this parameter is <tt>nullptr</tt>, no break
  188. * detection is attempted.
  189. * @param status for errors if it occurs
  190. * @see BreakIterator
  191. * @stable ICU 2.0
  192. */
  193. void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
  194. /**
  195. * Returns the BreakIterator that is used to restrict the points at
  196. * which matches are detected. This will be the same object that was
  197. * passed to the constructor or to <tt>setBreakIterator</tt>.
  198. * Note that <tt>nullptr</tt> is a legal value; it means that break
  199. * detection should not be attempted.
  200. * @return BreakIterator used to restrict matchings.
  201. * @see #setBreakIterator
  202. * @stable ICU 2.0
  203. */
  204. const BreakIterator* getBreakIterator() const;
  205. /**
  206. * Set the string text to be searched. Text iteration will hence begin at
  207. * the start of the text string. This method is useful if you want to
  208. * re-use an iterator to search for the same pattern within a different
  209. * body of text. The user is responsible for deleting the text.
  210. * @param text string to be searched.
  211. * @param status for errors. If the text length is 0,
  212. * an U_ILLEGAL_ARGUMENT_ERROR is returned.
  213. * @stable ICU 2.0
  214. */
  215. virtual void setText(const UnicodeString &text, UErrorCode &status);
  216. /**
  217. * Set the string text to be searched. Text iteration will hence begin at
  218. * the start of the text string. This method is useful if you want to
  219. * re-use an iterator to search for the same pattern within a different
  220. * body of text.
  221. * <p>
  222. * Note: No parsing of the text within the <tt>CharacterIterator</tt>
  223. * will be done during searching for this version. The block of text
  224. * in <tt>CharacterIterator</tt> will be used as it is.
  225. * The user is responsible for deleting the text.
  226. * @param text string iterator to be searched.
  227. * @param status for errors if any. If the text length is 0 then an
  228. * U_ILLEGAL_ARGUMENT_ERROR is returned.
  229. * @stable ICU 2.0
  230. */
  231. virtual void setText(CharacterIterator &text, UErrorCode &status);
  232. /**
  233. * Return the string text to be searched.
  234. * @return text string to be searched.
  235. * @stable ICU 2.0
  236. */
  237. const UnicodeString& getText() const;
  238. // operator overloading ----------------------------------------------
  239. /**
  240. * Equality operator.
  241. * @param that SearchIterator instance to be compared.
  242. * @return true if both BreakIterators are of the same class, have the
  243. * same behavior, terates over the same text and have the same
  244. * attributes. false otherwise.
  245. * @stable ICU 2.0
  246. */
  247. virtual bool operator==(const SearchIterator &that) const;
  248. /**
  249. * Not-equal operator.
  250. * @param that SearchIterator instance to be compared.
  251. * @return false if operator== returns true, and vice versa.
  252. * @stable ICU 2.0
  253. */
  254. bool operator!=(const SearchIterator &that) const;
  255. // public methods ----------------------------------------------------
  256. /**
  257. * Returns a copy of SearchIterator with the same behavior, and
  258. * iterating over the same text, as this one. Note that all data will be
  259. * replicated, except for the text string to be searched.
  260. * @return cloned object
  261. * @stable ICU 2.0
  262. */
  263. virtual SearchIterator* safeClone() const = 0;
  264. /**
  265. * Returns the first index at which the string text matches the search
  266. * pattern. The iterator is adjusted so that its current index (as
  267. * returned by <tt>getOffset</tt>) is the match position if one
  268. * was found.
  269. * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
  270. * the iterator will be adjusted to the index USEARCH_DONE
  271. * @param status for errors if it occurs
  272. * @return The character index of the first match, or
  273. * <tt>USEARCH_DONE</tt> if there are no matches.
  274. * @see #getOffset
  275. * @stable ICU 2.0
  276. */
  277. int32_t first(UErrorCode &status);
  278. /**
  279. * Returns the first index equal or greater than <tt>position</tt> at which the
  280. * string text matches the search pattern. The iterator is adjusted so
  281. * that its current index (as returned by <tt>getOffset</tt>) is the
  282. * match position if one was found.
  283. * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the
  284. * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>.
  285. * @param position where search if to start from. If position is less
  286. * than or greater than the text range for searching,
  287. * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
  288. * @param status for errors if it occurs
  289. * @return The character index of the first match following
  290. * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no
  291. * matches.
  292. * @see #getOffset
  293. * @stable ICU 2.0
  294. */
  295. int32_t following(int32_t position, UErrorCode &status);
  296. /**
  297. * Returns the last index in the target text at which it matches the
  298. * search pattern. The iterator is adjusted so that its current index
  299. * (as returned by <tt>getOffset</tt>) is the match position if one was
  300. * found.
  301. * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
  302. * the iterator will be adjusted to the index USEARCH_DONE.
  303. * @param status for errors if it occurs
  304. * @return The index of the first match, or <tt>USEARCH_DONE</tt> if
  305. * there are no matches.
  306. * @see #getOffset
  307. * @stable ICU 2.0
  308. */
  309. int32_t last(UErrorCode &status);
  310. /**
  311. * Returns the first index less than <tt>position</tt> at which the string
  312. * text matches the search pattern. The iterator is adjusted so that its
  313. * current index (as returned by <tt>getOffset</tt>) is the match
  314. * position if one was found. If a match is not found,
  315. * <tt>USEARCH_DONE</tt> will be returned and the iterator will be
  316. * adjusted to the index USEARCH_DONE
  317. * <p>
  318. * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the
  319. * result match is always less than <tt>position</tt>.
  320. * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across
  321. * <tt>position</tt>.
  322. *
  323. * @param position where search is to start from. If position is less
  324. * than or greater than the text range for searching,
  325. * an U_INDEX_OUTOFBOUNDS_ERROR will be returned
  326. * @param status for errors if it occurs
  327. * @return The character index of the first match preceding
  328. * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are
  329. * no matches.
  330. * @see #getOffset
  331. * @stable ICU 2.0
  332. */
  333. int32_t preceding(int32_t position, UErrorCode &status);
  334. /**
  335. * Returns the index of the next point at which the text matches the
  336. * search pattern, starting from the current position
  337. * The iterator is adjusted so that its current index (as returned by
  338. * <tt>getOffset</tt>) is the match position if one was found.
  339. * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
  340. * the iterator will be adjusted to a position after the end of the text
  341. * string.
  342. * @param status for errors if it occurs
  343. * @return The index of the next match after the current position,
  344. * or <tt>USEARCH_DONE</tt> if there are no more matches.
  345. * @see #getOffset
  346. * @stable ICU 2.0
  347. */
  348. int32_t next(UErrorCode &status);
  349. /**
  350. * Returns the index of the previous point at which the string text
  351. * matches the search pattern, starting at the current position.
  352. * The iterator is adjusted so that its current index (as returned by
  353. * <tt>getOffset</tt>) is the match position if one was found.
  354. * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
  355. * the iterator will be adjusted to the index USEARCH_DONE
  356. * @param status for errors if it occurs
  357. * @return The index of the previous match before the current position,
  358. * or <tt>USEARCH_DONE</tt> if there are no more matches.
  359. * @see #getOffset
  360. * @stable ICU 2.0
  361. */
  362. int32_t previous(UErrorCode &status);
  363. /**
  364. * Resets the iteration.
  365. * Search will begin at the start of the text string if a forward
  366. * iteration is initiated before a backwards iteration. Otherwise if a
  367. * backwards iteration is initiated before a forwards iteration, the
  368. * search will begin at the end of the text string.
  369. * @stable ICU 2.0
  370. */
  371. virtual void reset();
  372. protected:
  373. // protected data members ---------------------------------------------
  374. /**
  375. * C search data struct
  376. * @stable ICU 2.0
  377. */
  378. USearch *m_search_;
  379. /**
  380. * Break iterator.
  381. * Currently the C++ breakiterator does not have getRules etc to reproduce
  382. * another in C. Hence we keep the original around and do the verification
  383. * at the end of the match. The user is responsible for deleting this
  384. * break iterator.
  385. * @stable ICU 2.0
  386. */
  387. BreakIterator *m_breakiterator_;
  388. /**
  389. * Unicode string version of the search text
  390. * @stable ICU 2.0
  391. */
  392. UnicodeString m_text_;
  393. // protected constructors and destructors -----------------------------
  394. /**
  395. * Default constructor.
  396. * Initializes data to the default values.
  397. * @stable ICU 2.0
  398. */
  399. SearchIterator();
  400. /**
  401. * Constructor for use by subclasses.
  402. * @param text The target text to be searched.
  403. * @param breakiter A {@link BreakIterator} that is used to restrict the
  404. * points at which matches are detected. If
  405. * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
  406. * match, but the match's start or end index is not a
  407. * boundary as determined by the <tt>BreakIterator</tt>,
  408. * the match is rejected and <tt>handleNext</tt> or
  409. * <tt>handlePrev</tt> is called again. If this parameter
  410. * is <tt>nullptr</tt>, no break detection is attempted.
  411. * @see #handleNext
  412. * @see #handlePrev
  413. * @stable ICU 2.0
  414. */
  415. SearchIterator(const UnicodeString &text,
  416. BreakIterator *breakiter = nullptr);
  417. /**
  418. * Constructor for use by subclasses.
  419. * <p>
  420. * Note: No parsing of the text within the <tt>CharacterIterator</tt>
  421. * will be done during searching for this version. The block of text
  422. * in <tt>CharacterIterator</tt> will be used as it is.
  423. * @param text The target text to be searched.
  424. * @param breakiter A {@link BreakIterator} that is used to restrict the
  425. * points at which matches are detected. If
  426. * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
  427. * match, but the match's start or end index is not a
  428. * boundary as determined by the <tt>BreakIterator</tt>,
  429. * the match is rejected and <tt>handleNext</tt> or
  430. * <tt>handlePrev</tt> is called again. If this parameter
  431. * is <tt>nullptr</tt>, no break detection is attempted.
  432. * @see #handleNext
  433. * @see #handlePrev
  434. * @stable ICU 2.0
  435. */
  436. SearchIterator(CharacterIterator &text, BreakIterator *breakiter = nullptr);
  437. // protected methods --------------------------------------------------
  438. /**
  439. * Assignment operator. Sets this iterator to have the same behavior,
  440. * and iterate over the same text, as the one passed in.
  441. * @param that instance to be copied.
  442. * @stable ICU 2.0
  443. */
  444. SearchIterator & operator=(const SearchIterator &that);
  445. /**
  446. * Abstract method which subclasses override to provide the mechanism
  447. * for finding the next match in the target text. This allows different
  448. * subclasses to provide different search algorithms.
  449. * <p>
  450. * If a match is found, the implementation should return the index at
  451. * which the match starts and should call
  452. * <tt>setMatchLength</tt> with the number of characters
  453. * in the target text that make up the match. If no match is found, the
  454. * method should return USEARCH_DONE.
  455. * <p>
  456. * @param position The index in the target text at which the search
  457. * should start.
  458. * @param status for error codes if it occurs.
  459. * @return index at which the match starts, else if match is not found
  460. * USEARCH_DONE is returned
  461. * @see #setMatchLength
  462. * @stable ICU 2.0
  463. */
  464. virtual int32_t handleNext(int32_t position, UErrorCode &status)
  465. = 0;
  466. /**
  467. * Abstract method which subclasses override to provide the mechanism for
  468. * finding the previous match in the target text. This allows different
  469. * subclasses to provide different search algorithms.
  470. * <p>
  471. * If a match is found, the implementation should return the index at
  472. * which the match starts and should call
  473. * <tt>setMatchLength</tt> with the number of characters
  474. * in the target text that make up the match. If no match is found, the
  475. * method should return USEARCH_DONE.
  476. * <p>
  477. * @param position The index in the target text at which the search
  478. * should start.
  479. * @param status for error codes if it occurs.
  480. * @return index at which the match starts, else if match is not found
  481. * USEARCH_DONE is returned
  482. * @see #setMatchLength
  483. * @stable ICU 2.0
  484. */
  485. virtual int32_t handlePrev(int32_t position, UErrorCode &status)
  486. = 0;
  487. /**
  488. * Sets the length of the currently matched string in the text string to
  489. * be searched.
  490. * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
  491. * methods should call this when they find a match in the target text.
  492. * @param length length of the matched text.
  493. * @see #handleNext
  494. * @see #handlePrev
  495. * @stable ICU 2.0
  496. */
  497. virtual void setMatchLength(int32_t length);
  498. /**
  499. * Sets the offset of the currently matched string in the text string to
  500. * be searched.
  501. * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
  502. * methods should call this when they find a match in the target text.
  503. * @param position start offset of the matched text.
  504. * @see #handleNext
  505. * @see #handlePrev
  506. * @stable ICU 2.0
  507. */
  508. virtual void setMatchStart(int32_t position);
  509. /**
  510. * sets match not found
  511. * @stable ICU 2.0
  512. */
  513. void setMatchNotFound();
  514. };
  515. inline bool SearchIterator::operator!=(const SearchIterator &that) const
  516. {
  517. return !operator==(that);
  518. }
  519. U_NAMESPACE_END
  520. #endif /* #if !UCONFIG_NO_COLLATION */
  521. #endif /* U_SHOW_CPLUSPLUS_API */
  522. #endif