utf8collationiterator.h 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2012-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * utf8collationiterator.h
  9. *
  10. * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
  11. * created by: Markus W. Scherer
  12. */
  13. #ifndef __UTF8COLLATIONITERATOR_H__
  14. #define __UTF8COLLATIONITERATOR_H__
  15. #include "unicode/utypes.h"
  16. #if !UCONFIG_NO_COLLATION
  17. #include "cmemory.h"
  18. #include "collation.h"
  19. #include "collationdata.h"
  20. #include "collationiterator.h"
  21. #include "normalizer2impl.h"
  22. U_NAMESPACE_BEGIN
  23. /**
  24. * UTF-8 collation element and character iterator.
  25. * Handles normalized UTF-8 text inline, with length or NUL-terminated.
  26. * Unnormalized text is handled by a subclass.
  27. */
  28. class U_I18N_API UTF8CollationIterator : public CollationIterator {
  29. public:
  30. UTF8CollationIterator(const CollationData *d, UBool numeric,
  31. const uint8_t *s, int32_t p, int32_t len)
  32. : CollationIterator(d, numeric),
  33. u8(s), pos(p), length(len) {}
  34. virtual ~UTF8CollationIterator();
  35. virtual void resetToOffset(int32_t newOffset) override;
  36. virtual int32_t getOffset() const override;
  37. virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
  38. virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
  39. protected:
  40. /**
  41. * For byte sequences that are illegal in UTF-8, an error value may be returned
  42. * together with a bogus code point. The caller will ignore that code point.
  43. *
  44. * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
  45. * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns true.
  46. *
  47. * Valid lead surrogates are returned from inside a normalized text segment,
  48. * where handleGetTrailSurrogate() will return the matching trail surrogate.
  49. */
  50. virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
  51. virtual UBool foundNULTerminator() override;
  52. virtual UBool forbidSurrogateCodePoints() const override;
  53. virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
  54. virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
  55. const uint8_t *u8;
  56. int32_t pos;
  57. int32_t length; // <0 for NUL-terminated strings
  58. };
  59. /**
  60. * Incrementally checks the input text for FCD and normalizes where necessary.
  61. */
  62. class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
  63. public:
  64. FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
  65. const uint8_t *s, int32_t p, int32_t len)
  66. : UTF8CollationIterator(data, numeric, s, p, len),
  67. state(CHECK_FWD), start(p),
  68. nfcImpl(data->nfcImpl) {}
  69. virtual ~FCDUTF8CollationIterator();
  70. virtual void resetToOffset(int32_t newOffset) override;
  71. virtual int32_t getOffset() const override;
  72. virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
  73. virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
  74. protected:
  75. virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
  76. virtual char16_t handleGetTrailSurrogate() override;
  77. virtual UBool foundNULTerminator() override;
  78. virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
  79. virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
  80. private:
  81. UBool nextHasLccc() const;
  82. UBool previousHasTccc() const;
  83. /**
  84. * Switches to forward checking if possible.
  85. */
  86. void switchToForward();
  87. /**
  88. * Extends the FCD text segment forward or normalizes around pos.
  89. * @return true if success
  90. */
  91. UBool nextSegment(UErrorCode &errorCode);
  92. /**
  93. * Switches to backward checking.
  94. */
  95. void switchToBackward();
  96. /**
  97. * Extends the FCD text segment backward or normalizes around pos.
  98. * @return true if success
  99. */
  100. UBool previousSegment(UErrorCode &errorCode);
  101. UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
  102. enum State {
  103. /**
  104. * The input text [start..pos[ passes the FCD check.
  105. * Moving forward checks incrementally.
  106. * limit is undefined.
  107. */
  108. CHECK_FWD,
  109. /**
  110. * The input text [pos..limit[ passes the FCD check.
  111. * Moving backward checks incrementally.
  112. * start is undefined.
  113. */
  114. CHECK_BWD,
  115. /**
  116. * The input text [start..limit[ passes the FCD check.
  117. * pos tracks the current text index.
  118. */
  119. IN_FCD_SEGMENT,
  120. /**
  121. * The input text [start..limit[ failed the FCD check and was normalized.
  122. * pos tracks the current index in the normalized string.
  123. */
  124. IN_NORMALIZED
  125. };
  126. State state;
  127. int32_t start;
  128. int32_t limit;
  129. const Normalizer2Impl &nfcImpl;
  130. UnicodeString normalized;
  131. };
  132. U_NAMESPACE_END
  133. #endif // !UCONFIG_NO_COLLATION
  134. #endif // __UTF8COLLATIONITERATOR_H__