123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- *******************************************************************************
- * Copyright (C) 2012-2016, International Business Machines
- * Corporation and others. All Rights Reserved.
- *******************************************************************************
- * utf8collationiterator.h
- *
- * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
- * created by: Markus W. Scherer
- */
- #ifndef __UTF8COLLATIONITERATOR_H__
- #define __UTF8COLLATIONITERATOR_H__
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_COLLATION
- #include "cmemory.h"
- #include "collation.h"
- #include "collationdata.h"
- #include "collationiterator.h"
- #include "normalizer2impl.h"
- U_NAMESPACE_BEGIN
- /**
- * UTF-8 collation element and character iterator.
- * Handles normalized UTF-8 text inline, with length or NUL-terminated.
- * Unnormalized text is handled by a subclass.
- */
- class U_I18N_API UTF8CollationIterator : public CollationIterator {
- public:
- UTF8CollationIterator(const CollationData *d, UBool numeric,
- const uint8_t *s, int32_t p, int32_t len)
- : CollationIterator(d, numeric),
- u8(s), pos(p), length(len) {}
- virtual ~UTF8CollationIterator();
- virtual void resetToOffset(int32_t newOffset) override;
- virtual int32_t getOffset() const override;
- virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
- virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
- protected:
- /**
- * For byte sequences that are illegal in UTF-8, an error value may be returned
- * together with a bogus code point. The caller will ignore that code point.
- *
- * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
- * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns true.
- *
- * Valid lead surrogates are returned from inside a normalized text segment,
- * where handleGetTrailSurrogate() will return the matching trail surrogate.
- */
- virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
- virtual UBool foundNULTerminator() override;
- virtual UBool forbidSurrogateCodePoints() const override;
- virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
- virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
- const uint8_t *u8;
- int32_t pos;
- int32_t length; // <0 for NUL-terminated strings
- };
- /**
- * Incrementally checks the input text for FCD and normalizes where necessary.
- */
- class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
- public:
- FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
- const uint8_t *s, int32_t p, int32_t len)
- : UTF8CollationIterator(data, numeric, s, p, len),
- state(CHECK_FWD), start(p),
- nfcImpl(data->nfcImpl) {}
- virtual ~FCDUTF8CollationIterator();
- virtual void resetToOffset(int32_t newOffset) override;
- virtual int32_t getOffset() const override;
- virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
- virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
- protected:
- virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
- virtual char16_t handleGetTrailSurrogate() override;
- virtual UBool foundNULTerminator() override;
- virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
- virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
- private:
- UBool nextHasLccc() const;
- UBool previousHasTccc() const;
- /**
- * Switches to forward checking if possible.
- */
- void switchToForward();
- /**
- * Extends the FCD text segment forward or normalizes around pos.
- * @return true if success
- */
- UBool nextSegment(UErrorCode &errorCode);
- /**
- * Switches to backward checking.
- */
- void switchToBackward();
- /**
- * Extends the FCD text segment backward or normalizes around pos.
- * @return true if success
- */
- UBool previousSegment(UErrorCode &errorCode);
- UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
- enum State {
- /**
- * The input text [start..pos[ passes the FCD check.
- * Moving forward checks incrementally.
- * limit is undefined.
- */
- CHECK_FWD,
- /**
- * The input text [pos..limit[ passes the FCD check.
- * Moving backward checks incrementally.
- * start is undefined.
- */
- CHECK_BWD,
- /**
- * The input text [start..limit[ passes the FCD check.
- * pos tracks the current text index.
- */
- IN_FCD_SEGMENT,
- /**
- * The input text [start..limit[ failed the FCD check and was normalized.
- * pos tracks the current index in the normalized string.
- */
- IN_NORMALIZED
- };
- State state;
- int32_t start;
- int32_t limit;
- const Normalizer2Impl &nfcImpl;
- UnicodeString normalized;
- };
- U_NAMESPACE_END
- #endif // !UCONFIG_NO_COLLATION
- #endif // __UTF8COLLATIONITERATOR_H__
|