// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2009-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: normalizer2impl.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2009nov22 * created by: Markus W. Scherer */ #ifndef __NORMALIZER2IMPL_H__ #define __NORMALIZER2IMPL_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_NORMALIZATION #include "unicode/normalizer2.h" #include "unicode/ucptrie.h" #include "unicode/unistr.h" #include "unicode/unorm.h" #include "unicode/utf.h" #include "unicode/utf16.h" #include "mutex.h" #include "udataswp.h" #include "uset_imp.h" // When the nfc.nrm data is *not* hardcoded into the common library // (with this constant set to 0), // then it needs to be built into the data package: // Add nfc.nrm to icu4c/source/data/Makefile.in DAT_FILES_SHORT #define NORM2_HARDCODE_NFC_DATA 1 U_NAMESPACE_BEGIN struct CanonIterData; class ByteSink; class Edits; class InitCanonIterData; class LcccContext; class U_COMMON_API Hangul { public: /* Korean Hangul and Jamo constants */ enum { JAMO_L_BASE=0x1100, /* "lead" jamo */ JAMO_L_END=0x1112, JAMO_V_BASE=0x1161, /* "vowel" jamo */ JAMO_V_END=0x1175, JAMO_T_BASE=0x11a7, /* "trail" jamo */ JAMO_T_END=0x11c2, HANGUL_BASE=0xac00, HANGUL_END=0xd7a3, JAMO_L_COUNT=19, JAMO_V_COUNT=21, JAMO_T_COUNT=28, JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT, HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT, HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT }; static inline UBool isHangul(UChar32 c) { return HANGUL_BASE<=c && c(c - JAMO_L_BASE) < JAMO_L_COUNT; } static inline UBool isJamoV(UChar32 c) { return static_cast(c - JAMO_V_BASE) < JAMO_V_COUNT; } static inline UBool isJamoT(UChar32 c) { int32_t t=c-JAMO_T_BASE; return 0(JAMO_L_BASE + c / JAMO_V_COUNT); buffer[1] = static_cast(JAMO_V_BASE + c % JAMO_V_COUNT); if(c2==0) { return 2; } else { buffer[2] = static_cast(JAMO_T_BASE + c2); return 3; } } /** * Decomposes c, which must be a Hangul syllable, into buffer. * This is the raw, not recursive, decomposition. Its length is always 2. */ static inline void getRawDecomposition(UChar32 c, char16_t buffer[2]) { UChar32 orig=c; c-=HANGUL_BASE; UChar32 c2=c%JAMO_T_COUNT; if(c2==0) { c/=JAMO_T_COUNT; buffer[0] = static_cast(JAMO_L_BASE + c / JAMO_V_COUNT); buffer[1] = static_cast(JAMO_V_BASE + c % JAMO_V_COUNT); } else { buffer[0] = static_cast(orig - c2); // LV syllable buffer[1] = static_cast(JAMO_T_BASE + c2); } } private: Hangul() = delete; // no instantiation }; class Normalizer2Impl; class U_COMMON_API ReorderingBuffer : public UMemory { public: /** Constructs only; init() should be called. */ ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) : impl(ni), str(dest), start(nullptr), reorderStart(nullptr), limit(nullptr), remainingCapacity(0), lastCC(0) {} /** Constructs, removes the string contents, and initializes for a small initial capacity. */ ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest, UErrorCode &errorCode); ~ReorderingBuffer() { if (start != nullptr) { str.releaseBuffer(static_cast(limit - start)); } } UBool init(int32_t destCapacity, UErrorCode &errorCode); UBool isEmpty() const { return start==limit; } int32_t length() const { return static_cast(limit - start); } char16_t *getStart() { return start; } char16_t *getLimit() { return limit; } uint8_t getLastCC() const { return lastCC; } UBool equals(const char16_t *start, const char16_t *limit) const; UBool equals(const uint8_t *otherStart, const uint8_t *otherLimit) const; UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) { return (c<=0xffff) ? appendBMP(static_cast(c), cc, errorCode) : appendSupplementary(c, cc, errorCode); } UBool append(const char16_t *s, int32_t length, UBool isNFD, uint8_t leadCC, uint8_t trailCC, UErrorCode &errorCode); UBool appendBMP(char16_t c, uint8_t cc, UErrorCode &errorCode) { if(remainingCapacity==0 && !resize(1, errorCode)) { return false; } if(lastCC<=cc || cc==0) { *limit++=c; lastCC=cc; if(cc<=1) { reorderStart=limit; } } else { insert(c, cc); } --remainingCapacity; return true; } UBool appendZeroCC(UChar32 c, UErrorCode &errorCode); UBool appendZeroCC(const char16_t *s, const char16_t *sLimit, UErrorCode &errorCode); void remove(); void removeSuffix(int32_t suffixLength); void setReorderingLimit(char16_t *newLimit) { remainingCapacity += static_cast(limit - newLimit); reorderStart=limit=newLimit; lastCC=0; } void copyReorderableSuffixTo(UnicodeString &s) const { s.setTo(ConstChar16Ptr(reorderStart), static_cast(limit - reorderStart)); } private: /* * TODO: Revisit whether it makes sense to track reorderStart. * It is set to after the last known character with cc<=1, * which stops previousCC() before it reads that character and looks up its cc. * previousCC() is normally only called from insert(). * In other words, reorderStart speeds up the insertion of a combining mark * into a multi-combining mark sequence where it does not belong at the end. * This might not be worth the trouble. * On the other hand, it's not a huge amount of trouble. * * We probably need it for UNORM_SIMPLE_APPEND. */ UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode); void insert(UChar32 c, uint8_t cc); static void writeCodePoint(char16_t *p, UChar32 c) { if(c<=0xffff) { *p = static_cast(c); } else { p[0]=U16_LEAD(c); p[1]=U16_TRAIL(c); } } UBool resize(int32_t appendLength, UErrorCode &errorCode); const Normalizer2Impl &impl; UnicodeString &str; char16_t *start, *reorderStart, *limit; int32_t remainingCapacity; uint8_t lastCC; // private backward iterator void setIterator() { codePointStart=limit; } void skipPrevious(); // Requires start(INERT) : UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c); } uint16_t getRawNorm16(UChar32 c) const { return UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c); } UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const { if(norm16=MIN_NORMAL_MAYBE_YES) { return getCCFromNormalYesOrMaybe(norm16); } if(norm16(norm16 >> OFFSET_SHIFT); } static uint8_t getCCFromYesOrMaybeYes(uint16_t norm16) { return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; } uint8_t getCCFromYesOrMaybeYesCP(UChar32 c) const { if (c < minCompNoMaybeCP) { return 0; } return getCCFromYesOrMaybeYes(getNorm16(c)); } /** * Returns the FCD data for code point c. * @param c A Unicode code point. * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. */ uint16_t getFCD16(UChar32 c) const { if(c>8]; if(bits==0) { return false; } return (bits >> ((lead >> 5) & 7)) & 1; } /** Returns the FCD value from the regular normalization data. */ uint16_t getFCD16FromNormData(UChar32 c) const; uint16_t getFCD16FromMaybeOrNonZeroCC(uint16_t norm16) const; /** * Gets the decomposition for one code point. * @param c code point * @param buffer out-only buffer for algorithmic decompositions * @param length out-only, takes the length of the decomposition, if any * @return pointer to the decomposition, or NULL if none */ const char16_t *getDecomposition(UChar32 c, char16_t buffer[4], int32_t &length) const; /** * Gets the raw decomposition for one code point. * @param c code point * @param buffer out-only buffer for algorithmic decompositions * @param length out-only, takes the length of the decomposition, if any * @return pointer to the decomposition, or NULL if none */ const char16_t *getRawDecomposition(UChar32 c, char16_t buffer[30], int32_t &length) const; UChar32 composePair(UChar32 a, UChar32 b) const; UBool isCanonSegmentStarter(UChar32 c) const; UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const; enum { // Fixed norm16 values. MIN_YES_YES_WITH_CC=0xfe02, JAMO_VT=0xfe00, MIN_NORMAL_MAYBE_YES=0xfc00, JAMO_L=2, // offset=1 hasCompBoundaryAfter=false INERT=1, // offset=0 hasCompBoundaryAfter=true // norm16 bit 0 is comp-boundary-after. HAS_COMP_BOUNDARY_AFTER=1, OFFSET_SHIFT=1, // For algorithmic one-way mappings, norm16 bits 2..1 indicate the // tccc (0, 1, >1) for quick FCC boundary-after tests. DELTA_TCCC_0=0, DELTA_TCCC_1=2, DELTA_TCCC_GT_1=4, DELTA_TCCC_MASK=6, DELTA_SHIFT=3, MAX_DELTA=0x40 }; enum { // Byte offsets from the start of the data, after the generic header. IX_NORM_TRIE_OFFSET, IX_EXTRA_DATA_OFFSET, IX_SMALL_FCD_OFFSET, IX_RESERVED3_OFFSET, IX_RESERVED4_OFFSET, IX_RESERVED5_OFFSET, IX_RESERVED6_OFFSET, IX_TOTAL_SIZE, // Code point thresholds for quick check codes. IX_MIN_DECOMP_NO_CP, IX_MIN_COMP_NO_MAYBE_CP, // Norm16 value thresholds for quick check combinations and types of extra data. /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ IX_MIN_YES_NO, /** Mappings are comp-normalized. */ IX_MIN_NO_NO, IX_LIMIT_NO_NO, IX_MIN_MAYBE_YES, /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ IX_MIN_YES_NO_MAPPINGS_ONLY, /** Mappings are not comp-normalized but have a comp boundary before. */ IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE, /** Mappings do not have a comp boundary before. */ IX_MIN_NO_NO_COMP_NO_MAYBE_CC, /** Mappings to the empty string. */ IX_MIN_NO_NO_EMPTY, IX_MIN_LCCC_CP, IX_RESERVED19, /** Two-way mappings; each starts with a character that combines backward. */ IX_MIN_MAYBE_NO, // 20 /** Two-way mappings & compositions. */ IX_MIN_MAYBE_NO_COMBINES_FWD, IX_COUNT // 22 }; enum { MAPPING_HAS_CCC_LCCC_WORD=0x80, MAPPING_HAS_RAW_MAPPING=0x40, // unused bit 0x20, MAPPING_LENGTH_MASK=0x1f }; enum { COMP_1_LAST_TUPLE=0x8000, COMP_1_TRIPLE=1, COMP_1_TRAIL_LIMIT=0x3400, COMP_1_TRAIL_MASK=0x7ffe, COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit COMP_2_TRAIL_SHIFT=6, COMP_2_TRAIL_MASK=0xffc0 }; // higher-level functionality ------------------------------------------ *** // NFD without an NFD Normalizer2 instance. UnicodeString &decompose(const UnicodeString &src, UnicodeString &dest, UErrorCode &errorCode) const; /** * Decomposes [src, limit[ and writes the result to dest. * limit can be NULL if src is NUL-terminated. * destLengthEstimate is the initial dest buffer capacity and can be -1. */ void decompose(const char16_t *src, const char16_t *limit, UnicodeString &dest, int32_t destLengthEstimate, UErrorCode &errorCode) const; const char16_t *decompose(const char16_t *src, const char16_t *limit, ReorderingBuffer *buffer, UErrorCode &errorCode) const; void decomposeAndAppend(const char16_t *src, const char16_t *limit, UBool doDecompose, UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const; /** sink==nullptr: isNormalized()/spanQuickCheckYes() */ const uint8_t *decomposeUTF8(uint32_t options, const uint8_t *src, const uint8_t *limit, ByteSink *sink, Edits *edits, UErrorCode &errorCode) const; UBool compose(const char16_t *src, const char16_t *limit, UBool onlyContiguous, UBool doCompose, ReorderingBuffer &buffer, UErrorCode &errorCode) const; const char16_t *composeQuickCheck(const char16_t *src, const char16_t *limit, UBool onlyContiguous, UNormalizationCheckResult *pQCResult) const; void composeAndAppend(const char16_t *src, const char16_t *limit, UBool doCompose, UBool onlyContiguous, UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const; /** sink==nullptr: isNormalized() */ UBool composeUTF8(uint32_t options, UBool onlyContiguous, const uint8_t *src, const uint8_t *limit, ByteSink *sink, icu::Edits *edits, UErrorCode &errorCode) const; const char16_t *makeFCD(const char16_t *src, const char16_t *limit, ReorderingBuffer *buffer, UErrorCode &errorCode) const; void makeFCDAndAppend(const char16_t *src, const char16_t *limit, UBool doMakeFCD, UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const; UBool hasDecompBoundaryBefore(UChar32 c) const; UBool norm16HasDecompBoundaryBefore(uint16_t norm16) const; UBool hasDecompBoundaryAfter(UChar32 c) const; UBool norm16HasDecompBoundaryAfter(uint16_t norm16) const; UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); } UBool hasCompBoundaryBefore(UChar32 c) const { return c=minMaybeYes; } static UBool isInert(uint16_t norm16) { return norm16==INERT; } static UBool isJamoL(uint16_t norm16) { return norm16==JAMO_L; } static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } uint16_t hangulLVT() const { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } UBool isHangulLV(uint16_t norm16) const { return norm16==minYesNo; } UBool isHangulLVT(uint16_t norm16) const { return norm16==hangulLVT(); } UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16=MIN_YES_YES_WITH_CC || norm16=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; // } uint8_t getCCFromNoNo(uint16_t norm16) const { const uint16_t *mapping=getDataForYesOrNo(norm16); if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) { return static_cast(*(mapping - 1)); } else { return 0; } } // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() uint8_t getTrailCCFromCompYesAndZeroCC(uint16_t norm16) const { if(norm16<=minYesNo) { return 0; // yesYes and Hangul LV have ccc=tccc=0 } else { // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. return static_cast(*getDataForYesOrNo(norm16) >> 8); // tccc from yesNo } } uint8_t getPreviousTrailCC(const char16_t *start, const char16_t *p) const; uint8_t getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const; // Requires algorithmic-NoNo. UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const { return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; } UChar32 getAlgorithmicDelta(uint16_t norm16) const { return (norm16>>DELTA_SHIFT)-centerNoNoDelta; } const uint16_t *getDataForYesOrNo(uint16_t norm16) const { return extraData+(norm16>>OFFSET_SHIFT); } const uint16_t *getDataForMaybe(uint16_t norm16) const { return extraData+((norm16-minMaybeNo+limitNoNo)>>OFFSET_SHIFT); } const uint16_t *getData(uint16_t norm16) const { if(norm16>=minMaybeNo) { norm16=norm16-minMaybeNo+limitNoNo; } return extraData+(norm16>>OFFSET_SHIFT); } const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const { if(norm16