123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- *******************************************************************************
- *
- * Copyright (C) 2009-2016, International Business Machines
- * Corporation and others. All Rights Reserved.
- *
- *******************************************************************************
- * file name: normalizer2.cpp
- * encoding: UTF-8
- * tab size: 8 (not used)
- * indentation:4
- *
- * created on: 2009nov22
- * created by: Markus W. Scherer
- */
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_NORMALIZATION
- #include "unicode/edits.h"
- #include "unicode/normalizer2.h"
- #include "unicode/stringoptions.h"
- #include "unicode/unistr.h"
- #include "unicode/unorm.h"
- #include "cstring.h"
- #include "mutex.h"
- #include "norm2allmodes.h"
- #include "normalizer2impl.h"
- #include "uassert.h"
- #include "ucln_cmn.h"
- using icu::Normalizer2Impl;
- #if NORM2_HARDCODE_NFC_DATA
- // NFC/NFD data machine-generated by gennorm2 --csource
- #define INCLUDED_FROM_NORMALIZER2_CPP
- #include "norm2_nfc_data.h"
- #endif
- U_NAMESPACE_BEGIN
- // Public API dispatch via Normalizer2 subclasses -------------------------- ***
- Normalizer2::~Normalizer2() {}
- void
- Normalizer2::normalizeUTF8(uint32_t /*options*/, StringPiece src, ByteSink &sink,
- Edits *edits, UErrorCode &errorCode) const {
- if (U_FAILURE(errorCode)) {
- return;
- }
- if (edits != nullptr) {
- errorCode = U_UNSUPPORTED_ERROR;
- return;
- }
- UnicodeString src16 = UnicodeString::fromUTF8(src);
- normalize(src16, errorCode).toUTF8(sink);
- }
- UBool
- Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
- return false;
- }
- UChar32
- Normalizer2::composePair(UChar32, UChar32) const {
- return U_SENTINEL;
- }
- uint8_t
- Normalizer2::getCombiningClass(UChar32 /*c*/) const {
- return 0;
- }
- UBool
- Normalizer2::isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const {
- return U_SUCCESS(errorCode) && isNormalized(UnicodeString::fromUTF8(s), errorCode);
- }
- // Normalizer2 implementation for the old UNORM_NONE.
- class NoopNormalizer2 : public Normalizer2 {
- virtual ~NoopNormalizer2();
- virtual UnicodeString &
- normalize(const UnicodeString &src,
- UnicodeString &dest,
- UErrorCode &errorCode) const override {
- if(U_SUCCESS(errorCode)) {
- if(&dest!=&src) {
- dest=src;
- } else {
- errorCode=U_ILLEGAL_ARGUMENT_ERROR;
- }
- }
- return dest;
- }
- virtual void
- normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
- Edits *edits, UErrorCode &errorCode) const override {
- if(U_SUCCESS(errorCode)) {
- if (edits != nullptr) {
- if ((options & U_EDITS_NO_RESET) == 0) {
- edits->reset();
- }
- edits->addUnchanged(src.length());
- }
- if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
- sink.Append(src.data(), src.length());
- }
- sink.Flush();
- }
- }
- virtual UnicodeString &
- normalizeSecondAndAppend(UnicodeString &first,
- const UnicodeString &second,
- UErrorCode &errorCode) const override {
- if(U_SUCCESS(errorCode)) {
- if(&first!=&second) {
- first.append(second);
- } else {
- errorCode=U_ILLEGAL_ARGUMENT_ERROR;
- }
- }
- return first;
- }
- virtual UnicodeString &
- append(UnicodeString &first,
- const UnicodeString &second,
- UErrorCode &errorCode) const override {
- if(U_SUCCESS(errorCode)) {
- if(&first!=&second) {
- first.append(second);
- } else {
- errorCode=U_ILLEGAL_ARGUMENT_ERROR;
- }
- }
- return first;
- }
- virtual UBool
- getDecomposition(UChar32, UnicodeString &) const override {
- return false;
- }
- // No need to override the default getRawDecomposition().
- virtual UBool
- isNormalized(const UnicodeString &, UErrorCode &errorCode) const override {
- return U_SUCCESS(errorCode);
- }
- virtual UBool
- isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const override {
- return U_SUCCESS(errorCode);
- }
- virtual UNormalizationCheckResult
- quickCheck(const UnicodeString &, UErrorCode &) const override {
- return UNORM_YES;
- }
- virtual int32_t
- spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const override {
- return s.length();
- }
- virtual UBool hasBoundaryBefore(UChar32) const override { return true; }
- virtual UBool hasBoundaryAfter(UChar32) const override { return true; }
- virtual UBool isInert(UChar32) const override { return true; }
- };
- NoopNormalizer2::~NoopNormalizer2() {}
- Normalizer2WithImpl::~Normalizer2WithImpl() {}
- DecomposeNormalizer2::~DecomposeNormalizer2() {}
- ComposeNormalizer2::~ComposeNormalizer2() {}
- FCDNormalizer2::~FCDNormalizer2() {}
- // instance cache ---------------------------------------------------------- ***
- U_CDECL_BEGIN
- static UBool U_CALLCONV uprv_normalizer2_cleanup();
- U_CDECL_END
- static Normalizer2 *noopSingleton;
- static icu::UInitOnce noopInitOnce {};
- static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) {
- return;
- }
- noopSingleton=new NoopNormalizer2;
- if(noopSingleton==nullptr) {
- errorCode=U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
- }
- const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return nullptr; }
- umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
- return noopSingleton;
- }
- const Normalizer2Impl *
- Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
- return &((Normalizer2WithImpl *)norm2)->impl;
- }
- Norm2AllModes::~Norm2AllModes() {
- delete impl;
- }
- Norm2AllModes *
- Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) {
- delete impl;
- return nullptr;
- }
- Norm2AllModes *allModes=new Norm2AllModes(impl);
- if(allModes==nullptr) {
- errorCode=U_MEMORY_ALLOCATION_ERROR;
- delete impl;
- return nullptr;
- }
- return allModes;
- }
- #if NORM2_HARDCODE_NFC_DATA
- Norm2AllModes *
- Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) {
- return nullptr;
- }
- Normalizer2Impl *impl=new Normalizer2Impl;
- if(impl==nullptr) {
- errorCode=U_MEMORY_ALLOCATION_ERROR;
- return nullptr;
- }
- impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
- norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
- return createInstance(impl, errorCode);
- }
- static Norm2AllModes *nfcSingleton;
- static icu::UInitOnce nfcInitOnce {};
- static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
- nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
- ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
- }
- const Norm2AllModes *
- Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return nullptr; }
- umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
- return nfcSingleton;
- }
- const Normalizer2 *
- Normalizer2::getNFCInstance(UErrorCode &errorCode) {
- const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
- return allModes!=nullptr ? &allModes->comp : nullptr;
- }
- const Normalizer2 *
- Normalizer2::getNFDInstance(UErrorCode &errorCode) {
- const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
- return allModes!=nullptr ? &allModes->decomp : nullptr;
- }
- const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
- const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
- return allModes!=nullptr ? &allModes->fcd : nullptr;
- }
- const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
- const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
- return allModes!=nullptr ? &allModes->fcc : nullptr;
- }
- const Normalizer2Impl *
- Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
- const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
- return allModes!=nullptr ? allModes->impl : nullptr;
- }
- #endif // NORM2_HARDCODE_NFC_DATA
- U_CDECL_BEGIN
- static UBool U_CALLCONV uprv_normalizer2_cleanup() {
- delete noopSingleton;
- noopSingleton = nullptr;
- noopInitOnce.reset();
- #if NORM2_HARDCODE_NFC_DATA
- delete nfcSingleton;
- nfcSingleton = nullptr;
- nfcInitOnce.reset();
- #endif
- return true;
- }
- U_CDECL_END
- U_NAMESPACE_END
- // C API ------------------------------------------------------------------- ***
- U_NAMESPACE_USE
- U_CAPI const UNormalizer2 * U_EXPORT2
- unorm2_getNFCInstance(UErrorCode *pErrorCode) {
- return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
- }
- U_CAPI const UNormalizer2 * U_EXPORT2
- unorm2_getNFDInstance(UErrorCode *pErrorCode) {
- return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
- }
- U_CAPI void U_EXPORT2
- unorm2_close(UNormalizer2 *norm2) {
- delete (Normalizer2 *)norm2;
- }
- U_CAPI int32_t U_EXPORT2
- unorm2_normalize(const UNormalizer2 *norm2,
- const char16_t *src, int32_t length,
- char16_t *dest, int32_t capacity,
- UErrorCode *pErrorCode) {
- if(U_FAILURE(*pErrorCode)) {
- return 0;
- }
- if( (src==nullptr ? length!=0 : length<-1) ||
- (dest==nullptr ? capacity!=0 : capacity<0) ||
- (src==dest && src!=nullptr)
- ) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
- UnicodeString destString(dest, 0, capacity);
- // length==0: Nothing to do, and n2wi->normalize(nullptr, nullptr, buffer, ...) would crash.
- if(length!=0) {
- const Normalizer2 *n2=(const Normalizer2 *)norm2;
- const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
- if(n2wi!=nullptr) {
- // Avoid duplicate argument checking and support NUL-terminated src.
- ReorderingBuffer buffer(n2wi->impl, destString);
- if(buffer.init(length, *pErrorCode)) {
- n2wi->normalize(src, length>=0 ? src+length : nullptr, buffer, *pErrorCode);
- }
- } else {
- UnicodeString srcString(length<0, src, length);
- n2->normalize(srcString, destString, *pErrorCode);
- }
- }
- return destString.extract(dest, capacity, *pErrorCode);
- }
- static int32_t
- normalizeSecondAndAppend(const UNormalizer2 *norm2,
- char16_t *first, int32_t firstLength, int32_t firstCapacity,
- const char16_t *second, int32_t secondLength,
- UBool doNormalize,
- UErrorCode *pErrorCode) {
- if(U_FAILURE(*pErrorCode)) {
- return 0;
- }
- if( (second==nullptr ? secondLength!=0 : secondLength<-1) ||
- (first==nullptr ? (firstCapacity!=0 || firstLength!=0) :
- (firstCapacity<0 || firstLength<-1)) ||
- (first==second && first!=nullptr)
- ) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
- UnicodeString firstString(first, firstLength, firstCapacity);
- firstLength=firstString.length(); // In case it was -1.
- // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(nullptr, nullptr, buffer, ...) would crash.
- if(secondLength!=0) {
- const Normalizer2* n2 = reinterpret_cast<const Normalizer2*>(norm2);
- const Normalizer2WithImpl* n2wi = dynamic_cast<const Normalizer2WithImpl*>(n2);
- if(n2wi!=nullptr) {
- // Avoid duplicate argument checking and support NUL-terminated src.
- UnicodeString safeMiddle;
- {
- ReorderingBuffer buffer(n2wi->impl, firstString);
- if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1
- n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : nullptr,
- doNormalize, safeMiddle, buffer, *pErrorCode);
- }
- } // The ReorderingBuffer destructor finalizes firstString.
- if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
- // Restore the modified suffix of the first string.
- // This does not restore first[] array contents between firstLength and firstCapacity.
- // (That might be uninitialized memory, as far as we know.)
- if(first!=nullptr) { /* don't dereference nullptr */
- safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
- if(firstLength<firstCapacity) {
- first[firstLength]=0; // NUL-terminate in case it was originally.
- }
- }
- }
- } else {
- UnicodeString secondString(secondLength<0, second, secondLength);
- if(doNormalize) {
- n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
- } else {
- n2->append(firstString, secondString, *pErrorCode);
- }
- }
- }
- return firstString.extract(first, firstCapacity, *pErrorCode);
- }
- U_CAPI int32_t U_EXPORT2
- unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
- char16_t *first, int32_t firstLength, int32_t firstCapacity,
- const char16_t *second, int32_t secondLength,
- UErrorCode *pErrorCode) {
- return normalizeSecondAndAppend(norm2,
- first, firstLength, firstCapacity,
- second, secondLength,
- true, pErrorCode);
- }
- U_CAPI int32_t U_EXPORT2
- unorm2_append(const UNormalizer2 *norm2,
- char16_t *first, int32_t firstLength, int32_t firstCapacity,
- const char16_t *second, int32_t secondLength,
- UErrorCode *pErrorCode) {
- return normalizeSecondAndAppend(norm2,
- first, firstLength, firstCapacity,
- second, secondLength,
- false, pErrorCode);
- }
- U_CAPI int32_t U_EXPORT2
- unorm2_getDecomposition(const UNormalizer2 *norm2,
- UChar32 c, char16_t *decomposition, int32_t capacity,
- UErrorCode *pErrorCode) {
- if(U_FAILURE(*pErrorCode)) {
- return 0;
- }
- if(decomposition==nullptr ? capacity!=0 : capacity<0) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
- UnicodeString destString(decomposition, 0, capacity);
- if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
- return destString.extract(decomposition, capacity, *pErrorCode);
- } else {
- return -1;
- }
- }
- U_CAPI int32_t U_EXPORT2
- unorm2_getRawDecomposition(const UNormalizer2 *norm2,
- UChar32 c, char16_t *decomposition, int32_t capacity,
- UErrorCode *pErrorCode) {
- if(U_FAILURE(*pErrorCode)) {
- return 0;
- }
- if(decomposition==nullptr ? capacity!=0 : capacity<0) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
- UnicodeString destString(decomposition, 0, capacity);
- if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
- return destString.extract(decomposition, capacity, *pErrorCode);
- } else {
- return -1;
- }
- }
- U_CAPI UChar32 U_EXPORT2
- unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
- return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
- }
- U_CAPI uint8_t U_EXPORT2
- unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
- return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
- }
- U_CAPI UBool U_EXPORT2
- unorm2_isNormalized(const UNormalizer2 *norm2,
- const char16_t *s, int32_t length,
- UErrorCode *pErrorCode) {
- if(U_FAILURE(*pErrorCode)) {
- return 0;
- }
- if((s==nullptr && length!=0) || length<-1) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
- UnicodeString sString(length<0, s, length);
- return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
- }
- U_CAPI UNormalizationCheckResult U_EXPORT2
- unorm2_quickCheck(const UNormalizer2 *norm2,
- const char16_t *s, int32_t length,
- UErrorCode *pErrorCode) {
- if(U_FAILURE(*pErrorCode)) {
- return UNORM_NO;
- }
- if((s==nullptr && length!=0) || length<-1) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return UNORM_NO;
- }
- UnicodeString sString(length<0, s, length);
- return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
- }
- U_CAPI int32_t U_EXPORT2
- unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
- const char16_t *s, int32_t length,
- UErrorCode *pErrorCode) {
- if(U_FAILURE(*pErrorCode)) {
- return 0;
- }
- if((s==nullptr && length!=0) || length<-1) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
- UnicodeString sString(length<0, s, length);
- return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
- }
- U_CAPI UBool U_EXPORT2
- unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
- return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
- }
- U_CAPI UBool U_EXPORT2
- unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
- return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
- }
- U_CAPI UBool U_EXPORT2
- unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
- return ((const Normalizer2 *)norm2)->isInert(c);
- }
- // Some properties APIs ---------------------------------------------------- ***
- U_CAPI uint8_t U_EXPORT2
- u_getCombiningClass(UChar32 c) {
- UErrorCode errorCode=U_ZERO_ERROR;
- const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
- if(U_SUCCESS(errorCode)) {
- return nfd->getCombiningClass(c);
- } else {
- return 0;
- }
- }
- U_CFUNC uint16_t
- unorm_getFCD16(UChar32 c) {
- UErrorCode errorCode=U_ZERO_ERROR;
- const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
- if(U_SUCCESS(errorCode)) {
- return impl->getFCD16(c);
- } else {
- return 0;
- }
- }
- #endif // !UCONFIG_NO_NORMALIZATION
|