123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- *************************************************************************
- * COPYRIGHT:
- * Copyright (c) 1996-2012, International Business Machines Corporation and
- * others. All Rights Reserved.
- *************************************************************************
- */
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_NORMALIZATION
- #include "unicode/uniset.h"
- #include "unicode/unistr.h"
- #include "unicode/chariter.h"
- #include "unicode/schriter.h"
- #include "unicode/uchriter.h"
- #include "unicode/normlzr.h"
- #include "unicode/utf16.h"
- #include "cmemory.h"
- #include "normalizer2impl.h"
- #include "uprops.h" // for uniset_getUnicode32Instance()
- #if defined(move32)
- // System can define move32 intrinsics, but the char iters define move32 method
- // using same undef trick in headers, so undef here to re-enable the method.
- #undef move32
- #endif
- U_NAMESPACE_BEGIN
- UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
- //-------------------------------------------------------------------------
- // Constructors and other boilerplate
- //-------------------------------------------------------------------------
- Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
- UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
- text(new StringCharacterIterator(str)),
- currentIndex(0), nextIndex(0),
- buffer(), bufferPos(0)
- {
- init();
- }
- Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
- UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
- text(new UCharCharacterIterator(str, length)),
- currentIndex(0), nextIndex(0),
- buffer(), bufferPos(0)
- {
- init();
- }
- Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
- UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
- text(iter.clone()),
- currentIndex(0), nextIndex(0),
- buffer(), bufferPos(0)
- {
- init();
- }
- Normalizer::Normalizer(const Normalizer ©) :
- UObject(copy), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(copy.fUMode), fOptions(copy.fOptions),
- text(copy.text->clone()),
- currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
- buffer(copy.buffer), bufferPos(copy.bufferPos)
- {
- init();
- }
- void
- Normalizer::init() {
- UErrorCode errorCode=U_ZERO_ERROR;
- fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
- if(fOptions&UNORM_UNICODE_3_2) {
- delete fFilteredNorm2;
- fNorm2=fFilteredNorm2=
- new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
- }
- if(U_FAILURE(errorCode)) {
- errorCode=U_ZERO_ERROR;
- fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
- }
- }
- Normalizer::~Normalizer()
- {
- delete fFilteredNorm2;
- delete text;
- }
- Normalizer*
- Normalizer::clone() const
- {
- return new Normalizer(*this);
- }
- /**
- * Generates a hash code for this iterator.
- */
- int32_t Normalizer::hashCode() const
- {
- return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
- }
-
- bool Normalizer::operator==(const Normalizer& that) const
- {
- return
- this==&that ||
- (fUMode==that.fUMode &&
- fOptions==that.fOptions &&
- *text==*that.text &&
- buffer==that.buffer &&
- bufferPos==that.bufferPos &&
- nextIndex==that.nextIndex);
- }
- //-------------------------------------------------------------------------
- // Static utility methods
- //-------------------------------------------------------------------------
- void U_EXPORT2
- Normalizer::normalize(const UnicodeString& source,
- UNormalizationMode mode, int32_t options,
- UnicodeString& result,
- UErrorCode &status) {
- if(source.isBogus() || U_FAILURE(status)) {
- result.setToBogus();
- if(U_SUCCESS(status)) {
- status=U_ILLEGAL_ARGUMENT_ERROR;
- }
- } else {
- UnicodeString localDest;
- UnicodeString *dest;
- if(&source!=&result) {
- dest=&result;
- } else {
- // the source and result strings are the same object, use a temporary one
- dest=&localDest;
- }
- const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
- if(U_SUCCESS(status)) {
- if(options&UNORM_UNICODE_3_2) {
- FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
- normalize(source, *dest, status);
- } else {
- n2->normalize(source, *dest, status);
- }
- }
- if(dest==&localDest && U_SUCCESS(status)) {
- result=*dest;
- }
- }
- }
- void U_EXPORT2
- Normalizer::compose(const UnicodeString& source,
- UBool compat, int32_t options,
- UnicodeString& result,
- UErrorCode &status) {
- normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
- }
- void U_EXPORT2
- Normalizer::decompose(const UnicodeString& source,
- UBool compat, int32_t options,
- UnicodeString& result,
- UErrorCode &status) {
- normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
- }
- UNormalizationCheckResult
- Normalizer::quickCheck(const UnicodeString& source,
- UNormalizationMode mode, int32_t options,
- UErrorCode &status) {
- const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
- if(U_SUCCESS(status)) {
- if(options&UNORM_UNICODE_3_2) {
- return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
- quickCheck(source, status);
- } else {
- return n2->quickCheck(source, status);
- }
- } else {
- return UNORM_MAYBE;
- }
- }
- UBool
- Normalizer::isNormalized(const UnicodeString& source,
- UNormalizationMode mode, int32_t options,
- UErrorCode &status) {
- const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
- if(U_SUCCESS(status)) {
- if(options&UNORM_UNICODE_3_2) {
- return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
- isNormalized(source, status);
- } else {
- return n2->isNormalized(source, status);
- }
- } else {
- return false;
- }
- }
- UnicodeString & U_EXPORT2
- Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
- UnicodeString &result,
- UNormalizationMode mode, int32_t options,
- UErrorCode &errorCode) {
- if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
- result.setToBogus();
- if(U_SUCCESS(errorCode)) {
- errorCode=U_ILLEGAL_ARGUMENT_ERROR;
- }
- } else {
- UnicodeString localDest;
- UnicodeString *dest;
- if(&right!=&result) {
- dest=&result;
- } else {
- // the right and result strings are the same object, use a temporary one
- dest=&localDest;
- }
- *dest=left;
- const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
- if(U_SUCCESS(errorCode)) {
- if(options&UNORM_UNICODE_3_2) {
- FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
- append(*dest, right, errorCode);
- } else {
- n2->append(*dest, right, errorCode);
- }
- }
- if(dest==&localDest && U_SUCCESS(errorCode)) {
- result=*dest;
- }
- }
- return result;
- }
- //-------------------------------------------------------------------------
- // Iteration API
- //-------------------------------------------------------------------------
- /**
- * Return the current character in the normalized text.
- */
- UChar32 Normalizer::current() {
- if(bufferPos<buffer.length() || nextNormalize()) {
- return buffer.char32At(bufferPos);
- } else {
- return DONE;
- }
- }
- /**
- * Return the next character in the normalized text and advance
- * the iteration position by one. If the end
- * of the text has already been reached, {@link #DONE} is returned.
- */
- UChar32 Normalizer::next() {
- if(bufferPos<buffer.length() || nextNormalize()) {
- UChar32 c=buffer.char32At(bufferPos);
- bufferPos+=U16_LENGTH(c);
- return c;
- } else {
- return DONE;
- }
- }
- /**
- * Return the previous character in the normalized text and decrement
- * the iteration position by one. If the beginning
- * of the text has already been reached, {@link #DONE} is returned.
- */
- UChar32 Normalizer::previous() {
- if(bufferPos>0 || previousNormalize()) {
- UChar32 c=buffer.char32At(bufferPos-1);
- bufferPos-=U16_LENGTH(c);
- return c;
- } else {
- return DONE;
- }
- }
- void Normalizer::reset() {
- currentIndex=nextIndex=text->setToStart();
- clearBuffer();
- }
- void
- Normalizer::setIndexOnly(int32_t index) {
- text->setIndex(index); // pins index
- currentIndex=nextIndex=text->getIndex();
- clearBuffer();
- }
- /**
- * Return the first character in the normalized text. This resets
- * the <tt>Normalizer's</tt> position to the beginning of the text.
- */
- UChar32 Normalizer::first() {
- reset();
- return next();
- }
- /**
- * Return the last character in the normalized text. This resets
- * the <tt>Normalizer's</tt> position to be just before the
- * the input text corresponding to that normalized character.
- */
- UChar32 Normalizer::last() {
- currentIndex=nextIndex=text->setToEnd();
- clearBuffer();
- return previous();
- }
- /**
- * Retrieve the current iteration position in the input text that is
- * being normalized. This method is useful in applications such as
- * searching, where you need to be able to determine the position in
- * the input text that corresponds to a given normalized output character.
- * <p>
- * <b>Note:</b> This method sets the position in the <em>input</em>, while
- * {@link #next} and {@link #previous} iterate through characters in the
- * <em>output</em>. This means that there is not necessarily a one-to-one
- * correspondence between characters returned by <tt>next</tt> and
- * <tt>previous</tt> and the indices passed to and returned from
- * <tt>setIndex</tt> and {@link #getIndex}.
- *
- */
- int32_t Normalizer::getIndex() const {
- if(bufferPos<buffer.length()) {
- return currentIndex;
- } else {
- return nextIndex;
- }
- }
- /**
- * Retrieve the index of the start of the input text. This is the begin index
- * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
- * over which this <tt>Normalizer</tt> is iterating
- */
- int32_t Normalizer::startIndex() const {
- return text->startIndex();
- }
- /**
- * Retrieve the index of the end of the input text. This is the end index
- * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
- * over which this <tt>Normalizer</tt> is iterating
- */
- int32_t Normalizer::endIndex() const {
- return text->endIndex();
- }
- //-------------------------------------------------------------------------
- // Property access methods
- //-------------------------------------------------------------------------
- void
- Normalizer::setMode(UNormalizationMode newMode)
- {
- fUMode = newMode;
- init();
- }
- UNormalizationMode
- Normalizer::getUMode() const
- {
- return fUMode;
- }
- void
- Normalizer::setOption(int32_t option,
- UBool value)
- {
- if (value) {
- fOptions |= option;
- } else {
- fOptions &= (~option);
- }
- init();
- }
- UBool
- Normalizer::getOption(int32_t option) const
- {
- return (fOptions & option) != 0;
- }
- /**
- * Set the input text over which this <tt>Normalizer</tt> will iterate.
- * The iteration position is set to the beginning of the input text.
- */
- void
- Normalizer::setText(const UnicodeString& newText,
- UErrorCode &status)
- {
- if (U_FAILURE(status)) {
- return;
- }
- CharacterIterator *newIter = new StringCharacterIterator(newText);
- if (newIter == nullptr) {
- status = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- delete text;
- text = newIter;
- reset();
- }
- /**
- * Set the input text over which this <tt>Normalizer</tt> will iterate.
- * The iteration position is set to the beginning of the string.
- */
- void
- Normalizer::setText(const CharacterIterator& newText,
- UErrorCode &status)
- {
- if (U_FAILURE(status)) {
- return;
- }
- CharacterIterator *newIter = newText.clone();
- if (newIter == nullptr) {
- status = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- delete text;
- text = newIter;
- reset();
- }
- void
- Normalizer::setText(ConstChar16Ptr newText,
- int32_t length,
- UErrorCode &status)
- {
- if (U_FAILURE(status)) {
- return;
- }
- CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
- if (newIter == nullptr) {
- status = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- delete text;
- text = newIter;
- reset();
- }
- /**
- * Copies the text under iteration into the UnicodeString referred to by "result".
- * @param result Receives a copy of the text under iteration.
- */
- void
- Normalizer::getText(UnicodeString& result)
- {
- text->getText(result);
- }
- //-------------------------------------------------------------------------
- // Private utility methods
- //-------------------------------------------------------------------------
- void Normalizer::clearBuffer() {
- buffer.remove();
- bufferPos=0;
- }
- UBool
- Normalizer::nextNormalize() {
- clearBuffer();
- currentIndex=nextIndex;
- text->setIndex(nextIndex);
- if(!text->hasNext()) {
- return false;
- }
- // Skip at least one character so we make progress.
- UnicodeString segment(text->next32PostInc());
- while(text->hasNext()) {
- UChar32 c;
- if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
- text->move32(-1, CharacterIterator::kCurrent);
- break;
- }
- segment.append(c);
- }
- nextIndex=text->getIndex();
- UErrorCode errorCode=U_ZERO_ERROR;
- fNorm2->normalize(segment, buffer, errorCode);
- return U_SUCCESS(errorCode) && !buffer.isEmpty();
- }
- UBool
- Normalizer::previousNormalize() {
- clearBuffer();
- nextIndex=currentIndex;
- text->setIndex(currentIndex);
- if(!text->hasPrevious()) {
- return false;
- }
- UnicodeString segment;
- while(text->hasPrevious()) {
- UChar32 c=text->previous32();
- segment.insert(0, c);
- if(fNorm2->hasBoundaryBefore(c)) {
- break;
- }
- }
- currentIndex=text->getIndex();
- UErrorCode errorCode=U_ZERO_ERROR;
- fNorm2->normalize(segment, buffer, errorCode);
- bufferPos=buffer.length();
- return U_SUCCESS(errorCode) && !buffer.isEmpty();
- }
- U_NAMESPACE_END
- #endif /* #if !UCONFIG_NO_NORMALIZATION */
|