123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- *******************************************************************************
- *
- * Copyright (C) 1999-2014, International Business Machines
- * Corporation and others. All Rights Reserved.
- *
- *******************************************************************************
- * file name: unistr_cnv.cpp
- * encoding: UTF-8
- * tab size: 8 (not used)
- * indentation:2
- *
- * created on: 2004aug19
- * created by: Markus W. Scherer
- *
- * Character conversion functions moved here from unistr.cpp
- */
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_CONVERSION
- #include "unicode/putil.h"
- #include "cstring.h"
- #include "cmemory.h"
- #include "unicode/ustring.h"
- #include "unicode/unistr.h"
- #include "unicode/ucnv.h"
- #include "ucnv_imp.h"
- #include "putilimp.h"
- #include "ustr_cnv.h"
- #include "ustr_imp.h"
- U_NAMESPACE_BEGIN
- //========================================
- // Constructors
- //========================================
- #if !U_CHARSET_IS_UTF8
- UnicodeString::UnicodeString(const char *codepageData) {
- fUnion.fFields.fLengthAndFlags = kShortString;
- if(codepageData != 0) {
- doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
- }
- }
- UnicodeString::UnicodeString(const char *codepageData,
- int32_t dataLength) {
- fUnion.fFields.fLengthAndFlags = kShortString;
- if(codepageData != 0) {
- doCodepageCreate(codepageData, dataLength, 0);
- }
- }
- // else see unistr.cpp
- #endif
- UnicodeString::UnicodeString(const char *codepageData,
- const char *codepage) {
- fUnion.fFields.fLengthAndFlags = kShortString;
- if (codepageData != nullptr) {
- doCodepageCreate(codepageData, static_cast<int32_t>(uprv_strlen(codepageData)), codepage);
- }
- }
- UnicodeString::UnicodeString(const char *codepageData,
- int32_t dataLength,
- const char *codepage) {
- fUnion.fFields.fLengthAndFlags = kShortString;
- if (codepageData != nullptr) {
- doCodepageCreate(codepageData, dataLength, codepage);
- }
- }
- UnicodeString::UnicodeString(const char *src, int32_t srcLength,
- UConverter *cnv,
- UErrorCode &errorCode) {
- fUnion.fFields.fLengthAndFlags = kShortString;
- if(U_SUCCESS(errorCode)) {
- // check arguments
- if(src==nullptr) {
- // treat as an empty string, do nothing more
- } else if(srcLength<-1) {
- errorCode=U_ILLEGAL_ARGUMENT_ERROR;
- } else {
- // get input length
- if(srcLength==-1) {
- srcLength = static_cast<int32_t>(uprv_strlen(src));
- }
- if(srcLength>0) {
- if (cnv != nullptr) {
- // use the provided converter
- ucnv_resetToUnicode(cnv);
- doCodepageCreate(src, srcLength, cnv, errorCode);
- } else {
- // use the default converter
- cnv=u_getDefaultConverter(&errorCode);
- doCodepageCreate(src, srcLength, cnv, errorCode);
- u_releaseDefaultConverter(cnv);
- }
- }
- }
- if(U_FAILURE(errorCode)) {
- setToBogus();
- }
- }
- }
- //========================================
- // Codeset conversion
- //========================================
- #if !U_CHARSET_IS_UTF8
- int32_t
- UnicodeString::extract(int32_t start,
- int32_t length,
- char *target,
- uint32_t dstSize) const {
- return extract(start, length, target, dstSize, 0);
- }
- // else see unistr.cpp
- #endif
- int32_t
- UnicodeString::extract(int32_t start,
- int32_t length,
- char *target,
- uint32_t dstSize,
- const char *codepage) const
- {
- // if the arguments are illegal, then do nothing
- if (/*dstSize < 0 || */(dstSize > 0 && target == nullptr)) {
- return 0;
- }
- // pin the indices to legal values
- pinIndices(start, length);
- // We need to cast dstSize to int32_t for all subsequent code.
- // I don't know why the API was defined with uint32_t but we are stuck with it.
- // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
- // as a limit in some functions, it may wrap around and yield a pointer
- // that compares less-than target.
- int32_t capacity;
- if(dstSize < 0x7fffffff) {
- // Assume that the capacity is real and a limit pointer won't wrap around.
- capacity = static_cast<int32_t>(dstSize);
- } else {
- // Pin the capacity so that a limit pointer does not wrap around.
- char* targetLimit = static_cast<char*>(U_MAX_PTR(target));
- // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
- // greater than target and does not wrap around the top of the address space.
- capacity = static_cast<int32_t>(targetLimit - target);
- }
- // create the converter
- UConverter *converter;
- UErrorCode status = U_ZERO_ERROR;
- // just write the NUL if the string length is 0
- if(length == 0) {
- return u_terminateChars(target, capacity, 0, &status);
- }
- // if the codepage is the default, use our cache
- // if it is an empty string, then use the "invariant character" conversion
- if (codepage == nullptr) {
- const char *defaultName = ucnv_getDefaultName();
- if(UCNV_FAST_IS_UTF8(defaultName)) {
- return toUTF8(start, length, target, capacity);
- }
- converter = u_getDefaultConverter(&status);
- } else if (*codepage == 0) {
- // use the "invariant characters" conversion
- int32_t destLength;
- if(length <= capacity) {
- destLength = length;
- } else {
- destLength = capacity;
- }
- u_UCharsToChars(getArrayStart() + start, target, destLength);
- return u_terminateChars(target, capacity, length, &status);
- } else {
- converter = ucnv_open(codepage, &status);
- }
- length = doExtract(start, length, target, capacity, converter, status);
- // close the converter
- if (codepage == nullptr) {
- u_releaseDefaultConverter(converter);
- } else {
- ucnv_close(converter);
- }
- return length;
- }
- int32_t
- UnicodeString::extract(char *dest, int32_t destCapacity,
- UConverter *cnv,
- UErrorCode &errorCode) const
- {
- if(U_FAILURE(errorCode)) {
- return 0;
- }
- if (isBogus() || destCapacity < 0 || (destCapacity > 0 && dest == nullptr)) {
- errorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
- // nothing to do?
- if(isEmpty()) {
- return u_terminateChars(dest, destCapacity, 0, &errorCode);
- }
- // get the converter
- UBool isDefaultConverter;
- if (cnv == nullptr) {
- isDefaultConverter=true;
- cnv=u_getDefaultConverter(&errorCode);
- if(U_FAILURE(errorCode)) {
- return 0;
- }
- } else {
- isDefaultConverter=false;
- ucnv_resetFromUnicode(cnv);
- }
- // convert
- int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
- // release the converter
- if(isDefaultConverter) {
- u_releaseDefaultConverter(cnv);
- }
- return len;
- }
- int32_t
- UnicodeString::doExtract(int32_t start, int32_t length,
- char *dest, int32_t destCapacity,
- UConverter *cnv,
- UErrorCode &errorCode) const
- {
- if(U_FAILURE(errorCode)) {
- if(destCapacity!=0) {
- *dest=0;
- }
- return 0;
- }
- const char16_t *src=getArrayStart()+start, *srcLimit=src+length;
- char *originalDest=dest;
- const char *destLimit;
- if(destCapacity==0) {
- destLimit=dest=nullptr;
- } else if(destCapacity==-1) {
- // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
- destLimit = static_cast<char*>(U_MAX_PTR(dest));
- // for NUL-termination, translate into highest int32_t
- destCapacity=0x7fffffff;
- } else {
- destLimit=dest+destCapacity;
- }
- // perform the conversion
- ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, nullptr, true, &errorCode);
- length = static_cast<int32_t>(dest - originalDest);
- // if an overflow occurs, then get the preflighting length
- if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
- char buffer[1024];
- destLimit=buffer+sizeof(buffer);
- do {
- dest=buffer;
- errorCode=U_ZERO_ERROR;
- ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, nullptr, true, &errorCode);
- length += static_cast<int32_t>(dest - buffer);
- } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
- }
- return u_terminateChars(originalDest, destCapacity, length, &errorCode);
- }
- void
- UnicodeString::doCodepageCreate(const char *codepageData,
- int32_t dataLength,
- const char *codepage)
- {
- // if there's nothing to convert, do nothing
- if (codepageData == nullptr || dataLength == 0 || dataLength < -1) {
- return;
- }
- if(dataLength == -1) {
- dataLength = static_cast<int32_t>(uprv_strlen(codepageData));
- }
- UErrorCode status = U_ZERO_ERROR;
- // create the converter
- // if the codepage is the default, use our cache
- // if it is an empty string, then use the "invariant character" conversion
- UConverter *converter;
- if (codepage == nullptr) {
- const char *defaultName = ucnv_getDefaultName();
- if(UCNV_FAST_IS_UTF8(defaultName)) {
- setToUTF8(StringPiece(codepageData, dataLength));
- return;
- }
- converter = u_getDefaultConverter(&status);
- } else if (*codepage == 0) {
- // use the "invariant characters" conversion
- if(cloneArrayIfNeeded(dataLength, dataLength, false)) {
- u_charsToUChars(codepageData, getArrayStart(), dataLength);
- setLength(dataLength);
- } else {
- setToBogus();
- }
- return;
- } else {
- converter = ucnv_open(codepage, &status);
- }
- // if we failed, set the appropriate flags and return
- if(U_FAILURE(status)) {
- setToBogus();
- return;
- }
- // perform the conversion
- doCodepageCreate(codepageData, dataLength, converter, status);
- if(U_FAILURE(status)) {
- setToBogus();
- }
- // close the converter
- if (codepage == nullptr) {
- u_releaseDefaultConverter(converter);
- } else {
- ucnv_close(converter);
- }
- }
- void
- UnicodeString::doCodepageCreate(const char *codepageData,
- int32_t dataLength,
- UConverter *converter,
- UErrorCode &status)
- {
- if(U_FAILURE(status)) {
- return;
- }
- // set up the conversion parameters
- const char *mySource = codepageData;
- const char *mySourceEnd = mySource + dataLength;
- char16_t *array, *myTarget;
- // estimate the size needed:
- int32_t arraySize;
- if(dataLength <= US_STACKBUF_SIZE) {
- // try to use the stack buffer
- arraySize = US_STACKBUF_SIZE;
- } else {
- // 1.25 char16_t's per source byte should cover most cases
- arraySize = dataLength + (dataLength >> 2);
- }
- // we do not care about the current contents
- UBool doCopyArray = false;
- for(;;) {
- if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
- setToBogus();
- break;
- }
- // perform the conversion
- array = getArrayStart();
- myTarget = array + length();
- ucnv_toUnicode(converter, &myTarget, array + getCapacity(),
- &mySource, mySourceEnd, nullptr, true, &status);
- // update the conversion parameters
- setLength(static_cast<int32_t>(myTarget - array));
- // allocate more space and copy data, if needed
- if(status == U_BUFFER_OVERFLOW_ERROR) {
- // reset the error code
- status = U_ZERO_ERROR;
- // keep the previous conversion results
- doCopyArray = true;
- // estimate the new size needed, larger than before
- // try 2 char16_t's per remaining source byte
- arraySize = static_cast<int32_t>(length() + 2 * (mySourceEnd - mySource));
- } else {
- break;
- }
- }
- }
- U_NAMESPACE_END
- #endif
|