12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- ******************************************************************************
- *
- * Copyright (C) 2001-2016, International Business Machines
- * Corporation and others. All Rights Reserved.
- *
- ******************************************************************************
- *
- * File ustrtrns.cpp
- *
- * Modification History:
- *
- * Date Name Description
- * 9/10/2001 Ram Creation.
- ******************************************************************************
- */
- /*******************************************************************************
- *
- * u_strTo* and u_strFrom* APIs
- * WCS functions moved to ustr_wcs.c for better modularization
- *
- *******************************************************************************
- */
- #include "unicode/putil.h"
- #include "unicode/ustring.h"
- #include "unicode/utf.h"
- #include "unicode/utf8.h"
- #include "unicode/utf16.h"
- #include "cstring.h"
- #include "cmemory.h"
- #include "ustr_imp.h"
- #include "uassert.h"
- U_CAPI char16_t* U_EXPORT2
- u_strFromUTF32WithSub(char16_t *dest,
- int32_t destCapacity,
- int32_t *pDestLength,
- const UChar32 *src,
- int32_t srcLength,
- UChar32 subchar, int32_t *pNumSubstitutions,
- UErrorCode *pErrorCode) {
- const UChar32 *srcLimit;
- UChar32 ch;
- char16_t *destLimit;
- char16_t *pDest;
- int32_t reqLength;
- int32_t numSubstitutions;
- /* args check */
- if(U_FAILURE(*pErrorCode)){
- return nullptr;
- }
- if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
- (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
- subchar > 0x10ffff || U_IS_SURROGATE(subchar)
- ) {
- *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return nullptr;
- }
- if(pNumSubstitutions != nullptr) {
- *pNumSubstitutions = 0;
- }
- pDest = dest;
- destLimit = (dest!=nullptr)?(dest + destCapacity):nullptr;
- reqLength = 0;
- numSubstitutions = 0;
- if(srcLength < 0) {
- /* simple loop for conversion of a NUL-terminated BMP string */
- while((ch=*src) != 0 &&
- ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
- ++src;
- if(pDest < destLimit) {
- *pDest++ = (char16_t)ch;
- } else {
- ++reqLength;
- }
- }
- srcLimit = src;
- if(ch != 0) {
- /* "complicated" case, find the end of the remaining string */
- while(*++srcLimit != 0) {}
- }
- } else {
- srcLimit = (src!=nullptr)?(src + srcLength):nullptr;
- }
- /* convert with length */
- while(src < srcLimit) {
- ch = *src++;
- do {
- /* usually "loops" once; twice only for writing subchar */
- if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
- if(pDest < destLimit) {
- *pDest++ = (char16_t)ch;
- } else {
- ++reqLength;
- }
- break;
- } else if(0x10000 <= ch && ch <= 0x10ffff) {
- if(pDest!=nullptr && ((pDest + 2) <= destLimit)) {
- *pDest++ = U16_LEAD(ch);
- *pDest++ = U16_TRAIL(ch);
- } else {
- reqLength += 2;
- }
- break;
- } else if((ch = subchar) < 0) {
- /* surrogate code point, or not a Unicode code point at all */
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return nullptr;
- } else {
- ++numSubstitutions;
- }
- } while(true);
- }
- reqLength += (int32_t)(pDest - dest);
- if(pDestLength) {
- *pDestLength = reqLength;
- }
- if(pNumSubstitutions != nullptr) {
- *pNumSubstitutions = numSubstitutions;
- }
- /* Terminate the buffer */
- u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
-
- return dest;
- }
- U_CAPI char16_t* U_EXPORT2
- u_strFromUTF32(char16_t *dest,
- int32_t destCapacity,
- int32_t *pDestLength,
- const UChar32 *src,
- int32_t srcLength,
- UErrorCode *pErrorCode) {
- return u_strFromUTF32WithSub(
- dest, destCapacity, pDestLength,
- src, srcLength,
- U_SENTINEL, nullptr,
- pErrorCode);
- }
- U_CAPI UChar32* U_EXPORT2
- u_strToUTF32WithSub(UChar32 *dest,
- int32_t destCapacity,
- int32_t *pDestLength,
- const char16_t *src,
- int32_t srcLength,
- UChar32 subchar, int32_t *pNumSubstitutions,
- UErrorCode *pErrorCode) {
- const char16_t *srcLimit;
- UChar32 ch;
- char16_t ch2;
- UChar32 *destLimit;
- UChar32 *pDest;
- int32_t reqLength;
- int32_t numSubstitutions;
- /* args check */
- if(U_FAILURE(*pErrorCode)){
- return nullptr;
- }
- if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
- (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
- subchar > 0x10ffff || U_IS_SURROGATE(subchar)
- ) {
- *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return nullptr;
- }
- if(pNumSubstitutions != nullptr) {
- *pNumSubstitutions = 0;
- }
- pDest = dest;
- destLimit = (dest!=nullptr)?(dest + destCapacity):nullptr;
- reqLength = 0;
- numSubstitutions = 0;
- if(srcLength < 0) {
- /* simple loop for conversion of a NUL-terminated BMP string */
- while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
- ++src;
- if(pDest < destLimit) {
- *pDest++ = ch;
- } else {
- ++reqLength;
- }
- }
- srcLimit = src;
- if(ch != 0) {
- /* "complicated" case, find the end of the remaining string */
- while(*++srcLimit != 0) {}
- }
- } else {
- srcLimit = (src!=nullptr)?(src + srcLength):nullptr;
- }
- /* convert with length */
- while(src < srcLimit) {
- ch = *src++;
- if(!U16_IS_SURROGATE(ch)) {
- /* write or count ch below */
- } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
- ++src;
- ch = U16_GET_SUPPLEMENTARY(ch, ch2);
- } else if((ch = subchar) < 0) {
- /* unpaired surrogate */
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return nullptr;
- } else {
- ++numSubstitutions;
- }
- if(pDest < destLimit) {
- *pDest++ = ch;
- } else {
- ++reqLength;
- }
- }
- reqLength += (int32_t)(pDest - dest);
- if(pDestLength) {
- *pDestLength = reqLength;
- }
- if(pNumSubstitutions != nullptr) {
- *pNumSubstitutions = numSubstitutions;
- }
- /* Terminate the buffer */
- u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
- return dest;
- }
- U_CAPI UChar32* U_EXPORT2
- u_strToUTF32(UChar32 *dest,
- int32_t destCapacity,
- int32_t *pDestLength,
- const char16_t *src,
- int32_t srcLength,
- UErrorCode *pErrorCode) {
- return u_strToUTF32WithSub(
- dest, destCapacity, pDestLength,
- src, srcLength,
- U_SENTINEL, nullptr,
- pErrorCode);
- }
- U_CAPI char16_t* U_EXPORT2
- u_strFromUTF8WithSub(char16_t *dest,
- int32_t destCapacity,
- int32_t *pDestLength,
- const char* src,
- int32_t srcLength,
- UChar32 subchar, int32_t *pNumSubstitutions,
- UErrorCode *pErrorCode){
- /* args check */
- if(U_FAILURE(*pErrorCode)) {
- return nullptr;
- }
- if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
- (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
- subchar > 0x10ffff || U_IS_SURROGATE(subchar)
- ) {
- *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return nullptr;
- }
- if(pNumSubstitutions!=nullptr) {
- *pNumSubstitutions=0;
- }
- char16_t *pDest = dest;
- char16_t *pDestLimit = dest+destCapacity;
- int32_t reqLength = 0;
- int32_t numSubstitutions=0;
- /*
- * Inline processing of UTF-8 byte sequences:
- *
- * Byte sequences for the most common characters are handled inline in
- * the conversion loops. In order to reduce the path lengths for those
- * characters, the tests are arranged in a kind of binary search.
- * ASCII (<=0x7f) is checked first, followed by the dividing point
- * between 2- and 3-byte sequences (0xe0).
- * The 3-byte branch is tested first to speed up CJK text.
- * The compiler should combine the subtractions for the two tests for 0xe0.
- * Each branch then tests for the other end of its range.
- */
- if(srcLength < 0){
- /*
- * Transform a NUL-terminated string.
- * The code explicitly checks for NULs only in the lead byte position.
- * A NUL byte in the trail byte position fails the trail byte range check anyway.
- */
- int32_t i;
- UChar32 c;
- for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
- // modified copy of U8_NEXT()
- ++i;
- if(U8_IS_SINGLE(c)) {
- *pDest++=(char16_t)c;
- } else {
- uint8_t __t1, __t2;
- if( /* handle U+0800..U+FFFF inline */
- (0xe0<=(c) && (c)<0xf0) &&
- U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
- (__t2=src[(i)+1]-0x80)<=0x3f) {
- *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
- i+=2;
- } else if( /* handle U+0080..U+07FF inline */
- ((c)<0xe0 && (c)>=0xc2) &&
- (__t1=src[i]-0x80)<=0x3f) {
- *pDest++ = (((c)&0x1f)<<6)|__t1;
- ++(i);
- } else {
- /* function call for "complicated" and error cases */
- (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
- if(c<0 && (++numSubstitutions, c = subchar) < 0) {
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return nullptr;
- } else if(c<=0xFFFF) {
- *(pDest++)=(char16_t)c;
- } else {
- *(pDest++)=U16_LEAD(c);
- if(pDest<pDestLimit) {
- *(pDest++)=U16_TRAIL(c);
- } else {
- reqLength++;
- break;
- }
- }
- }
- }
- }
- /* Pre-flight the rest of the string. */
- while((c = (uint8_t)src[i]) != 0) {
- // modified copy of U8_NEXT()
- ++i;
- if(U8_IS_SINGLE(c)) {
- ++reqLength;
- } else {
- uint8_t __t1, __t2;
- if( /* handle U+0800..U+FFFF inline */
- (0xe0<=(c) && (c)<0xf0) &&
- U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
- (__t2=src[(i)+1]-0x80)<=0x3f) {
- ++reqLength;
- i+=2;
- } else if( /* handle U+0080..U+07FF inline */
- ((c)<0xe0 && (c)>=0xc2) &&
- (__t1=src[i]-0x80)<=0x3f) {
- ++reqLength;
- ++(i);
- } else {
- /* function call for "complicated" and error cases */
- (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
- if(c<0 && (++numSubstitutions, c = subchar) < 0) {
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return nullptr;
- }
- reqLength += U16_LENGTH(c);
- }
- }
- }
- } else /* srcLength >= 0 */ {
- /* Faster loop without ongoing checking for srcLength and pDestLimit. */
- int32_t i = 0;
- UChar32 c;
- for(;;) {
- /*
- * Each iteration of the inner loop progresses by at most 3 UTF-8
- * bytes and one char16_t, for most characters.
- * For supplementary code points (4 & 2), which are rare,
- * there is an additional adjustment.
- */
- int32_t count = (int32_t)(pDestLimit - pDest);
- int32_t count2 = (srcLength - i) / 3;
- if(count > count2) {
- count = count2; /* min(remaining dest, remaining src/3) */
- }
- if(count < 3) {
- /*
- * Too much overhead if we get near the end of the string,
- * continue with the next loop.
- */
- break;
- }
- do {
- // modified copy of U8_NEXT()
- c = (uint8_t)src[i++];
- if(U8_IS_SINGLE(c)) {
- *pDest++=(char16_t)c;
- } else {
- uint8_t __t1, __t2;
- if( /* handle U+0800..U+FFFF inline */
- (0xe0<=(c) && (c)<0xf0) &&
- ((i)+1)<srcLength &&
- U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
- (__t2=src[(i)+1]-0x80)<=0x3f) {
- *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
- i+=2;
- } else if( /* handle U+0080..U+07FF inline */
- ((c)<0xe0 && (c)>=0xc2) &&
- ((i)!=srcLength) &&
- (__t1=src[i]-0x80)<=0x3f) {
- *pDest++ = (((c)&0x1f)<<6)|__t1;
- ++(i);
- } else {
- if(c >= 0xf0 || subchar > 0xffff) {
- // We may read up to four bytes and write up to two UChars,
- // which we didn't account for with computing count,
- // so we adjust it here.
- if(--count == 0) {
- --i; // back out byte c
- break;
- }
- }
- /* function call for "complicated" and error cases */
- (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
- if(c<0 && (++numSubstitutions, c = subchar) < 0) {
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return nullptr;
- } else if(c<=0xFFFF) {
- *(pDest++)=(char16_t)c;
- } else {
- *(pDest++)=U16_LEAD(c);
- *(pDest++)=U16_TRAIL(c);
- }
- }
- }
- } while(--count > 0);
- }
- while(i < srcLength && (pDest < pDestLimit)) {
- // modified copy of U8_NEXT()
- c = (uint8_t)src[i++];
- if(U8_IS_SINGLE(c)) {
- *pDest++=(char16_t)c;
- } else {
- uint8_t __t1, __t2;
- if( /* handle U+0800..U+FFFF inline */
- (0xe0<=(c) && (c)<0xf0) &&
- ((i)+1)<srcLength &&
- U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
- (__t2=src[(i)+1]-0x80)<=0x3f) {
- *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
- i+=2;
- } else if( /* handle U+0080..U+07FF inline */
- ((c)<0xe0 && (c)>=0xc2) &&
- ((i)!=srcLength) &&
- (__t1=src[i]-0x80)<=0x3f) {
- *pDest++ = (((c)&0x1f)<<6)|__t1;
- ++(i);
- } else {
- /* function call for "complicated" and error cases */
- (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
- if(c<0 && (++numSubstitutions, c = subchar) < 0) {
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return nullptr;
- } else if(c<=0xFFFF) {
- *(pDest++)=(char16_t)c;
- } else {
- *(pDest++)=U16_LEAD(c);
- if(pDest<pDestLimit) {
- *(pDest++)=U16_TRAIL(c);
- } else {
- reqLength++;
- break;
- }
- }
- }
- }
- }
- /* Pre-flight the rest of the string. */
- while(i < srcLength) {
- // modified copy of U8_NEXT()
- c = (uint8_t)src[i++];
- if(U8_IS_SINGLE(c)) {
- ++reqLength;
- } else {
- uint8_t __t1, __t2;
- if( /* handle U+0800..U+FFFF inline */
- (0xe0<=(c) && (c)<0xf0) &&
- ((i)+1)<srcLength &&
- U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
- (__t2=src[(i)+1]-0x80)<=0x3f) {
- ++reqLength;
- i+=2;
- } else if( /* handle U+0080..U+07FF inline */
- ((c)<0xe0 && (c)>=0xc2) &&
- ((i)!=srcLength) &&
- (__t1=src[i]-0x80)<=0x3f) {
- ++reqLength;
- ++(i);
- } else {
- /* function call for "complicated" and error cases */
- (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
- if(c<0 && (++numSubstitutions, c = subchar) < 0) {
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return nullptr;
- }
- reqLength += U16_LENGTH(c);
- }
- }
- }
- }
- reqLength+=(int32_t)(pDest - dest);
- if(pNumSubstitutions!=nullptr) {
- *pNumSubstitutions=numSubstitutions;
- }
- if(pDestLength){
- *pDestLength = reqLength;
- }
- /* Terminate the buffer */
- u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
- return dest;
- }
- U_CAPI char16_t* U_EXPORT2
- u_strFromUTF8(char16_t *dest,
- int32_t destCapacity,
- int32_t *pDestLength,
- const char* src,
- int32_t srcLength,
- UErrorCode *pErrorCode){
- return u_strFromUTF8WithSub(
- dest, destCapacity, pDestLength,
- src, srcLength,
- U_SENTINEL, nullptr,
- pErrorCode);
- }
- U_CAPI char16_t * U_EXPORT2
- u_strFromUTF8Lenient(char16_t *dest,
- int32_t destCapacity,
- int32_t *pDestLength,
- const char *src,
- int32_t srcLength,
- UErrorCode *pErrorCode) {
- char16_t *pDest = dest;
- UChar32 ch;
- int32_t reqLength = 0;
- uint8_t* pSrc = (uint8_t*) src;
- /* args check */
- if(U_FAILURE(*pErrorCode)){
- return nullptr;
- }
-
- if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
- (destCapacity<0) || (dest == nullptr && destCapacity > 0)
- ) {
- *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return nullptr;
- }
- if(srcLength < 0) {
- /* Transform a NUL-terminated string. */
- char16_t *pDestLimit = (dest!=nullptr)?(dest+destCapacity):nullptr;
- uint8_t t1, t2, t3; /* trail bytes */
- while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
- if(ch < 0xc0) {
- /*
- * ASCII, or a trail byte in lead position which is treated like
- * a single-byte sequence for better character boundary
- * resynchronization after illegal sequences.
- */
- *pDest++=(char16_t)ch;
- ++pSrc;
- continue;
- } else if(ch < 0xe0) { /* U+0080..U+07FF */
- if((t1 = pSrc[1]) != 0) {
- /* 0x3080 = (0xc0 << 6) + 0x80 */
- *pDest++ = (char16_t)((ch << 6) + t1 - 0x3080);
- pSrc += 2;
- continue;
- }
- } else if(ch < 0xf0) { /* U+0800..U+FFFF */
- if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
- /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
- /* 0x2080 = (0x80 << 6) + 0x80 */
- *pDest++ = (char16_t)((ch << 12) + (t1 << 6) + t2 - 0x2080);
- pSrc += 3;
- continue;
- }
- } else /* f0..f4 */ { /* U+10000..U+10FFFF */
- if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
- pSrc += 4;
- /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
- ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
- *(pDest++) = U16_LEAD(ch);
- if(pDest < pDestLimit) {
- *(pDest++) = U16_TRAIL(ch);
- } else {
- reqLength = 1;
- break;
- }
- continue;
- }
- }
- /* truncated character at the end */
- *pDest++ = 0xfffd;
- while(*++pSrc != 0) {}
- break;
- }
- /* Pre-flight the rest of the string. */
- while((ch = *pSrc) != 0) {
- if(ch < 0xc0) {
- /*
- * ASCII, or a trail byte in lead position which is treated like
- * a single-byte sequence for better character boundary
- * resynchronization after illegal sequences.
- */
- ++reqLength;
- ++pSrc;
- continue;
- } else if(ch < 0xe0) { /* U+0080..U+07FF */
- if(pSrc[1] != 0) {
- ++reqLength;
- pSrc += 2;
- continue;
- }
- } else if(ch < 0xf0) { /* U+0800..U+FFFF */
- if(pSrc[1] != 0 && pSrc[2] != 0) {
- ++reqLength;
- pSrc += 3;
- continue;
- }
- } else /* f0..f4 */ { /* U+10000..U+10FFFF */
- if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
- reqLength += 2;
- pSrc += 4;
- continue;
- }
- }
- /* truncated character at the end */
- ++reqLength;
- break;
- }
- } else /* srcLength >= 0 */ {
- const uint8_t *pSrcLimit = (pSrc!=nullptr)?(pSrc + srcLength):nullptr;
- /*
- * This function requires that if srcLength is given, then it must be
- * destCapatity >= srcLength so that we need not check for
- * destination buffer overflow in the loop.
- */
- if(destCapacity < srcLength) {
- if(pDestLength != nullptr) {
- *pDestLength = srcLength; /* this likely overestimates the true destLength! */
- }
- *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
- return nullptr;
- }
- if((pSrcLimit - pSrc) >= 4) {
- pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
- /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
- do {
- ch = *pSrc++;
- if(ch < 0xc0) {
- /*
- * ASCII, or a trail byte in lead position which is treated like
- * a single-byte sequence for better character boundary
- * resynchronization after illegal sequences.
- */
- *pDest++=(char16_t)ch;
- } else if(ch < 0xe0) { /* U+0080..U+07FF */
- /* 0x3080 = (0xc0 << 6) + 0x80 */
- *pDest++ = (char16_t)((ch << 6) + *pSrc++ - 0x3080);
- } else if(ch < 0xf0) { /* U+0800..U+FFFF */
- /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
- /* 0x2080 = (0x80 << 6) + 0x80 */
- ch = (ch << 12) + (*pSrc++ << 6);
- *pDest++ = (char16_t)(ch + *pSrc++ - 0x2080);
- } else /* f0..f4 */ { /* U+10000..U+10FFFF */
- /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
- ch = (ch << 18) + (*pSrc++ << 12);
- ch += *pSrc++ << 6;
- ch += *pSrc++ - 0x3c82080;
- *(pDest++) = U16_LEAD(ch);
- *(pDest++) = U16_TRAIL(ch);
- }
- } while(pSrc < pSrcLimit);
- pSrcLimit += 3; /* restore original pSrcLimit */
- }
- while(pSrc < pSrcLimit) {
- ch = *pSrc++;
- if(ch < 0xc0) {
- /*
- * ASCII, or a trail byte in lead position which is treated like
- * a single-byte sequence for better character boundary
- * resynchronization after illegal sequences.
- */
- *pDest++=(char16_t)ch;
- continue;
- } else if(ch < 0xe0) { /* U+0080..U+07FF */
- if(pSrc < pSrcLimit) {
- /* 0x3080 = (0xc0 << 6) + 0x80 */
- *pDest++ = (char16_t)((ch << 6) + *pSrc++ - 0x3080);
- continue;
- }
- } else if(ch < 0xf0) { /* U+0800..U+FFFF */
- if((pSrcLimit - pSrc) >= 2) {
- /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
- /* 0x2080 = (0x80 << 6) + 0x80 */
- ch = (ch << 12) + (*pSrc++ << 6);
- *pDest++ = (char16_t)(ch + *pSrc++ - 0x2080);
- pSrc += 3;
- continue;
- }
- } else /* f0..f4 */ { /* U+10000..U+10FFFF */
- if((pSrcLimit - pSrc) >= 3) {
- /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
- ch = (ch << 18) + (*pSrc++ << 12);
- ch += *pSrc++ << 6;
- ch += *pSrc++ - 0x3c82080;
- *(pDest++) = U16_LEAD(ch);
- *(pDest++) = U16_TRAIL(ch);
- pSrc += 4;
- continue;
- }
- }
- /* truncated character at the end */
- *pDest++ = 0xfffd;
- break;
- }
- }
- reqLength+=(int32_t)(pDest - dest);
- if(pDestLength){
- *pDestLength = reqLength;
- }
- /* Terminate the buffer */
- u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
- return dest;
- }
- static inline uint8_t *
- _appendUTF8(uint8_t *pDest, UChar32 c) {
- /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
- if((c)<=0x7f) {
- *pDest++ = static_cast<uint8_t>(c);
- } else if(c<=0x7ff) {
- *pDest++ = static_cast<uint8_t>((c >> 6) | 0xc0);
- *pDest++ = static_cast<uint8_t>((c & 0x3f) | 0x80);
- } else if(c<=0xffff) {
- *pDest++ = static_cast<uint8_t>((c >> 12) | 0xe0);
- *pDest++ = static_cast<uint8_t>(((c >> 6) & 0x3f) | 0x80);
- *pDest++ = static_cast<uint8_t>(((c) & 0x3f) | 0x80);
- } else /* if((uint32_t)(c)<=0x10ffff) */ {
- *pDest++ = static_cast<uint8_t>(((c) >> 18) | 0xf0);
- *pDest++ = static_cast<uint8_t>((((c) >> 12) & 0x3f) | 0x80);
- *pDest++ = static_cast<uint8_t>((((c) >> 6) & 0x3f) | 0x80);
- *pDest++ = static_cast<uint8_t>(((c) & 0x3f) | 0x80);
- }
- return pDest;
- }
-
- U_CAPI char* U_EXPORT2
- u_strToUTF8WithSub(char *dest,
- int32_t destCapacity,
- int32_t *pDestLength,
- const char16_t *pSrc,
- int32_t srcLength,
- UChar32 subchar, int32_t *pNumSubstitutions,
- UErrorCode *pErrorCode){
- int32_t reqLength=0;
- uint32_t ch=0,ch2=0;
- uint8_t *pDest = (uint8_t *)dest;
- uint8_t *pDestLimit = (pDest!=nullptr)?(pDest + destCapacity):nullptr;
- int32_t numSubstitutions;
- /* args check */
- if(U_FAILURE(*pErrorCode)){
- return nullptr;
- }
-
- if( (pSrc==nullptr && srcLength!=0) || srcLength < -1 ||
- (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
- subchar > 0x10ffff || U_IS_SURROGATE(subchar)
- ) {
- *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return nullptr;
- }
- if(pNumSubstitutions!=nullptr) {
- *pNumSubstitutions=0;
- }
- numSubstitutions=0;
- if(srcLength==-1) {
- while((ch=*pSrc)!=0) {
- ++pSrc;
- if(ch <= 0x7f) {
- if(pDest<pDestLimit) {
- *pDest++ = (uint8_t)ch;
- } else {
- reqLength = 1;
- break;
- }
- } else if(ch <= 0x7ff) {
- if((pDestLimit - pDest) >= 2) {
- *pDest++=(uint8_t)((ch>>6)|0xc0);
- *pDest++=(uint8_t)((ch&0x3f)|0x80);
- } else {
- reqLength = 2;
- break;
- }
- } else if(ch <= 0xd7ff || ch >= 0xe000) {
- if((pDestLimit - pDest) >= 3) {
- *pDest++=(uint8_t)((ch>>12)|0xe0);
- *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
- *pDest++=(uint8_t)((ch&0x3f)|0x80);
- } else {
- reqLength = 3;
- break;
- }
- } else /* ch is a surrogate */ {
- int32_t length;
- /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
- if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
- ++pSrc;
- ch=U16_GET_SUPPLEMENTARY(ch, ch2);
- } else if(subchar>=0) {
- ch=subchar;
- ++numSubstitutions;
- } else {
- /* Unicode 3.2 forbids surrogate code points in UTF-8 */
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return nullptr;
- }
- length = U8_LENGTH(ch);
- if((pDestLimit - pDest) >= length) {
- /* convert and append*/
- pDest=_appendUTF8(pDest, ch);
- } else {
- reqLength = length;
- break;
- }
- }
- }
- while((ch=*pSrc++)!=0) {
- if(ch<=0x7f) {
- ++reqLength;
- } else if(ch<=0x7ff) {
- reqLength+=2;
- } else if(!U16_IS_SURROGATE(ch)) {
- reqLength+=3;
- } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
- ++pSrc;
- reqLength+=4;
- } else if(subchar>=0) {
- reqLength+=U8_LENGTH(subchar);
- ++numSubstitutions;
- } else {
- /* Unicode 3.2 forbids surrogate code points in UTF-8 */
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return nullptr;
- }
- }
- } else {
- const char16_t *pSrcLimit = (pSrc!=nullptr)?(pSrc+srcLength):nullptr;
- int32_t count;
- /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
- for(;;) {
- /*
- * Each iteration of the inner loop progresses by at most 3 UTF-8
- * bytes and one char16_t, for most characters.
- * For supplementary code points (4 & 2), which are rare,
- * there is an additional adjustment.
- */
- count = (int32_t)((pDestLimit - pDest) / 3);
- srcLength = (int32_t)(pSrcLimit - pSrc);
- if(count > srcLength) {
- count = srcLength; /* min(remaining dest/3, remaining src) */
- }
- if(count < 3) {
- /*
- * Too much overhead if we get near the end of the string,
- * continue with the next loop.
- */
- break;
- }
- do {
- ch=*pSrc++;
- if(ch <= 0x7f) {
- *pDest++ = (uint8_t)ch;
- } else if(ch <= 0x7ff) {
- *pDest++=(uint8_t)((ch>>6)|0xc0);
- *pDest++=(uint8_t)((ch&0x3f)|0x80);
- } else if(ch <= 0xd7ff || ch >= 0xe000) {
- *pDest++=(uint8_t)((ch>>12)|0xe0);
- *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
- *pDest++=(uint8_t)((ch&0x3f)|0x80);
- } else /* ch is a surrogate */ {
- /*
- * We will read two UChars and probably output four bytes,
- * which we didn't account for with computing count,
- * so we adjust it here.
- */
- if(--count == 0) {
- --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
- break; /* recompute count */
- }
- if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
- ++pSrc;
- ch=U16_GET_SUPPLEMENTARY(ch, ch2);
- /* writing 4 bytes per 2 UChars is ok */
- *pDest++=(uint8_t)((ch>>18)|0xf0);
- *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
- *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
- *pDest++=(uint8_t)((ch&0x3f)|0x80);
- } else {
- /* Unicode 3.2 forbids surrogate code points in UTF-8 */
- if(subchar>=0) {
- ch=subchar;
- ++numSubstitutions;
- } else {
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return nullptr;
- }
- /* convert and append*/
- pDest=_appendUTF8(pDest, ch);
- }
- }
- } while(--count > 0);
- }
- while(pSrc<pSrcLimit) {
- ch=*pSrc++;
- if(ch <= 0x7f) {
- if(pDest<pDestLimit) {
- *pDest++ = (uint8_t)ch;
- } else {
- reqLength = 1;
- break;
- }
- } else if(ch <= 0x7ff) {
- if((pDestLimit - pDest) >= 2) {
- *pDest++=(uint8_t)((ch>>6)|0xc0);
- *pDest++=(uint8_t)((ch&0x3f)|0x80);
- } else {
- reqLength = 2;
- break;
- }
- } else if(ch <= 0xd7ff || ch >= 0xe000) {
- if((pDestLimit - pDest) >= 3) {
- *pDest++=(uint8_t)((ch>>12)|0xe0);
- *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
- *pDest++=(uint8_t)((ch&0x3f)|0x80);
- } else {
- reqLength = 3;
- break;
- }
- } else /* ch is a surrogate */ {
- int32_t length;
- if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
- ++pSrc;
- ch=U16_GET_SUPPLEMENTARY(ch, ch2);
- } else if(subchar>=0) {
- ch=subchar;
- ++numSubstitutions;
- } else {
- /* Unicode 3.2 forbids surrogate code points in UTF-8 */
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return nullptr;
- }
- length = U8_LENGTH(ch);
- if((pDestLimit - pDest) >= length) {
- /* convert and append*/
- pDest=_appendUTF8(pDest, ch);
- } else {
- reqLength = length;
- break;
- }
- }
- }
- while(pSrc<pSrcLimit) {
- ch=*pSrc++;
- if(ch<=0x7f) {
- ++reqLength;
- } else if(ch<=0x7ff) {
- reqLength+=2;
- } else if(!U16_IS_SURROGATE(ch)) {
- reqLength+=3;
- } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
- ++pSrc;
- reqLength+=4;
- } else if(subchar>=0) {
- reqLength+=U8_LENGTH(subchar);
- ++numSubstitutions;
- } else {
- /* Unicode 3.2 forbids surrogate code points in UTF-8 */
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return nullptr;
- }
- }
- }
- reqLength+=(int32_t)(pDest - (uint8_t *)dest);
- if(pNumSubstitutions!=nullptr) {
- *pNumSubstitutions=numSubstitutions;
- }
- if(pDestLength){
- *pDestLength = reqLength;
- }
- /* Terminate the buffer */
- u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
- return dest;
- }
- U_CAPI char* U_EXPORT2
- u_strToUTF8(char *dest,
- int32_t destCapacity,
- int32_t *pDestLength,
- const char16_t *pSrc,
- int32_t srcLength,
- UErrorCode *pErrorCode){
- return u_strToUTF8WithSub(
- dest, destCapacity, pDestLength,
- pSrc, srcLength,
- U_SENTINEL, nullptr,
- pErrorCode);
- }
- U_CAPI char16_t* U_EXPORT2
- u_strFromJavaModifiedUTF8WithSub(
- char16_t *dest,
- int32_t destCapacity,
- int32_t *pDestLength,
- const char *src,
- int32_t srcLength,
- UChar32 subchar, int32_t *pNumSubstitutions,
- UErrorCode *pErrorCode) {
- /* args check */
- if(U_FAILURE(*pErrorCode)) {
- return nullptr;
- }
- if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
- (dest==nullptr && destCapacity!=0) || destCapacity<0 ||
- subchar > 0x10ffff || U_IS_SURROGATE(subchar)
- ) {
- *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return nullptr;
- }
- if(pNumSubstitutions!=nullptr) {
- *pNumSubstitutions=0;
- }
- char16_t *pDest = dest;
- char16_t *pDestLimit = dest+destCapacity;
- int32_t reqLength = 0;
- int32_t numSubstitutions=0;
- if(srcLength < 0) {
- /*
- * Transform a NUL-terminated ASCII string.
- * Handle non-ASCII strings with slower code.
- */
- UChar32 c;
- while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
- *pDest++=(char16_t)c;
- ++src;
- }
- if(c == 0) {
- reqLength=(int32_t)(pDest - dest);
- if(pDestLength) {
- *pDestLength = reqLength;
- }
- /* Terminate the buffer */
- u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
- return dest;
- }
- srcLength = static_cast<int32_t>(uprv_strlen(src));
- }
- /* Faster loop without ongoing checking for srcLength and pDestLimit. */
- UChar32 ch;
- uint8_t t1, t2;
- int32_t i = 0;
- for(;;) {
- int32_t count = (int32_t)(pDestLimit - pDest);
- int32_t count2 = srcLength - i;
- if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
- /* fast ASCII loop */
- int32_t start = i;
- uint8_t b;
- while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
- *pDest++=b;
- ++i;
- }
- int32_t delta = i - start;
- count -= delta;
- count2 -= delta;
- }
- /*
- * Each iteration of the inner loop progresses by at most 3 UTF-8
- * bytes and one char16_t.
- */
- if(subchar > 0xFFFF) {
- break;
- }
- count2 /= 3;
- if(count > count2) {
- count = count2; /* min(remaining dest, remaining src/3) */
- }
- if(count < 3) {
- /*
- * Too much overhead if we get near the end of the string,
- * continue with the next loop.
- */
- break;
- }
- do {
- ch = (uint8_t)src[i++];
- if(U8_IS_SINGLE(ch)) {
- *pDest++=(char16_t)ch;
- } else {
- if(ch >= 0xe0) {
- if( /* handle U+0000..U+FFFF inline */
- ch <= 0xef &&
- (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
- (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
- ) {
- /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
- *pDest++ = (char16_t)((ch << 12) | (t1 << 6) | t2);
- i += 2;
- continue;
- }
- } else {
- if( /* handle U+0000..U+07FF inline */
- ch >= 0xc0 &&
- (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
- ) {
- *pDest++ = (char16_t)(((ch & 0x1f) << 6) | t1);
- ++i;
- continue;
- }
- }
- if(subchar < 0) {
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return nullptr;
- } else if(subchar > 0xffff && --count == 0) {
- /*
- * We need to write two UChars, adjusted count for that,
- * and ran out of space.
- */
- --i; // back out byte ch
- break;
- } else {
- /* function call for error cases */
- utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
- ++numSubstitutions;
- *(pDest++)=(char16_t)subchar;
- }
- }
- } while(--count > 0);
- }
- while(i < srcLength && (pDest < pDestLimit)) {
- ch = (uint8_t)src[i++];
- if(U8_IS_SINGLE(ch)){
- *pDest++=(char16_t)ch;
- } else {
- if(ch >= 0xe0) {
- if( /* handle U+0000..U+FFFF inline */
- ch <= 0xef &&
- (i+1) < srcLength &&
- (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
- (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
- ) {
- /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
- *pDest++ = (char16_t)((ch << 12) | (t1 << 6) | t2);
- i += 2;
- continue;
- }
- } else {
- if( /* handle U+0000..U+07FF inline */
- ch >= 0xc0 &&
- i < srcLength &&
- (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
- ) {
- *pDest++ = (char16_t)(((ch & 0x1f) << 6) | t1);
- ++i;
- continue;
- }
- }
- if(subchar < 0) {
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return nullptr;
- } else {
- /* function call for error cases */
- utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
- ++numSubstitutions;
- if(subchar<=0xFFFF) {
- *(pDest++)=(char16_t)subchar;
- } else {
- *(pDest++)=U16_LEAD(subchar);
- if(pDest<pDestLimit) {
- *(pDest++)=U16_TRAIL(subchar);
- } else {
- reqLength++;
- break;
- }
- }
- }
- }
- }
- /* Pre-flight the rest of the string. */
- while(i < srcLength) {
- ch = (uint8_t)src[i++];
- if(U8_IS_SINGLE(ch)) {
- reqLength++;
- } else {
- if(ch >= 0xe0) {
- if( /* handle U+0000..U+FFFF inline */
- ch <= 0xef &&
- (i+1) < srcLength &&
- (uint8_t)(src[i] - 0x80) <= 0x3f &&
- (uint8_t)(src[i+1] - 0x80) <= 0x3f
- ) {
- reqLength++;
- i += 2;
- continue;
- }
- } else {
- if( /* handle U+0000..U+07FF inline */
- ch >= 0xc0 &&
- i < srcLength &&
- (uint8_t)(src[i] - 0x80) <= 0x3f
- ) {
- reqLength++;
- ++i;
- continue;
- }
- }
- if(subchar < 0) {
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return nullptr;
- } else {
- /* function call for error cases */
- utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
- ++numSubstitutions;
- reqLength+=U16_LENGTH(ch);
- }
- }
- }
- if(pNumSubstitutions!=nullptr) {
- *pNumSubstitutions=numSubstitutions;
- }
- reqLength+=(int32_t)(pDest - dest);
- if(pDestLength) {
- *pDestLength = reqLength;
- }
- /* Terminate the buffer */
- u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
- return dest;
- }
- U_CAPI char* U_EXPORT2
- u_strToJavaModifiedUTF8(
- char *dest,
- int32_t destCapacity,
- int32_t *pDestLength,
- const char16_t *src,
- int32_t srcLength,
- UErrorCode *pErrorCode) {
- int32_t reqLength=0;
- uint32_t ch=0;
- const char16_t *pSrcLimit;
- int32_t count;
- /* args check */
- if(U_FAILURE(*pErrorCode)){
- return nullptr;
- }
- if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
- (dest==nullptr && destCapacity!=0) || destCapacity<0
- ) {
- *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return nullptr;
- }
- uint8_t *pDest = (uint8_t *)dest;
- uint8_t *pDestLimit = pDest + destCapacity;
- if(srcLength==-1) {
- /* Convert NUL-terminated ASCII, then find the string length. */
- while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
- *pDest++ = (uint8_t)ch;
- ++src;
- }
- if(ch == 0) {
- reqLength=(int32_t)(pDest - (uint8_t *)dest);
- if(pDestLength) {
- *pDestLength = reqLength;
- }
- /* Terminate the buffer */
- u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
- return dest;
- }
- srcLength = u_strlen(src);
- }
- /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
- pSrcLimit = (src!=nullptr)?(src+srcLength):nullptr;
- for(;;) {
- count = (int32_t)(pDestLimit - pDest);
- srcLength = (int32_t)(pSrcLimit - src);
- if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
- /* fast ASCII loop */
- const char16_t *prevSrc = src;
- int32_t delta;
- while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
- *pDest++=(uint8_t)ch;
- ++src;
- }
- delta = (int32_t)(src - prevSrc);
- count -= delta;
- srcLength -= delta;
- }
- /*
- * Each iteration of the inner loop progresses by at most 3 UTF-8
- * bytes and one char16_t.
- */
- count /= 3;
- if(count > srcLength) {
- count = srcLength; /* min(remaining dest/3, remaining src) */
- }
- if(count < 3) {
- /*
- * Too much overhead if we get near the end of the string,
- * continue with the next loop.
- */
- break;
- }
- do {
- ch=*src++;
- if(ch <= 0x7f && ch != 0) {
- *pDest++ = (uint8_t)ch;
- } else if(ch <= 0x7ff) {
- *pDest++=(uint8_t)((ch>>6)|0xc0);
- *pDest++=(uint8_t)((ch&0x3f)|0x80);
- } else {
- *pDest++=(uint8_t)((ch>>12)|0xe0);
- *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
- *pDest++=(uint8_t)((ch&0x3f)|0x80);
- }
- } while(--count > 0);
- }
- while(src<pSrcLimit) {
- ch=*src++;
- if(ch <= 0x7f && ch != 0) {
- if(pDest<pDestLimit) {
- *pDest++ = (uint8_t)ch;
- } else {
- reqLength = 1;
- break;
- }
- } else if(ch <= 0x7ff) {
- if((pDestLimit - pDest) >= 2) {
- *pDest++=(uint8_t)((ch>>6)|0xc0);
- *pDest++=(uint8_t)((ch&0x3f)|0x80);
- } else {
- reqLength = 2;
- break;
- }
- } else {
- if((pDestLimit - pDest) >= 3) {
- *pDest++=(uint8_t)((ch>>12)|0xe0);
- *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
- *pDest++=(uint8_t)((ch&0x3f)|0x80);
- } else {
- reqLength = 3;
- break;
- }
- }
- }
- while(src<pSrcLimit) {
- ch=*src++;
- if(ch <= 0x7f && ch != 0) {
- ++reqLength;
- } else if(ch<=0x7ff) {
- reqLength+=2;
- } else {
- reqLength+=3;
- }
- }
- reqLength+=(int32_t)(pDest - (uint8_t *)dest);
- if(pDestLength){
- *pDestLength = reqLength;
- }
- /* Terminate the buffer */
- u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
- return dest;
- }
|