ustring.cpp 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. *
  6. * Copyright (C) 1998-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. ******************************************************************************
  10. *
  11. * File ustring.cpp
  12. *
  13. * Modification History:
  14. *
  15. * Date Name Description
  16. * 12/07/98 bertrand Creation.
  17. ******************************************************************************
  18. */
  19. #include "unicode/utypes.h"
  20. #include "unicode/putil.h"
  21. #include "unicode/uchar.h"
  22. #include "unicode/ustring.h"
  23. #include "unicode/utf16.h"
  24. #include "cstring.h"
  25. #include "cwchar.h"
  26. #include "cmemory.h"
  27. #include "ustr_imp.h"
  28. /* ANSI string.h - style functions ------------------------------------------ */
  29. /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit char16_t */
  30. #define U_BMP_MAX 0xffff
  31. /* Forward binary string search functions ----------------------------------- */
  32. /*
  33. * Test if a substring match inside a string is at code point boundaries.
  34. * All pointers refer to the same buffer.
  35. * The limit pointer may be nullptr, all others must be real pointers.
  36. */
  37. static inline UBool
  38. isMatchAtCPBoundary(const char16_t *start, const char16_t *match, const char16_t *matchLimit, const char16_t *limit) {
  39. if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) {
  40. /* the leading edge of the match is in the middle of a surrogate pair */
  41. return false;
  42. }
  43. if(U16_IS_LEAD(*(matchLimit-1)) && matchLimit!=limit && U16_IS_TRAIL(*matchLimit)) {
  44. /* the trailing edge of the match is in the middle of a surrogate pair */
  45. return false;
  46. }
  47. return true;
  48. }
  49. U_CAPI char16_t * U_EXPORT2
  50. u_strFindFirst(const char16_t *s, int32_t length,
  51. const char16_t *sub, int32_t subLength) {
  52. const char16_t *start, *p, *q, *subLimit;
  53. char16_t c, cs, cq;
  54. if(sub==nullptr || subLength<-1) {
  55. return (char16_t *)s;
  56. }
  57. if(s==nullptr || length<-1) {
  58. return nullptr;
  59. }
  60. start=s;
  61. if(length<0 && subLength<0) {
  62. /* both strings are NUL-terminated */
  63. if((cs=*sub++)==0) {
  64. return (char16_t *)s;
  65. }
  66. if(*sub==0 && !U16_IS_SURROGATE(cs)) {
  67. /* the substring consists of a single, non-surrogate BMP code point */
  68. return u_strchr(s, cs);
  69. }
  70. while((c=*s++)!=0) {
  71. if(c==cs) {
  72. /* found first substring char16_t, compare rest */
  73. p=s;
  74. q=sub;
  75. for(;;) {
  76. if((cq=*q)==0) {
  77. if(isMatchAtCPBoundary(start, s-1, p, nullptr)) {
  78. return (char16_t *)(s-1); /* well-formed match */
  79. } else {
  80. break; /* no match because surrogate pair is split */
  81. }
  82. }
  83. if((c=*p)==0) {
  84. return nullptr; /* no match, and none possible after s */
  85. }
  86. if(c!=cq) {
  87. break; /* no match */
  88. }
  89. ++p;
  90. ++q;
  91. }
  92. }
  93. }
  94. /* not found */
  95. return nullptr;
  96. }
  97. if(subLength<0) {
  98. subLength=u_strlen(sub);
  99. }
  100. if(subLength==0) {
  101. return (char16_t *)s;
  102. }
  103. /* get sub[0] to search for it fast */
  104. cs=*sub++;
  105. --subLength;
  106. subLimit=sub+subLength;
  107. if(subLength==0 && !U16_IS_SURROGATE(cs)) {
  108. /* the substring consists of a single, non-surrogate BMP code point */
  109. return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length);
  110. }
  111. if(length<0) {
  112. /* s is NUL-terminated */
  113. while((c=*s++)!=0) {
  114. if(c==cs) {
  115. /* found first substring char16_t, compare rest */
  116. p=s;
  117. q=sub;
  118. for(;;) {
  119. if(q==subLimit) {
  120. if(isMatchAtCPBoundary(start, s-1, p, nullptr)) {
  121. return (char16_t *)(s-1); /* well-formed match */
  122. } else {
  123. break; /* no match because surrogate pair is split */
  124. }
  125. }
  126. if((c=*p)==0) {
  127. return nullptr; /* no match, and none possible after s */
  128. }
  129. if(c!=*q) {
  130. break; /* no match */
  131. }
  132. ++p;
  133. ++q;
  134. }
  135. }
  136. }
  137. } else {
  138. const char16_t *limit, *preLimit;
  139. /* subLength was decremented above */
  140. if(length<=subLength) {
  141. return nullptr; /* s is shorter than sub */
  142. }
  143. limit=s+length;
  144. /* the substring must start before preLimit */
  145. preLimit=limit-subLength;
  146. while(s!=preLimit) {
  147. c=*s++;
  148. if(c==cs) {
  149. /* found first substring char16_t, compare rest */
  150. p=s;
  151. q=sub;
  152. for(;;) {
  153. if(q==subLimit) {
  154. if(isMatchAtCPBoundary(start, s-1, p, limit)) {
  155. return (char16_t *)(s-1); /* well-formed match */
  156. } else {
  157. break; /* no match because surrogate pair is split */
  158. }
  159. }
  160. if(*p!=*q) {
  161. break; /* no match */
  162. }
  163. ++p;
  164. ++q;
  165. }
  166. }
  167. }
  168. }
  169. /* not found */
  170. return nullptr;
  171. }
  172. U_CAPI char16_t * U_EXPORT2
  173. u_strstr(const char16_t *s, const char16_t *substring) {
  174. return u_strFindFirst(s, -1, substring, -1);
  175. }
  176. U_CAPI char16_t * U_EXPORT2
  177. u_strchr(const char16_t *s, char16_t c) {
  178. if(U16_IS_SURROGATE(c)) {
  179. /* make sure to not find half of a surrogate pair */
  180. return u_strFindFirst(s, -1, &c, 1);
  181. } else {
  182. char16_t cs;
  183. /* trivial search for a BMP code point */
  184. for(;;) {
  185. if((cs=*s)==c) {
  186. return (char16_t *)s;
  187. }
  188. if(cs==0) {
  189. return nullptr;
  190. }
  191. ++s;
  192. }
  193. }
  194. }
  195. U_CAPI char16_t * U_EXPORT2
  196. u_strchr32(const char16_t *s, UChar32 c) {
  197. if((uint32_t)c<=U_BMP_MAX) {
  198. /* find BMP code point */
  199. return u_strchr(s, (char16_t)c);
  200. } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
  201. /* find supplementary code point as surrogate pair */
  202. char16_t cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
  203. while((cs=*s++)!=0) {
  204. if(cs==lead && *s==trail) {
  205. return (char16_t *)(s-1);
  206. }
  207. }
  208. return nullptr;
  209. } else {
  210. /* not a Unicode code point, not findable */
  211. return nullptr;
  212. }
  213. }
  214. U_CAPI char16_t * U_EXPORT2
  215. u_memchr(const char16_t *s, char16_t c, int32_t count) {
  216. if(count<=0) {
  217. return nullptr; /* no string */
  218. } else if(U16_IS_SURROGATE(c)) {
  219. /* make sure to not find half of a surrogate pair */
  220. return u_strFindFirst(s, count, &c, 1);
  221. } else {
  222. /* trivial search for a BMP code point */
  223. const char16_t *limit=s+count;
  224. do {
  225. if(*s==c) {
  226. return (char16_t *)s;
  227. }
  228. } while(++s!=limit);
  229. return nullptr;
  230. }
  231. }
  232. U_CAPI char16_t * U_EXPORT2
  233. u_memchr32(const char16_t *s, UChar32 c, int32_t count) {
  234. if((uint32_t)c<=U_BMP_MAX) {
  235. /* find BMP code point */
  236. return u_memchr(s, (char16_t)c, count);
  237. } else if(count<2) {
  238. /* too short for a surrogate pair */
  239. return nullptr;
  240. } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
  241. /* find supplementary code point as surrogate pair */
  242. const char16_t *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */
  243. char16_t lead=U16_LEAD(c), trail=U16_TRAIL(c);
  244. do {
  245. if(*s==lead && *(s+1)==trail) {
  246. return (char16_t *)s;
  247. }
  248. } while(++s!=limit);
  249. return nullptr;
  250. } else {
  251. /* not a Unicode code point, not findable */
  252. return nullptr;
  253. }
  254. }
  255. /* Backward binary string search functions ---------------------------------- */
  256. U_CAPI char16_t * U_EXPORT2
  257. u_strFindLast(const char16_t *s, int32_t length,
  258. const char16_t *sub, int32_t subLength) {
  259. const char16_t *start, *limit, *p, *q, *subLimit;
  260. char16_t c, cs;
  261. if(sub==nullptr || subLength<-1) {
  262. return (char16_t *)s;
  263. }
  264. if(s==nullptr || length<-1) {
  265. return nullptr;
  266. }
  267. /*
  268. * This implementation is more lazy than the one for u_strFindFirst():
  269. * There is no special search code for NUL-terminated strings.
  270. * It does not seem to be worth it for searching substrings to
  271. * search forward and find all matches like in u_strrchr() and similar.
  272. * Therefore, we simply get both string lengths and search backward.
  273. *
  274. * markus 2002oct23
  275. */
  276. if(subLength<0) {
  277. subLength=u_strlen(sub);
  278. }
  279. if(subLength==0) {
  280. return (char16_t *)s;
  281. }
  282. /* get sub[subLength-1] to search for it fast */
  283. subLimit=sub+subLength;
  284. cs=*(--subLimit);
  285. --subLength;
  286. if(subLength==0 && !U16_IS_SURROGATE(cs)) {
  287. /* the substring consists of a single, non-surrogate BMP code point */
  288. return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
  289. }
  290. if(length<0) {
  291. length=u_strlen(s);
  292. }
  293. /* subLength was decremented above */
  294. if(length<=subLength) {
  295. return nullptr; /* s is shorter than sub */
  296. }
  297. start=s;
  298. limit=s+length;
  299. /* the substring must start no later than s+subLength */
  300. s+=subLength;
  301. while(s!=limit) {
  302. c=*(--limit);
  303. if(c==cs) {
  304. /* found last substring char16_t, compare rest */
  305. p=limit;
  306. q=subLimit;
  307. for(;;) {
  308. if(q==sub) {
  309. if(isMatchAtCPBoundary(start, p, limit+1, start+length)) {
  310. return (char16_t *)p; /* well-formed match */
  311. } else {
  312. break; /* no match because surrogate pair is split */
  313. }
  314. }
  315. if(*(--p)!=*(--q)) {
  316. break; /* no match */
  317. }
  318. }
  319. }
  320. }
  321. /* not found */
  322. return nullptr;
  323. }
  324. U_CAPI char16_t * U_EXPORT2
  325. u_strrstr(const char16_t *s, const char16_t *substring) {
  326. return u_strFindLast(s, -1, substring, -1);
  327. }
  328. U_CAPI char16_t * U_EXPORT2
  329. u_strrchr(const char16_t *s, char16_t c) {
  330. if(U16_IS_SURROGATE(c)) {
  331. /* make sure to not find half of a surrogate pair */
  332. return u_strFindLast(s, -1, &c, 1);
  333. } else {
  334. const char16_t *result=nullptr;
  335. char16_t cs;
  336. /* trivial search for a BMP code point */
  337. for(;;) {
  338. if((cs=*s)==c) {
  339. result=s;
  340. }
  341. if(cs==0) {
  342. return (char16_t *)result;
  343. }
  344. ++s;
  345. }
  346. }
  347. }
  348. U_CAPI char16_t * U_EXPORT2
  349. u_strrchr32(const char16_t *s, UChar32 c) {
  350. if((uint32_t)c<=U_BMP_MAX) {
  351. /* find BMP code point */
  352. return u_strrchr(s, (char16_t)c);
  353. } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
  354. /* find supplementary code point as surrogate pair */
  355. const char16_t *result=nullptr;
  356. char16_t cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
  357. while((cs=*s++)!=0) {
  358. if(cs==lead && *s==trail) {
  359. result=s-1;
  360. }
  361. }
  362. return (char16_t *)result;
  363. } else {
  364. /* not a Unicode code point, not findable */
  365. return nullptr;
  366. }
  367. }
  368. U_CAPI char16_t * U_EXPORT2
  369. u_memrchr(const char16_t *s, char16_t c, int32_t count) {
  370. if(count<=0) {
  371. return nullptr; /* no string */
  372. } else if(U16_IS_SURROGATE(c)) {
  373. /* make sure to not find half of a surrogate pair */
  374. return u_strFindLast(s, count, &c, 1);
  375. } else {
  376. /* trivial search for a BMP code point */
  377. const char16_t *limit=s+count;
  378. do {
  379. if(*(--limit)==c) {
  380. return (char16_t *)limit;
  381. }
  382. } while(s!=limit);
  383. return nullptr;
  384. }
  385. }
  386. U_CAPI char16_t * U_EXPORT2
  387. u_memrchr32(const char16_t *s, UChar32 c, int32_t count) {
  388. if((uint32_t)c<=U_BMP_MAX) {
  389. /* find BMP code point */
  390. return u_memrchr(s, (char16_t)c, count);
  391. } else if(count<2) {
  392. /* too short for a surrogate pair */
  393. return nullptr;
  394. } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
  395. /* find supplementary code point as surrogate pair */
  396. const char16_t *limit=s+count-1;
  397. char16_t lead=U16_LEAD(c), trail=U16_TRAIL(c);
  398. do {
  399. if(*limit==trail && *(limit-1)==lead) {
  400. return (char16_t *)(limit-1);
  401. }
  402. } while(s!=--limit);
  403. return nullptr;
  404. } else {
  405. /* not a Unicode code point, not findable */
  406. return nullptr;
  407. }
  408. }
  409. /* Tokenization functions --------------------------------------------------- */
  410. /*
  411. * Match each code point in a string against each code point in the matchSet.
  412. * Return the index of the first string code point that
  413. * is (polarity==true) or is not (false) contained in the matchSet.
  414. * Return -(string length)-1 if there is no such code point.
  415. */
  416. static int32_t
  417. _matchFromSet(const char16_t *string, const char16_t *matchSet, UBool polarity) {
  418. int32_t matchLen, matchBMPLen, strItr, matchItr;
  419. UChar32 stringCh, matchCh;
  420. char16_t c, c2;
  421. /* first part of matchSet contains only BMP code points */
  422. matchBMPLen = 0;
  423. while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
  424. ++matchBMPLen;
  425. }
  426. /* second part of matchSet contains BMP and supplementary code points */
  427. matchLen = matchBMPLen;
  428. while(matchSet[matchLen] != 0) {
  429. ++matchLen;
  430. }
  431. for(strItr = 0; (c = string[strItr]) != 0;) {
  432. ++strItr;
  433. if(U16_IS_SINGLE(c)) {
  434. if(polarity) {
  435. for(matchItr = 0; matchItr < matchLen; ++matchItr) {
  436. if(c == matchSet[matchItr]) {
  437. return strItr - 1; /* one matches */
  438. }
  439. }
  440. } else {
  441. for(matchItr = 0; matchItr < matchLen; ++matchItr) {
  442. if(c == matchSet[matchItr]) {
  443. goto endloop;
  444. }
  445. }
  446. return strItr - 1; /* none matches */
  447. }
  448. } else {
  449. /*
  450. * No need to check for string length before U16_IS_TRAIL
  451. * because c2 could at worst be the terminating NUL.
  452. */
  453. if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
  454. ++strItr;
  455. stringCh = U16_GET_SUPPLEMENTARY(c, c2);
  456. } else {
  457. stringCh = c; /* unpaired trail surrogate */
  458. }
  459. if(polarity) {
  460. for(matchItr = matchBMPLen; matchItr < matchLen;) {
  461. U16_NEXT(matchSet, matchItr, matchLen, matchCh);
  462. if(stringCh == matchCh) {
  463. return strItr - U16_LENGTH(stringCh); /* one matches */
  464. }
  465. }
  466. } else {
  467. for(matchItr = matchBMPLen; matchItr < matchLen;) {
  468. U16_NEXT(matchSet, matchItr, matchLen, matchCh);
  469. if(stringCh == matchCh) {
  470. goto endloop;
  471. }
  472. }
  473. return strItr - U16_LENGTH(stringCh); /* none matches */
  474. }
  475. }
  476. endloop:
  477. /* wish C had continue with labels like Java... */;
  478. }
  479. /* Didn't find it. */
  480. return -strItr-1;
  481. }
  482. /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
  483. U_CAPI char16_t * U_EXPORT2
  484. u_strpbrk(const char16_t *string, const char16_t *matchSet)
  485. {
  486. int32_t idx = _matchFromSet(string, matchSet, true);
  487. if(idx >= 0) {
  488. return (char16_t *)string + idx;
  489. } else {
  490. return nullptr;
  491. }
  492. }
  493. /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
  494. U_CAPI int32_t U_EXPORT2
  495. u_strcspn(const char16_t *string, const char16_t *matchSet)
  496. {
  497. int32_t idx = _matchFromSet(string, matchSet, true);
  498. if(idx >= 0) {
  499. return idx;
  500. } else {
  501. return -idx - 1; /* == u_strlen(string) */
  502. }
  503. }
  504. /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
  505. U_CAPI int32_t U_EXPORT2
  506. u_strspn(const char16_t *string, const char16_t *matchSet)
  507. {
  508. int32_t idx = _matchFromSet(string, matchSet, false);
  509. if(idx >= 0) {
  510. return idx;
  511. } else {
  512. return -idx - 1; /* == u_strlen(string) */
  513. }
  514. }
  515. /* ----- Text manipulation functions --- */
  516. U_CAPI char16_t* U_EXPORT2
  517. u_strtok_r(char16_t *src,
  518. const char16_t *delim,
  519. char16_t **saveState)
  520. {
  521. char16_t *tokSource;
  522. char16_t *nextToken;
  523. uint32_t nonDelimIdx;
  524. /* If saveState is nullptr, the user messed up. */
  525. if (src != nullptr) {
  526. tokSource = src;
  527. *saveState = src; /* Set to "src" in case there are no delimiters */
  528. }
  529. else if (*saveState) {
  530. tokSource = *saveState;
  531. }
  532. else {
  533. /* src == nullptr && *saveState == nullptr */
  534. /* This shouldn't happen. We already finished tokenizing. */
  535. return nullptr;
  536. }
  537. /* Skip initial delimiters */
  538. nonDelimIdx = u_strspn(tokSource, delim);
  539. tokSource = &tokSource[nonDelimIdx];
  540. if (*tokSource) {
  541. nextToken = u_strpbrk(tokSource, delim);
  542. if (nextToken != nullptr) {
  543. /* Create a token */
  544. *(nextToken++) = 0;
  545. *saveState = nextToken;
  546. return tokSource;
  547. }
  548. else if (*saveState) {
  549. /* Return the last token */
  550. *saveState = nullptr;
  551. return tokSource;
  552. }
  553. }
  554. else {
  555. /* No tokens were found. Only delimiters were left. */
  556. *saveState = nullptr;
  557. }
  558. return nullptr;
  559. }
  560. /* Miscellaneous functions -------------------------------------------------- */
  561. U_CAPI char16_t* U_EXPORT2
  562. u_strcat(char16_t *dst,
  563. const char16_t *src)
  564. {
  565. char16_t *anchor = dst; /* save a pointer to start of dst */
  566. while(*dst != 0) { /* To end of first string */
  567. ++dst;
  568. }
  569. while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */
  570. }
  571. return anchor;
  572. }
  573. U_CAPI char16_t* U_EXPORT2
  574. u_strncat(char16_t *dst,
  575. const char16_t *src,
  576. int32_t n )
  577. {
  578. if(n > 0) {
  579. char16_t *anchor = dst; /* save a pointer to start of dst */
  580. while(*dst != 0) { /* To end of first string */
  581. ++dst;
  582. }
  583. while((*dst = *src) != 0) { /* copy string 2 over */
  584. ++dst;
  585. if(--n == 0) {
  586. *dst = 0;
  587. break;
  588. }
  589. ++src;
  590. }
  591. return anchor;
  592. } else {
  593. return dst;
  594. }
  595. }
  596. /* ----- Text property functions --- */
  597. U_CAPI int32_t U_EXPORT2
  598. u_strcmp(const char16_t *s1,
  599. const char16_t *s2)
  600. {
  601. char16_t c1, c2;
  602. for(;;) {
  603. c1=*s1++;
  604. c2=*s2++;
  605. if (c1 != c2 || c1 == 0) {
  606. break;
  607. }
  608. }
  609. return (int32_t)c1 - (int32_t)c2;
  610. }
  611. U_CFUNC int32_t U_EXPORT2
  612. uprv_strCompare(const char16_t *s1, int32_t length1,
  613. const char16_t *s2, int32_t length2,
  614. UBool strncmpStyle, UBool codePointOrder) {
  615. const char16_t *start1, *start2, *limit1, *limit2;
  616. char16_t c1, c2;
  617. /* setup for fix-up */
  618. start1=s1;
  619. start2=s2;
  620. /* compare identical prefixes - they do not need to be fixed up */
  621. if(length1<0 && length2<0) {
  622. /* strcmp style, both NUL-terminated */
  623. if(s1==s2) {
  624. return 0;
  625. }
  626. for(;;) {
  627. c1=*s1;
  628. c2=*s2;
  629. if(c1!=c2) {
  630. break;
  631. }
  632. if(c1==0) {
  633. return 0;
  634. }
  635. ++s1;
  636. ++s2;
  637. }
  638. /* setup for fix-up */
  639. limit1=limit2=nullptr;
  640. } else if(strncmpStyle) {
  641. /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
  642. if(s1==s2) {
  643. return 0;
  644. }
  645. limit1=start1+length1;
  646. for(;;) {
  647. /* both lengths are same, check only one limit */
  648. if(s1==limit1) {
  649. return 0;
  650. }
  651. c1=*s1;
  652. c2=*s2;
  653. if(c1!=c2) {
  654. break;
  655. }
  656. if(c1==0) {
  657. return 0;
  658. }
  659. ++s1;
  660. ++s2;
  661. }
  662. /* setup for fix-up */
  663. limit2=start2+length1; /* use length1 here, too, to enforce assumption */
  664. } else {
  665. /* memcmp/UnicodeString style, both length-specified */
  666. int32_t lengthResult;
  667. if(length1<0) {
  668. length1=u_strlen(s1);
  669. }
  670. if(length2<0) {
  671. length2=u_strlen(s2);
  672. }
  673. /* limit1=start1+min(length1, length2) */
  674. if(length1<length2) {
  675. lengthResult=-1;
  676. limit1=start1+length1;
  677. } else if(length1==length2) {
  678. lengthResult=0;
  679. limit1=start1+length1;
  680. } else /* length1>length2 */ {
  681. lengthResult=1;
  682. limit1=start1+length2;
  683. }
  684. if(s1==s2) {
  685. return lengthResult;
  686. }
  687. for(;;) {
  688. /* check pseudo-limit */
  689. if(s1==limit1) {
  690. return lengthResult;
  691. }
  692. c1=*s1;
  693. c2=*s2;
  694. if(c1!=c2) {
  695. break;
  696. }
  697. ++s1;
  698. ++s2;
  699. }
  700. /* setup for fix-up */
  701. limit1=start1+length1;
  702. limit2=start2+length2;
  703. }
  704. /* if both values are in or above the surrogate range, fix them up */
  705. if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
  706. /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
  707. if(
  708. (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) ||
  709. (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1)))
  710. ) {
  711. /* part of a surrogate pair, leave >=d800 */
  712. } else {
  713. /* BMP code point - may be surrogate code point - make <d800 */
  714. c1-=0x2800;
  715. }
  716. if(
  717. (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) ||
  718. (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1)))
  719. ) {
  720. /* part of a surrogate pair, leave >=d800 */
  721. } else {
  722. /* BMP code point - may be surrogate code point - make <d800 */
  723. c2-=0x2800;
  724. }
  725. }
  726. /* now c1 and c2 are in the requested (code unit or code point) order */
  727. return (int32_t)c1-(int32_t)c2;
  728. }
  729. /*
  730. * Compare two strings as presented by UCharIterators.
  731. * Use code unit or code point order.
  732. * When the function returns, it is undefined where the iterators
  733. * have stopped.
  734. */
  735. U_CAPI int32_t U_EXPORT2
  736. u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) {
  737. UChar32 c1, c2;
  738. /* argument checking */
  739. if(iter1==nullptr || iter2==nullptr) {
  740. return 0; /* bad arguments */
  741. }
  742. if(iter1==iter2) {
  743. return 0; /* identical iterators */
  744. }
  745. /* reset iterators to start? */
  746. iter1->move(iter1, 0, UITER_START);
  747. iter2->move(iter2, 0, UITER_START);
  748. /* compare identical prefixes - they do not need to be fixed up */
  749. for(;;) {
  750. c1=iter1->next(iter1);
  751. c2=iter2->next(iter2);
  752. if(c1!=c2) {
  753. break;
  754. }
  755. if(c1==-1) {
  756. return 0;
  757. }
  758. }
  759. /* if both values are in or above the surrogate range, fix them up */
  760. if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
  761. /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
  762. if(
  763. (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) ||
  764. (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1))))
  765. ) {
  766. /* part of a surrogate pair, leave >=d800 */
  767. } else {
  768. /* BMP code point - may be surrogate code point - make <d800 */
  769. c1-=0x2800;
  770. }
  771. if(
  772. (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) ||
  773. (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2))))
  774. ) {
  775. /* part of a surrogate pair, leave >=d800 */
  776. } else {
  777. /* BMP code point - may be surrogate code point - make <d800 */
  778. c2-=0x2800;
  779. }
  780. }
  781. /* now c1 and c2 are in the requested (code unit or code point) order */
  782. return (int32_t)c1-(int32_t)c2;
  783. }
  784. #if 0
  785. /*
  786. * u_strCompareIter() does not leave the iterators _on_ the different units.
  787. * This is possible but would cost a few extra indirect function calls to back
  788. * up if the last unit (c1 or c2 respectively) was >=0.
  789. *
  790. * Consistently leaving them _behind_ the different units is not an option
  791. * because the current "unit" is the end of the string if that is reached,
  792. * and in such a case the iterator does not move.
  793. * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
  794. * of their strings. Calling previous() on each does not move them to where
  795. * the comparison fails.
  796. *
  797. * So the simplest semantics is to not define where the iterators end up.
  798. *
  799. * The following fragment is part of what would need to be done for backing up.
  800. */
  801. void fragment {
  802. /* iff a surrogate is part of a surrogate pair, leave >=d800 */
  803. if(c1<=0xdbff) {
  804. if(!U16_IS_TRAIL(iter1->current(iter1))) {
  805. /* lead surrogate code point - make <d800 */
  806. c1-=0x2800;
  807. }
  808. } else if(c1<=0xdfff) {
  809. int32_t idx=iter1->getIndex(iter1, UITER_CURRENT);
  810. iter1->previous(iter1); /* ==c1 */
  811. if(!U16_IS_LEAD(iter1->previous(iter1))) {
  812. /* trail surrogate code point - make <d800 */
  813. c1-=0x2800;
  814. }
  815. /* go back to behind where the difference is */
  816. iter1->move(iter1, idx, UITER_ZERO);
  817. } else /* 0xe000<=c1<=0xffff */ {
  818. /* BMP code point - make <d800 */
  819. c1-=0x2800;
  820. }
  821. }
  822. #endif
  823. U_CAPI int32_t U_EXPORT2
  824. u_strCompare(const char16_t *s1, int32_t length1,
  825. const char16_t *s2, int32_t length2,
  826. UBool codePointOrder) {
  827. /* argument checking */
  828. if(s1==nullptr || length1<-1 || s2==nullptr || length2<-1) {
  829. return 0;
  830. }
  831. return uprv_strCompare(s1, length1, s2, length2, false, codePointOrder);
  832. }
  833. /* String compare in code point order - u_strcmp() compares in code unit order. */
  834. U_CAPI int32_t U_EXPORT2
  835. u_strcmpCodePointOrder(const char16_t *s1, const char16_t *s2) {
  836. return uprv_strCompare(s1, -1, s2, -1, false, true);
  837. }
  838. U_CAPI int32_t U_EXPORT2
  839. u_strncmp(const char16_t *s1,
  840. const char16_t *s2,
  841. int32_t n)
  842. {
  843. if(n > 0) {
  844. int32_t rc;
  845. for(;;) {
  846. rc = (int32_t)*s1 - (int32_t)*s2;
  847. if(rc != 0 || *s1 == 0 || --n == 0) {
  848. return rc;
  849. }
  850. ++s1;
  851. ++s2;
  852. }
  853. } else {
  854. return 0;
  855. }
  856. }
  857. U_CAPI int32_t U_EXPORT2
  858. u_strncmpCodePointOrder(const char16_t *s1, const char16_t *s2, int32_t n) {
  859. return uprv_strCompare(s1, n, s2, n, true, true);
  860. }
  861. U_CAPI char16_t* U_EXPORT2
  862. u_strcpy(char16_t *dst,
  863. const char16_t *src)
  864. {
  865. char16_t *anchor = dst; /* save a pointer to start of dst */
  866. while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */
  867. }
  868. return anchor;
  869. }
  870. U_CAPI char16_t* U_EXPORT2
  871. u_strncpy(char16_t *dst,
  872. const char16_t *src,
  873. int32_t n)
  874. {
  875. char16_t *anchor = dst; /* save a pointer to start of dst */
  876. /* copy string 2 over */
  877. while(n > 0 && (*(dst++) = *(src++)) != 0) {
  878. --n;
  879. }
  880. return anchor;
  881. }
  882. U_CAPI int32_t U_EXPORT2
  883. u_strlen(const char16_t *s)
  884. {
  885. #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
  886. return (int32_t)uprv_wcslen((const wchar_t *)s);
  887. #else
  888. const char16_t *t = s;
  889. while(*t != 0) {
  890. ++t;
  891. }
  892. return t - s;
  893. #endif
  894. }
  895. U_CAPI int32_t U_EXPORT2
  896. u_countChar32(const char16_t *s, int32_t length) {
  897. int32_t count;
  898. if(s==nullptr || length<-1) {
  899. return 0;
  900. }
  901. count=0;
  902. if(length>=0) {
  903. while(length>0) {
  904. ++count;
  905. if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) {
  906. s+=2;
  907. length-=2;
  908. } else {
  909. ++s;
  910. --length;
  911. }
  912. }
  913. } else /* length==-1 */ {
  914. char16_t c;
  915. for(;;) {
  916. if((c=*s++)==0) {
  917. break;
  918. }
  919. ++count;
  920. /*
  921. * sufficient to look ahead one because of UTF-16;
  922. * safe to look ahead one because at worst that would be the terminating NUL
  923. */
  924. if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
  925. ++s;
  926. }
  927. }
  928. }
  929. return count;
  930. }
  931. U_CAPI UBool U_EXPORT2
  932. u_strHasMoreChar32Than(const char16_t *s, int32_t length, int32_t number) {
  933. if(number<0) {
  934. return true;
  935. }
  936. if(s==nullptr || length<-1) {
  937. return false;
  938. }
  939. if(length==-1) {
  940. /* s is NUL-terminated */
  941. char16_t c;
  942. /* count code points until they exceed */
  943. for(;;) {
  944. if((c=*s++)==0) {
  945. return false;
  946. }
  947. if(number==0) {
  948. return true;
  949. }
  950. if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
  951. ++s;
  952. }
  953. --number;
  954. }
  955. } else {
  956. /* length>=0 known */
  957. const char16_t *limit;
  958. int32_t maxSupplementary;
  959. /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
  960. if(((length+1)/2)>number) {
  961. return true;
  962. }
  963. /* check if s does not even contain enough UChars */
  964. maxSupplementary=length-number;
  965. if(maxSupplementary<=0) {
  966. return false;
  967. }
  968. /* there are maxSupplementary=length-number more UChars than asked-for code points */
  969. /*
  970. * count code points until they exceed and also check that there are
  971. * no more than maxSupplementary supplementary code points (char16_t pairs)
  972. */
  973. limit=s+length;
  974. for(;;) {
  975. if(s==limit) {
  976. return false;
  977. }
  978. if(number==0) {
  979. return true;
  980. }
  981. if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) {
  982. ++s;
  983. if(--maxSupplementary<=0) {
  984. /* too many pairs - too few code points */
  985. return false;
  986. }
  987. }
  988. --number;
  989. }
  990. }
  991. }
  992. U_CAPI char16_t * U_EXPORT2
  993. u_memcpy(char16_t *dest, const char16_t *src, int32_t count) {
  994. if(count > 0) {
  995. uprv_memcpy(dest, src, (size_t)count*U_SIZEOF_UCHAR);
  996. }
  997. return dest;
  998. }
  999. U_CAPI char16_t * U_EXPORT2
  1000. u_memmove(char16_t *dest, const char16_t *src, int32_t count) {
  1001. if(count > 0) {
  1002. uprv_memmove(dest, src, (size_t)count*U_SIZEOF_UCHAR);
  1003. }
  1004. return dest;
  1005. }
  1006. U_CAPI char16_t * U_EXPORT2
  1007. u_memset(char16_t *dest, char16_t c, int32_t count) {
  1008. if(count > 0) {
  1009. char16_t *ptr = dest;
  1010. char16_t *limit = dest + count;
  1011. while (ptr < limit) {
  1012. *(ptr++) = c;
  1013. }
  1014. }
  1015. return dest;
  1016. }
  1017. U_CAPI int32_t U_EXPORT2
  1018. u_memcmp(const char16_t *buf1, const char16_t *buf2, int32_t count) {
  1019. if(count > 0) {
  1020. const char16_t *limit = buf1 + count;
  1021. int32_t result;
  1022. while (buf1 < limit) {
  1023. result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2;
  1024. if (result != 0) {
  1025. return result;
  1026. }
  1027. buf1++;
  1028. buf2++;
  1029. }
  1030. }
  1031. return 0;
  1032. }
  1033. U_CAPI int32_t U_EXPORT2
  1034. u_memcmpCodePointOrder(const char16_t *s1, const char16_t *s2, int32_t count) {
  1035. return uprv_strCompare(s1, count, s2, count, false, true);
  1036. }
  1037. /* u_unescape & support fns ------------------------------------------------- */
  1038. /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
  1039. static const char16_t UNESCAPE_MAP[] = {
  1040. /*" 0x22, 0x22 */
  1041. /*' 0x27, 0x27 */
  1042. /*? 0x3F, 0x3F */
  1043. /*\ 0x5C, 0x5C */
  1044. /*a*/ 0x61, 0x07,
  1045. /*b*/ 0x62, 0x08,
  1046. /*e*/ 0x65, 0x1b,
  1047. /*f*/ 0x66, 0x0c,
  1048. /*n*/ 0x6E, 0x0a,
  1049. /*r*/ 0x72, 0x0d,
  1050. /*t*/ 0x74, 0x09,
  1051. /*v*/ 0x76, 0x0b
  1052. };
  1053. enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };
  1054. /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
  1055. static int32_t _digit8(char16_t c) {
  1056. if (c >= u'0' && c <= u'7') {
  1057. return c - u'0';
  1058. }
  1059. return -1;
  1060. }
  1061. /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
  1062. static int32_t _digit16(char16_t c) {
  1063. if (c >= u'0' && c <= u'9') {
  1064. return c - u'0';
  1065. }
  1066. if (c >= u'A' && c <= u'F') {
  1067. return c - (u'A' - 10);
  1068. }
  1069. if (c >= u'a' && c <= u'f') {
  1070. return c - (u'a' - 10);
  1071. }
  1072. return -1;
  1073. }
  1074. /* Parse a single escape sequence. Although this method deals in
  1075. * UChars, it does not use C++ or UnicodeString. This allows it to
  1076. * be used from C contexts. */
  1077. U_CAPI UChar32 U_EXPORT2
  1078. u_unescapeAt(UNESCAPE_CHAR_AT charAt,
  1079. int32_t *offset,
  1080. int32_t length,
  1081. void *context) {
  1082. int32_t start = *offset;
  1083. UChar32 c;
  1084. UChar32 result = 0;
  1085. int8_t n = 0;
  1086. int8_t minDig = 0;
  1087. int8_t maxDig = 0;
  1088. int8_t bitsPerDigit = 4;
  1089. int32_t dig;
  1090. UBool braces = false;
  1091. /* Check that offset is in range */
  1092. if (*offset < 0 || *offset >= length) {
  1093. goto err;
  1094. }
  1095. /* Fetch first char16_t after '\\' */
  1096. c = charAt((*offset)++, context);
  1097. /* Convert hexadecimal and octal escapes */
  1098. switch (c) {
  1099. case u'u':
  1100. minDig = maxDig = 4;
  1101. break;
  1102. case u'U':
  1103. minDig = maxDig = 8;
  1104. break;
  1105. case u'x':
  1106. minDig = 1;
  1107. if (*offset < length && charAt(*offset, context) == u'{') {
  1108. ++(*offset);
  1109. braces = true;
  1110. maxDig = 8;
  1111. } else {
  1112. maxDig = 2;
  1113. }
  1114. break;
  1115. default:
  1116. dig = _digit8(c);
  1117. if (dig >= 0) {
  1118. minDig = 1;
  1119. maxDig = 3;
  1120. n = 1; /* Already have first octal digit */
  1121. bitsPerDigit = 3;
  1122. result = dig;
  1123. }
  1124. break;
  1125. }
  1126. if (minDig != 0) {
  1127. while (*offset < length && n < maxDig) {
  1128. c = charAt(*offset, context);
  1129. dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c);
  1130. if (dig < 0) {
  1131. break;
  1132. }
  1133. result = (result << bitsPerDigit) | dig;
  1134. ++(*offset);
  1135. ++n;
  1136. }
  1137. if (n < minDig) {
  1138. goto err;
  1139. }
  1140. if (braces) {
  1141. if (c != u'}') {
  1142. goto err;
  1143. }
  1144. ++(*offset);
  1145. }
  1146. if (result < 0 || result >= 0x110000) {
  1147. goto err;
  1148. }
  1149. /* If an escape sequence specifies a lead surrogate, see if
  1150. * there is a trail surrogate after it, either as an escape or
  1151. * as a literal. If so, join them up into a supplementary.
  1152. */
  1153. if (*offset < length && U16_IS_LEAD(result)) {
  1154. int32_t ahead = *offset + 1;
  1155. c = charAt(*offset, context);
  1156. if (c == u'\\' && ahead < length) {
  1157. // Calling ourselves recursively may cause a stack overflow if
  1158. // we have repeated escaped lead surrogates.
  1159. // Limit the length to 11 ("x{0000DFFF}") after ahead.
  1160. int32_t tailLimit = ahead + 11;
  1161. if (tailLimit > length) {
  1162. tailLimit = length;
  1163. }
  1164. c = u_unescapeAt(charAt, &ahead, tailLimit, context);
  1165. }
  1166. if (U16_IS_TRAIL(c)) {
  1167. *offset = ahead;
  1168. result = U16_GET_SUPPLEMENTARY(result, c);
  1169. }
  1170. }
  1171. return result;
  1172. }
  1173. /* Convert C-style escapes in table */
  1174. for (int32_t i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
  1175. if (c == UNESCAPE_MAP[i]) {
  1176. return UNESCAPE_MAP[i+1];
  1177. } else if (c < UNESCAPE_MAP[i]) {
  1178. break;
  1179. }
  1180. }
  1181. /* Map \cX to control-X: X & 0x1F */
  1182. if (c == u'c' && *offset < length) {
  1183. c = charAt((*offset)++, context);
  1184. if (U16_IS_LEAD(c) && *offset < length) {
  1185. char16_t c2 = charAt(*offset, context);
  1186. if (U16_IS_TRAIL(c2)) {
  1187. ++(*offset);
  1188. c = U16_GET_SUPPLEMENTARY(c, c2);
  1189. }
  1190. }
  1191. return 0x1F & c;
  1192. }
  1193. /* If no special forms are recognized, then consider
  1194. * the backslash to generically escape the next character.
  1195. * Deal with surrogate pairs. */
  1196. if (U16_IS_LEAD(c) && *offset < length) {
  1197. char16_t c2 = charAt(*offset, context);
  1198. if (U16_IS_TRAIL(c2)) {
  1199. ++(*offset);
  1200. return U16_GET_SUPPLEMENTARY(c, c2);
  1201. }
  1202. }
  1203. return c;
  1204. err:
  1205. /* Invalid escape sequence */
  1206. *offset = start; /* Reset to initial value */
  1207. return (UChar32)0xFFFFFFFF;
  1208. }
  1209. /* u_unescapeAt() callback to return a char16_t from a char* */
  1210. static char16_t U_CALLCONV
  1211. _charPtr_charAt(int32_t offset, void *context) {
  1212. char16_t c16;
  1213. /* It would be more efficient to access the invariant tables
  1214. * directly but there is no API for that. */
  1215. u_charsToUChars(static_cast<char*>(context) + offset, &c16, 1);
  1216. return c16;
  1217. }
  1218. /* Append an escape-free segment of the text; used by u_unescape() */
  1219. static void _appendUChars(char16_t *dest, int32_t destCapacity,
  1220. const char *src, int32_t srcLen) {
  1221. if (destCapacity < 0) {
  1222. destCapacity = 0;
  1223. }
  1224. if (srcLen > destCapacity) {
  1225. srcLen = destCapacity;
  1226. }
  1227. u_charsToUChars(src, dest, srcLen);
  1228. }
  1229. /* Do an invariant conversion of char* -> char16_t*, with escape parsing */
  1230. U_CAPI int32_t U_EXPORT2
  1231. u_unescape(const char *src, char16_t *dest, int32_t destCapacity) {
  1232. const char *segment = src;
  1233. int32_t i = 0;
  1234. char c;
  1235. while ((c=*src) != 0) {
  1236. /* '\\' intentionally written as compiler-specific
  1237. * character constant to correspond to compiler-specific
  1238. * char* constants. */
  1239. if (c == '\\') {
  1240. int32_t lenParsed = 0;
  1241. UChar32 c32;
  1242. if (src != segment) {
  1243. if (dest != nullptr) {
  1244. _appendUChars(dest + i, destCapacity - i,
  1245. segment, (int32_t)(src - segment));
  1246. }
  1247. i += (int32_t)(src - segment);
  1248. }
  1249. ++src; /* advance past '\\' */
  1250. c32 = u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), const_cast<char*>(src));
  1251. if (lenParsed == 0) {
  1252. goto err;
  1253. }
  1254. src += lenParsed; /* advance past escape seq. */
  1255. if (dest != nullptr && U16_LENGTH(c32) <= (destCapacity - i)) {
  1256. U16_APPEND_UNSAFE(dest, i, c32);
  1257. } else {
  1258. i += U16_LENGTH(c32);
  1259. }
  1260. segment = src;
  1261. } else {
  1262. ++src;
  1263. }
  1264. }
  1265. if (src != segment) {
  1266. if (dest != nullptr) {
  1267. _appendUChars(dest + i, destCapacity - i,
  1268. segment, (int32_t)(src - segment));
  1269. }
  1270. i += (int32_t)(src - segment);
  1271. }
  1272. if (dest != nullptr && i < destCapacity) {
  1273. dest[i] = 0;
  1274. }
  1275. return i;
  1276. err:
  1277. if (dest != nullptr && destCapacity > 0) {
  1278. *dest = 0;
  1279. }
  1280. return 0;
  1281. }
  1282. /* NUL-termination of strings ----------------------------------------------- */
  1283. /**
  1284. * NUL-terminate a string no matter what its type.
  1285. * Set warning and error codes accordingly.
  1286. */
  1287. #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) UPRV_BLOCK_MACRO_BEGIN { \
  1288. if(pErrorCode!=nullptr && U_SUCCESS(*pErrorCode)) { \
  1289. /* not a public function, so no complete argument checking */ \
  1290. \
  1291. if(length<0) { \
  1292. /* assume that the caller handles this */ \
  1293. } else if(length<destCapacity) { \
  1294. /* NUL-terminate the string, the NUL fits */ \
  1295. dest[length]=0; \
  1296. /* unset the not-terminated warning but leave all others */ \
  1297. if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \
  1298. *pErrorCode=U_ZERO_ERROR; \
  1299. } \
  1300. } else if(length==destCapacity) { \
  1301. /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
  1302. *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \
  1303. } else /* length>destCapacity */ { \
  1304. /* even the string itself did not fit - set an error code */ \
  1305. *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \
  1306. } \
  1307. } \
  1308. } UPRV_BLOCK_MACRO_END
  1309. U_CAPI char16_t U_EXPORT2
  1310. u_asciiToUpper(char16_t c) {
  1311. if (u'a' <= c && c <= u'z') {
  1312. c = c + u'A' - u'a';
  1313. }
  1314. return c;
  1315. }
  1316. U_CAPI int32_t U_EXPORT2
  1317. u_terminateUChars(char16_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
  1318. __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
  1319. return length;
  1320. }
  1321. U_CAPI int32_t U_EXPORT2
  1322. u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
  1323. __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
  1324. return length;
  1325. }
  1326. U_CAPI int32_t U_EXPORT2
  1327. u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
  1328. __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
  1329. return length;
  1330. }
  1331. U_CAPI int32_t U_EXPORT2
  1332. u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
  1333. __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
  1334. return length;
  1335. }
  1336. // Compute the hash code for a string -------------------------------------- ***
  1337. // Moved here from uhash.c so that UnicodeString::hashCode() does not depend
  1338. // on UHashtable code.
  1339. /*
  1340. Compute the hash by iterating sparsely over about 32 (up to 63)
  1341. characters spaced evenly through the string. For each character,
  1342. multiply the previous hash value by a prime number and add the new
  1343. character in, like a linear congruential random number generator,
  1344. producing a pseudorandom deterministic value well distributed over
  1345. the output range. [LIU]
  1346. */
  1347. #define STRING_HASH(TYPE, STR, STRLEN, DEREF) UPRV_BLOCK_MACRO_BEGIN { \
  1348. uint32_t hash = 0; \
  1349. const TYPE *p = (const TYPE*) STR; \
  1350. if (p != nullptr) { \
  1351. int32_t len = (int32_t)(STRLEN); \
  1352. int32_t inc = ((len - 32) / 32) + 1; \
  1353. const TYPE *limit = p + len; \
  1354. while (p<limit) { \
  1355. hash = (hash * 37) + DEREF; \
  1356. p += inc; \
  1357. } \
  1358. } \
  1359. return static_cast<int32_t>(hash); \
  1360. } UPRV_BLOCK_MACRO_END
  1361. /* Used by UnicodeString to compute its hashcode - Not public API. */
  1362. U_CAPI int32_t U_EXPORT2
  1363. ustr_hashUCharsN(const char16_t *str, int32_t length) {
  1364. STRING_HASH(char16_t, str, length, *p);
  1365. }
  1366. U_CAPI int32_t U_EXPORT2
  1367. ustr_hashCharsN(const char *str, int32_t length) {
  1368. STRING_HASH(uint8_t, str, length, *p);
  1369. }
  1370. U_CAPI int32_t U_EXPORT2
  1371. ustr_hashICharsN(const char *str, int32_t length) {
  1372. STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p));
  1373. }