slutf8.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841
  1. #include "slinclud.h"
  2. #include <string.h>
  3. #include "slang.h"
  4. #include "_slang.h"
  5. static unsigned char Len_Map[256] =
  6. {
  7. 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 31 */
  8. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 63 */
  9. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 95 */
  10. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 127 */
  11. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* - 159 */
  12. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* - 191 */
  13. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* - 223 */
  14. 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 /* - 255 */
  15. };
  16. /*
  17. * Also note that the code positions U+D800 to U+DFFF (UTF-16 surrogates)
  18. * as well as U+FFFE and U+FFFF must not occur in normal UTF-8 or UCS-4
  19. * data. UTF-8 decoders should treat them like malformed or overlong
  20. * sequences for safety reasons.
  21. */
  22. #define IS_ILLEGAL_UNICODE(w) \
  23. (((w >= 0xD800) && (w <= 0xDFFF)) || (w == 0xFFFE) || (w == 0xFFFF))
  24. _INLINE_
  25. static int is_invalid_or_overlong_utf8 (SLuchar_Type *u, unsigned int len)
  26. {
  27. unsigned int i;
  28. unsigned char ch, ch1;
  29. /* Check for invalid sequences */
  30. for (i = 1; i < len; i++)
  31. {
  32. if ((u[i] & 0xC0) != 0x80)
  33. return 1;
  34. }
  35. /* Illegal (overlong) sequences */
  36. /* 1100000x (10xxxxxx) */
  37. /* 11100000 100xxxxx (10xxxxxx) */
  38. /* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */
  39. /* 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) */
  40. /* 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) */
  41. ch = *u;
  42. if ((ch == 0xC0) || (ch == 0xC1))
  43. return 1;
  44. ch1 = u[1];
  45. if (((ch1 & ch) == 0x80)
  46. && ((ch == 0xE0)
  47. || (ch == 0xF0)
  48. || (ch == 0xF8)
  49. || (ch == 0xFC)))
  50. return 1;
  51. if (len == 3)
  52. {
  53. /* D800 is encoded as 0xED 0xA0 0x80 and DFFF as 0xED 0xBF 0xBF */
  54. if ((ch == 0xED)
  55. && ((ch1 >= 0xA0) && (ch1 <= 0xBF))
  56. && (u[2] >= 0x80) && (u[2] <= 0xBF))
  57. return 1;
  58. /* Now FFFE and FFFF */
  59. if ((ch == 0xEF)
  60. && (ch1 == 0xBF)
  61. && ((u[2] == 0xBE) || (u[2] == 0xBF)))
  62. return 1;
  63. }
  64. return 0;
  65. }
  66. /* This function assumes that the necessary checks have been made to ensure
  67. * a valid UTF-8 encoded character is present.
  68. */
  69. _INLINE_
  70. static SLwchar_Type fast_utf8_decode (SLuchar_Type *u, unsigned int len)
  71. {
  72. static unsigned char masks[7] =
  73. {
  74. 0, 0, 0x1F, 0xF, 0x7, 0x3, 0x1
  75. };
  76. SLuchar_Type *umax;
  77. SLwchar_Type w;
  78. w = (*u & masks[len]);
  79. umax = u + len;
  80. u++;
  81. while (u < umax)
  82. {
  83. w = (w << 6)| (u[0] & 0x3F);
  84. u++;
  85. }
  86. return w;
  87. }
  88. unsigned char *SLutf8_skip_char (unsigned char *s, unsigned char *smax)
  89. {
  90. unsigned int len;
  91. if (s >= smax)
  92. return s;
  93. len = Len_Map[*s];
  94. if (len <= 1)
  95. return s+1;
  96. if (s + len > smax)
  97. return s+1;
  98. if (is_invalid_or_overlong_utf8 (s, len))
  99. return s + 1;
  100. return s + len;
  101. }
  102. SLuchar_Type *SLutf8_skip_chars (SLuchar_Type *s, SLuchar_Type *smax,
  103. unsigned int num, unsigned int *dnum,
  104. int ignore_combining)
  105. {
  106. unsigned int n;
  107. n = 0;
  108. while ((n < num) && (s < smax))
  109. {
  110. unsigned int len = Len_Map[*s];
  111. if (len <= 1)
  112. {
  113. n++;
  114. s++;
  115. continue;
  116. }
  117. if (s + len > smax)
  118. {
  119. s++;
  120. n++;
  121. continue;
  122. }
  123. if (is_invalid_or_overlong_utf8 (s, len))
  124. {
  125. s++;
  126. n++;
  127. continue;
  128. }
  129. if (ignore_combining)
  130. {
  131. SLwchar_Type w = fast_utf8_decode (s, len);
  132. if (0 != SLwchar_wcwidth (w))
  133. n++;
  134. s += len;
  135. continue;
  136. }
  137. n++;
  138. s += len;
  139. }
  140. if (ignore_combining)
  141. {
  142. while (s < smax)
  143. {
  144. SLwchar_Type w;
  145. unsigned int nconsumed;
  146. if (NULL == SLutf8_decode (s, smax, &w, &nconsumed))
  147. break;
  148. if (0 != SLwchar_wcwidth (w))
  149. break;
  150. s += nconsumed;
  151. }
  152. }
  153. if (dnum != NULL)
  154. *dnum = n;
  155. return s;
  156. }
  157. SLuchar_Type *SLutf8_bskip_chars (SLuchar_Type *smin, SLuchar_Type *s,
  158. unsigned int num, unsigned int *dnum,
  159. int ignore_combining)
  160. {
  161. unsigned int n;
  162. SLuchar_Type *smax = s;
  163. n = 0;
  164. while ((n < num) && (s > smin))
  165. {
  166. unsigned char ch;
  167. unsigned int dn;
  168. s--;
  169. ch = *s;
  170. if (ch < 0x80)
  171. {
  172. n++;
  173. smax = s;
  174. continue;
  175. }
  176. dn = 0;
  177. while ((s != smin)
  178. && (Len_Map[ch] == 0)
  179. && (dn < SLUTF8_MAX_MBLEN))
  180. {
  181. s--;
  182. ch = *s;
  183. dn++;
  184. }
  185. if (ch <= 0xBF)
  186. {
  187. /* Invalid sequence */
  188. n++;
  189. smax--;
  190. s = smax;
  191. continue;
  192. }
  193. if (ch > 0xBF)
  194. {
  195. SLwchar_Type w;
  196. SLuchar_Type *s1;
  197. if ((NULL == (s1 = SLutf8_decode (s, smax, &w, NULL)))
  198. || (s1 != smax))
  199. {
  200. /* This means we backed up over an invalid sequence */
  201. dn = (unsigned int) (smax - s);
  202. n++;
  203. smax--;
  204. s = smax;
  205. continue;
  206. }
  207. if ((ignore_combining == 0)
  208. || (0 != SLwchar_wcwidth (w)))
  209. n++;
  210. smax = s;
  211. }
  212. }
  213. if (dnum != NULL)
  214. *dnum = n;
  215. return s;
  216. }
  217. SLuchar_Type *SLutf8_bskip_char (SLuchar_Type *smin, SLuchar_Type *s)
  218. {
  219. if (s > smin)
  220. {
  221. unsigned int dn;
  222. s--;
  223. if (*s >= 0x80)
  224. s = SLutf8_bskip_chars (smin, s+1, 1, &dn, 0);
  225. }
  226. return s;
  227. }
  228. /* This function counts the number of wide characters in a UTF-8 encoded
  229. * string. Each byte in an invalid sequence is counted as a single character.
  230. * If the string contains illegal values, the bytes making up the character is
  231. * counted as 1 character.
  232. */
  233. unsigned int SLutf8_strlen (SLuchar_Type *s, int ignore_combining)
  234. {
  235. unsigned int count, len;
  236. if (s == NULL)
  237. return 0;
  238. len = strlen ((char *)s);
  239. (void) SLutf8_skip_chars (s, s + len, len, &count, ignore_combining);
  240. return count;
  241. }
  242. /*
  243. * This function returns NULL if the input does not correspond to a valid
  244. * UTF-8 sequence, otherwise, it returns the position of the next character
  245. * in the sequence.
  246. */
  247. unsigned char *SLutf8_decode (unsigned char *u, unsigned char *umax,
  248. SLwchar_Type *wp, unsigned int *nconsumedp)
  249. {
  250. unsigned int len;
  251. unsigned char ch;
  252. SLwchar_Type w;
  253. if (u >= umax)
  254. {
  255. *wp = 0;
  256. if (nconsumedp != NULL)
  257. *nconsumedp = 0;
  258. return NULL;
  259. }
  260. *wp = ch = *u;
  261. if (ch < 0x80)
  262. {
  263. if (nconsumedp != NULL) *nconsumedp = 1;
  264. return u+1;
  265. }
  266. len = Len_Map[ch];
  267. if (len < 2)
  268. {
  269. /* should not happen--- code here for completeness */
  270. if (nconsumedp != NULL) *nconsumedp = 1;
  271. return NULL;
  272. }
  273. if (u + len > umax)
  274. {
  275. if (nconsumedp != NULL) *nconsumedp = 1; /* (unsigned int) (umax - u); */
  276. return NULL;
  277. }
  278. if (is_invalid_or_overlong_utf8 (u, len))
  279. {
  280. if (nconsumedp != NULL)
  281. *nconsumedp = 1;
  282. return NULL;
  283. }
  284. if (nconsumedp != NULL)
  285. *nconsumedp = len;
  286. *wp = w = fast_utf8_decode (u, len);
  287. if (IS_ILLEGAL_UNICODE(w))
  288. return NULL;
  289. return u + len;
  290. }
  291. /* Encode the wide character returning a pointer to the end of the
  292. * utf8 of the encoded multi-byte character. This function will also encode
  293. * illegal unicode values. It returns NULL if buflen is too small.
  294. * Otherwise, it returns a pointer at the end of the last encoded byte.
  295. * It does not null terminate the encoded string.
  296. */
  297. SLuchar_Type *SLutf8_encode (SLwchar_Type w, SLuchar_Type *u, unsigned int ulen)
  298. {
  299. SLuchar_Type *umax = u + ulen;
  300. /* U-00000000 - U-0000007F: 0xxxxxxx */
  301. if (w <= 0x7F)
  302. {
  303. if (u >= umax)
  304. return NULL;
  305. *u++ = (unsigned char) w;
  306. return u;
  307. }
  308. /* U-00000080 - U-000007FF: 110xxxxx 10xxxxxx */
  309. if (w <= 0x7FF)
  310. {
  311. if ((u + 1) >= umax)
  312. return NULL;
  313. *u++ = (w >> 6) | 0xC0;
  314. *u++ = (w & 0x3F) | 0x80;
  315. return u;
  316. }
  317. /* First bad character starts at 0xD800 */
  318. /* Allow illegal values to be encoded */
  319. /*
  320. *if (IS_ILLEGAL_UNICODE(w))
  321. * return NULL;
  322. */
  323. /* U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx */
  324. if (w <= 0xFFFF)
  325. {
  326. if (u+2 >= umax)
  327. return NULL;
  328. *u++ = (w >> 12 ) | 0xE0;
  329. goto finish_2;
  330. }
  331. /* U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
  332. if (w <= 0x1FFFFF)
  333. {
  334. if (u+3 >= umax)
  335. return NULL;
  336. *u++ = (w >> 18) | 0xF0;
  337. goto finish_3;
  338. }
  339. /* U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
  340. if (w <= 0x3FFFFFF)
  341. {
  342. if (u+4 >= umax)
  343. return NULL;
  344. *u++ = (w >> 24) | 0xF8;
  345. goto finish_4;
  346. }
  347. /* U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
  348. if (w <= 0x7FFFFFFF)
  349. {
  350. if (u+5 >= umax)
  351. return NULL;
  352. *u++ = (w >> 30) | 0xFC;
  353. goto finish_5;
  354. }
  355. /* unreached?? */
  356. return NULL;
  357. finish_5: *u++ = ((w >> 24) & 0x3F)|0x80;
  358. finish_4: *u++ = ((w >> 18) & 0x3F)|0x80;
  359. finish_3: *u++ = ((w >> 12) & 0x3F)|0x80;
  360. finish_2: *u++ = ((w >> 6) & 0x3F)|0x80;
  361. *u++ = (w & 0x3F)|0x80;
  362. return u;
  363. }
  364. /* Like SLutf8_encode, but null terminates the result.
  365. * At least SLUTF8_MAX_MBLEN+1 bytes assumed.
  366. */
  367. SLuchar_Type *SLutf8_encode_null_terminate (SLwchar_Type w, SLuchar_Type *u)
  368. {
  369. SLuchar_Type *p;
  370. p = SLutf8_encode (w, u, SLUTF8_MAX_MBLEN);
  371. if (p != NULL)
  372. *p = 0;
  373. return p;
  374. }
  375. #if 0
  376. int SLutf8_decode_bytes (SLuchar_Type *u, SLuchar_Type *umax,
  377. unsigned char *b, unsigned int *np)
  378. {
  379. unsigned char *bmax;
  380. bmax = b;
  381. while (u < umax)
  382. {
  383. SLwchar_Type w;
  384. if (0 == (*u & 0x80))
  385. {
  386. *bmax++ = *u++;
  387. continue;
  388. }
  389. if (NULL == (u = SLutf8_decode (u, umax, &w, NULL)))
  390. return -1; /* FIXME: HANDLE ERROR */
  391. if (w > 0xFF)
  392. {
  393. #if 0
  394. sprintf (bmax, "<U+%04X>", w);
  395. bmax += strlen (bmax);
  396. continue;
  397. #endif
  398. /* FIXME: HANDLE ERROR */
  399. w = w & 0xFF;
  400. }
  401. *bmax++ = w;
  402. }
  403. *np = bmax - b;
  404. *bmax = 0;
  405. return 0;
  406. }
  407. /* UTF-8 Encode the bytes between b and bmax storing the results in the
  408. * buffer defined by u and umax, returning the position following the
  409. * last encoded character. Upon return, *np is set to the number of bytes
  410. * sucessfully encoded.
  411. */
  412. SLuchar_Type *SLutf8_encode_bytes (unsigned char *b, unsigned char *bmax,
  413. SLuchar_Type *u, unsigned int ulen,
  414. unsigned int *np)
  415. {
  416. unsigned char *bstart = b;
  417. SLuchar_Type *umax = u + ulen;
  418. while (b < bmax)
  419. {
  420. SLuchar_Type *u1;
  421. if (0 == (*b & 0x80))
  422. {
  423. if (u >= umax)
  424. break;
  425. *u++ = *b++;
  426. continue;
  427. }
  428. if (NULL == (u1 = SLutf8_encode (*b, u, umax - u)))
  429. break;
  430. u = u1;
  431. b++;
  432. }
  433. *np = b - bstart;
  434. if (u < umax)
  435. *u = 0;
  436. return u;
  437. }
  438. #endif
  439. static SLuchar_Type *xform_utf8 (SLuchar_Type *u, SLuchar_Type *umax,
  440. SLwchar_Type (*fun)(SLwchar_Type))
  441. {
  442. SLuchar_Type *buf, *p;
  443. unsigned int malloced_len, len;
  444. if (umax < u)
  445. return NULL;
  446. len = 0;
  447. p = buf = NULL;
  448. malloced_len = 0;
  449. while (1)
  450. {
  451. SLwchar_Type w;
  452. SLuchar_Type *u1;
  453. unsigned int nconsumed;
  454. if (malloced_len <= len + SLUTF8_MAX_MBLEN)
  455. {
  456. SLuchar_Type *newbuf;
  457. malloced_len += 1 + (umax - u) + SLUTF8_MAX_MBLEN;
  458. newbuf = (SLuchar_Type *)SLrealloc ((char *)buf, malloced_len);
  459. if (newbuf == NULL)
  460. {
  461. SLfree ((char *)buf);
  462. return NULL;
  463. }
  464. buf = newbuf;
  465. p = buf + len;
  466. }
  467. if (u >= umax)
  468. {
  469. *p = 0;
  470. p = (SLuchar_Type *) SLang_create_nslstring ((char *)buf, len);
  471. SLfree ((char *)buf);
  472. return p;
  473. }
  474. if (NULL == (u1 = SLutf8_decode (u, umax, &w, &nconsumed)))
  475. {
  476. /* Invalid sequence */
  477. memcpy ((char *) p, u, nconsumed);
  478. p += nconsumed;
  479. len += nconsumed;
  480. u1 = u + nconsumed;
  481. }
  482. else
  483. {
  484. SLuchar_Type *p1;
  485. p1 = SLutf8_encode ((*fun)(w), p, malloced_len);
  486. if (p1 == NULL)
  487. {
  488. SLfree ((char *)buf);
  489. SLang_verror (SL_INTERNAL_ERROR, "SLutf8_encode returned NULL");
  490. return NULL;
  491. }
  492. len += p1 - p;
  493. p = p1;
  494. }
  495. u = u1;
  496. }
  497. }
  498. /* Returned an uppercased version of an UTF-8 encoded string. Illegal or
  499. * invalid sequences will be returned as-is. This function returns
  500. * an SLstring.
  501. */
  502. SLuchar_Type *SLutf8_strup (SLuchar_Type *u, SLuchar_Type *umax)
  503. {
  504. return xform_utf8 (u, umax, SLwchar_toupper);
  505. }
  506. /* Returned an lowercased version of an UTF-8 encoded string. Illegal or
  507. * invalid sequences will be returned as-is. This function returns
  508. * an SLstring.
  509. */
  510. SLuchar_Type *SLutf8_strlo (SLuchar_Type *u, SLuchar_Type *umax)
  511. {
  512. return xform_utf8 (u, umax, SLwchar_tolower);
  513. }
  514. int SLutf8_compare (SLuchar_Type *a, SLuchar_Type *amax,
  515. SLuchar_Type *b, SLuchar_Type *bmax,
  516. unsigned int nchars,
  517. int cs)
  518. {
  519. while (nchars && (a < amax) && (b < bmax))
  520. {
  521. SLwchar_Type cha, chb;
  522. unsigned int na, nb;
  523. int aok, bok;
  524. if (*a < 0x80)
  525. {
  526. cha = (SLwchar_Type) *a++;
  527. aok = 1;
  528. }
  529. else
  530. {
  531. aok = (NULL != SLutf8_decode (a, amax, &cha, &na));
  532. a += na;
  533. }
  534. if (*b < 0x80)
  535. {
  536. chb = (SLwchar_Type) *b++;
  537. bok = 1;
  538. }
  539. else
  540. {
  541. bok = (NULL != SLutf8_decode (b, bmax, &chb, &nb));
  542. b += nb;
  543. }
  544. nchars--;
  545. if (aok && bok)
  546. {
  547. if (cs == 0)
  548. {
  549. cha = SLwchar_toupper (cha);
  550. chb = SLwchar_toupper (chb);
  551. }
  552. }
  553. else if (aok)
  554. return 1;
  555. else if (bok)
  556. return -1;
  557. if (cha == chb)
  558. continue;
  559. if (cha > chb)
  560. return 1;
  561. return -1;
  562. }
  563. if (nchars == 0)
  564. return 0;
  565. if ((a >= amax) && (b >= bmax))
  566. return 0;
  567. if (b >= bmax)
  568. return 1;
  569. return -1;
  570. }
  571. /* Returns an SLstring */
  572. SLstr_Type *SLutf8_subst_wchar (SLuchar_Type *u, SLuchar_Type *umax,
  573. SLwchar_Type wch, unsigned int pos,
  574. int ignore_combining)
  575. {
  576. SLuchar_Type *a, *a1, *b;
  577. unsigned int dpos;
  578. SLuchar_Type buf[SLUTF8_MAX_MBLEN+1];
  579. SLstr_Type *c;
  580. unsigned int n1, n2, n3, len;
  581. a = SLutf8_skip_chars (u, umax, pos, &dpos, ignore_combining);
  582. if ((dpos != pos) || (a == umax))
  583. {
  584. SLang_verror (SL_INDEX_ERROR, "Specified character position is invalid for string");
  585. return NULL;
  586. }
  587. a1 = SLutf8_skip_chars (a, umax, 1, NULL, ignore_combining);
  588. b = SLutf8_encode (wch, buf, SLUTF8_MAX_MBLEN);
  589. if (b == NULL)
  590. {
  591. SLang_verror (SL_UNICODE_ERROR, "Unable to encode wchar 0x%lX", (unsigned long)wch);
  592. return NULL;
  593. }
  594. n1 = (a-u);
  595. n2 = (b-buf);
  596. n3 = (umax-a1);
  597. len = n1 + n2 + n3;
  598. c = _pSLallocate_slstring (len);
  599. if (c == NULL)
  600. return NULL;
  601. memcpy (c, (char *)u, n1);
  602. memcpy (c+n1, (char *)buf, n2);
  603. memcpy (c+n1+n2, (char *)a1, n3);
  604. c[len] = 0;
  605. /* No need to worry about this failing-- it frees its argument */
  606. return _pSLcreate_via_alloced_slstring (c, len);
  607. }
  608. /* utf8 buffer assumed to be at least SLUTF8_MAX_MBLEN+1 bytes. Result will be
  609. * null terminated. Returns position of NEXT character.
  610. * Analogous to: *p++
  611. */
  612. SLuchar_Type *SLutf8_extract_utf8_char (SLuchar_Type *u,
  613. SLuchar_Type *umax,
  614. SLuchar_Type *utf8)
  615. {
  616. SLuchar_Type *u1;
  617. u1 = SLutf8_skip_char (u, umax);
  618. memcpy ((char *)utf8, u, u1-u);
  619. utf8[u1-u] = 0;
  620. return u1;
  621. }
  622. /* These routines depend upon the value of the _pSLinterp_UTF8_Mode variable.
  623. * They also generate slang errors upon error.
  624. */
  625. SLuchar_Type *_pSLinterp_decode_wchar (SLuchar_Type *u,
  626. SLuchar_Type *umax,
  627. SLwchar_Type *chp)
  628. {
  629. if (_pSLinterp_UTF8_Mode == 0)
  630. {
  631. if (u < umax)
  632. *chp = (SLwchar_Type) *u++;
  633. return u;
  634. }
  635. if (NULL == (u = SLutf8_decode (u, umax, chp, NULL)))
  636. SLang_verror (SL_INVALID_UTF8, "Invalid UTF-8 encoded string");
  637. return u;
  638. }
  639. /* At least SLUTF8_MAX_MBLEN+1 bytes assumed-- null terminates result.
  640. * Upon success, it returns a pointer to the _end_ of the encoded character
  641. */
  642. SLuchar_Type *_pSLinterp_encode_wchar (SLwchar_Type wch, SLuchar_Type *u, unsigned int *encoded_len)
  643. {
  644. SLuchar_Type *u1;
  645. if (_pSLinterp_UTF8_Mode == 0)
  646. {
  647. *encoded_len = 1;
  648. *u++ = (SLuchar_Type) wch;
  649. *u++ = 0;
  650. return u;
  651. }
  652. if (NULL == (u1 = SLutf8_encode_null_terminate (wch, u)))
  653. {
  654. SLang_verror (SL_UNICODE_ERROR, "Unable to encode character 0x%lX", (unsigned long)wch);
  655. return NULL;
  656. }
  657. *encoded_len = (unsigned int) (u1 - u);
  658. return u1;
  659. }
  660. #ifdef REGRESSION
  661. int main (int argc, char **argv)
  662. {
  663. unsigned char *s, *smax;
  664. char **t;
  665. char *ok_tests [] =
  666. {
  667. "퟿",
  668. "",
  669. "�",
  670. "�",
  671. "�",
  672. NULL
  673. };
  674. char *long_tests [] =
  675. {
  676. "À¯",
  677. "à€¯",
  678. "ð€€¯",
  679. "ø€€€¯",
  680. "ü€€€€¯",
  681. NULL
  682. };
  683. t = long_tests;
  684. while ((s = (unsigned char *) *t++) != NULL)
  685. {
  686. smax = s + strlen ((char *)s);
  687. while (s < smax)
  688. {
  689. SLwchar_Type w;
  690. if (NULL == (s = SLutf8_to_wc (s, smax, &w)))
  691. {
  692. fprintf (stderr, "SLutf8_to_wc failed\n");
  693. break;
  694. }
  695. if (w == 0)
  696. break;
  697. fprintf (stdout, " 0x%X", w);
  698. }
  699. fprintf (stdout, "\n");
  700. }
  701. return 0;
  702. }
  703. #endif