strutilutf8.c 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362
  1. /* UTF-8 strings utilities
  2. Copyright (C) 2007 Free Software Foundation, Inc.
  3. Written 2007 by:
  4. Rostislav Benes
  5. The file_date routine is mostly from GNU's fileutils package,
  6. written by Richard Stallman and David MacKenzie.
  7. This program is free software; you can redistribute it and/or modify
  8. it under the terms of the GNU General Public License as published by
  9. the Free Software Foundation; either version 2 of the License, or
  10. (at your option) any later version.
  11. This program is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. GNU General Public License for more details.
  15. You should have received a copy of the GNU General Public License
  16. along with this program; if not, write to the Free Software
  17. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. */
  19. #include <config.h>
  20. #include <stdlib.h>
  21. #include <stdio.h>
  22. #include <errno.h>
  23. #include <glib.h>
  24. #include <langinfo.h>
  25. #include <string.h>
  26. #include "lib/global.h"
  27. #include "lib/strutil.h"
  28. /* using function for utf-8 from glib */
  29. static const char replch[] = "\xEF\xBF\xBD";
  30. static int
  31. str_unichar_iscombiningmark (gunichar uni)
  32. {
  33. int type = g_unichar_type (uni);
  34. return (type == G_UNICODE_COMBINING_MARK)
  35. || (type == G_UNICODE_ENCLOSING_MARK)
  36. || (type == G_UNICODE_NON_SPACING_MARK);
  37. }
  38. static void
  39. str_utf8_insert_replace_char (GString * buffer)
  40. {
  41. g_string_append (buffer, replch);
  42. }
  43. static int
  44. str_utf8_is_valid_string (const char *text)
  45. {
  46. return g_utf8_validate (text, -1, NULL);
  47. }
  48. static int
  49. str_utf8_is_valid_char (const char *ch, size_t size)
  50. {
  51. switch (g_utf8_get_char_validated (ch, size))
  52. {
  53. case (gunichar) (-2):
  54. return -2;
  55. case (gunichar) (-1):
  56. return -1;
  57. default:
  58. return 1;
  59. }
  60. }
  61. static void
  62. str_utf8_cnext_char (const char **text)
  63. {
  64. (*text) = g_utf8_next_char (*text);
  65. }
  66. static void
  67. str_utf8_cprev_char (const char **text)
  68. {
  69. (*text) = g_utf8_prev_char (*text);
  70. }
  71. static void
  72. str_utf8_cnext_char_safe (const char **text)
  73. {
  74. if (str_utf8_is_valid_char (*text, -1) == 1)
  75. (*text) = g_utf8_next_char (*text);
  76. else
  77. (*text)++;
  78. }
  79. static void
  80. str_utf8_cprev_char_safe (const char **text)
  81. {
  82. const char *result = g_utf8_prev_char (*text);
  83. const char *t = result;
  84. str_utf8_cnext_char_safe (&t);
  85. if (t == *text)
  86. (*text) = result;
  87. else
  88. (*text)--;
  89. }
  90. static void
  91. str_utf8_fix_string (char *text)
  92. {
  93. gunichar uni;
  94. while (text[0] != '\0')
  95. {
  96. uni = g_utf8_get_char_validated (text, -1);
  97. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  98. {
  99. text = g_utf8_next_char (text);
  100. }
  101. else
  102. {
  103. text[0] = '?';
  104. text++;
  105. }
  106. }
  107. }
  108. static int
  109. str_utf8_isspace (const char *text)
  110. {
  111. gunichar uni = g_utf8_get_char_validated (text, -1);
  112. return g_unichar_isspace (uni);
  113. }
  114. static int
  115. str_utf8_ispunct (const char *text)
  116. {
  117. gunichar uni = g_utf8_get_char_validated (text, -1);
  118. return g_unichar_ispunct (uni);
  119. }
  120. static int
  121. str_utf8_isalnum (const char *text)
  122. {
  123. gunichar uni = g_utf8_get_char_validated (text, -1);
  124. return g_unichar_isalnum (uni);
  125. }
  126. static int
  127. str_utf8_isdigit (const char *text)
  128. {
  129. gunichar uni = g_utf8_get_char_validated (text, -1);
  130. return g_unichar_isdigit (uni);
  131. }
  132. static int
  133. str_utf8_isprint (const char *ch)
  134. {
  135. gunichar uni = g_utf8_get_char_validated (ch, -1);
  136. return g_unichar_isprint (uni);
  137. }
  138. static int
  139. str_utf8_iscombiningmark (const char *ch)
  140. {
  141. gunichar uni = g_utf8_get_char_validated (ch, -1);
  142. return str_unichar_iscombiningmark (uni);
  143. }
  144. static int
  145. str_utf8_cnext_noncomb_char (const char **text)
  146. {
  147. int count = 0;
  148. while ((*text)[0] != '\0')
  149. {
  150. str_utf8_cnext_char_safe (text);
  151. count++;
  152. if (!str_utf8_iscombiningmark (*text))
  153. break;
  154. }
  155. return count;
  156. }
  157. static int
  158. str_utf8_cprev_noncomb_char (const char **text, const char *begin)
  159. {
  160. int count = 0;
  161. while ((*text) != begin)
  162. {
  163. str_utf8_cprev_char_safe (text);
  164. count++;
  165. if (!str_utf8_iscombiningmark (*text))
  166. break;
  167. }
  168. return count;
  169. }
  170. static int
  171. str_utf8_toupper (const char *text, char **out, size_t * remain)
  172. {
  173. gunichar uni;
  174. size_t left;
  175. uni = g_utf8_get_char_validated (text, -1);
  176. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  177. return 0;
  178. uni = g_unichar_toupper (uni);
  179. left = g_unichar_to_utf8 (uni, NULL);
  180. if (left >= *remain)
  181. return 0;
  182. left = g_unichar_to_utf8 (uni, *out);
  183. (*out) += left;
  184. (*remain) -= left;
  185. return 1;
  186. }
  187. static int
  188. str_utf8_tolower (const char *text, char **out, size_t * remain)
  189. {
  190. gunichar uni;
  191. size_t left;
  192. uni = g_utf8_get_char_validated (text, -1);
  193. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  194. return 0;
  195. uni = g_unichar_tolower (uni);
  196. left = g_unichar_to_utf8 (uni, NULL);
  197. if (left >= *remain)
  198. return 0;
  199. left = g_unichar_to_utf8 (uni, *out);
  200. (*out) += left;
  201. (*remain) -= left;
  202. return 1;
  203. }
  204. static int
  205. str_utf8_length (const char *text)
  206. {
  207. int result = 0;
  208. const char *start;
  209. const char *end;
  210. start = text;
  211. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  212. {
  213. if (start != end)
  214. {
  215. result += g_utf8_strlen (start, end - start);
  216. }
  217. result++;
  218. start = end + 1;
  219. }
  220. if (start == text)
  221. {
  222. result = g_utf8_strlen (text, -1);
  223. }
  224. else
  225. {
  226. if (start[0] != '\0' && start != end)
  227. {
  228. result += g_utf8_strlen (start, end - start);
  229. }
  230. }
  231. return result;
  232. }
  233. static int
  234. str_utf8_length2 (const char *text, int size)
  235. {
  236. int result = 0;
  237. const char *start;
  238. const char *end;
  239. start = text;
  240. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
  241. {
  242. if (start != end)
  243. {
  244. result += g_utf8_strlen (start, min (end - start, size));
  245. size -= end - start;
  246. }
  247. result += (size > 0);
  248. size--;
  249. start = end + 1;
  250. }
  251. if (start == text)
  252. {
  253. result = g_utf8_strlen (text, size);
  254. }
  255. else
  256. {
  257. if (start[0] != '\0' && start != end && size > 0)
  258. {
  259. result += g_utf8_strlen (start, min (end - start, size));
  260. }
  261. }
  262. return result;
  263. }
  264. static int
  265. str_utf8_length_noncomb (const char *text)
  266. {
  267. int result = 0;
  268. const char *t = text;
  269. while (t[0] != '\0')
  270. {
  271. str_utf8_cnext_noncomb_char (&t);
  272. result++;
  273. }
  274. return result;
  275. }
  276. /*
  277. static void
  278. str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
  279. {
  280. char *next = g_utf8_next_char (*string);
  281. (*left) -= next - (*string);
  282. (*string) = next;
  283. g_string_append_c (buffer, '?');
  284. }
  285. */
  286. static gchar *
  287. str_utf8_conv_gerror_message (GError *error, const char *def_msg)
  288. {
  289. if ((error != NULL) && (error->message != NULL))
  290. return g_strdup (error->message);
  291. return g_strdup (def_msg != NULL ? def_msg : "");
  292. }
  293. static estr_t
  294. str_utf8_vfs_convert_to (GIConv coder, const char *string,
  295. int size, GString * buffer)
  296. {
  297. estr_t result;
  298. if (coder == str_cnv_not_convert)
  299. {
  300. g_string_append_len (buffer, string, size);
  301. result = ESTR_SUCCESS;
  302. }
  303. else
  304. result = str_nconvert (coder, (char *) string, size, buffer);
  305. return result;
  306. }
  307. struct term_form
  308. {
  309. char text[BUF_MEDIUM * 6];
  310. size_t width;
  311. int compose;
  312. };
  313. /* utiliti function, that make string valid in utf8 and all characters printable
  314. * return width of string too*/
  315. static const struct term_form *
  316. str_utf8_make_make_term_form (const char *text, size_t length)
  317. {
  318. static struct term_form result;
  319. gunichar uni;
  320. size_t left;
  321. char *actual;
  322. result.text[0] = '\0';
  323. result.width = 0;
  324. result.compose = 0;
  325. actual = result.text;
  326. /* check if text start with combining character,
  327. * add space at begin in this case */
  328. if (length != 0 && text[0] != '\0')
  329. {
  330. uni = g_utf8_get_char_validated (text, -1);
  331. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  332. {
  333. if (str_unichar_iscombiningmark (uni))
  334. {
  335. actual[0] = ' ';
  336. actual++;
  337. result.width++;
  338. result.compose = 1;
  339. }
  340. }
  341. }
  342. while (length != 0 && text[0] != '\0') {
  343. uni = g_utf8_get_char_validated (text, -1);
  344. if ((uni != (gunichar)(-1)) && (uni != (gunichar)(-2))) {
  345. if (g_unichar_isprint(uni)) {
  346. left = g_unichar_to_utf8 (uni, actual);
  347. actual+= left;
  348. if (!str_unichar_iscombiningmark (uni)) {
  349. result.width++;
  350. if (g_unichar_iswide(uni)) result.width++;
  351. } else result.compose = 1;
  352. } else {
  353. actual[0] = '.';
  354. actual++;
  355. result.width++;
  356. }
  357. text = g_utf8_next_char (text);
  358. } else {
  359. text++;
  360. /*actual[0] = '?';*/
  361. memcpy (actual, replch, strlen (replch));
  362. actual+= strlen (replch);
  363. result.width++;
  364. }
  365. if (length != (size_t) (-1)) length--; }
  366. actual[0] = '\0';
  367. return &result;
  368. }
  369. static const char *
  370. str_utf8_term_form (const char *text)
  371. {
  372. static char result[BUF_MEDIUM * 6];
  373. const struct term_form *pre_form;
  374. char *composed;
  375. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  376. if (pre_form->compose)
  377. {
  378. composed =
  379. g_utf8_normalize (pre_form->text, -1,
  380. G_NORMALIZE_DEFAULT_COMPOSE);
  381. g_strlcpy (result, composed, sizeof (result));
  382. g_free (composed);
  383. }
  384. else
  385. {
  386. g_strlcpy (result, pre_form->text, sizeof (result));
  387. }
  388. return result;
  389. }
  390. struct utf8_tool
  391. {
  392. char *actual;
  393. size_t remain;
  394. const char *cheked;
  395. int ident;
  396. int compose;
  397. };
  398. /* utiliti function, that copy all characters from cheked to actual */
  399. static int
  400. utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
  401. {
  402. size_t left;
  403. gunichar uni;
  404. tool->compose = 0;
  405. while (tool->cheked[0] != '\0')
  406. {
  407. uni = g_utf8_get_char (tool->cheked);
  408. tool->compose |= str_unichar_iscombiningmark (uni);
  409. left = g_unichar_to_utf8 (uni, NULL);
  410. if (tool->remain <= left)
  411. return 0;
  412. left = g_unichar_to_utf8 (uni, tool->actual);
  413. tool->actual += left;
  414. tool->remain -= left;
  415. tool->cheked = g_utf8_next_char (tool->cheked);
  416. }
  417. return 1;
  418. }
  419. /* utiliti function, that copy characters from cheked to actual until ident is
  420. * smaller than to_ident */
  421. static int
  422. utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
  423. {
  424. size_t left;
  425. gunichar uni;
  426. int w;
  427. tool->compose = 0;
  428. while (tool->cheked[0] != '\0')
  429. {
  430. uni = g_utf8_get_char (tool->cheked);
  431. if (!str_unichar_iscombiningmark (uni))
  432. {
  433. w = 1;
  434. if (g_unichar_iswide (uni))
  435. w++;
  436. if (tool->ident + w > to_ident)
  437. return 1;
  438. }
  439. else
  440. {
  441. w = 0;
  442. tool->compose = 1;
  443. }
  444. left = g_unichar_to_utf8 (uni, NULL);
  445. if (tool->remain <= left)
  446. return 0;
  447. left = g_unichar_to_utf8 (uni, tool->actual);
  448. tool->actual += left;
  449. tool->remain -= left;
  450. tool->cheked = g_utf8_next_char (tool->cheked);
  451. tool->ident += w;
  452. }
  453. return 1;
  454. }
  455. /* utiliti function, add count spaces to actual */
  456. static int
  457. utf8_tool_insert_space (struct utf8_tool *tool, int count)
  458. {
  459. if (count <= 0)
  460. return 1;
  461. if (tool->remain <= (gsize) count)
  462. return 0;
  463. memset (tool->actual, ' ', count);
  464. tool->actual += count;
  465. tool->remain -= count;
  466. return 1;
  467. }
  468. /* utiliti function, add one characters to actual */
  469. static int
  470. utf8_tool_insert_char (struct utf8_tool *tool, char ch)
  471. {
  472. if (tool->remain <= 1)
  473. return 0;
  474. tool->actual[0] = ch;
  475. tool->actual++;
  476. tool->remain--;
  477. return 1;
  478. }
  479. /* utiliti function, thah skip characters from cheked until ident is greater or
  480. * equal to to_ident */
  481. static int
  482. utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
  483. {
  484. gunichar uni;
  485. while (to_ident > tool->ident && tool->cheked[0] != '\0')
  486. {
  487. uni = g_utf8_get_char (tool->cheked);
  488. if (!str_unichar_iscombiningmark (uni))
  489. {
  490. tool->ident++;
  491. if (g_unichar_iswide (uni))
  492. tool->ident++;
  493. }
  494. tool->cheked = g_utf8_next_char (tool->cheked);
  495. }
  496. uni = g_utf8_get_char (tool->cheked);
  497. while (str_unichar_iscombiningmark (uni))
  498. {
  499. tool->cheked = g_utf8_next_char (tool->cheked);
  500. uni = g_utf8_get_char (tool->cheked);
  501. }
  502. return 1;
  503. }
  504. static void
  505. utf8_tool_compose (char *buffer, size_t size)
  506. {
  507. char *composed =
  508. g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  509. g_strlcpy (buffer, composed, size);
  510. g_free (composed);
  511. }
  512. static const char *
  513. str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
  514. {
  515. static char result[BUF_MEDIUM * 6];
  516. const struct term_form *pre_form;
  517. struct utf8_tool tool;
  518. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  519. tool.cheked = pre_form->text;
  520. tool.actual = result;
  521. tool.remain = sizeof (result);
  522. tool.compose = 0;
  523. if (pre_form->width <= (gsize)width)
  524. {
  525. tool.ident = 0;
  526. switch (HIDE_FIT (just_mode))
  527. {
  528. case J_CENTER_LEFT:
  529. case J_CENTER:
  530. tool.ident = (width - pre_form->width) / 2;
  531. break;
  532. case J_RIGHT:
  533. tool.ident = width - pre_form->width;
  534. break;
  535. }
  536. utf8_tool_insert_space (&tool, tool.ident);
  537. utf8_tool_copy_chars_to_end (&tool);
  538. utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
  539. }
  540. else
  541. {
  542. if (IS_FIT (just_mode))
  543. {
  544. tool.ident = 0;
  545. utf8_tool_copy_chars_to (&tool, width / 2);
  546. utf8_tool_insert_char (&tool, '~');
  547. tool.ident = 0;
  548. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  549. utf8_tool_copy_chars_to_end (&tool);
  550. utf8_tool_insert_space (&tool,
  551. width - (pre_form->width - tool.ident +
  552. 1));
  553. }
  554. else
  555. {
  556. tool.ident = 0;
  557. switch (HIDE_FIT (just_mode))
  558. {
  559. case J_CENTER:
  560. tool.ident = (width - pre_form->width) / 2;
  561. break;
  562. case J_RIGHT:
  563. tool.ident = width - pre_form->width;
  564. break;
  565. }
  566. utf8_tool_skip_chars_to (&tool, 0);
  567. utf8_tool_insert_space (&tool, tool.ident);
  568. utf8_tool_copy_chars_to (&tool, width);
  569. utf8_tool_insert_space (&tool, width - tool.ident);
  570. }
  571. }
  572. tool.actual[0] = '\0';
  573. if (tool.compose)
  574. utf8_tool_compose (result, sizeof (result));
  575. return result;
  576. }
  577. static const char *
  578. str_utf8_term_trim (const char *text, int width)
  579. {
  580. static char result[BUF_MEDIUM * 6];
  581. const struct term_form *pre_form;
  582. struct utf8_tool tool;
  583. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  584. tool.cheked = pre_form->text;
  585. tool.actual = result;
  586. tool.remain = sizeof (result);
  587. tool.compose = 0;
  588. if ((gsize)width < pre_form->width)
  589. {
  590. if (width <= 3)
  591. {
  592. memset (tool.actual, '.', width);
  593. tool.actual += width;
  594. tool.remain -= width;
  595. }
  596. else
  597. {
  598. memset (tool.actual, '.', 3);
  599. tool.actual += 3;
  600. tool.remain -= 3;
  601. tool.ident = 0;
  602. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
  603. utf8_tool_copy_chars_to_end (&tool);
  604. }
  605. }
  606. else
  607. {
  608. utf8_tool_copy_chars_to_end (&tool);
  609. }
  610. tool.actual[0] = '\0';
  611. if (tool.compose)
  612. utf8_tool_compose (result, sizeof (result));
  613. return result;
  614. }
  615. static int
  616. str_utf8_term_width2 (const char *text, size_t length)
  617. {
  618. const struct term_form *result;
  619. result = str_utf8_make_make_term_form (text, length);
  620. return result->width;
  621. }
  622. static int
  623. str_utf8_term_width1 (const char *text)
  624. {
  625. return str_utf8_term_width2 (text, (size_t) (-1));
  626. }
  627. static int
  628. str_utf8_term_char_width (const char *text)
  629. {
  630. gunichar uni = g_utf8_get_char_validated (text, -1);
  631. return (str_unichar_iscombiningmark (uni)) ? 0
  632. : ((g_unichar_iswide (uni)) ? 2 : 1);
  633. }
  634. static void
  635. str_utf8_msg_term_size (const char *text, int *lines, int *columns)
  636. {
  637. char *p, *tmp;
  638. char *q;
  639. char c = '\0';
  640. int width;
  641. (*lines) = 1;
  642. (*columns) = 0;
  643. tmp = g_strdup (text);
  644. p = tmp;
  645. for (;;)
  646. {
  647. q = strchr (p, '\n');
  648. if (q != NULL)
  649. {
  650. c = q[0];
  651. q[0] = '\0';
  652. }
  653. width = str_utf8_term_width1 (p);
  654. if (width > (*columns))
  655. (*columns) = width;
  656. if (q == NULL)
  657. break;
  658. q[0] = c;
  659. p = q + 1;
  660. (*lines)++;
  661. }
  662. g_free (tmp);
  663. }
  664. static const char *
  665. str_utf8_term_substring (const char *text, int start, int width)
  666. {
  667. static char result[BUF_MEDIUM * 6];
  668. const struct term_form *pre_form;
  669. struct utf8_tool tool;
  670. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  671. tool.cheked = pre_form->text;
  672. tool.actual = result;
  673. tool.remain = sizeof (result);
  674. tool.compose = 0;
  675. tool.ident = -start;
  676. utf8_tool_skip_chars_to (&tool, 0);
  677. if (tool.ident < 0)
  678. tool.ident = 0;
  679. utf8_tool_insert_space (&tool, tool.ident);
  680. utf8_tool_copy_chars_to (&tool, width);
  681. utf8_tool_insert_space (&tool, width - tool.ident);
  682. tool.actual[0] = '\0';
  683. if (tool.compose)
  684. utf8_tool_compose (result, sizeof (result));
  685. return result;
  686. }
  687. static const char *
  688. str_utf8_trunc (const char *text, int width)
  689. {
  690. static char result[MC_MAXPATHLEN * 6 * 2];
  691. const struct term_form *pre_form;
  692. struct utf8_tool tool;
  693. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  694. tool.cheked = pre_form->text;
  695. tool.actual = result;
  696. tool.remain = sizeof (result);
  697. tool.compose = 0;
  698. if (pre_form->width > (gsize)width)
  699. {
  700. tool.ident = 0;
  701. utf8_tool_copy_chars_to (&tool, width / 2);
  702. utf8_tool_insert_char (&tool, '~');
  703. tool.ident = 0;
  704. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  705. utf8_tool_copy_chars_to_end (&tool);
  706. }
  707. else
  708. {
  709. utf8_tool_copy_chars_to_end (&tool);
  710. }
  711. tool.actual[0] = '\0';
  712. if (tool.compose)
  713. utf8_tool_compose (result, sizeof (result));
  714. return result;
  715. }
  716. static int
  717. str_utf8_offset_to_pos (const char *text, size_t length)
  718. {
  719. if (str_utf8_is_valid_string (text))
  720. return g_utf8_offset_to_pointer (text, length) - text;
  721. else
  722. {
  723. int result;
  724. GString *buffer = g_string_new (text);
  725. str_utf8_fix_string (buffer->str);
  726. result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
  727. g_string_free (buffer, TRUE);
  728. return result;
  729. }
  730. }
  731. static int
  732. str_utf8_column_to_pos (const char *text, size_t pos)
  733. {
  734. static int result;
  735. gunichar uni;
  736. int width;
  737. width = 0;
  738. result = 0;
  739. while (text[0] != '\0')
  740. {
  741. uni = g_utf8_get_char_validated (text, 6);
  742. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  743. {
  744. if (g_unichar_isprint (uni))
  745. {
  746. if (!str_unichar_iscombiningmark (uni))
  747. {
  748. width++;
  749. if (g_unichar_iswide (uni))
  750. width++;
  751. }
  752. }
  753. else
  754. {
  755. width++;
  756. }
  757. text = g_utf8_next_char (text);
  758. }
  759. else
  760. {
  761. text++;
  762. width++;
  763. }
  764. if ((gsize)width > pos)
  765. return result;
  766. result++;
  767. }
  768. return result;
  769. }
  770. static char *
  771. str_utf8_create_search_needle (const char *needle, int case_sen)
  772. {
  773. if (needle != NULL)
  774. {
  775. if (case_sen)
  776. {
  777. return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
  778. }
  779. else
  780. {
  781. char *fold = g_utf8_casefold (needle, -1);
  782. char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  783. g_free (fold);
  784. return result;
  785. }
  786. }
  787. else
  788. return NULL;
  789. }
  790. static void
  791. str_utf8_release_search_needle (char *needle, int case_sen)
  792. {
  793. (void) case_sen;
  794. if (needle != NULL)
  795. g_free (needle);
  796. }
  797. static const char *
  798. str_utf8_search_first (const char *text, const char *search, int case_sen)
  799. {
  800. char *fold_text;
  801. char *deco_text;
  802. const char *match;
  803. const char *result = NULL;
  804. const char *m;
  805. fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
  806. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  807. match = deco_text;
  808. do
  809. {
  810. match = g_strstr_len (match, -1, search);
  811. if (match != NULL)
  812. {
  813. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  814. !str_utf8_iscombiningmark (match + strlen (search)))
  815. {
  816. result = text;
  817. m = deco_text;
  818. while (m < match)
  819. {
  820. str_utf8_cnext_noncomb_char (&m);
  821. str_utf8_cnext_noncomb_char (&result);
  822. }
  823. }
  824. else
  825. {
  826. str_utf8_cnext_char (&match);
  827. }
  828. }
  829. }
  830. while (match != NULL && result == NULL);
  831. g_free (deco_text);
  832. if (!case_sen)
  833. g_free (fold_text);
  834. return result;
  835. }
  836. static const char *
  837. str_utf8_search_last (const char *text, const char *search, int case_sen)
  838. {
  839. char *fold_text;
  840. char *deco_text;
  841. char *match;
  842. const char *result = NULL;
  843. const char *m;
  844. fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
  845. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  846. do
  847. {
  848. match = g_strrstr_len (deco_text, -1, search);
  849. if (match != NULL)
  850. {
  851. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  852. !str_utf8_iscombiningmark (match + strlen (search)))
  853. {
  854. result = text;
  855. m = deco_text;
  856. while (m < match)
  857. {
  858. str_utf8_cnext_noncomb_char (&m);
  859. str_utf8_cnext_noncomb_char (&result);
  860. }
  861. }
  862. else
  863. {
  864. match[0] = '\0';
  865. }
  866. }
  867. }
  868. while (match != NULL && result == NULL);
  869. g_free (deco_text);
  870. if (!case_sen)
  871. g_free (fold_text);
  872. return result;
  873. }
  874. static char *
  875. str_utf8_normalize (const char *text)
  876. {
  877. GString *fixed = g_string_new ("");
  878. char *tmp;
  879. char *result;
  880. const char *start;
  881. const char *end;
  882. start = text;
  883. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  884. {
  885. if (start != end)
  886. {
  887. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  888. g_string_append (fixed, tmp);
  889. g_free (tmp);
  890. }
  891. g_string_append_c (fixed, end[0]);
  892. start = end + 1;
  893. }
  894. if (start == text)
  895. {
  896. result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
  897. }
  898. else
  899. {
  900. if (start[0] != '\0' && start != end)
  901. {
  902. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  903. g_string_append (fixed, tmp);
  904. g_free (tmp);
  905. }
  906. result = g_strdup (fixed->str);
  907. }
  908. g_string_free (fixed, TRUE);
  909. return result;
  910. }
  911. static char *
  912. str_utf8_casefold_normalize (const char *text)
  913. {
  914. GString *fixed = g_string_new ("");
  915. char *tmp, *fold;
  916. char *result;
  917. const char *start;
  918. const char *end;
  919. start = text;
  920. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  921. {
  922. if (start != end)
  923. {
  924. fold = g_utf8_casefold (start, end - start);
  925. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  926. g_string_append (fixed, tmp);
  927. g_free (tmp);
  928. g_free (fold);
  929. }
  930. g_string_append_c (fixed, end[0]);
  931. start = end + 1;
  932. }
  933. if (start == text)
  934. {
  935. fold = g_utf8_casefold (text, -1);
  936. result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  937. g_free (fold);
  938. }
  939. else
  940. {
  941. if (start[0] != '\0' && start != end)
  942. {
  943. fold = g_utf8_casefold (start, end - start);
  944. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  945. g_string_append (fixed, tmp);
  946. g_free (tmp);
  947. g_free (fold);
  948. }
  949. result = g_strdup (fixed->str);
  950. }
  951. g_string_free (fixed, TRUE);
  952. return result;
  953. }
  954. static int
  955. str_utf8_compare (const char *t1, const char *t2)
  956. {
  957. char *n1, *n2;
  958. int result;
  959. n1 = str_utf8_normalize (t1);
  960. n2 = str_utf8_normalize (t2);
  961. result = strcmp (n1, n2);
  962. g_free (n1);
  963. g_free (n2);
  964. return result;
  965. }
  966. static int
  967. str_utf8_ncompare (const char *t1, const char *t2)
  968. {
  969. char *n1, *n2;
  970. int result;
  971. n1 = str_utf8_normalize (t1);
  972. n2 = str_utf8_normalize (t2);
  973. result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
  974. g_free (n1);
  975. g_free (n2);
  976. return result;
  977. }
  978. static int
  979. str_utf8_casecmp (const char *t1, const char *t2)
  980. {
  981. char *n1, *n2;
  982. int result;
  983. n1 = str_utf8_casefold_normalize (t1);
  984. n2 = str_utf8_casefold_normalize (t2);
  985. result = strcmp (n1, n2);
  986. g_free (n1);
  987. g_free (n2);
  988. return result;
  989. }
  990. static int
  991. str_utf8_ncasecmp (const char *t1, const char *t2)
  992. {
  993. char *n1, *n2;
  994. int result;
  995. n1 = str_utf8_casefold_normalize (t1);
  996. n2 = str_utf8_casefold_normalize (t2);
  997. result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
  998. g_free (n1);
  999. g_free (n2);
  1000. return result;
  1001. }
  1002. static int
  1003. str_utf8_prefix (const char *text, const char *prefix)
  1004. {
  1005. char *t = str_utf8_normalize (text);
  1006. char *p = str_utf8_normalize (prefix);
  1007. const char *nt = t;
  1008. const char *np = p;
  1009. const char *nnt = t;
  1010. const char *nnp = p;
  1011. int result;
  1012. while (nt[0] != '\0' && np[0] != '\0')
  1013. {
  1014. str_utf8_cnext_char_safe (&nnt);
  1015. str_utf8_cnext_char_safe (&nnp);
  1016. if (nnt - nt != nnp - np)
  1017. break;
  1018. if (strncmp (nt, np, nnt - nt) != 0)
  1019. break;
  1020. nt = nnt;
  1021. np = nnp;
  1022. }
  1023. result = np - p;
  1024. g_free (t);
  1025. g_free (p);
  1026. return result;
  1027. }
  1028. static int
  1029. str_utf8_caseprefix (const char *text, const char *prefix)
  1030. {
  1031. char *t = str_utf8_casefold_normalize (text);
  1032. char *p = str_utf8_casefold_normalize (prefix);
  1033. const char *nt = t;
  1034. const char *np = p;
  1035. const char *nnt = t;
  1036. const char *nnp = p;
  1037. int result;
  1038. while (nt[0] != '\0' && np[0] != '\0')
  1039. {
  1040. str_utf8_cnext_char_safe (&nnt);
  1041. str_utf8_cnext_char_safe (&nnp);
  1042. if (nnt - nt != nnp - np)
  1043. break;
  1044. if (strncmp (nt, np, nnt - nt) != 0)
  1045. break;
  1046. nt = nnt;
  1047. np = nnp;
  1048. }
  1049. result = np - p;
  1050. g_free (t);
  1051. g_free (p);
  1052. return result;
  1053. }
  1054. static char *
  1055. str_utf8_create_key_gen (const char *text, int case_sen,
  1056. gchar * (*keygen) (const gchar *, gssize size))
  1057. {
  1058. char *result;
  1059. if (case_sen) {
  1060. result = str_utf8_normalize (text);
  1061. } else {
  1062. const char *start, *end;
  1063. char *fold, *key;
  1064. GString *fixed = g_string_new ("");
  1065. start = text;
  1066. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  1067. {
  1068. if (start != end)
  1069. {
  1070. fold = g_utf8_casefold (start, end - start);
  1071. key = keygen (fold, -1);
  1072. g_string_append (fixed, key);
  1073. g_free (key);
  1074. g_free (fold);
  1075. }
  1076. g_string_append_c (fixed, end[0]);
  1077. start = end + 1;
  1078. }
  1079. if (start == text)
  1080. {
  1081. fold = g_utf8_casefold (text, -1);
  1082. result = keygen (fold, -1);
  1083. g_free (fold);
  1084. }
  1085. else
  1086. {
  1087. if (start[0] != '\0' && start != end)
  1088. {
  1089. fold = g_utf8_casefold (start, end - start);
  1090. key = keygen (fold, -1);
  1091. g_string_append (fixed, key);
  1092. g_free (key);
  1093. g_free (fold);
  1094. }
  1095. result = g_strdup (fixed->str);
  1096. }
  1097. g_string_free (fixed, TRUE);
  1098. }
  1099. return result;
  1100. }
  1101. static char *
  1102. str_utf8_create_key (const char *text, int case_sen)
  1103. {
  1104. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
  1105. }
  1106. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1107. static char *
  1108. str_utf8_create_key_for_filename (const char *text, int case_sen)
  1109. {
  1110. return str_utf8_create_key_gen (text, case_sen,
  1111. g_utf8_collate_key_for_filename);
  1112. }
  1113. #endif
  1114. static int
  1115. str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
  1116. {
  1117. (void) case_sen;
  1118. return strcmp (t1, t2);
  1119. }
  1120. static void
  1121. str_utf8_release_key (char *key, int case_sen)
  1122. {
  1123. (void) case_sen;
  1124. g_free (key);
  1125. }
  1126. struct str_class
  1127. str_utf8_init (void)
  1128. {
  1129. struct str_class result;
  1130. result.conv_gerror_message = str_utf8_conv_gerror_message;
  1131. result.vfs_convert_to = str_utf8_vfs_convert_to;
  1132. result.insert_replace_char = str_utf8_insert_replace_char;
  1133. result.is_valid_string = str_utf8_is_valid_string;
  1134. result.is_valid_char = str_utf8_is_valid_char;
  1135. result.cnext_char = str_utf8_cnext_char;
  1136. result.cprev_char = str_utf8_cprev_char;
  1137. result.cnext_char_safe = str_utf8_cnext_char_safe;
  1138. result.cprev_char_safe = str_utf8_cprev_char_safe;
  1139. result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
  1140. result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
  1141. result.isspace = str_utf8_isspace;
  1142. result.ispunct = str_utf8_ispunct;
  1143. result.isalnum = str_utf8_isalnum;
  1144. result.isdigit = str_utf8_isdigit;
  1145. result.isprint = str_utf8_isprint;
  1146. result.iscombiningmark = str_utf8_iscombiningmark;
  1147. result.toupper = str_utf8_toupper;
  1148. result.tolower = str_utf8_tolower;
  1149. result.length = str_utf8_length;
  1150. result.length2 = str_utf8_length2;
  1151. result.length_noncomb = str_utf8_length_noncomb;
  1152. result.fix_string = str_utf8_fix_string;
  1153. result.term_form = str_utf8_term_form;
  1154. result.fit_to_term = str_utf8_fit_to_term;
  1155. result.term_trim = str_utf8_term_trim;
  1156. result.term_width2 = str_utf8_term_width2;
  1157. result.term_width1 = str_utf8_term_width1;
  1158. result.term_char_width = str_utf8_term_char_width;
  1159. result.msg_term_size = str_utf8_msg_term_size;
  1160. result.term_substring = str_utf8_term_substring;
  1161. result.trunc = str_utf8_trunc;
  1162. result.offset_to_pos = str_utf8_offset_to_pos;
  1163. result.column_to_pos = str_utf8_column_to_pos;
  1164. result.create_search_needle = str_utf8_create_search_needle;
  1165. result.release_search_needle = str_utf8_release_search_needle;
  1166. result.search_first = str_utf8_search_first;
  1167. result.search_last = str_utf8_search_last;
  1168. result.compare = str_utf8_compare;
  1169. result.ncompare = str_utf8_ncompare;
  1170. result.casecmp = str_utf8_casecmp;
  1171. result.ncasecmp = str_utf8_ncasecmp;
  1172. result.prefix = str_utf8_prefix;
  1173. result.caseprefix = str_utf8_caseprefix;
  1174. result.create_key = str_utf8_create_key;
  1175. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1176. /* case insensitive sort files in "a1 a2 a10" order */
  1177. result.create_key_for_filename = str_utf8_create_key_for_filename;
  1178. #else
  1179. /* case insensitive sort files in "a1 a10 a2" order */
  1180. result.create_key_for_filename = str_utf8_create_key;
  1181. #endif
  1182. result.key_collate = str_utf8_key_collate;
  1183. result.release_key = str_utf8_release_key;
  1184. return result;
  1185. }