strutilutf8.c 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365
  1. /*
  2. UTF-8 strings utilities
  3. Copyright (C) 2007, 2011, 2013
  4. The Free Software Foundation, Inc.
  5. Written by:
  6. Rostislav Benes, 2007
  7. This file is part of the Midnight Commander.
  8. The Midnight Commander is free software: you can redistribute it
  9. and/or modify it under the terms of the GNU General Public License as
  10. published by the Free Software Foundation, either version 3 of the License,
  11. or (at your option) any later version.
  12. The Midnight Commander is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. GNU General Public License for more details.
  16. You should have received a copy of the GNU General Public License
  17. along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include <config.h>
  20. #include <stdlib.h>
  21. #include <langinfo.h>
  22. #include <string.h>
  23. #include "lib/global.h"
  24. #include "lib/strutil.h"
  25. /* using function for utf-8 from glib */
  26. static const char replch[] = "\xEF\xBF\xBD";
  27. static gboolean
  28. str_unichar_iscombiningmark (gunichar uni)
  29. {
  30. GUnicodeType type;
  31. type = g_unichar_type (uni);
  32. return (type == G_UNICODE_COMBINING_MARK)
  33. || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  34. }
  35. static void
  36. str_utf8_insert_replace_char (GString * buffer)
  37. {
  38. g_string_append (buffer, replch);
  39. }
  40. static int
  41. str_utf8_is_valid_string (const char *text)
  42. {
  43. return g_utf8_validate (text, -1, NULL);
  44. }
  45. static int
  46. str_utf8_is_valid_char (const char *ch, size_t size)
  47. {
  48. switch (g_utf8_get_char_validated (ch, size))
  49. {
  50. case (gunichar) (-2):
  51. return -2;
  52. case (gunichar) (-1):
  53. return -1;
  54. default:
  55. return 1;
  56. }
  57. }
  58. static void
  59. str_utf8_cnext_char (const char **text)
  60. {
  61. (*text) = g_utf8_next_char (*text);
  62. }
  63. static void
  64. str_utf8_cprev_char (const char **text)
  65. {
  66. (*text) = g_utf8_prev_char (*text);
  67. }
  68. static void
  69. str_utf8_cnext_char_safe (const char **text)
  70. {
  71. if (str_utf8_is_valid_char (*text, -1) == 1)
  72. (*text) = g_utf8_next_char (*text);
  73. else
  74. (*text)++;
  75. }
  76. static void
  77. str_utf8_cprev_char_safe (const char **text)
  78. {
  79. const char *result = g_utf8_prev_char (*text);
  80. const char *t = result;
  81. str_utf8_cnext_char_safe (&t);
  82. if (t == *text)
  83. (*text) = result;
  84. else
  85. (*text)--;
  86. }
  87. static void
  88. str_utf8_fix_string (char *text)
  89. {
  90. gunichar uni;
  91. while (text[0] != '\0')
  92. {
  93. uni = g_utf8_get_char_validated (text, -1);
  94. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  95. {
  96. text = g_utf8_next_char (text);
  97. }
  98. else
  99. {
  100. text[0] = '?';
  101. text++;
  102. }
  103. }
  104. }
  105. static int
  106. str_utf8_isspace (const char *text)
  107. {
  108. gunichar uni = g_utf8_get_char_validated (text, -1);
  109. return g_unichar_isspace (uni);
  110. }
  111. static int
  112. str_utf8_ispunct (const char *text)
  113. {
  114. gunichar uni = g_utf8_get_char_validated (text, -1);
  115. return g_unichar_ispunct (uni);
  116. }
  117. static int
  118. str_utf8_isalnum (const char *text)
  119. {
  120. gunichar uni = g_utf8_get_char_validated (text, -1);
  121. return g_unichar_isalnum (uni);
  122. }
  123. static int
  124. str_utf8_isdigit (const char *text)
  125. {
  126. gunichar uni = g_utf8_get_char_validated (text, -1);
  127. return g_unichar_isdigit (uni);
  128. }
  129. static int
  130. str_utf8_isprint (const char *ch)
  131. {
  132. gunichar uni = g_utf8_get_char_validated (ch, -1);
  133. return g_unichar_isprint (uni);
  134. }
  135. static gboolean
  136. str_utf8_iscombiningmark (const char *ch)
  137. {
  138. gunichar uni = g_utf8_get_char_validated (ch, -1);
  139. return str_unichar_iscombiningmark (uni);
  140. }
  141. static int
  142. str_utf8_cnext_noncomb_char (const char **text)
  143. {
  144. int count = 0;
  145. while ((*text)[0] != '\0')
  146. {
  147. str_utf8_cnext_char_safe (text);
  148. count++;
  149. if (!str_utf8_iscombiningmark (*text))
  150. break;
  151. }
  152. return count;
  153. }
  154. static int
  155. str_utf8_cprev_noncomb_char (const char **text, const char *begin)
  156. {
  157. int count = 0;
  158. while ((*text) != begin)
  159. {
  160. str_utf8_cprev_char_safe (text);
  161. count++;
  162. if (!str_utf8_iscombiningmark (*text))
  163. break;
  164. }
  165. return count;
  166. }
  167. static int
  168. str_utf8_toupper (const char *text, char **out, size_t * remain)
  169. {
  170. gunichar uni;
  171. size_t left;
  172. uni = g_utf8_get_char_validated (text, -1);
  173. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  174. return 0;
  175. uni = g_unichar_toupper (uni);
  176. left = g_unichar_to_utf8 (uni, NULL);
  177. if (left >= *remain)
  178. return 0;
  179. left = g_unichar_to_utf8 (uni, *out);
  180. (*out) += left;
  181. (*remain) -= left;
  182. return 1;
  183. }
  184. static int
  185. str_utf8_tolower (const char *text, char **out, size_t * remain)
  186. {
  187. gunichar uni;
  188. size_t left;
  189. uni = g_utf8_get_char_validated (text, -1);
  190. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  191. return 0;
  192. uni = g_unichar_tolower (uni);
  193. left = g_unichar_to_utf8 (uni, NULL);
  194. if (left >= *remain)
  195. return 0;
  196. left = g_unichar_to_utf8 (uni, *out);
  197. (*out) += left;
  198. (*remain) -= left;
  199. return 1;
  200. }
  201. static int
  202. str_utf8_length (const char *text)
  203. {
  204. int result = 0;
  205. const char *start;
  206. const char *end;
  207. start = text;
  208. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  209. {
  210. if (start != end)
  211. {
  212. result += g_utf8_strlen (start, end - start);
  213. }
  214. result++;
  215. start = end + 1;
  216. }
  217. if (start == text)
  218. {
  219. result = g_utf8_strlen (text, -1);
  220. }
  221. else
  222. {
  223. if (start[0] != '\0' && start != end)
  224. {
  225. result += g_utf8_strlen (start, end - start);
  226. }
  227. }
  228. return result;
  229. }
  230. static int
  231. str_utf8_length2 (const char *text, int size)
  232. {
  233. int result = 0;
  234. const char *start;
  235. const char *end;
  236. start = text;
  237. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
  238. {
  239. if (start != end)
  240. {
  241. result += g_utf8_strlen (start, min (end - start, size));
  242. size -= end - start;
  243. }
  244. result += (size > 0);
  245. size--;
  246. start = end + 1;
  247. }
  248. if (start == text)
  249. {
  250. result = g_utf8_strlen (text, size);
  251. }
  252. else
  253. {
  254. if (start[0] != '\0' && start != end && size > 0)
  255. {
  256. result += g_utf8_strlen (start, min (end - start, size));
  257. }
  258. }
  259. return result;
  260. }
  261. static int
  262. str_utf8_length_noncomb (const char *text)
  263. {
  264. int result = 0;
  265. const char *t = text;
  266. while (t[0] != '\0')
  267. {
  268. str_utf8_cnext_noncomb_char (&t);
  269. result++;
  270. }
  271. return result;
  272. }
  273. /*
  274. static void
  275. str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
  276. {
  277. char *next = g_utf8_next_char (*string);
  278. (*left) -= next - (*string);
  279. (*string) = next;
  280. g_string_append_c (buffer, '?');
  281. }
  282. */
  283. static gchar *
  284. str_utf8_conv_gerror_message (GError * error, const char *def_msg)
  285. {
  286. if ((error != NULL) && (error->message != NULL))
  287. return g_strdup (error->message);
  288. return g_strdup (def_msg != NULL ? def_msg : "");
  289. }
  290. static estr_t
  291. str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
  292. {
  293. estr_t result;
  294. if (coder == str_cnv_not_convert)
  295. {
  296. g_string_append_len (buffer, string, size);
  297. result = ESTR_SUCCESS;
  298. }
  299. else
  300. result = str_nconvert (coder, (char *) string, size, buffer);
  301. return result;
  302. }
  303. struct term_form
  304. {
  305. char text[BUF_MEDIUM * 6];
  306. size_t width;
  307. gboolean compose;
  308. };
  309. /* utiliti function, that make string valid in utf8 and all characters printable
  310. * return width of string too*/
  311. static const struct term_form *
  312. str_utf8_make_make_term_form (const char *text, size_t length)
  313. {
  314. static struct term_form result;
  315. gunichar uni;
  316. size_t left;
  317. char *actual;
  318. result.text[0] = '\0';
  319. result.width = 0;
  320. result.compose = FALSE;
  321. actual = result.text;
  322. /* check if text start with combining character,
  323. * add space at begin in this case */
  324. if (length != 0 && text[0] != '\0')
  325. {
  326. uni = g_utf8_get_char_validated (text, -1);
  327. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  328. {
  329. if (str_unichar_iscombiningmark (uni))
  330. {
  331. actual[0] = ' ';
  332. actual++;
  333. result.width++;
  334. result.compose = TRUE;
  335. }
  336. }
  337. }
  338. while (length != 0 && text[0] != '\0')
  339. {
  340. uni = g_utf8_get_char_validated (text, -1);
  341. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  342. {
  343. if (g_unichar_isprint (uni))
  344. {
  345. left = g_unichar_to_utf8 (uni, actual);
  346. actual += left;
  347. if (str_unichar_iscombiningmark (uni))
  348. result.compose = TRUE;
  349. else
  350. {
  351. result.width++;
  352. if (g_unichar_iswide (uni))
  353. result.width++;
  354. }
  355. }
  356. else
  357. {
  358. actual[0] = '.';
  359. actual++;
  360. result.width++;
  361. }
  362. text = g_utf8_next_char (text);
  363. }
  364. else
  365. {
  366. text++;
  367. /*actual[0] = '?'; */
  368. memcpy (actual, replch, strlen (replch));
  369. actual += strlen (replch);
  370. result.width++;
  371. }
  372. if (length != (size_t) (-1))
  373. length--;
  374. }
  375. actual[0] = '\0';
  376. return &result;
  377. }
  378. static const char *
  379. str_utf8_term_form (const char *text)
  380. {
  381. static char result[BUF_MEDIUM * 6];
  382. const struct term_form *pre_form;
  383. char *composed;
  384. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  385. if (pre_form->compose)
  386. {
  387. composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  388. g_strlcpy (result, composed, sizeof (result));
  389. g_free (composed);
  390. }
  391. else
  392. {
  393. g_strlcpy (result, pre_form->text, sizeof (result));
  394. }
  395. return result;
  396. }
  397. struct utf8_tool
  398. {
  399. char *actual;
  400. size_t remain;
  401. const char *cheked;
  402. int ident;
  403. gboolean compose;
  404. };
  405. /* utiliti function, that copy all characters from cheked to actual */
  406. static gboolean
  407. utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
  408. {
  409. size_t left;
  410. gunichar uni;
  411. tool->compose = FALSE;
  412. while (tool->cheked[0] != '\0')
  413. {
  414. uni = g_utf8_get_char (tool->cheked);
  415. tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
  416. left = g_unichar_to_utf8 (uni, NULL);
  417. if (tool->remain <= left)
  418. return FALSE;
  419. left = g_unichar_to_utf8 (uni, tool->actual);
  420. tool->actual += left;
  421. tool->remain -= left;
  422. tool->cheked = g_utf8_next_char (tool->cheked);
  423. }
  424. return TRUE;
  425. }
  426. /* utiliti function, that copy characters from cheked to actual until ident is
  427. * smaller than to_ident */
  428. static gboolean
  429. utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
  430. {
  431. size_t left;
  432. gunichar uni;
  433. int w;
  434. tool->compose = FALSE;
  435. while (tool->cheked[0] != '\0')
  436. {
  437. uni = g_utf8_get_char (tool->cheked);
  438. if (!str_unichar_iscombiningmark (uni))
  439. {
  440. w = 1;
  441. if (g_unichar_iswide (uni))
  442. w++;
  443. if (tool->ident + w > to_ident)
  444. return TRUE;
  445. }
  446. else
  447. {
  448. w = 0;
  449. tool->compose = TRUE;
  450. }
  451. left = g_unichar_to_utf8 (uni, NULL);
  452. if (tool->remain <= left)
  453. return FALSE;
  454. left = g_unichar_to_utf8 (uni, tool->actual);
  455. tool->actual += left;
  456. tool->remain -= left;
  457. tool->cheked = g_utf8_next_char (tool->cheked);
  458. tool->ident += w;
  459. }
  460. return TRUE;
  461. }
  462. /* utiliti function, add count spaces to actual */
  463. static int
  464. utf8_tool_insert_space (struct utf8_tool *tool, int count)
  465. {
  466. if (count <= 0)
  467. return 1;
  468. if (tool->remain <= (gsize) count)
  469. return 0;
  470. memset (tool->actual, ' ', count);
  471. tool->actual += count;
  472. tool->remain -= count;
  473. return 1;
  474. }
  475. /* utiliti function, add one characters to actual */
  476. static int
  477. utf8_tool_insert_char (struct utf8_tool *tool, char ch)
  478. {
  479. if (tool->remain <= 1)
  480. return 0;
  481. tool->actual[0] = ch;
  482. tool->actual++;
  483. tool->remain--;
  484. return 1;
  485. }
  486. /* utiliti function, thah skip characters from cheked until ident is greater or
  487. * equal to to_ident */
  488. static gboolean
  489. utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
  490. {
  491. gunichar uni;
  492. while (to_ident > tool->ident && tool->cheked[0] != '\0')
  493. {
  494. uni = g_utf8_get_char (tool->cheked);
  495. if (!str_unichar_iscombiningmark (uni))
  496. {
  497. tool->ident++;
  498. if (g_unichar_iswide (uni))
  499. tool->ident++;
  500. }
  501. tool->cheked = g_utf8_next_char (tool->cheked);
  502. }
  503. uni = g_utf8_get_char (tool->cheked);
  504. while (str_unichar_iscombiningmark (uni))
  505. {
  506. tool->cheked = g_utf8_next_char (tool->cheked);
  507. uni = g_utf8_get_char (tool->cheked);
  508. }
  509. return TRUE;
  510. }
  511. static void
  512. utf8_tool_compose (char *buffer, size_t size)
  513. {
  514. char *composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  515. g_strlcpy (buffer, composed, size);
  516. g_free (composed);
  517. }
  518. static const char *
  519. str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
  520. {
  521. static char result[BUF_MEDIUM * 6];
  522. const struct term_form *pre_form;
  523. struct utf8_tool tool;
  524. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  525. tool.cheked = pre_form->text;
  526. tool.actual = result;
  527. tool.remain = sizeof (result);
  528. tool.compose = FALSE;
  529. if (pre_form->width <= (gsize) width)
  530. {
  531. tool.ident = 0;
  532. switch (HIDE_FIT (just_mode))
  533. {
  534. case J_CENTER_LEFT:
  535. case J_CENTER:
  536. tool.ident = (width - pre_form->width) / 2;
  537. break;
  538. case J_RIGHT:
  539. tool.ident = width - pre_form->width;
  540. break;
  541. }
  542. utf8_tool_insert_space (&tool, tool.ident);
  543. utf8_tool_copy_chars_to_end (&tool);
  544. utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
  545. }
  546. else
  547. {
  548. if (IS_FIT (just_mode))
  549. {
  550. tool.ident = 0;
  551. utf8_tool_copy_chars_to (&tool, width / 2);
  552. utf8_tool_insert_char (&tool, '~');
  553. tool.ident = 0;
  554. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  555. utf8_tool_copy_chars_to_end (&tool);
  556. utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
  557. }
  558. else
  559. {
  560. tool.ident = 0;
  561. switch (HIDE_FIT (just_mode))
  562. {
  563. case J_CENTER:
  564. tool.ident = (width - pre_form->width) / 2;
  565. break;
  566. case J_RIGHT:
  567. tool.ident = width - pre_form->width;
  568. break;
  569. }
  570. utf8_tool_skip_chars_to (&tool, 0);
  571. utf8_tool_insert_space (&tool, tool.ident);
  572. utf8_tool_copy_chars_to (&tool, width);
  573. utf8_tool_insert_space (&tool, width - tool.ident);
  574. }
  575. }
  576. tool.actual[0] = '\0';
  577. if (tool.compose)
  578. utf8_tool_compose (result, sizeof (result));
  579. return result;
  580. }
  581. static const char *
  582. str_utf8_term_trim (const char *text, int width)
  583. {
  584. static char result[BUF_MEDIUM * 6];
  585. const struct term_form *pre_form;
  586. struct utf8_tool tool;
  587. if (width < 1)
  588. {
  589. result[0] = '\0';
  590. return result;
  591. }
  592. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  593. tool.cheked = pre_form->text;
  594. tool.actual = result;
  595. tool.remain = sizeof (result);
  596. tool.compose = FALSE;
  597. if ((gsize) width < pre_form->width)
  598. {
  599. if (width <= 3)
  600. {
  601. memset (tool.actual, '.', width);
  602. tool.actual += width;
  603. tool.remain -= width;
  604. }
  605. else
  606. {
  607. memset (tool.actual, '.', 3);
  608. tool.actual += 3;
  609. tool.remain -= 3;
  610. tool.ident = 0;
  611. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
  612. utf8_tool_copy_chars_to_end (&tool);
  613. }
  614. }
  615. else
  616. {
  617. utf8_tool_copy_chars_to_end (&tool);
  618. }
  619. tool.actual[0] = '\0';
  620. if (tool.compose)
  621. utf8_tool_compose (result, sizeof (result));
  622. return result;
  623. }
  624. static int
  625. str_utf8_term_width2 (const char *text, size_t length)
  626. {
  627. const struct term_form *result;
  628. result = str_utf8_make_make_term_form (text, length);
  629. return result->width;
  630. }
  631. static int
  632. str_utf8_term_width1 (const char *text)
  633. {
  634. return str_utf8_term_width2 (text, (size_t) (-1));
  635. }
  636. static int
  637. str_utf8_term_char_width (const char *text)
  638. {
  639. gunichar uni = g_utf8_get_char_validated (text, -1);
  640. return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
  641. }
  642. static const char *
  643. str_utf8_term_substring (const char *text, int start, int width)
  644. {
  645. static char result[BUF_MEDIUM * 6];
  646. const struct term_form *pre_form;
  647. struct utf8_tool tool;
  648. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  649. tool.cheked = pre_form->text;
  650. tool.actual = result;
  651. tool.remain = sizeof (result);
  652. tool.compose = FALSE;
  653. tool.ident = -start;
  654. utf8_tool_skip_chars_to (&tool, 0);
  655. if (tool.ident < 0)
  656. tool.ident = 0;
  657. utf8_tool_insert_space (&tool, tool.ident);
  658. utf8_tool_copy_chars_to (&tool, width);
  659. utf8_tool_insert_space (&tool, width - tool.ident);
  660. tool.actual[0] = '\0';
  661. if (tool.compose)
  662. utf8_tool_compose (result, sizeof (result));
  663. return result;
  664. }
  665. static const char *
  666. str_utf8_trunc (const char *text, int width)
  667. {
  668. static char result[MC_MAXPATHLEN * 6 * 2];
  669. const struct term_form *pre_form;
  670. struct utf8_tool tool;
  671. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  672. tool.cheked = pre_form->text;
  673. tool.actual = result;
  674. tool.remain = sizeof (result);
  675. tool.compose = FALSE;
  676. if (pre_form->width > (gsize) width)
  677. {
  678. tool.ident = 0;
  679. utf8_tool_copy_chars_to (&tool, width / 2);
  680. utf8_tool_insert_char (&tool, '~');
  681. tool.ident = 0;
  682. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  683. utf8_tool_copy_chars_to_end (&tool);
  684. }
  685. else
  686. {
  687. utf8_tool_copy_chars_to_end (&tool);
  688. }
  689. tool.actual[0] = '\0';
  690. if (tool.compose)
  691. utf8_tool_compose (result, sizeof (result));
  692. return result;
  693. }
  694. static int
  695. str_utf8_offset_to_pos (const char *text, size_t length)
  696. {
  697. if (str_utf8_is_valid_string (text))
  698. return g_utf8_offset_to_pointer (text, length) - text;
  699. else
  700. {
  701. int result;
  702. GString *buffer = g_string_new (text);
  703. str_utf8_fix_string (buffer->str);
  704. result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
  705. g_string_free (buffer, TRUE);
  706. return result;
  707. }
  708. }
  709. static int
  710. str_utf8_column_to_pos (const char *text, size_t pos)
  711. {
  712. static int result;
  713. gunichar uni;
  714. int width;
  715. width = 0;
  716. result = 0;
  717. while (text[0] != '\0')
  718. {
  719. uni = g_utf8_get_char_validated (text, 6);
  720. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  721. {
  722. if (g_unichar_isprint (uni))
  723. {
  724. if (!str_unichar_iscombiningmark (uni))
  725. {
  726. width++;
  727. if (g_unichar_iswide (uni))
  728. width++;
  729. }
  730. }
  731. else
  732. {
  733. width++;
  734. }
  735. text = g_utf8_next_char (text);
  736. }
  737. else
  738. {
  739. text++;
  740. width++;
  741. }
  742. if ((gsize) width > pos)
  743. return result;
  744. result++;
  745. }
  746. return result;
  747. }
  748. static char *
  749. str_utf8_create_search_needle (const char *needle, int case_sen)
  750. {
  751. if (needle != NULL)
  752. {
  753. if (case_sen)
  754. {
  755. return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
  756. }
  757. else
  758. {
  759. char *fold = g_utf8_casefold (needle, -1);
  760. char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  761. g_free (fold);
  762. return result;
  763. }
  764. }
  765. else
  766. return NULL;
  767. }
  768. static void
  769. str_utf8_release_search_needle (char *needle, int case_sen)
  770. {
  771. (void) case_sen;
  772. if (needle != NULL)
  773. g_free (needle);
  774. }
  775. static const char *
  776. str_utf8_search_first (const char *text, const char *search, int case_sen)
  777. {
  778. char *fold_text;
  779. char *deco_text;
  780. const char *match;
  781. const char *result = NULL;
  782. const char *m;
  783. fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
  784. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  785. match = deco_text;
  786. do
  787. {
  788. match = g_strstr_len (match, -1, search);
  789. if (match != NULL)
  790. {
  791. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  792. !str_utf8_iscombiningmark (match + strlen (search)))
  793. {
  794. result = text;
  795. m = deco_text;
  796. while (m < match)
  797. {
  798. str_utf8_cnext_noncomb_char (&m);
  799. str_utf8_cnext_noncomb_char (&result);
  800. }
  801. }
  802. else
  803. {
  804. str_utf8_cnext_char (&match);
  805. }
  806. }
  807. }
  808. while (match != NULL && result == NULL);
  809. g_free (deco_text);
  810. if (!case_sen)
  811. g_free (fold_text);
  812. return result;
  813. }
  814. static const char *
  815. str_utf8_search_last (const char *text, const char *search, int case_sen)
  816. {
  817. char *fold_text;
  818. char *deco_text;
  819. char *match;
  820. const char *result = NULL;
  821. const char *m;
  822. fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
  823. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  824. do
  825. {
  826. match = g_strrstr_len (deco_text, -1, search);
  827. if (match != NULL)
  828. {
  829. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  830. !str_utf8_iscombiningmark (match + strlen (search)))
  831. {
  832. result = text;
  833. m = deco_text;
  834. while (m < match)
  835. {
  836. str_utf8_cnext_noncomb_char (&m);
  837. str_utf8_cnext_noncomb_char (&result);
  838. }
  839. }
  840. else
  841. {
  842. match[0] = '\0';
  843. }
  844. }
  845. }
  846. while (match != NULL && result == NULL);
  847. g_free (deco_text);
  848. if (!case_sen)
  849. g_free (fold_text);
  850. return result;
  851. }
  852. static char *
  853. str_utf8_normalize (const char *text)
  854. {
  855. GString *fixed;
  856. char *tmp;
  857. char *result;
  858. const char *start;
  859. const char *end;
  860. fixed = g_string_sized_new (4);
  861. start = text;
  862. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  863. {
  864. if (start != end)
  865. {
  866. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  867. g_string_append (fixed, tmp);
  868. g_free (tmp);
  869. }
  870. g_string_append_c (fixed, end[0]);
  871. start = end + 1;
  872. }
  873. if (start == text)
  874. {
  875. result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
  876. g_string_free (fixed, TRUE);
  877. }
  878. else
  879. {
  880. if (start[0] != '\0' && start != end)
  881. {
  882. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  883. g_string_append (fixed, tmp);
  884. g_free (tmp);
  885. }
  886. result = g_string_free (fixed, FALSE);
  887. }
  888. return result;
  889. }
  890. static char *
  891. str_utf8_casefold_normalize (const char *text)
  892. {
  893. GString *fixed;
  894. char *tmp, *fold;
  895. char *result;
  896. const char *start;
  897. const char *end;
  898. fixed = g_string_sized_new (4);
  899. start = text;
  900. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  901. {
  902. if (start != end)
  903. {
  904. fold = g_utf8_casefold (start, end - start);
  905. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  906. g_string_append (fixed, tmp);
  907. g_free (tmp);
  908. g_free (fold);
  909. }
  910. g_string_append_c (fixed, end[0]);
  911. start = end + 1;
  912. }
  913. if (start == text)
  914. {
  915. fold = g_utf8_casefold (text, -1);
  916. result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  917. g_free (fold);
  918. g_string_free (fixed, TRUE);
  919. }
  920. else
  921. {
  922. if (start[0] != '\0' && start != end)
  923. {
  924. fold = g_utf8_casefold (start, end - start);
  925. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  926. g_string_append (fixed, tmp);
  927. g_free (tmp);
  928. g_free (fold);
  929. }
  930. result = g_string_free (fixed, FALSE);
  931. }
  932. return result;
  933. }
  934. static int
  935. str_utf8_compare (const char *t1, const char *t2)
  936. {
  937. char *n1, *n2;
  938. int result;
  939. n1 = str_utf8_normalize (t1);
  940. n2 = str_utf8_normalize (t2);
  941. result = strcmp (n1, n2);
  942. g_free (n1);
  943. g_free (n2);
  944. return result;
  945. }
  946. static int
  947. str_utf8_ncompare (const char *t1, const char *t2)
  948. {
  949. char *n1, *n2;
  950. int result;
  951. n1 = str_utf8_normalize (t1);
  952. n2 = str_utf8_normalize (t2);
  953. result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
  954. g_free (n1);
  955. g_free (n2);
  956. return result;
  957. }
  958. static int
  959. str_utf8_casecmp (const char *t1, const char *t2)
  960. {
  961. char *n1, *n2;
  962. int result;
  963. n1 = str_utf8_casefold_normalize (t1);
  964. n2 = str_utf8_casefold_normalize (t2);
  965. result = strcmp (n1, n2);
  966. g_free (n1);
  967. g_free (n2);
  968. return result;
  969. }
  970. static int
  971. str_utf8_ncasecmp (const char *t1, const char *t2)
  972. {
  973. char *n1, *n2;
  974. int result;
  975. n1 = str_utf8_casefold_normalize (t1);
  976. n2 = str_utf8_casefold_normalize (t2);
  977. result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
  978. g_free (n1);
  979. g_free (n2);
  980. return result;
  981. }
  982. static int
  983. str_utf8_prefix (const char *text, const char *prefix)
  984. {
  985. char *t = str_utf8_normalize (text);
  986. char *p = str_utf8_normalize (prefix);
  987. const char *nt = t;
  988. const char *np = p;
  989. const char *nnt = t;
  990. const char *nnp = p;
  991. int result;
  992. while (nt[0] != '\0' && np[0] != '\0')
  993. {
  994. str_utf8_cnext_char_safe (&nnt);
  995. str_utf8_cnext_char_safe (&nnp);
  996. if (nnt - nt != nnp - np)
  997. break;
  998. if (strncmp (nt, np, nnt - nt) != 0)
  999. break;
  1000. nt = nnt;
  1001. np = nnp;
  1002. }
  1003. result = np - p;
  1004. g_free (t);
  1005. g_free (p);
  1006. return result;
  1007. }
  1008. static int
  1009. str_utf8_caseprefix (const char *text, const char *prefix)
  1010. {
  1011. char *t = str_utf8_casefold_normalize (text);
  1012. char *p = str_utf8_casefold_normalize (prefix);
  1013. const char *nt = t;
  1014. const char *np = p;
  1015. const char *nnt = t;
  1016. const char *nnp = p;
  1017. int result;
  1018. while (nt[0] != '\0' && np[0] != '\0')
  1019. {
  1020. str_utf8_cnext_char_safe (&nnt);
  1021. str_utf8_cnext_char_safe (&nnp);
  1022. if (nnt - nt != nnp - np)
  1023. break;
  1024. if (strncmp (nt, np, nnt - nt) != 0)
  1025. break;
  1026. nt = nnt;
  1027. np = nnp;
  1028. }
  1029. result = np - p;
  1030. g_free (t);
  1031. g_free (p);
  1032. return result;
  1033. }
  1034. static char *
  1035. str_utf8_create_key_gen (const char *text, int case_sen,
  1036. gchar * (*keygen) (const gchar * text, gssize size))
  1037. {
  1038. char *result;
  1039. if (case_sen)
  1040. {
  1041. result = str_utf8_normalize (text);
  1042. }
  1043. else
  1044. {
  1045. gboolean dot;
  1046. GString *fixed;
  1047. const char *start, *end;
  1048. char *fold, *key;
  1049. dot = text[0] == '.';
  1050. fixed = g_string_sized_new (16);
  1051. if (!dot)
  1052. start = text;
  1053. else
  1054. {
  1055. start = text + 1;
  1056. g_string_append_c (fixed, '.');
  1057. }
  1058. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  1059. {
  1060. if (start != end)
  1061. {
  1062. fold = g_utf8_casefold (start, end - start);
  1063. key = keygen (fold, -1);
  1064. g_string_append (fixed, key);
  1065. g_free (key);
  1066. g_free (fold);
  1067. }
  1068. g_string_append_c (fixed, end[0]);
  1069. start = end + 1;
  1070. }
  1071. if (start == text)
  1072. {
  1073. fold = g_utf8_casefold (start, -1);
  1074. result = keygen (fold, -1);
  1075. g_free (fold);
  1076. g_string_free (fixed, TRUE);
  1077. }
  1078. else if (dot && (start == text + 1))
  1079. {
  1080. fold = g_utf8_casefold (start, -1);
  1081. key = keygen (fold, -1);
  1082. g_string_append (fixed, key);
  1083. g_free (key);
  1084. g_free (fold);
  1085. result = g_string_free (fixed, FALSE);
  1086. }
  1087. else
  1088. {
  1089. if (start[0] != '\0' && start != end)
  1090. {
  1091. fold = g_utf8_casefold (start, end - start);
  1092. key = keygen (fold, -1);
  1093. g_string_append (fixed, key);
  1094. g_free (key);
  1095. g_free (fold);
  1096. }
  1097. result = g_string_free (fixed, FALSE);
  1098. }
  1099. }
  1100. return result;
  1101. }
  1102. static char *
  1103. str_utf8_create_key (const char *text, int case_sen)
  1104. {
  1105. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
  1106. }
  1107. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1108. static char *
  1109. str_utf8_create_key_for_filename (const char *text, int case_sen)
  1110. {
  1111. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
  1112. }
  1113. #endif
  1114. static int
  1115. str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
  1116. {
  1117. (void) case_sen;
  1118. return strcmp (t1, t2);
  1119. }
  1120. static void
  1121. str_utf8_release_key (char *key, int case_sen)
  1122. {
  1123. (void) case_sen;
  1124. g_free (key);
  1125. }
  1126. struct str_class
  1127. str_utf8_init (void)
  1128. {
  1129. struct str_class result;
  1130. result.conv_gerror_message = str_utf8_conv_gerror_message;
  1131. result.vfs_convert_to = str_utf8_vfs_convert_to;
  1132. result.insert_replace_char = str_utf8_insert_replace_char;
  1133. result.is_valid_string = str_utf8_is_valid_string;
  1134. result.is_valid_char = str_utf8_is_valid_char;
  1135. result.cnext_char = str_utf8_cnext_char;
  1136. result.cprev_char = str_utf8_cprev_char;
  1137. result.cnext_char_safe = str_utf8_cnext_char_safe;
  1138. result.cprev_char_safe = str_utf8_cprev_char_safe;
  1139. result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
  1140. result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
  1141. result.char_isspace = str_utf8_isspace;
  1142. result.char_ispunct = str_utf8_ispunct;
  1143. result.char_isalnum = str_utf8_isalnum;
  1144. result.char_isdigit = str_utf8_isdigit;
  1145. result.char_isprint = str_utf8_isprint;
  1146. result.char_iscombiningmark = str_utf8_iscombiningmark;
  1147. result.char_toupper = str_utf8_toupper;
  1148. result.char_tolower = str_utf8_tolower;
  1149. result.length = str_utf8_length;
  1150. result.length2 = str_utf8_length2;
  1151. result.length_noncomb = str_utf8_length_noncomb;
  1152. result.fix_string = str_utf8_fix_string;
  1153. result.term_form = str_utf8_term_form;
  1154. result.fit_to_term = str_utf8_fit_to_term;
  1155. result.term_trim = str_utf8_term_trim;
  1156. result.term_width2 = str_utf8_term_width2;
  1157. result.term_width1 = str_utf8_term_width1;
  1158. result.term_char_width = str_utf8_term_char_width;
  1159. result.term_substring = str_utf8_term_substring;
  1160. result.trunc = str_utf8_trunc;
  1161. result.offset_to_pos = str_utf8_offset_to_pos;
  1162. result.column_to_pos = str_utf8_column_to_pos;
  1163. result.create_search_needle = str_utf8_create_search_needle;
  1164. result.release_search_needle = str_utf8_release_search_needle;
  1165. result.search_first = str_utf8_search_first;
  1166. result.search_last = str_utf8_search_last;
  1167. result.compare = str_utf8_compare;
  1168. result.ncompare = str_utf8_ncompare;
  1169. result.casecmp = str_utf8_casecmp;
  1170. result.ncasecmp = str_utf8_ncasecmp;
  1171. result.prefix = str_utf8_prefix;
  1172. result.caseprefix = str_utf8_caseprefix;
  1173. result.create_key = str_utf8_create_key;
  1174. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1175. /* case insensitive sort files in "a1 a2 a10" order */
  1176. result.create_key_for_filename = str_utf8_create_key_for_filename;
  1177. #else
  1178. /* case insensitive sort files in "a1 a10 a2" order */
  1179. result.create_key_for_filename = str_utf8_create_key;
  1180. #endif
  1181. result.key_collate = str_utf8_key_collate;
  1182. result.release_key = str_utf8_release_key;
  1183. return result;
  1184. }