strutilutf8.c 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518
  1. /*
  2. UTF-8 strings utilities
  3. Copyright (C) 2007-2021
  4. Free Software Foundation, Inc.
  5. Written by:
  6. Rostislav Benes, 2007
  7. This file is part of the Midnight Commander.
  8. The Midnight Commander is free software: you can redistribute it
  9. and/or modify it under the terms of the GNU General Public License as
  10. published by the Free Software Foundation, either version 3 of the License,
  11. or (at your option) any later version.
  12. The Midnight Commander is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. GNU General Public License for more details.
  16. You should have received a copy of the GNU General Public License
  17. along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include <config.h>
  20. #include <stdlib.h>
  21. #include <langinfo.h>
  22. #include <string.h>
  23. #include "lib/global.h"
  24. #include "lib/strutil.h"
  25. /* using function for utf-8 from glib */
  26. /*** global variables ****************************************************************************/
  27. /*** file scope macro definitions ****************************************************************/
  28. /*** file scope type declarations ****************************************************************/
  29. struct utf8_tool
  30. {
  31. char *actual;
  32. size_t remain;
  33. const char *checked;
  34. int ident;
  35. gboolean compose;
  36. };
  37. struct term_form
  38. {
  39. char text[BUF_MEDIUM * 6];
  40. size_t width;
  41. gboolean compose;
  42. };
  43. /*** file scope variables ************************************************************************/
  44. static const char replch[] = "\xEF\xBF\xBD";
  45. /* --------------------------------------------------------------------------------------------- */
  46. /*** file scope functions ************************************************************************/
  47. /* --------------------------------------------------------------------------------------------- */
  48. static gboolean
  49. str_unichar_iscombiningmark (gunichar uni)
  50. {
  51. GUnicodeType type;
  52. type = g_unichar_type (uni);
  53. return (type == G_UNICODE_SPACING_MARK)
  54. || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  55. }
  56. /* --------------------------------------------------------------------------------------------- */
  57. static void
  58. str_utf8_insert_replace_char (GString * buffer)
  59. {
  60. g_string_append (buffer, replch);
  61. }
  62. /* --------------------------------------------------------------------------------------------- */
  63. static gboolean
  64. str_utf8_is_valid_string (const char *text)
  65. {
  66. return g_utf8_validate (text, -1, NULL);
  67. }
  68. /* --------------------------------------------------------------------------------------------- */
  69. static int
  70. str_utf8_is_valid_char (const char *ch, size_t size)
  71. {
  72. switch (g_utf8_get_char_validated (ch, size))
  73. {
  74. case (gunichar) (-2):
  75. return (-2);
  76. case (gunichar) (-1):
  77. return (-1);
  78. default:
  79. return 1;
  80. }
  81. }
  82. /* --------------------------------------------------------------------------------------------- */
  83. static void
  84. str_utf8_cnext_char (const char **text)
  85. {
  86. (*text) = g_utf8_next_char (*text);
  87. }
  88. /* --------------------------------------------------------------------------------------------- */
  89. static void
  90. str_utf8_cprev_char (const char **text)
  91. {
  92. (*text) = g_utf8_prev_char (*text);
  93. }
  94. /* --------------------------------------------------------------------------------------------- */
  95. static void
  96. str_utf8_cnext_char_safe (const char **text)
  97. {
  98. if (str_utf8_is_valid_char (*text, -1) == 1)
  99. (*text) = g_utf8_next_char (*text);
  100. else
  101. (*text)++;
  102. }
  103. /* --------------------------------------------------------------------------------------------- */
  104. static void
  105. str_utf8_cprev_char_safe (const char **text)
  106. {
  107. const char *result, *t;
  108. result = g_utf8_prev_char (*text);
  109. t = result;
  110. str_utf8_cnext_char_safe (&t);
  111. if (t == *text)
  112. (*text) = result;
  113. else
  114. (*text)--;
  115. }
  116. /* --------------------------------------------------------------------------------------------- */
  117. static void
  118. str_utf8_fix_string (char *text)
  119. {
  120. while (text[0] != '\0')
  121. {
  122. gunichar uni;
  123. uni = g_utf8_get_char_validated (text, -1);
  124. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  125. text = g_utf8_next_char (text);
  126. else
  127. {
  128. text[0] = '?';
  129. text++;
  130. }
  131. }
  132. }
  133. /* --------------------------------------------------------------------------------------------- */
  134. static gboolean
  135. str_utf8_isspace (const char *text)
  136. {
  137. gunichar uni;
  138. uni = g_utf8_get_char_validated (text, -1);
  139. return g_unichar_isspace (uni);
  140. }
  141. /* --------------------------------------------------------------------------------------------- */
  142. static gboolean
  143. str_utf8_ispunct (const char *text)
  144. {
  145. gunichar uni;
  146. uni = g_utf8_get_char_validated (text, -1);
  147. return g_unichar_ispunct (uni);
  148. }
  149. /* --------------------------------------------------------------------------------------------- */
  150. static gboolean
  151. str_utf8_isalnum (const char *text)
  152. {
  153. gunichar uni;
  154. uni = g_utf8_get_char_validated (text, -1);
  155. return g_unichar_isalnum (uni);
  156. }
  157. /* --------------------------------------------------------------------------------------------- */
  158. static gboolean
  159. str_utf8_isdigit (const char *text)
  160. {
  161. gunichar uni;
  162. uni = g_utf8_get_char_validated (text, -1);
  163. return g_unichar_isdigit (uni);
  164. }
  165. /* --------------------------------------------------------------------------------------------- */
  166. static gboolean
  167. str_utf8_isprint (const char *ch)
  168. {
  169. gunichar uni;
  170. uni = g_utf8_get_char_validated (ch, -1);
  171. return g_unichar_isprint (uni);
  172. }
  173. /* --------------------------------------------------------------------------------------------- */
  174. static gboolean
  175. str_utf8_iscombiningmark (const char *ch)
  176. {
  177. gunichar uni;
  178. uni = g_utf8_get_char_validated (ch, -1);
  179. return str_unichar_iscombiningmark (uni);
  180. }
  181. /* --------------------------------------------------------------------------------------------- */
  182. static int
  183. str_utf8_cnext_noncomb_char (const char **text)
  184. {
  185. int count = 0;
  186. while ((*text)[0] != '\0')
  187. {
  188. str_utf8_cnext_char_safe (text);
  189. count++;
  190. if (!str_utf8_iscombiningmark (*text))
  191. break;
  192. }
  193. return count;
  194. }
  195. /* --------------------------------------------------------------------------------------------- */
  196. static int
  197. str_utf8_cprev_noncomb_char (const char **text, const char *begin)
  198. {
  199. int count = 0;
  200. while ((*text) != begin)
  201. {
  202. str_utf8_cprev_char_safe (text);
  203. count++;
  204. if (!str_utf8_iscombiningmark (*text))
  205. break;
  206. }
  207. return count;
  208. }
  209. /* --------------------------------------------------------------------------------------------- */
  210. static gboolean
  211. str_utf8_toupper (const char *text, char **out, size_t * remain)
  212. {
  213. gunichar uni;
  214. size_t left;
  215. uni = g_utf8_get_char_validated (text, -1);
  216. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  217. return FALSE;
  218. uni = g_unichar_toupper (uni);
  219. left = g_unichar_to_utf8 (uni, NULL);
  220. if (left >= *remain)
  221. return FALSE;
  222. left = g_unichar_to_utf8 (uni, *out);
  223. (*out) += left;
  224. (*remain) -= left;
  225. return TRUE;
  226. }
  227. /* --------------------------------------------------------------------------------------------- */
  228. static gboolean
  229. str_utf8_tolower (const char *text, char **out, size_t * remain)
  230. {
  231. gunichar uni;
  232. size_t left;
  233. uni = g_utf8_get_char_validated (text, -1);
  234. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  235. return FALSE;
  236. uni = g_unichar_tolower (uni);
  237. left = g_unichar_to_utf8 (uni, NULL);
  238. if (left >= *remain)
  239. return FALSE;
  240. left = g_unichar_to_utf8 (uni, *out);
  241. (*out) += left;
  242. (*remain) -= left;
  243. return TRUE;
  244. }
  245. /* --------------------------------------------------------------------------------------------- */
  246. static int
  247. str_utf8_length (const char *text)
  248. {
  249. int result = 0;
  250. const char *start;
  251. const char *end;
  252. start = text;
  253. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  254. {
  255. if (start != end)
  256. result += g_utf8_strlen (start, end - start);
  257. result++;
  258. start = end + 1;
  259. }
  260. if (start == text)
  261. result = g_utf8_strlen (text, -1);
  262. else if (start[0] != '\0' && start != end)
  263. result += g_utf8_strlen (start, end - start);
  264. return result;
  265. }
  266. /* --------------------------------------------------------------------------------------------- */
  267. static int
  268. str_utf8_length2 (const char *text, int size)
  269. {
  270. int result = 0;
  271. const char *start;
  272. const char *end;
  273. start = text;
  274. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
  275. {
  276. if (start != end)
  277. {
  278. result += g_utf8_strlen (start, MIN (end - start, size));
  279. size -= end - start;
  280. }
  281. result += (size > 0);
  282. size--;
  283. start = end + 1;
  284. }
  285. if (start == text)
  286. result = g_utf8_strlen (text, size);
  287. else if (start[0] != '\0' && start != end && size > 0)
  288. result += g_utf8_strlen (start, MIN (end - start, size));
  289. return result;
  290. }
  291. /* --------------------------------------------------------------------------------------------- */
  292. static int
  293. str_utf8_length_noncomb (const char *text)
  294. {
  295. int result = 0;
  296. const char *t = text;
  297. while (t[0] != '\0')
  298. {
  299. str_utf8_cnext_noncomb_char (&t);
  300. result++;
  301. }
  302. return result;
  303. }
  304. /* --------------------------------------------------------------------------------------------- */
  305. #if 0
  306. static void
  307. str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
  308. {
  309. char *next;
  310. next = g_utf8_next_char (*string);
  311. (*left) -= next - (*string);
  312. (*string) = next;
  313. g_string_append_c (buffer, '?');
  314. }
  315. #endif
  316. /* --------------------------------------------------------------------------------------------- */
  317. static gchar *
  318. str_utf8_conv_gerror_message (GError * mcerror, const char *def_msg)
  319. {
  320. if (mcerror != NULL)
  321. return g_strdup (mcerror->message);
  322. return g_strdup (def_msg != NULL ? def_msg : "");
  323. }
  324. /* --------------------------------------------------------------------------------------------- */
  325. static estr_t
  326. str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
  327. {
  328. estr_t result = ESTR_SUCCESS;
  329. if (coder == str_cnv_not_convert)
  330. g_string_append_len (buffer, string, size);
  331. else
  332. result = str_nconvert (coder, string, size, buffer);
  333. return result;
  334. }
  335. /* --------------------------------------------------------------------------------------------- */
  336. /* utility function, that makes string valid in utf8 and all characters printable
  337. * return width of string too */
  338. static const struct term_form *
  339. str_utf8_make_make_term_form (const char *text, size_t length)
  340. {
  341. static struct term_form result;
  342. gunichar uni;
  343. size_t left;
  344. char *actual;
  345. result.text[0] = '\0';
  346. result.width = 0;
  347. result.compose = FALSE;
  348. actual = result.text;
  349. /* check if text start with combining character,
  350. * add space at begin in this case */
  351. if (length != 0 && text[0] != '\0')
  352. {
  353. uni = g_utf8_get_char_validated (text, -1);
  354. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
  355. && str_unichar_iscombiningmark (uni))
  356. {
  357. actual[0] = ' ';
  358. actual++;
  359. result.width++;
  360. result.compose = TRUE;
  361. }
  362. }
  363. while (length != 0 && text[0] != '\0')
  364. {
  365. uni = g_utf8_get_char_validated (text, -1);
  366. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  367. {
  368. if (g_unichar_isprint (uni))
  369. {
  370. left = g_unichar_to_utf8 (uni, actual);
  371. actual += left;
  372. if (str_unichar_iscombiningmark (uni))
  373. result.compose = TRUE;
  374. else
  375. {
  376. result.width++;
  377. if (g_unichar_iswide (uni))
  378. result.width++;
  379. }
  380. }
  381. else
  382. {
  383. actual[0] = '.';
  384. actual++;
  385. result.width++;
  386. }
  387. text = g_utf8_next_char (text);
  388. }
  389. else
  390. {
  391. text++;
  392. /*actual[0] = '?'; */
  393. memcpy (actual, replch, strlen (replch));
  394. actual += strlen (replch);
  395. result.width++;
  396. }
  397. if (length != (size_t) (-1))
  398. length--;
  399. }
  400. actual[0] = '\0';
  401. return &result;
  402. }
  403. /* --------------------------------------------------------------------------------------------- */
  404. static const char *
  405. str_utf8_term_form (const char *text)
  406. {
  407. static char result[BUF_MEDIUM * 6];
  408. const struct term_form *pre_form;
  409. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  410. if (pre_form->compose)
  411. {
  412. char *composed;
  413. composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  414. g_strlcpy (result, composed, sizeof (result));
  415. g_free (composed);
  416. }
  417. else
  418. g_strlcpy (result, pre_form->text, sizeof (result));
  419. return result;
  420. }
  421. /* --------------------------------------------------------------------------------------------- */
  422. /* utility function, that copies all characters from checked to actual */
  423. static gboolean
  424. utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
  425. {
  426. tool->compose = FALSE;
  427. while (tool->checked[0] != '\0')
  428. {
  429. gunichar uni;
  430. size_t left;
  431. uni = g_utf8_get_char (tool->checked);
  432. tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
  433. left = g_unichar_to_utf8 (uni, NULL);
  434. if (tool->remain <= left)
  435. return FALSE;
  436. left = g_unichar_to_utf8 (uni, tool->actual);
  437. tool->actual += left;
  438. tool->remain -= left;
  439. tool->checked = g_utf8_next_char (tool->checked);
  440. }
  441. return TRUE;
  442. }
  443. /* --------------------------------------------------------------------------------------------- */
  444. /* utility function, that copies characters from checked to actual until ident is
  445. * smaller than to_ident */
  446. static gboolean
  447. utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
  448. {
  449. tool->compose = FALSE;
  450. while (tool->checked[0] != '\0')
  451. {
  452. gunichar uni;
  453. size_t left;
  454. int w = 0;
  455. uni = g_utf8_get_char (tool->checked);
  456. if (str_unichar_iscombiningmark (uni))
  457. tool->compose = TRUE;
  458. else
  459. {
  460. w = 1;
  461. if (g_unichar_iswide (uni))
  462. w++;
  463. if (tool->ident + w > to_ident)
  464. return TRUE;
  465. }
  466. left = g_unichar_to_utf8 (uni, NULL);
  467. if (tool->remain <= left)
  468. return FALSE;
  469. left = g_unichar_to_utf8 (uni, tool->actual);
  470. tool->actual += left;
  471. tool->remain -= left;
  472. tool->checked = g_utf8_next_char (tool->checked);
  473. tool->ident += w;
  474. }
  475. return TRUE;
  476. }
  477. /* --------------------------------------------------------------------------------------------- */
  478. /* utility function, adds count spaces to actual */
  479. static int
  480. utf8_tool_insert_space (struct utf8_tool *tool, int count)
  481. {
  482. if (count <= 0)
  483. return 1;
  484. if (tool->remain <= (gsize) count)
  485. return 0;
  486. memset (tool->actual, ' ', count);
  487. tool->actual += count;
  488. tool->remain -= count;
  489. return 1;
  490. }
  491. /* --------------------------------------------------------------------------------------------- */
  492. /* utility function, adds one characters to actual */
  493. static int
  494. utf8_tool_insert_char (struct utf8_tool *tool, char ch)
  495. {
  496. if (tool->remain <= 1)
  497. return 0;
  498. tool->actual[0] = ch;
  499. tool->actual++;
  500. tool->remain--;
  501. return 1;
  502. }
  503. /* --------------------------------------------------------------------------------------------- */
  504. /* utility function, thah skips characters from checked until ident is greater or
  505. * equal to to_ident */
  506. static gboolean
  507. utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
  508. {
  509. gunichar uni;
  510. while (to_ident > tool->ident && tool->checked[0] != '\0')
  511. {
  512. uni = g_utf8_get_char (tool->checked);
  513. if (!str_unichar_iscombiningmark (uni))
  514. {
  515. tool->ident++;
  516. if (g_unichar_iswide (uni))
  517. tool->ident++;
  518. }
  519. tool->checked = g_utf8_next_char (tool->checked);
  520. }
  521. uni = g_utf8_get_char (tool->checked);
  522. while (str_unichar_iscombiningmark (uni))
  523. {
  524. tool->checked = g_utf8_next_char (tool->checked);
  525. uni = g_utf8_get_char (tool->checked);
  526. }
  527. return TRUE;
  528. }
  529. /* --------------------------------------------------------------------------------------------- */
  530. static void
  531. utf8_tool_compose (char *buffer, size_t size)
  532. {
  533. char *composed;
  534. composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  535. g_strlcpy (buffer, composed, size);
  536. g_free (composed);
  537. }
  538. /* --------------------------------------------------------------------------------------------- */
  539. static const char *
  540. str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
  541. {
  542. static char result[BUF_MEDIUM * 6];
  543. const struct term_form *pre_form;
  544. struct utf8_tool tool;
  545. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  546. tool.checked = pre_form->text;
  547. tool.actual = result;
  548. tool.remain = sizeof (result);
  549. tool.compose = FALSE;
  550. if (pre_form->width <= (gsize) width)
  551. {
  552. switch (HIDE_FIT (just_mode))
  553. {
  554. case J_CENTER_LEFT:
  555. case J_CENTER:
  556. tool.ident = (width - pre_form->width) / 2;
  557. break;
  558. case J_RIGHT:
  559. tool.ident = width - pre_form->width;
  560. break;
  561. default:
  562. tool.ident = 0;
  563. break;
  564. }
  565. utf8_tool_insert_space (&tool, tool.ident);
  566. utf8_tool_copy_chars_to_end (&tool);
  567. utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
  568. }
  569. else if (IS_FIT (just_mode))
  570. {
  571. tool.ident = 0;
  572. utf8_tool_copy_chars_to (&tool, width / 2);
  573. utf8_tool_insert_char (&tool, '~');
  574. tool.ident = 0;
  575. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  576. utf8_tool_copy_chars_to_end (&tool);
  577. utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
  578. }
  579. else
  580. {
  581. switch (HIDE_FIT (just_mode))
  582. {
  583. case J_CENTER:
  584. tool.ident = (width - pre_form->width) / 2;
  585. break;
  586. case J_RIGHT:
  587. tool.ident = width - pre_form->width;
  588. break;
  589. default:
  590. tool.ident = 0;
  591. break;
  592. }
  593. utf8_tool_skip_chars_to (&tool, 0);
  594. utf8_tool_insert_space (&tool, tool.ident);
  595. utf8_tool_copy_chars_to (&tool, width);
  596. utf8_tool_insert_space (&tool, width - tool.ident);
  597. }
  598. tool.actual[0] = '\0';
  599. if (tool.compose)
  600. utf8_tool_compose (result, sizeof (result));
  601. return result;
  602. }
  603. /* --------------------------------------------------------------------------------------------- */
  604. static const char *
  605. str_utf8_term_trim (const char *text, int width)
  606. {
  607. static char result[BUF_MEDIUM * 6];
  608. const struct term_form *pre_form;
  609. struct utf8_tool tool;
  610. if (width < 1)
  611. {
  612. result[0] = '\0';
  613. return result;
  614. }
  615. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  616. tool.checked = pre_form->text;
  617. tool.actual = result;
  618. tool.remain = sizeof (result);
  619. tool.compose = FALSE;
  620. if ((gsize) width >= pre_form->width)
  621. utf8_tool_copy_chars_to_end (&tool);
  622. else if (width <= 3)
  623. {
  624. memset (tool.actual, '.', width);
  625. tool.actual += width;
  626. tool.remain -= width;
  627. }
  628. else
  629. {
  630. memset (tool.actual, '.', 3);
  631. tool.actual += 3;
  632. tool.remain -= 3;
  633. tool.ident = 0;
  634. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
  635. utf8_tool_copy_chars_to_end (&tool);
  636. }
  637. tool.actual[0] = '\0';
  638. if (tool.compose)
  639. utf8_tool_compose (result, sizeof (result));
  640. return result;
  641. }
  642. /* --------------------------------------------------------------------------------------------- */
  643. static int
  644. str_utf8_term_width2 (const char *text, size_t length)
  645. {
  646. const struct term_form *result;
  647. result = str_utf8_make_make_term_form (text, length);
  648. return result->width;
  649. }
  650. /* --------------------------------------------------------------------------------------------- */
  651. static int
  652. str_utf8_term_width1 (const char *text)
  653. {
  654. return str_utf8_term_width2 (text, (size_t) (-1));
  655. }
  656. /* --------------------------------------------------------------------------------------------- */
  657. static int
  658. str_utf8_term_char_width (const char *text)
  659. {
  660. gunichar uni;
  661. uni = g_utf8_get_char_validated (text, -1);
  662. return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
  663. }
  664. /* --------------------------------------------------------------------------------------------- */
  665. static const char *
  666. str_utf8_term_substring (const char *text, int start, int width)
  667. {
  668. static char result[BUF_MEDIUM * 6];
  669. const struct term_form *pre_form;
  670. struct utf8_tool tool;
  671. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  672. tool.checked = pre_form->text;
  673. tool.actual = result;
  674. tool.remain = sizeof (result);
  675. tool.compose = FALSE;
  676. tool.ident = -start;
  677. utf8_tool_skip_chars_to (&tool, 0);
  678. if (tool.ident < 0)
  679. tool.ident = 0;
  680. utf8_tool_insert_space (&tool, tool.ident);
  681. utf8_tool_copy_chars_to (&tool, width);
  682. utf8_tool_insert_space (&tool, width - tool.ident);
  683. tool.actual[0] = '\0';
  684. if (tool.compose)
  685. utf8_tool_compose (result, sizeof (result));
  686. return result;
  687. }
  688. /* --------------------------------------------------------------------------------------------- */
  689. static const char *
  690. str_utf8_trunc (const char *text, int width)
  691. {
  692. static char result[MC_MAXPATHLEN * 6 * 2];
  693. const struct term_form *pre_form;
  694. struct utf8_tool tool;
  695. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  696. tool.checked = pre_form->text;
  697. tool.actual = result;
  698. tool.remain = sizeof (result);
  699. tool.compose = FALSE;
  700. if (pre_form->width <= (gsize) width)
  701. utf8_tool_copy_chars_to_end (&tool);
  702. else
  703. {
  704. tool.ident = 0;
  705. utf8_tool_copy_chars_to (&tool, width / 2);
  706. utf8_tool_insert_char (&tool, '~');
  707. tool.ident = 0;
  708. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  709. utf8_tool_copy_chars_to_end (&tool);
  710. }
  711. tool.actual[0] = '\0';
  712. if (tool.compose)
  713. utf8_tool_compose (result, sizeof (result));
  714. return result;
  715. }
  716. /* --------------------------------------------------------------------------------------------- */
  717. static int
  718. str_utf8_offset_to_pos (const char *text, size_t length)
  719. {
  720. if (str_utf8_is_valid_string (text))
  721. return g_utf8_offset_to_pointer (text, length) - text;
  722. else
  723. {
  724. int result;
  725. GString *buffer;
  726. buffer = g_string_new (text);
  727. str_utf8_fix_string (buffer->str);
  728. result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
  729. g_string_free (buffer, TRUE);
  730. return result;
  731. }
  732. }
  733. /* --------------------------------------------------------------------------------------------- */
  734. static int
  735. str_utf8_column_to_pos (const char *text, size_t pos)
  736. {
  737. int result = 0;
  738. int width = 0;
  739. while (text[0] != '\0')
  740. {
  741. gunichar uni;
  742. uni = g_utf8_get_char_validated (text, 6);
  743. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  744. {
  745. if (g_unichar_isprint (uni))
  746. {
  747. if (!str_unichar_iscombiningmark (uni))
  748. {
  749. width++;
  750. if (g_unichar_iswide (uni))
  751. width++;
  752. }
  753. }
  754. else
  755. {
  756. width++;
  757. }
  758. text = g_utf8_next_char (text);
  759. }
  760. else
  761. {
  762. text++;
  763. width++;
  764. }
  765. if ((gsize) width > pos)
  766. return result;
  767. result++;
  768. }
  769. return result;
  770. }
  771. /* --------------------------------------------------------------------------------------------- */
  772. static char *
  773. str_utf8_create_search_needle (const char *needle, gboolean case_sen)
  774. {
  775. char *fold, *result;
  776. if (needle == NULL)
  777. return NULL;
  778. if (case_sen)
  779. return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
  780. fold = g_utf8_casefold (needle, -1);
  781. result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  782. g_free (fold);
  783. return result;
  784. }
  785. /* --------------------------------------------------------------------------------------------- */
  786. static void
  787. str_utf8_release_search_needle (char *needle, gboolean case_sen)
  788. {
  789. (void) case_sen;
  790. g_free (needle);
  791. }
  792. /* --------------------------------------------------------------------------------------------- */
  793. static const char *
  794. str_utf8_search_first (const char *text, const char *search, gboolean case_sen)
  795. {
  796. char *fold_text;
  797. char *deco_text;
  798. const char *match;
  799. const char *result = NULL;
  800. const char *m;
  801. fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
  802. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  803. match = deco_text;
  804. do
  805. {
  806. match = g_strstr_len (match, -1, search);
  807. if (match != NULL)
  808. {
  809. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  810. !str_utf8_iscombiningmark (match + strlen (search)))
  811. {
  812. result = text;
  813. m = deco_text;
  814. while (m < match)
  815. {
  816. str_utf8_cnext_noncomb_char (&m);
  817. str_utf8_cnext_noncomb_char (&result);
  818. }
  819. }
  820. else
  821. str_utf8_cnext_char (&match);
  822. }
  823. }
  824. while (match != NULL && result == NULL);
  825. g_free (deco_text);
  826. if (!case_sen)
  827. g_free (fold_text);
  828. return result;
  829. }
  830. /* --------------------------------------------------------------------------------------------- */
  831. static const char *
  832. str_utf8_search_last (const char *text, const char *search, gboolean case_sen)
  833. {
  834. char *fold_text;
  835. char *deco_text;
  836. char *match;
  837. const char *result = NULL;
  838. const char *m;
  839. fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
  840. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  841. do
  842. {
  843. match = g_strrstr_len (deco_text, -1, search);
  844. if (match != NULL)
  845. {
  846. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  847. !str_utf8_iscombiningmark (match + strlen (search)))
  848. {
  849. result = text;
  850. m = deco_text;
  851. while (m < match)
  852. {
  853. str_utf8_cnext_noncomb_char (&m);
  854. str_utf8_cnext_noncomb_char (&result);
  855. }
  856. }
  857. else
  858. match[0] = '\0';
  859. }
  860. }
  861. while (match != NULL && result == NULL);
  862. g_free (deco_text);
  863. if (!case_sen)
  864. g_free (fold_text);
  865. return result;
  866. }
  867. /* --------------------------------------------------------------------------------------------- */
  868. static char *
  869. str_utf8_normalize (const char *text)
  870. {
  871. GString *fixed;
  872. char *tmp;
  873. char *result;
  874. const char *start;
  875. const char *end;
  876. /* g_utf8_normalize() is a heavyweight function, that converts UTF-8 into UCS-4,
  877. * does the normalization and then converts UCS-4 back into UTF-8.
  878. * Since file names are composed of ASCII characters in most cases, we can speed up
  879. * utf8 normalization by checking if the heavyweight Unicode normalization is actually
  880. * needed. Normalization of ASCII string is no-op.
  881. */
  882. /* find out whether text is ASCII only */
  883. for (end = text; *end != '\0'; end++)
  884. if ((*end & 0x80) != 0)
  885. {
  886. /* found 2nd byte of utf8-encoded symbol */
  887. break;
  888. }
  889. /* if text is ASCII-only, return copy, normalize otherwise */
  890. if (*end == '\0')
  891. return g_strndup (text, end - text);
  892. fixed = g_string_sized_new (4);
  893. start = text;
  894. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  895. {
  896. if (start != end)
  897. {
  898. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  899. g_string_append (fixed, tmp);
  900. g_free (tmp);
  901. }
  902. g_string_append_c (fixed, end[0]);
  903. start = end + 1;
  904. }
  905. if (start == text)
  906. {
  907. result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
  908. g_string_free (fixed, TRUE);
  909. }
  910. else
  911. {
  912. if (start[0] != '\0' && start != end)
  913. {
  914. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  915. g_string_append (fixed, tmp);
  916. g_free (tmp);
  917. }
  918. result = g_string_free (fixed, FALSE);
  919. }
  920. return result;
  921. }
  922. /* --------------------------------------------------------------------------------------------- */
  923. static char *
  924. str_utf8_casefold_normalize (const char *text)
  925. {
  926. GString *fixed;
  927. char *tmp, *fold;
  928. char *result;
  929. const char *start;
  930. const char *end;
  931. fixed = g_string_sized_new (4);
  932. start = text;
  933. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  934. {
  935. if (start != end)
  936. {
  937. fold = g_utf8_casefold (start, end - start);
  938. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  939. g_string_append (fixed, tmp);
  940. g_free (tmp);
  941. g_free (fold);
  942. }
  943. g_string_append_c (fixed, end[0]);
  944. start = end + 1;
  945. }
  946. if (start == text)
  947. {
  948. fold = g_utf8_casefold (text, -1);
  949. result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  950. g_free (fold);
  951. g_string_free (fixed, TRUE);
  952. }
  953. else
  954. {
  955. if (start[0] != '\0' && start != end)
  956. {
  957. fold = g_utf8_casefold (start, end - start);
  958. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  959. g_string_append (fixed, tmp);
  960. g_free (tmp);
  961. g_free (fold);
  962. }
  963. result = g_string_free (fixed, FALSE);
  964. }
  965. return result;
  966. }
  967. /* --------------------------------------------------------------------------------------------- */
  968. static int
  969. str_utf8_compare (const char *t1, const char *t2)
  970. {
  971. char *n1, *n2;
  972. int result;
  973. n1 = str_utf8_normalize (t1);
  974. n2 = str_utf8_normalize (t2);
  975. result = strcmp (n1, n2);
  976. g_free (n1);
  977. g_free (n2);
  978. return result;
  979. }
  980. /* --------------------------------------------------------------------------------------------- */
  981. static int
  982. str_utf8_ncompare (const char *t1, const char *t2)
  983. {
  984. char *n1, *n2;
  985. size_t l1, l2;
  986. int result;
  987. n1 = str_utf8_normalize (t1);
  988. n2 = str_utf8_normalize (t2);
  989. l1 = strlen (n1);
  990. l2 = strlen (n2);
  991. result = strncmp (n1, n2, MIN (l1, l2));
  992. g_free (n1);
  993. g_free (n2);
  994. return result;
  995. }
  996. /* --------------------------------------------------------------------------------------------- */
  997. static int
  998. str_utf8_casecmp (const char *t1, const char *t2)
  999. {
  1000. char *n1, *n2;
  1001. int result;
  1002. n1 = str_utf8_casefold_normalize (t1);
  1003. n2 = str_utf8_casefold_normalize (t2);
  1004. result = strcmp (n1, n2);
  1005. g_free (n1);
  1006. g_free (n2);
  1007. return result;
  1008. }
  1009. /* --------------------------------------------------------------------------------------------- */
  1010. static int
  1011. str_utf8_ncasecmp (const char *t1, const char *t2)
  1012. {
  1013. char *n1, *n2;
  1014. size_t l1, l2;
  1015. int result;
  1016. n1 = str_utf8_casefold_normalize (t1);
  1017. n2 = str_utf8_casefold_normalize (t2);
  1018. l1 = strlen (n1);
  1019. l2 = strlen (n2);
  1020. result = strncmp (n1, n2, MIN (l1, l2));
  1021. g_free (n1);
  1022. g_free (n2);
  1023. return result;
  1024. }
  1025. /* --------------------------------------------------------------------------------------------- */
  1026. static int
  1027. str_utf8_prefix (const char *text, const char *prefix)
  1028. {
  1029. char *t, *p;
  1030. const char *nt, *np;
  1031. const char *nnt, *nnp;
  1032. int result;
  1033. t = str_utf8_normalize (text);
  1034. p = str_utf8_normalize (prefix);
  1035. nt = t;
  1036. np = p;
  1037. nnt = t;
  1038. nnp = p;
  1039. while (nt[0] != '\0' && np[0] != '\0')
  1040. {
  1041. str_utf8_cnext_char_safe (&nnt);
  1042. str_utf8_cnext_char_safe (&nnp);
  1043. if (nnt - nt != nnp - np)
  1044. break;
  1045. if (strncmp (nt, np, nnt - nt) != 0)
  1046. break;
  1047. nt = nnt;
  1048. np = nnp;
  1049. }
  1050. result = np - p;
  1051. g_free (t);
  1052. g_free (p);
  1053. return result;
  1054. }
  1055. /* --------------------------------------------------------------------------------------------- */
  1056. static int
  1057. str_utf8_caseprefix (const char *text, const char *prefix)
  1058. {
  1059. char *t, *p;
  1060. const char *nt, *np;
  1061. const char *nnt, *nnp;
  1062. int result;
  1063. t = str_utf8_casefold_normalize (text);
  1064. p = str_utf8_casefold_normalize (prefix);
  1065. nt = t;
  1066. np = p;
  1067. nnt = t;
  1068. nnp = p;
  1069. while (nt[0] != '\0' && np[0] != '\0')
  1070. {
  1071. str_utf8_cnext_char_safe (&nnt);
  1072. str_utf8_cnext_char_safe (&nnp);
  1073. if (nnt - nt != nnp - np)
  1074. break;
  1075. if (strncmp (nt, np, nnt - nt) != 0)
  1076. break;
  1077. nt = nnt;
  1078. np = nnp;
  1079. }
  1080. result = np - p;
  1081. g_free (t);
  1082. g_free (p);
  1083. return result;
  1084. }
  1085. /* --------------------------------------------------------------------------------------------- */
  1086. static char *
  1087. str_utf8_create_key_gen (const char *text, gboolean case_sen,
  1088. gchar * (*keygen) (const gchar * text, gssize size))
  1089. {
  1090. char *result;
  1091. if (case_sen)
  1092. result = str_utf8_normalize (text);
  1093. else
  1094. {
  1095. gboolean dot;
  1096. GString *fixed;
  1097. const char *start, *end;
  1098. char *fold, *key;
  1099. dot = text[0] == '.';
  1100. fixed = g_string_sized_new (16);
  1101. if (!dot)
  1102. start = text;
  1103. else
  1104. {
  1105. start = text + 1;
  1106. g_string_append_c (fixed, '.');
  1107. }
  1108. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  1109. {
  1110. if (start != end)
  1111. {
  1112. fold = g_utf8_casefold (start, end - start);
  1113. key = keygen (fold, -1);
  1114. g_string_append (fixed, key);
  1115. g_free (key);
  1116. g_free (fold);
  1117. }
  1118. g_string_append_c (fixed, end[0]);
  1119. start = end + 1;
  1120. }
  1121. if (start == text)
  1122. {
  1123. fold = g_utf8_casefold (start, -1);
  1124. result = keygen (fold, -1);
  1125. g_free (fold);
  1126. g_string_free (fixed, TRUE);
  1127. }
  1128. else if (dot && (start == text + 1))
  1129. {
  1130. fold = g_utf8_casefold (start, -1);
  1131. key = keygen (fold, -1);
  1132. g_string_append (fixed, key);
  1133. g_free (key);
  1134. g_free (fold);
  1135. result = g_string_free (fixed, FALSE);
  1136. }
  1137. else
  1138. {
  1139. if (start[0] != '\0' && start != end)
  1140. {
  1141. fold = g_utf8_casefold (start, end - start);
  1142. key = keygen (fold, -1);
  1143. g_string_append (fixed, key);
  1144. g_free (key);
  1145. g_free (fold);
  1146. }
  1147. result = g_string_free (fixed, FALSE);
  1148. }
  1149. }
  1150. return result;
  1151. }
  1152. /* --------------------------------------------------------------------------------------------- */
  1153. static char *
  1154. str_utf8_create_key (const char *text, gboolean case_sen)
  1155. {
  1156. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
  1157. }
  1158. /* --------------------------------------------------------------------------------------------- */
  1159. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1160. static char *
  1161. str_utf8_create_key_for_filename (const char *text, gboolean case_sen)
  1162. {
  1163. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
  1164. }
  1165. #endif
  1166. /* --------------------------------------------------------------------------------------------- */
  1167. static int
  1168. str_utf8_key_collate (const char *t1, const char *t2, gboolean case_sen)
  1169. {
  1170. (void) case_sen;
  1171. return strcmp (t1, t2);
  1172. }
  1173. /* --------------------------------------------------------------------------------------------- */
  1174. static void
  1175. str_utf8_release_key (char *key, gboolean case_sen)
  1176. {
  1177. (void) case_sen;
  1178. g_free (key);
  1179. }
  1180. /* --------------------------------------------------------------------------------------------- */
  1181. /*** public functions ****************************************************************************/
  1182. /* --------------------------------------------------------------------------------------------- */
  1183. struct str_class
  1184. str_utf8_init (void)
  1185. {
  1186. struct str_class result;
  1187. result.conv_gerror_message = str_utf8_conv_gerror_message;
  1188. result.vfs_convert_to = str_utf8_vfs_convert_to;
  1189. result.insert_replace_char = str_utf8_insert_replace_char;
  1190. result.is_valid_string = str_utf8_is_valid_string;
  1191. result.is_valid_char = str_utf8_is_valid_char;
  1192. result.cnext_char = str_utf8_cnext_char;
  1193. result.cprev_char = str_utf8_cprev_char;
  1194. result.cnext_char_safe = str_utf8_cnext_char_safe;
  1195. result.cprev_char_safe = str_utf8_cprev_char_safe;
  1196. result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
  1197. result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
  1198. result.char_isspace = str_utf8_isspace;
  1199. result.char_ispunct = str_utf8_ispunct;
  1200. result.char_isalnum = str_utf8_isalnum;
  1201. result.char_isdigit = str_utf8_isdigit;
  1202. result.char_isprint = str_utf8_isprint;
  1203. result.char_iscombiningmark = str_utf8_iscombiningmark;
  1204. result.char_toupper = str_utf8_toupper;
  1205. result.char_tolower = str_utf8_tolower;
  1206. result.length = str_utf8_length;
  1207. result.length2 = str_utf8_length2;
  1208. result.length_noncomb = str_utf8_length_noncomb;
  1209. result.fix_string = str_utf8_fix_string;
  1210. result.term_form = str_utf8_term_form;
  1211. result.fit_to_term = str_utf8_fit_to_term;
  1212. result.term_trim = str_utf8_term_trim;
  1213. result.term_width2 = str_utf8_term_width2;
  1214. result.term_width1 = str_utf8_term_width1;
  1215. result.term_char_width = str_utf8_term_char_width;
  1216. result.term_substring = str_utf8_term_substring;
  1217. result.trunc = str_utf8_trunc;
  1218. result.offset_to_pos = str_utf8_offset_to_pos;
  1219. result.column_to_pos = str_utf8_column_to_pos;
  1220. result.create_search_needle = str_utf8_create_search_needle;
  1221. result.release_search_needle = str_utf8_release_search_needle;
  1222. result.search_first = str_utf8_search_first;
  1223. result.search_last = str_utf8_search_last;
  1224. result.compare = str_utf8_compare;
  1225. result.ncompare = str_utf8_ncompare;
  1226. result.casecmp = str_utf8_casecmp;
  1227. result.ncasecmp = str_utf8_ncasecmp;
  1228. result.prefix = str_utf8_prefix;
  1229. result.caseprefix = str_utf8_caseprefix;
  1230. result.create_key = str_utf8_create_key;
  1231. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1232. /* case insensitive sort files in "a1 a2 a10" order */
  1233. result.create_key_for_filename = str_utf8_create_key_for_filename;
  1234. #else
  1235. /* case insensitive sort files in "a1 a10 a2" order */
  1236. result.create_key_for_filename = str_utf8_create_key;
  1237. #endif
  1238. result.key_collate = str_utf8_key_collate;
  1239. result.release_key = str_utf8_release_key;
  1240. return result;
  1241. }
  1242. /* --------------------------------------------------------------------------------------------- */