strutilutf8.c 40 KB


  1. /*
  2. UTF-8 strings utilities
  3. Copyright (C) 2007-2024
  4. Free Software Foundation, Inc.
  5. Written by:
  6. Rostislav Benes, 2007
  7. This file is part of the Midnight Commander.
  8. The Midnight Commander is free software: you can redistribute it
  9. and/or modify it under the terms of the GNU General Public License as
  10. published by the Free Software Foundation, either version 3 of the License,
  11. or (at your option) any later version.
  12. The Midnight Commander is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. GNU General Public License for more details.
  16. You should have received a copy of the GNU General Public License
  17. along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include <config.h>
  20. #include <stdlib.h>
  21. #include <langinfo.h>
  22. #include <limits.h> /* MB_LEN_MAX */
  23. #include <string.h>
  24. #include "lib/global.h"
  25. #include "lib/strutil.h"
  26. /* using function for utf-8 from glib */
  27. /*** global variables ****************************************************************************/
  28. /*** file scope macro definitions ****************************************************************/
  29. /*** file scope type declarations ****************************************************************/
  30. struct utf8_tool
  31. {
  32. char *actual;
  33. size_t remain;
  34. const char *checked;
  35. int ident;
  36. gboolean compose;
  37. };
  38. struct term_form
  39. {
  40. char text[BUF_MEDIUM * MB_LEN_MAX];
  41. size_t width;
  42. gboolean compose;
  43. };
  44. /*** forward declarations (file scope functions) *************************************************/
  45. /*** file scope variables ************************************************************************/
  46. static const char replch[] = "\xEF\xBF\xBD";
  47. /* --------------------------------------------------------------------------------------------- */
  48. /*** file scope functions ************************************************************************/
  49. /* --------------------------------------------------------------------------------------------- */
  50. static gboolean
  51. str_unichar_iscombiningmark (gunichar uni)
  52. {
  53. GUnicodeType type;
  54. type = g_unichar_type (uni);
  55. return (type == G_UNICODE_SPACING_MARK) || (type == G_UNICODE_ENCLOSING_MARK)
  56. || (type == G_UNICODE_NON_SPACING_MARK);
  57. }
  58. /* --------------------------------------------------------------------------------------------- */
  59. static void
  60. str_utf8_insert_replace_char (GString *buffer)
  61. {
  62. g_string_append (buffer, replch);
  63. }
  64. /* --------------------------------------------------------------------------------------------- */
  65. static gboolean
  66. str_utf8_is_valid_string (const char *text)
  67. {
  68. return g_utf8_validate (text, -1, NULL);
  69. }
  70. /* --------------------------------------------------------------------------------------------- */
  71. static int
  72. str_utf8_is_valid_char (const char *ch, size_t size)
  73. {
  74. switch (g_utf8_get_char_validated (ch, size))
  75. {
  76. case (gunichar) (-2):
  77. return (-2);
  78. case (gunichar) (-1):
  79. return (-1);
  80. default:
  81. return 1;
  82. }
  83. }
  84. /* --------------------------------------------------------------------------------------------- */
  85. static void
  86. str_utf8_cnext_char (const char **text)
  87. {
  88. (*text) = g_utf8_next_char (*text);
  89. }
  90. /* --------------------------------------------------------------------------------------------- */
  91. static void
  92. str_utf8_cprev_char (const char **text)
  93. {
  94. (*text) = g_utf8_prev_char (*text);
  95. }
  96. /* --------------------------------------------------------------------------------------------- */
  97. static void
  98. str_utf8_cnext_char_safe (const char **text)
  99. {
  100. if (str_utf8_is_valid_char (*text, -1) == 1)
  101. (*text) = g_utf8_next_char (*text);
  102. else
  103. (*text)++;
  104. }
  105. /* --------------------------------------------------------------------------------------------- */
  106. static void
  107. str_utf8_cprev_char_safe (const char **text)
  108. {
  109. const char *result, *t;
  110. result = g_utf8_prev_char (*text);
  111. t = result;
  112. str_utf8_cnext_char_safe (&t);
  113. if (t == *text)
  114. (*text) = result;
  115. else
  116. (*text)--;
  117. }
  118. /* --------------------------------------------------------------------------------------------- */
  119. static void
  120. str_utf8_fix_string (char *text)
  121. {
  122. while (text[0] != '\0')
  123. {
  124. gunichar uni;
  125. uni = g_utf8_get_char_validated (text, -1);
  126. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  127. text = g_utf8_next_char (text);
  128. else
  129. {
  130. text[0] = '?';
  131. text++;
  132. }
  133. }
  134. }
  135. /* --------------------------------------------------------------------------------------------- */
  136. static gboolean
  137. str_utf8_isspace (const char *text)
  138. {
  139. gunichar uni;
  140. uni = g_utf8_get_char_validated (text, -1);
  141. return g_unichar_isspace (uni);
  142. }
  143. /* --------------------------------------------------------------------------------------------- */
  144. static gboolean
  145. str_utf8_ispunct (const char *text)
  146. {
  147. gunichar uni;
  148. uni = g_utf8_get_char_validated (text, -1);
  149. return g_unichar_ispunct (uni);
  150. }
  151. /* --------------------------------------------------------------------------------------------- */
  152. static gboolean
  153. str_utf8_isalnum (const char *text)
  154. {
  155. gunichar uni;
  156. uni = g_utf8_get_char_validated (text, -1);
  157. return g_unichar_isalnum (uni);
  158. }
  159. /* --------------------------------------------------------------------------------------------- */
  160. static gboolean
  161. str_utf8_isdigit (const char *text)
  162. {
  163. gunichar uni;
  164. uni = g_utf8_get_char_validated (text, -1);
  165. return g_unichar_isdigit (uni);
  166. }
  167. /* --------------------------------------------------------------------------------------------- */
  168. static gboolean
  169. str_utf8_isprint (const char *ch)
  170. {
  171. gunichar uni;
  172. uni = g_utf8_get_char_validated (ch, -1);
  173. return g_unichar_isprint (uni);
  174. }
  175. /* --------------------------------------------------------------------------------------------- */
  176. static gboolean
  177. str_utf8_iscombiningmark (const char *ch)
  178. {
  179. gunichar uni;
  180. uni = g_utf8_get_char_validated (ch, -1);
  181. return str_unichar_iscombiningmark (uni);
  182. }
  183. /* --------------------------------------------------------------------------------------------- */
  184. static int
  185. str_utf8_cnext_noncomb_char (const char **text)
  186. {
  187. int count = 0;
  188. while ((*text)[0] != '\0')
  189. {
  190. str_utf8_cnext_char_safe (text);
  191. count++;
  192. if (!str_utf8_iscombiningmark (*text))
  193. break;
  194. }
  195. return count;
  196. }
  197. /* --------------------------------------------------------------------------------------------- */
  198. static int
  199. str_utf8_cprev_noncomb_char (const char **text, const char *begin)
  200. {
  201. int count = 0;
  202. while ((*text) != begin)
  203. {
  204. str_utf8_cprev_char_safe (text);
  205. count++;
  206. if (!str_utf8_iscombiningmark (*text))
  207. break;
  208. }
  209. return count;
  210. }
  211. /* --------------------------------------------------------------------------------------------- */
  212. static gboolean
  213. str_utf8_toupper (const char *text, char **out, size_t *remain)
  214. {
  215. gunichar uni;
  216. size_t left;
  217. uni = g_utf8_get_char_validated (text, -1);
  218. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  219. return FALSE;
  220. uni = g_unichar_toupper (uni);
  221. left = g_unichar_to_utf8 (uni, NULL);
  222. if (left >= *remain)
  223. return FALSE;
  224. left = g_unichar_to_utf8 (uni, *out);
  225. (*out) += left;
  226. (*remain) -= left;
  227. return TRUE;
  228. }
  229. /* --------------------------------------------------------------------------------------------- */
  230. static gboolean
  231. str_utf8_tolower (const char *text, char **out, size_t *remain)
  232. {
  233. gunichar uni;
  234. size_t left;
  235. uni = g_utf8_get_char_validated (text, -1);
  236. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  237. return FALSE;
  238. uni = g_unichar_tolower (uni);
  239. left = g_unichar_to_utf8 (uni, NULL);
  240. if (left >= *remain)
  241. return FALSE;
  242. left = g_unichar_to_utf8 (uni, *out);
  243. (*out) += left;
  244. (*remain) -= left;
  245. return TRUE;
  246. }
  247. /* --------------------------------------------------------------------------------------------- */
  248. static int
  249. str_utf8_length (const char *text)
  250. {
  251. int result = 0;
  252. const char *start;
  253. const char *end;
  254. start = text;
  255. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  256. {
  257. if (start != end)
  258. result += g_utf8_strlen (start, end - start);
  259. result++;
  260. start = end + 1;
  261. }
  262. if (start == text)
  263. result = g_utf8_strlen (text, -1);
  264. else if (start[0] != '\0' && start != end)
  265. result += g_utf8_strlen (start, end - start);
  266. return result;
  267. }
  268. /* --------------------------------------------------------------------------------------------- */
  269. static int
  270. str_utf8_length2 (const char *text, int size)
  271. {
  272. int result = 0;
  273. const char *start;
  274. const char *end;
  275. start = text;
  276. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
  277. {
  278. if (start != end)
  279. {
  280. result += g_utf8_strlen (start, MIN (end - start, size));
  281. size -= end - start;
  282. }
  283. result += (size > 0);
  284. size--;
  285. start = end + 1;
  286. }
  287. if (start == text)
  288. result = g_utf8_strlen (text, size);
  289. else if (start[0] != '\0' && start != end && size > 0)
  290. result += g_utf8_strlen (start, MIN (end - start, size));
  291. return result;
  292. }
  293. /* --------------------------------------------------------------------------------------------- */
  294. static int
  295. str_utf8_length_noncomb (const char *text)
  296. {
  297. int result = 0;
  298. const char *t = text;
  299. while (t[0] != '\0')
  300. {
  301. str_utf8_cnext_noncomb_char (&t);
  302. result++;
  303. }
  304. return result;
  305. }
  306. /* --------------------------------------------------------------------------------------------- */
  307. #if 0
  308. static void
  309. str_utf8_questmark_sustb (char **string, size_t *left, GString *buffer)
  310. {
  311. char *next;
  312. next = g_utf8_next_char (*string);
  313. (*left) -= next - (*string);
  314. (*string) = next;
  315. g_string_append_c (buffer, '?');
  316. }
  317. #endif
  318. /* --------------------------------------------------------------------------------------------- */
  319. static gchar *
  320. str_utf8_conv_gerror_message (GError *mcerror, const char *def_msg)
  321. {
  322. if (mcerror != NULL)
  323. return g_strdup (mcerror->message);
  324. return g_strdup (def_msg != NULL ? def_msg : "");
  325. }
  326. /* --------------------------------------------------------------------------------------------- */
  327. static estr_t
  328. str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString *buffer)
  329. {
  330. estr_t result = ESTR_SUCCESS;
  331. if (coder == str_cnv_not_convert)
  332. g_string_append_len (buffer, string, size);
  333. else
  334. result = str_nconvert (coder, string, size, buffer);
  335. return result;
  336. }
  337. /* --------------------------------------------------------------------------------------------- */
  338. /* utility function, that makes string valid in utf8 and all characters printable
  339. * return width of string too */
  340. static const struct term_form *
  341. str_utf8_make_make_term_form (const char *text, size_t length)
  342. {
  343. static struct term_form result;
  344. gunichar uni;
  345. size_t left;
  346. char *actual;
  347. result.text[0] = '\0';
  348. result.width = 0;
  349. result.compose = FALSE;
  350. actual = result.text;
  351. /* check if text start with combining character,
  352. * add space at begin in this case */
  353. if (length != 0 && text[0] != '\0')
  354. {
  355. uni = g_utf8_get_char_validated (text, -1);
  356. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
  357. && str_unichar_iscombiningmark (uni))
  358. {
  359. actual[0] = ' ';
  360. actual++;
  361. result.width++;
  362. result.compose = TRUE;
  363. }
  364. }
  365. while (length != 0 && text[0] != '\0')
  366. {
  367. uni = g_utf8_get_char_validated (text, -1);
  368. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  369. {
  370. if (g_unichar_isprint (uni))
  371. {
  372. left = g_unichar_to_utf8 (uni, actual);
  373. actual += left;
  374. if (str_unichar_iscombiningmark (uni))
  375. result.compose = TRUE;
  376. else
  377. {
  378. result.width++;
  379. if (g_unichar_iswide (uni))
  380. result.width++;
  381. }
  382. }
  383. else
  384. {
  385. actual[0] = '.';
  386. actual++;
  387. result.width++;
  388. }
  389. text = g_utf8_next_char (text);
  390. }
  391. else
  392. {
  393. text++;
  394. /*actual[0] = '?'; */
  395. memcpy (actual, replch, strlen (replch));
  396. actual += strlen (replch);
  397. result.width++;
  398. }
  399. if (length != (size_t) (-1))
  400. length--;
  401. }
  402. actual[0] = '\0';
  403. return &result;
  404. }
  405. /* --------------------------------------------------------------------------------------------- */
  406. static const char *
  407. str_utf8_term_form (const char *text)
  408. {
  409. static char result[BUF_MEDIUM * MB_LEN_MAX];
  410. const struct term_form *pre_form;
  411. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  412. if (pre_form->compose)
  413. {
  414. char *composed;
  415. composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  416. g_strlcpy (result, composed, sizeof (result));
  417. g_free (composed);
  418. }
  419. else
  420. g_strlcpy (result, pre_form->text, sizeof (result));
  421. return result;
  422. }
  423. /* --------------------------------------------------------------------------------------------- */
  424. /* utility function, that copies all characters from checked to actual */
  425. static gboolean
  426. utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
  427. {
  428. tool->compose = FALSE;
  429. while (tool->checked[0] != '\0')
  430. {
  431. gunichar uni;
  432. size_t left;
  433. uni = g_utf8_get_char (tool->checked);
  434. tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
  435. left = g_unichar_to_utf8 (uni, NULL);
  436. if (tool->remain <= left)
  437. return FALSE;
  438. left = g_unichar_to_utf8 (uni, tool->actual);
  439. tool->actual += left;
  440. tool->remain -= left;
  441. tool->checked = g_utf8_next_char (tool->checked);
  442. }
  443. return TRUE;
  444. }
  445. /* --------------------------------------------------------------------------------------------- */
  446. /* utility function, that copies characters from checked to actual until ident is
  447. * smaller than to_ident */
  448. static gboolean
  449. utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
  450. {
  451. tool->compose = FALSE;
  452. while (tool->checked[0] != '\0')
  453. {
  454. gunichar uni;
  455. size_t left;
  456. int w = 0;
  457. uni = g_utf8_get_char (tool->checked);
  458. if (str_unichar_iscombiningmark (uni))
  459. tool->compose = TRUE;
  460. else
  461. {
  462. w = 1;
  463. if (g_unichar_iswide (uni))
  464. w++;
  465. if (tool->ident + w > to_ident)
  466. return TRUE;
  467. }
  468. left = g_unichar_to_utf8 (uni, NULL);
  469. if (tool->remain <= left)
  470. return FALSE;
  471. left = g_unichar_to_utf8 (uni, tool->actual);
  472. tool->actual += left;
  473. tool->remain -= left;
  474. tool->checked = g_utf8_next_char (tool->checked);
  475. tool->ident += w;
  476. }
  477. return TRUE;
  478. }
  479. /* --------------------------------------------------------------------------------------------- */
  480. /* utility function, adds count spaces to actual */
  481. static int
  482. utf8_tool_insert_space (struct utf8_tool *tool, int count)
  483. {
  484. if (count <= 0)
  485. return 1;
  486. if (tool->remain <= (gsize) count)
  487. return 0;
  488. memset (tool->actual, ' ', count);
  489. tool->actual += count;
  490. tool->remain -= count;
  491. return 1;
  492. }
  493. /* --------------------------------------------------------------------------------------------- */
  494. /* utility function, adds one characters to actual */
  495. static int
  496. utf8_tool_insert_char (struct utf8_tool *tool, char ch)
  497. {
  498. if (tool->remain <= 1)
  499. return 0;
  500. tool->actual[0] = ch;
  501. tool->actual++;
  502. tool->remain--;
  503. return 1;
  504. }
  505. /* --------------------------------------------------------------------------------------------- */
  506. /* utility function, thah skips characters from checked until ident is greater or
  507. * equal to to_ident */
  508. static gboolean
  509. utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
  510. {
  511. gunichar uni;
  512. while (to_ident > tool->ident && tool->checked[0] != '\0')
  513. {
  514. uni = g_utf8_get_char (tool->checked);
  515. if (!str_unichar_iscombiningmark (uni))
  516. {
  517. tool->ident++;
  518. if (g_unichar_iswide (uni))
  519. tool->ident++;
  520. }
  521. tool->checked = g_utf8_next_char (tool->checked);
  522. }
  523. uni = g_utf8_get_char (tool->checked);
  524. while (str_unichar_iscombiningmark (uni))
  525. {
  526. tool->checked = g_utf8_next_char (tool->checked);
  527. uni = g_utf8_get_char (tool->checked);
  528. }
  529. return TRUE;
  530. }
  531. /* --------------------------------------------------------------------------------------------- */
  532. static void
  533. utf8_tool_compose (char *buffer, size_t size)
  534. {
  535. char *composed;
  536. composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  537. g_strlcpy (buffer, composed, size);
  538. g_free (composed);
  539. }
  540. /* --------------------------------------------------------------------------------------------- */
  541. static const char *
  542. str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
  543. {
  544. static char result[BUF_MEDIUM * MB_LEN_MAX];
  545. const struct term_form *pre_form;
  546. struct utf8_tool tool;
  547. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  548. tool.checked = pre_form->text;
  549. tool.actual = result;
  550. tool.remain = sizeof (result);
  551. tool.compose = FALSE;
  552. if (pre_form->width <= (gsize) width)
  553. {
  554. switch (HIDE_FIT (just_mode))
  555. {
  556. case J_CENTER_LEFT:
  557. case J_CENTER:
  558. tool.ident = (width - pre_form->width) / 2;
  559. break;
  560. case J_RIGHT:
  561. tool.ident = width - pre_form->width;
  562. break;
  563. default:
  564. tool.ident = 0;
  565. break;
  566. }
  567. utf8_tool_insert_space (&tool, tool.ident);
  568. utf8_tool_copy_chars_to_end (&tool);
  569. utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
  570. }
  571. else if (IS_FIT (just_mode))
  572. {
  573. tool.ident = 0;
  574. utf8_tool_copy_chars_to (&tool, width / 2);
  575. utf8_tool_insert_char (&tool, '~');
  576. tool.ident = 0;
  577. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  578. utf8_tool_copy_chars_to_end (&tool);
  579. utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
  580. }
  581. else
  582. {
  583. switch (HIDE_FIT (just_mode))
  584. {
  585. case J_CENTER:
  586. tool.ident = (width - pre_form->width) / 2;
  587. break;
  588. case J_RIGHT:
  589. tool.ident = width - pre_form->width;
  590. break;
  591. default:
  592. tool.ident = 0;
  593. break;
  594. }
  595. utf8_tool_skip_chars_to (&tool, 0);
  596. utf8_tool_insert_space (&tool, tool.ident);
  597. utf8_tool_copy_chars_to (&tool, width);
  598. utf8_tool_insert_space (&tool, width - tool.ident);
  599. }
  600. tool.actual[0] = '\0';
  601. if (tool.compose)
  602. utf8_tool_compose (result, sizeof (result));
  603. return result;
  604. }
  605. /* --------------------------------------------------------------------------------------------- */
  606. static const char *
  607. str_utf8_term_trim (const char *text, int width)
  608. {
  609. static char result[BUF_MEDIUM * MB_LEN_MAX];
  610. const struct term_form *pre_form;
  611. struct utf8_tool tool;
  612. if (width < 1)
  613. {
  614. result[0] = '\0';
  615. return result;
  616. }
  617. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  618. tool.checked = pre_form->text;
  619. tool.actual = result;
  620. tool.remain = sizeof (result);
  621. tool.compose = FALSE;
  622. if ((gsize) width >= pre_form->width)
  623. utf8_tool_copy_chars_to_end (&tool);
  624. else if (width <= 3)
  625. {
  626. memset (tool.actual, '.', width);
  627. tool.actual += width;
  628. tool.remain -= width;
  629. }
  630. else
  631. {
  632. memset (tool.actual, '.', 3);
  633. tool.actual += 3;
  634. tool.remain -= 3;
  635. tool.ident = 0;
  636. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
  637. utf8_tool_copy_chars_to_end (&tool);
  638. }
  639. tool.actual[0] = '\0';
  640. if (tool.compose)
  641. utf8_tool_compose (result, sizeof (result));
  642. return result;
  643. }
  644. /* --------------------------------------------------------------------------------------------- */
  645. static int
  646. str_utf8_term_width2 (const char *text, size_t length)
  647. {
  648. const struct term_form *result;
  649. result = str_utf8_make_make_term_form (text, length);
  650. return result->width;
  651. }
  652. /* --------------------------------------------------------------------------------------------- */
  653. static int
  654. str_utf8_term_width1 (const char *text)
  655. {
  656. return str_utf8_term_width2 (text, (size_t) (-1));
  657. }
  658. /* --------------------------------------------------------------------------------------------- */
  659. static int
  660. str_utf8_term_char_width (const char *text)
  661. {
  662. gunichar uni;
  663. uni = g_utf8_get_char_validated (text, -1);
  664. return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
  665. }
  666. /* --------------------------------------------------------------------------------------------- */
  667. static const char *
  668. str_utf8_term_substring (const char *text, int start, int width)
  669. {
  670. static char result[BUF_MEDIUM * MB_LEN_MAX];
  671. const struct term_form *pre_form;
  672. struct utf8_tool tool;
  673. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  674. tool.checked = pre_form->text;
  675. tool.actual = result;
  676. tool.remain = sizeof (result);
  677. tool.compose = FALSE;
  678. tool.ident = -start;
  679. utf8_tool_skip_chars_to (&tool, 0);
  680. if (tool.ident < 0)
  681. tool.ident = 0;
  682. utf8_tool_insert_space (&tool, tool.ident);
  683. utf8_tool_copy_chars_to (&tool, width);
  684. utf8_tool_insert_space (&tool, width - tool.ident);
  685. tool.actual[0] = '\0';
  686. if (tool.compose)
  687. utf8_tool_compose (result, sizeof (result));
  688. return result;
  689. }
  690. /* --------------------------------------------------------------------------------------------- */
  691. static const char *
  692. str_utf8_trunc (const char *text, int width)
  693. {
  694. static char result[MC_MAXPATHLEN * MB_LEN_MAX * 2];
  695. const struct term_form *pre_form;
  696. struct utf8_tool tool;
  697. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  698. tool.checked = pre_form->text;
  699. tool.actual = result;
  700. tool.remain = sizeof (result);
  701. tool.compose = FALSE;
  702. if (pre_form->width <= (gsize) width)
  703. utf8_tool_copy_chars_to_end (&tool);
  704. else
  705. {
  706. tool.ident = 0;
  707. utf8_tool_copy_chars_to (&tool, width / 2);
  708. utf8_tool_insert_char (&tool, '~');
  709. tool.ident = 0;
  710. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  711. utf8_tool_copy_chars_to_end (&tool);
  712. }
  713. tool.actual[0] = '\0';
  714. if (tool.compose)
  715. utf8_tool_compose (result, sizeof (result));
  716. return result;
  717. }
  718. /* --------------------------------------------------------------------------------------------- */
  719. static int
  720. str_utf8_offset_to_pos (const char *text, size_t length)
  721. {
  722. if (str_utf8_is_valid_string (text))
  723. return g_utf8_offset_to_pointer (text, length) - text;
  724. else
  725. {
  726. int result;
  727. char *buffer;
  728. buffer = g_strdup (text);
  729. str_utf8_fix_string (buffer);
  730. result = g_utf8_offset_to_pointer (buffer, length) - buffer;
  731. g_free (buffer);
  732. return result;
  733. }
  734. }
  735. /* --------------------------------------------------------------------------------------------- */
  736. static int
  737. str_utf8_column_to_pos (const char *text, size_t pos)
  738. {
  739. int result = 0;
  740. int width = 0;
  741. while (text[0] != '\0')
  742. {
  743. gunichar uni;
  744. uni = g_utf8_get_char_validated (text, MB_LEN_MAX);
  745. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  746. {
  747. if (g_unichar_isprint (uni))
  748. {
  749. if (!str_unichar_iscombiningmark (uni))
  750. {
  751. width++;
  752. if (g_unichar_iswide (uni))
  753. width++;
  754. }
  755. }
  756. else
  757. {
  758. width++;
  759. }
  760. text = g_utf8_next_char (text);
  761. }
  762. else
  763. {
  764. text++;
  765. width++;
  766. }
  767. if ((gsize) width > pos)
  768. return result;
  769. result++;
  770. }
  771. return result;
  772. }
  773. /* --------------------------------------------------------------------------------------------- */
  774. static char *
  775. str_utf8_create_search_needle (const char *needle, gboolean case_sen)
  776. {
  777. char *fold, *result;
  778. if (needle == NULL)
  779. return NULL;
  780. if (case_sen)
  781. return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
  782. fold = g_utf8_casefold (needle, -1);
  783. result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  784. g_free (fold);
  785. return result;
  786. }
  787. /* --------------------------------------------------------------------------------------------- */
  788. static void
  789. str_utf8_release_search_needle (char *needle, gboolean case_sen)
  790. {
  791. (void) case_sen;
  792. g_free (needle);
  793. }
  794. /* --------------------------------------------------------------------------------------------- */
  795. static const char *
  796. str_utf8_search_first (const char *text, const char *search, gboolean case_sen)
  797. {
  798. char *fold_text;
  799. char *deco_text;
  800. const char *match;
  801. const char *result = NULL;
  802. const char *m;
  803. fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
  804. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  805. match = deco_text;
  806. do
  807. {
  808. match = g_strstr_len (match, -1, search);
  809. if (match != NULL)
  810. {
  811. if ((!str_utf8_iscombiningmark (match) || (match == deco_text))
  812. && !str_utf8_iscombiningmark (match + strlen (search)))
  813. {
  814. result = text;
  815. m = deco_text;
  816. while (m < match)
  817. {
  818. str_utf8_cnext_noncomb_char (&m);
  819. str_utf8_cnext_noncomb_char (&result);
  820. }
  821. }
  822. else
  823. str_utf8_cnext_char (&match);
  824. }
  825. }
  826. while (match != NULL && result == NULL);
  827. g_free (deco_text);
  828. if (!case_sen)
  829. g_free (fold_text);
  830. return result;
  831. }
  832. /* --------------------------------------------------------------------------------------------- */
  833. static const char *
  834. str_utf8_search_last (const char *text, const char *search, gboolean case_sen)
  835. {
  836. char *fold_text;
  837. char *deco_text;
  838. char *match;
  839. const char *result = NULL;
  840. const char *m;
  841. fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
  842. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  843. do
  844. {
  845. match = g_strrstr_len (deco_text, -1, search);
  846. if (match != NULL)
  847. {
  848. if ((!str_utf8_iscombiningmark (match) || (match == deco_text))
  849. && !str_utf8_iscombiningmark (match + strlen (search)))
  850. {
  851. result = text;
  852. m = deco_text;
  853. while (m < match)
  854. {
  855. str_utf8_cnext_noncomb_char (&m);
  856. str_utf8_cnext_noncomb_char (&result);
  857. }
  858. }
  859. else
  860. match[0] = '\0';
  861. }
  862. }
  863. while (match != NULL && result == NULL);
  864. g_free (deco_text);
  865. if (!case_sen)
  866. g_free (fold_text);
  867. return result;
  868. }
  869. /* --------------------------------------------------------------------------------------------- */
  870. static char *
  871. str_utf8_normalize (const char *text)
  872. {
  873. GString *fixed;
  874. char *tmp;
  875. char *result;
  876. const char *start;
  877. const char *end;
  878. /* g_utf8_normalize() is a heavyweight function, that converts UTF-8 into UCS-4,
  879. * does the normalization and then converts UCS-4 back into UTF-8.
  880. * Since file names are composed of ASCII characters in most cases, we can speed up
  881. * utf8 normalization by checking if the heavyweight Unicode normalization is actually
  882. * needed. Normalization of ASCII string is no-op.
  883. */
  884. /* find out whether text is ASCII only */
  885. for (end = text; *end != '\0'; end++)
  886. if ((*end & 0x80) != 0)
  887. {
  888. /* found 2nd byte of utf8-encoded symbol */
  889. break;
  890. }
  891. /* if text is ASCII-only, return copy, normalize otherwise */
  892. if (*end == '\0')
  893. return g_strndup (text, end - text);
  894. fixed = g_string_sized_new (4);
  895. start = text;
  896. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  897. {
  898. if (start != end)
  899. {
  900. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  901. g_string_append (fixed, tmp);
  902. g_free (tmp);
  903. }
  904. g_string_append_c (fixed, end[0]);
  905. start = end + 1;
  906. }
  907. if (start == text)
  908. {
  909. result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
  910. g_string_free (fixed, TRUE);
  911. }
  912. else
  913. {
  914. if (start[0] != '\0' && start != end)
  915. {
  916. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  917. g_string_append (fixed, tmp);
  918. g_free (tmp);
  919. }
  920. result = g_string_free (fixed, FALSE);
  921. }
  922. return result;
  923. }
  924. /* --------------------------------------------------------------------------------------------- */
  925. static char *
  926. str_utf8_casefold_normalize (const char *text)
  927. {
  928. GString *fixed;
  929. char *tmp, *fold;
  930. char *result;
  931. const char *start;
  932. const char *end;
  933. fixed = g_string_sized_new (4);
  934. start = text;
  935. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  936. {
  937. if (start != end)
  938. {
  939. fold = g_utf8_casefold (start, end - start);
  940. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  941. g_string_append (fixed, tmp);
  942. g_free (tmp);
  943. g_free (fold);
  944. }
  945. g_string_append_c (fixed, end[0]);
  946. start = end + 1;
  947. }
  948. if (start == text)
  949. {
  950. fold = g_utf8_casefold (text, -1);
  951. result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  952. g_free (fold);
  953. g_string_free (fixed, TRUE);
  954. }
  955. else
  956. {
  957. if (start[0] != '\0' && start != end)
  958. {
  959. fold = g_utf8_casefold (start, end - start);
  960. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  961. g_string_append (fixed, tmp);
  962. g_free (tmp);
  963. g_free (fold);
  964. }
  965. result = g_string_free (fixed, FALSE);
  966. }
  967. return result;
  968. }
  969. /* --------------------------------------------------------------------------------------------- */
  970. static int
  971. str_utf8_compare (const char *t1, const char *t2)
  972. {
  973. char *n1, *n2;
  974. int result;
  975. n1 = str_utf8_normalize (t1);
  976. n2 = str_utf8_normalize (t2);
  977. result = strcmp (n1, n2);
  978. g_free (n1);
  979. g_free (n2);
  980. return result;
  981. }
  982. /* --------------------------------------------------------------------------------------------- */
  983. static int
  984. str_utf8_ncompare (const char *t1, const char *t2)
  985. {
  986. char *n1, *n2;
  987. size_t l1, l2;
  988. int result;
  989. n1 = str_utf8_normalize (t1);
  990. n2 = str_utf8_normalize (t2);
  991. l1 = strlen (n1);
  992. l2 = strlen (n2);
  993. result = strncmp (n1, n2, MIN (l1, l2));
  994. g_free (n1);
  995. g_free (n2);
  996. return result;
  997. }
  998. /* --------------------------------------------------------------------------------------------- */
  999. static int
  1000. str_utf8_casecmp (const char *t1, const char *t2)
  1001. {
  1002. char *n1, *n2;
  1003. int result;
  1004. n1 = str_utf8_casefold_normalize (t1);
  1005. n2 = str_utf8_casefold_normalize (t2);
  1006. result = strcmp (n1, n2);
  1007. g_free (n1);
  1008. g_free (n2);
  1009. return result;
  1010. }
  1011. /* --------------------------------------------------------------------------------------------- */
  1012. static int
  1013. str_utf8_ncasecmp (const char *t1, const char *t2)
  1014. {
  1015. char *n1, *n2;
  1016. size_t l1, l2;
  1017. int result;
  1018. n1 = str_utf8_casefold_normalize (t1);
  1019. n2 = str_utf8_casefold_normalize (t2);
  1020. l1 = strlen (n1);
  1021. l2 = strlen (n2);
  1022. result = strncmp (n1, n2, MIN (l1, l2));
  1023. g_free (n1);
  1024. g_free (n2);
  1025. return result;
  1026. }
  1027. /* --------------------------------------------------------------------------------------------- */
  1028. static int
  1029. str_utf8_prefix (const char *text, const char *prefix)
  1030. {
  1031. char *t, *p;
  1032. const char *nt, *np;
  1033. const char *nnt, *nnp;
  1034. int result;
  1035. t = str_utf8_normalize (text);
  1036. p = str_utf8_normalize (prefix);
  1037. nt = t;
  1038. np = p;
  1039. nnt = t;
  1040. nnp = p;
  1041. while (nt[0] != '\0' && np[0] != '\0')
  1042. {
  1043. str_utf8_cnext_char_safe (&nnt);
  1044. str_utf8_cnext_char_safe (&nnp);
  1045. if (nnt - nt != nnp - np)
  1046. break;
  1047. if (strncmp (nt, np, nnt - nt) != 0)
  1048. break;
  1049. nt = nnt;
  1050. np = nnp;
  1051. }
  1052. result = np - p;
  1053. g_free (t);
  1054. g_free (p);
  1055. return result;
  1056. }
  1057. /* --------------------------------------------------------------------------------------------- */
  1058. static int
  1059. str_utf8_caseprefix (const char *text, const char *prefix)
  1060. {
  1061. char *t, *p;
  1062. const char *nt, *np;
  1063. const char *nnt, *nnp;
  1064. int result;
  1065. t = str_utf8_casefold_normalize (text);
  1066. p = str_utf8_casefold_normalize (prefix);
  1067. nt = t;
  1068. np = p;
  1069. nnt = t;
  1070. nnp = p;
  1071. while (nt[0] != '\0' && np[0] != '\0')
  1072. {
  1073. str_utf8_cnext_char_safe (&nnt);
  1074. str_utf8_cnext_char_safe (&nnp);
  1075. if (nnt - nt != nnp - np)
  1076. break;
  1077. if (strncmp (nt, np, nnt - nt) != 0)
  1078. break;
  1079. nt = nnt;
  1080. np = nnp;
  1081. }
  1082. result = np - p;
  1083. g_free (t);
  1084. g_free (p);
  1085. return result;
  1086. }
  1087. /* --------------------------------------------------------------------------------------------- */
  1088. static char *
  1089. str_utf8_create_key_gen (const char *text, gboolean case_sen,
  1090. gchar *(*keygen) (const gchar *text, gssize size))
  1091. {
  1092. char *result;
  1093. if (case_sen)
  1094. result = str_utf8_normalize (text);
  1095. else
  1096. {
  1097. gboolean dot;
  1098. GString *fixed;
  1099. const char *start, *end;
  1100. char *fold, *key;
  1101. dot = text[0] == '.';
  1102. fixed = g_string_sized_new (16);
  1103. if (!dot)
  1104. start = text;
  1105. else
  1106. {
  1107. start = text + 1;
  1108. g_string_append_c (fixed, '.');
  1109. }
  1110. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  1111. {
  1112. if (start != end)
  1113. {
  1114. fold = g_utf8_casefold (start, end - start);
  1115. key = keygen (fold, -1);
  1116. g_string_append (fixed, key);
  1117. g_free (key);
  1118. g_free (fold);
  1119. }
  1120. g_string_append_c (fixed, end[0]);
  1121. start = end + 1;
  1122. }
  1123. if (start == text)
  1124. {
  1125. fold = g_utf8_casefold (start, -1);
  1126. result = keygen (fold, -1);
  1127. g_free (fold);
  1128. g_string_free (fixed, TRUE);
  1129. }
  1130. else if (dot && (start == text + 1))
  1131. {
  1132. fold = g_utf8_casefold (start, -1);
  1133. key = keygen (fold, -1);
  1134. g_string_append (fixed, key);
  1135. g_free (key);
  1136. g_free (fold);
  1137. result = g_string_free (fixed, FALSE);
  1138. }
  1139. else
  1140. {
  1141. if (start[0] != '\0' && start != end)
  1142. {
  1143. fold = g_utf8_casefold (start, end - start);
  1144. key = keygen (fold, -1);
  1145. g_string_append (fixed, key);
  1146. g_free (key);
  1147. g_free (fold);
  1148. }
  1149. result = g_string_free (fixed, FALSE);
  1150. }
  1151. }
  1152. return result;
  1153. }
  1154. /* --------------------------------------------------------------------------------------------- */
  1155. static char *
  1156. str_utf8_create_key (const char *text, gboolean case_sen)
  1157. {
  1158. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
  1159. }
  1160. /* --------------------------------------------------------------------------------------------- */
  1161. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1162. static char *
  1163. str_utf8_create_key_for_filename (const char *text, gboolean case_sen)
  1164. {
  1165. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
  1166. }
  1167. #endif
  1168. /* --------------------------------------------------------------------------------------------- */
  1169. static int
  1170. str_utf8_key_collate (const char *t1, const char *t2, gboolean case_sen)
  1171. {
  1172. (void) case_sen;
  1173. return strcmp (t1, t2);
  1174. }
  1175. /* --------------------------------------------------------------------------------------------- */
  1176. static void
  1177. str_utf8_release_key (char *key, gboolean case_sen)
  1178. {
  1179. (void) case_sen;
  1180. g_free (key);
  1181. }
  1182. /* --------------------------------------------------------------------------------------------- */
  1183. /*** public functions ****************************************************************************/
  1184. /* --------------------------------------------------------------------------------------------- */
  1185. struct str_class
  1186. str_utf8_init (void)
  1187. {
  1188. struct str_class result;
  1189. result.conv_gerror_message = str_utf8_conv_gerror_message;
  1190. result.vfs_convert_to = str_utf8_vfs_convert_to;
  1191. result.insert_replace_char = str_utf8_insert_replace_char;
  1192. result.is_valid_string = str_utf8_is_valid_string;
  1193. result.is_valid_char = str_utf8_is_valid_char;
  1194. result.cnext_char = str_utf8_cnext_char;
  1195. result.cprev_char = str_utf8_cprev_char;
  1196. result.cnext_char_safe = str_utf8_cnext_char_safe;
  1197. result.cprev_char_safe = str_utf8_cprev_char_safe;
  1198. result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
  1199. result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
  1200. result.char_isspace = str_utf8_isspace;
  1201. result.char_ispunct = str_utf8_ispunct;
  1202. result.char_isalnum = str_utf8_isalnum;
  1203. result.char_isdigit = str_utf8_isdigit;
  1204. result.char_isprint = str_utf8_isprint;
  1205. result.char_iscombiningmark = str_utf8_iscombiningmark;
  1206. result.char_toupper = str_utf8_toupper;
  1207. result.char_tolower = str_utf8_tolower;
  1208. result.length = str_utf8_length;
  1209. result.length2 = str_utf8_length2;
  1210. result.length_noncomb = str_utf8_length_noncomb;
  1211. result.fix_string = str_utf8_fix_string;
  1212. result.term_form = str_utf8_term_form;
  1213. result.fit_to_term = str_utf8_fit_to_term;
  1214. result.term_trim = str_utf8_term_trim;
  1215. result.term_width2 = str_utf8_term_width2;
  1216. result.term_width1 = str_utf8_term_width1;
  1217. result.term_char_width = str_utf8_term_char_width;
  1218. result.term_substring = str_utf8_term_substring;
  1219. result.trunc = str_utf8_trunc;
  1220. result.offset_to_pos = str_utf8_offset_to_pos;
  1221. result.column_to_pos = str_utf8_column_to_pos;
  1222. result.create_search_needle = str_utf8_create_search_needle;
  1223. result.release_search_needle = str_utf8_release_search_needle;
  1224. result.search_first = str_utf8_search_first;
  1225. result.search_last = str_utf8_search_last;
  1226. result.compare = str_utf8_compare;
  1227. result.ncompare = str_utf8_ncompare;
  1228. result.casecmp = str_utf8_casecmp;
  1229. result.ncasecmp = str_utf8_ncasecmp;
  1230. result.prefix = str_utf8_prefix;
  1231. result.caseprefix = str_utf8_caseprefix;
  1232. result.create_key = str_utf8_create_key;
  1233. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1234. /* case insensitive sort files in "a1 a2 a10" order */
  1235. result.create_key_for_filename = str_utf8_create_key_for_filename;
  1236. #else
  1237. /* case insensitive sort files in "a1 a10 a2" order */
  1238. result.create_key_for_filename = str_utf8_create_key;
  1239. #endif
  1240. result.key_collate = str_utf8_key_collate;
  1241. result.release_key = str_utf8_release_key;
  1242. return result;
  1243. }
  1244. /* --------------------------------------------------------------------------------------------- */