strutilutf8.c 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540
  1. /*
  2. UTF-8 strings utilities
  3. Copyright (C) 2007-2025
  4. Free Software Foundation, Inc.
  5. Written by:
  6. Rostislav Benes, 2007
  7. This file is part of the Midnight Commander.
  8. The Midnight Commander is free software: you can redistribute it
  9. and/or modify it under the terms of the GNU General Public License as
  10. published by the Free Software Foundation, either version 3 of the License,
  11. or (at your option) any later version.
  12. The Midnight Commander is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. GNU General Public License for more details.
  16. You should have received a copy of the GNU General Public License
  17. along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include <config.h>
  20. #include <stdlib.h>
  21. #include <langinfo.h>
  22. #include <limits.h> /* MB_LEN_MAX */
  23. #include <string.h>
  24. #include "lib/global.h"
  25. #include "lib/strutil.h"
  26. /* using function for utf-8 from glib */
  27. /*** global variables ****************************************************************************/
  28. /*** file scope macro definitions ****************************************************************/
  29. /*** file scope type declarations ****************************************************************/
  30. struct utf8_tool
  31. {
  32. char *actual;
  33. size_t remain;
  34. const char *checked;
  35. int ident;
  36. gboolean compose;
  37. };
  38. struct term_form
  39. {
  40. char text[BUF_MEDIUM * MB_LEN_MAX];
  41. size_t width;
  42. gboolean compose;
  43. };
  44. /*** forward declarations (file scope functions) *************************************************/
  45. /*** file scope variables ************************************************************************/
  46. static const char replch[] = "\xEF\xBF\xBD";
  47. /* --------------------------------------------------------------------------------------------- */
  48. /*** file scope functions ************************************************************************/
  49. /* --------------------------------------------------------------------------------------------- */
  50. static gboolean
  51. str_unichar_iscombiningmark (gunichar uni)
  52. {
  53. GUnicodeType type;
  54. type = g_unichar_type (uni);
  55. return (type == G_UNICODE_SPACING_MARK)
  56. || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  57. }
  58. /* --------------------------------------------------------------------------------------------- */
  59. static void
  60. str_utf8_insert_replace_char (GString *buffer)
  61. {
  62. g_string_append (buffer, replch);
  63. }
  64. /* --------------------------------------------------------------------------------------------- */
  65. static gboolean
  66. str_utf8_is_valid_string (const char *text)
  67. {
  68. return g_utf8_validate (text, -1, NULL);
  69. }
  70. /* --------------------------------------------------------------------------------------------- */
  71. static int
  72. str_utf8_is_valid_char (const char *ch, size_t size)
  73. {
  74. switch (g_utf8_get_char_validated (ch, size))
  75. {
  76. case (gunichar) (-2):
  77. return (-2);
  78. case (gunichar) (-1):
  79. return (-1);
  80. default:
  81. return 1;
  82. }
  83. }
  84. /* --------------------------------------------------------------------------------------------- */
  85. static void
  86. str_utf8_cnext_char (const char **text)
  87. {
  88. (*text) = g_utf8_next_char (*text);
  89. }
  90. /* --------------------------------------------------------------------------------------------- */
  91. static void
  92. str_utf8_cprev_char (const char **text)
  93. {
  94. (*text) = g_utf8_prev_char (*text);
  95. }
  96. /* --------------------------------------------------------------------------------------------- */
  97. static void
  98. str_utf8_cnext_char_safe (const char **text)
  99. {
  100. if (str_utf8_is_valid_char (*text, -1) == 1)
  101. (*text) = g_utf8_next_char (*text);
  102. else
  103. (*text)++;
  104. }
  105. /* --------------------------------------------------------------------------------------------- */
  106. static void
  107. str_utf8_cprev_char_safe (const char **text)
  108. {
  109. const char *result, *t;
  110. result = g_utf8_prev_char (*text);
  111. t = result;
  112. str_utf8_cnext_char_safe (&t);
  113. if (t == *text)
  114. (*text) = result;
  115. else
  116. (*text)--;
  117. }
  118. /* --------------------------------------------------------------------------------------------- */
  119. static void
  120. str_utf8_fix_string (char *text)
  121. {
  122. while (text[0] != '\0')
  123. {
  124. gunichar uni;
  125. uni = g_utf8_get_char_validated (text, -1);
  126. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  127. text = g_utf8_next_char (text);
  128. else
  129. {
  130. text[0] = '?';
  131. text++;
  132. }
  133. }
  134. }
  135. /* --------------------------------------------------------------------------------------------- */
  136. static gboolean
  137. str_utf8_isspace (const char *text)
  138. {
  139. gunichar uni;
  140. uni = g_utf8_get_char_validated (text, -1);
  141. return g_unichar_isspace (uni);
  142. }
  143. /* --------------------------------------------------------------------------------------------- */
  144. static gboolean
  145. str_utf8_ispunct (const char *text)
  146. {
  147. gunichar uni;
  148. uni = g_utf8_get_char_validated (text, -1);
  149. return g_unichar_ispunct (uni);
  150. }
  151. /* --------------------------------------------------------------------------------------------- */
  152. static gboolean
  153. str_utf8_isalnum (const char *text)
  154. {
  155. gunichar uni;
  156. uni = g_utf8_get_char_validated (text, -1);
  157. return g_unichar_isalnum (uni);
  158. }
  159. /* --------------------------------------------------------------------------------------------- */
  160. static gboolean
  161. str_utf8_isdigit (const char *text)
  162. {
  163. gunichar uni;
  164. uni = g_utf8_get_char_validated (text, -1);
  165. return g_unichar_isdigit (uni);
  166. }
  167. /* --------------------------------------------------------------------------------------------- */
  168. static gboolean
  169. str_utf8_isprint (const char *ch)
  170. {
  171. gunichar uni;
  172. uni = g_utf8_get_char_validated (ch, -1);
  173. return g_unichar_isprint (uni);
  174. }
  175. /* --------------------------------------------------------------------------------------------- */
  176. static gboolean
  177. str_utf8_iscombiningmark (const char *ch)
  178. {
  179. gunichar uni;
  180. uni = g_utf8_get_char_validated (ch, -1);
  181. return str_unichar_iscombiningmark (uni);
  182. }
  183. /* --------------------------------------------------------------------------------------------- */
  184. static int
  185. str_utf8_cnext_noncomb_char (const char **text)
  186. {
  187. int count = 0;
  188. while ((*text)[0] != '\0')
  189. {
  190. str_utf8_cnext_char_safe (text);
  191. count++;
  192. if (!str_utf8_iscombiningmark (*text))
  193. break;
  194. }
  195. return count;
  196. }
  197. /* --------------------------------------------------------------------------------------------- */
  198. static int
  199. str_utf8_cprev_noncomb_char (const char **text, const char *begin)
  200. {
  201. int count = 0;
  202. while ((*text) != begin)
  203. {
  204. str_utf8_cprev_char_safe (text);
  205. count++;
  206. if (!str_utf8_iscombiningmark (*text))
  207. break;
  208. }
  209. return count;
  210. }
  211. /* --------------------------------------------------------------------------------------------- */
  212. static gboolean
  213. str_utf8_toupper (const char *text, char **out, size_t *remain)
  214. {
  215. gunichar uni;
  216. size_t left;
  217. uni = g_utf8_get_char_validated (text, -1);
  218. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  219. return FALSE;
  220. uni = g_unichar_toupper (uni);
  221. left = g_unichar_to_utf8 (uni, NULL);
  222. if (left >= *remain)
  223. return FALSE;
  224. left = g_unichar_to_utf8 (uni, *out);
  225. (*out) += left;
  226. (*remain) -= left;
  227. return TRUE;
  228. }
  229. /* --------------------------------------------------------------------------------------------- */
  230. static gboolean
  231. str_utf8_tolower (const char *text, char **out, size_t *remain)
  232. {
  233. gunichar uni;
  234. size_t left;
  235. uni = g_utf8_get_char_validated (text, -1);
  236. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  237. return FALSE;
  238. uni = g_unichar_tolower (uni);
  239. left = g_unichar_to_utf8 (uni, NULL);
  240. if (left >= *remain)
  241. return FALSE;
  242. left = g_unichar_to_utf8 (uni, *out);
  243. (*out) += left;
  244. (*remain) -= left;
  245. return TRUE;
  246. }
  247. /* --------------------------------------------------------------------------------------------- */
  248. static int
  249. str_utf8_length (const char *text)
  250. {
  251. int result = 0;
  252. const char *start;
  253. const char *end;
  254. start = text;
  255. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  256. {
  257. if (start != end)
  258. result += g_utf8_strlen (start, end - start);
  259. result++;
  260. start = end + 1;
  261. }
  262. if (start == text)
  263. result = g_utf8_strlen (text, -1);
  264. else if (start[0] != '\0' && start != end)
  265. result += g_utf8_strlen (start, end - start);
  266. return result;
  267. }
  268. /* --------------------------------------------------------------------------------------------- */
  269. static int
  270. str_utf8_length2 (const char *text, int size)
  271. {
  272. int result = 0;
  273. const char *start;
  274. const char *end;
  275. start = text;
  276. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
  277. {
  278. if (start != end)
  279. {
  280. result += g_utf8_strlen (start, MIN (end - start, size));
  281. size -= end - start;
  282. }
  283. result += (size > 0);
  284. size--;
  285. start = end + 1;
  286. }
  287. if (start == text)
  288. result = g_utf8_strlen (text, size);
  289. else if (start[0] != '\0' && start != end && size > 0)
  290. result += g_utf8_strlen (start, MIN (end - start, size));
  291. return result;
  292. }
  293. /* --------------------------------------------------------------------------------------------- */
  294. static int
  295. str_utf8_length_noncomb (const char *text)
  296. {
  297. int result = 0;
  298. const char *t = text;
  299. while (t[0] != '\0')
  300. {
  301. str_utf8_cnext_noncomb_char (&t);
  302. result++;
  303. }
  304. return result;
  305. }
  306. /* --------------------------------------------------------------------------------------------- */
  307. #if 0
  308. static void
  309. str_utf8_questmark_sustb (char **string, size_t *left, GString *buffer)
  310. {
  311. char *next;
  312. next = g_utf8_next_char (*string);
  313. (*left) -= next - (*string);
  314. (*string) = next;
  315. g_string_append_c (buffer, '?');
  316. }
  317. #endif
  318. /* --------------------------------------------------------------------------------------------- */
  319. static gchar *
  320. str_utf8_conv_gerror_message (GError *mcerror, const char *def_msg)
  321. {
  322. if (mcerror != NULL)
  323. return g_strdup (mcerror->message);
  324. return g_strdup (def_msg != NULL ? def_msg : "");
  325. }
  326. /* --------------------------------------------------------------------------------------------- */
  327. static estr_t
  328. str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString *buffer)
  329. {
  330. estr_t result = ESTR_SUCCESS;
  331. if (coder == str_cnv_not_convert)
  332. g_string_append_len (buffer, string, size);
  333. else
  334. result = str_nconvert (coder, string, size, buffer);
  335. return result;
  336. }
  337. /* --------------------------------------------------------------------------------------------- */
  338. /* utility function, that makes string valid in utf8 and all characters printable
  339. * return width of string too */
  340. static const struct term_form *
  341. str_utf8_make_make_term_form (const char *text, size_t length)
  342. {
  343. static struct term_form result;
  344. gunichar uni;
  345. size_t left;
  346. char *actual;
  347. result.text[0] = '\0';
  348. result.width = 0;
  349. result.compose = FALSE;
  350. actual = result.text;
  351. /* check if text start with combining character,
  352. * add space at begin in this case */
  353. if (length != 0 && text[0] != '\0')
  354. {
  355. uni = g_utf8_get_char_validated (text, -1);
  356. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
  357. && str_unichar_iscombiningmark (uni))
  358. {
  359. actual[0] = ' ';
  360. actual++;
  361. result.width++;
  362. result.compose = TRUE;
  363. }
  364. }
  365. while (length != 0 && text[0] != '\0')
  366. {
  367. uni = g_utf8_get_char_validated (text, -1);
  368. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  369. {
  370. if (g_unichar_isprint (uni))
  371. {
  372. left = g_unichar_to_utf8 (uni, actual);
  373. actual += left;
  374. if (str_unichar_iscombiningmark (uni))
  375. result.compose = TRUE;
  376. else
  377. {
  378. result.width++;
  379. if (g_unichar_iswide (uni))
  380. result.width++;
  381. }
  382. }
  383. else
  384. {
  385. actual[0] = '.';
  386. actual++;
  387. result.width++;
  388. }
  389. text = g_utf8_next_char (text);
  390. }
  391. else
  392. {
  393. size_t repl_len;
  394. text++;
  395. /*actual[0] = '?'; */
  396. repl_len = strlen (replch);
  397. memcpy (actual, replch, repl_len);
  398. actual += repl_len;
  399. result.width++;
  400. }
  401. if (length != (size_t) (-1))
  402. length--;
  403. }
  404. actual[0] = '\0';
  405. return &result;
  406. }
  407. /* --------------------------------------------------------------------------------------------- */
  408. static const char *
  409. str_utf8_term_form (const char *text)
  410. {
  411. static char result[BUF_MEDIUM * MB_LEN_MAX];
  412. const struct term_form *pre_form;
  413. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  414. if (pre_form->compose)
  415. {
  416. char *composed;
  417. composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  418. g_strlcpy (result, composed, sizeof (result));
  419. g_free (composed);
  420. }
  421. else
  422. g_strlcpy (result, pre_form->text, sizeof (result));
  423. return result;
  424. }
  425. /* --------------------------------------------------------------------------------------------- */
  426. /* utility function, that copies all characters from checked to actual */
  427. static gboolean
  428. utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
  429. {
  430. tool->compose = FALSE;
  431. while (tool->checked[0] != '\0')
  432. {
  433. gunichar uni;
  434. size_t left;
  435. uni = g_utf8_get_char (tool->checked);
  436. tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
  437. left = g_unichar_to_utf8 (uni, NULL);
  438. if (tool->remain <= left)
  439. return FALSE;
  440. left = g_unichar_to_utf8 (uni, tool->actual);
  441. tool->actual += left;
  442. tool->remain -= left;
  443. tool->checked = g_utf8_next_char (tool->checked);
  444. }
  445. return TRUE;
  446. }
  447. /* --------------------------------------------------------------------------------------------- */
  448. /* utility function, that copies characters from checked to actual until ident is
  449. * smaller than to_ident */
  450. static gboolean
  451. utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
  452. {
  453. tool->compose = FALSE;
  454. while (tool->checked[0] != '\0')
  455. {
  456. gunichar uni;
  457. size_t left;
  458. int w = 0;
  459. uni = g_utf8_get_char (tool->checked);
  460. if (str_unichar_iscombiningmark (uni))
  461. tool->compose = TRUE;
  462. else
  463. {
  464. w = 1;
  465. if (g_unichar_iswide (uni))
  466. w++;
  467. if (tool->ident + w > to_ident)
  468. return TRUE;
  469. }
  470. left = g_unichar_to_utf8 (uni, NULL);
  471. if (tool->remain <= left)
  472. return FALSE;
  473. left = g_unichar_to_utf8 (uni, tool->actual);
  474. tool->actual += left;
  475. tool->remain -= left;
  476. tool->checked = g_utf8_next_char (tool->checked);
  477. tool->ident += w;
  478. }
  479. return TRUE;
  480. }
  481. /* --------------------------------------------------------------------------------------------- */
  482. /* utility function, adds count spaces to actual */
  483. static int
  484. utf8_tool_insert_space (struct utf8_tool *tool, int count)
  485. {
  486. if (count <= 0)
  487. return 1;
  488. if (tool->remain <= (gsize) count)
  489. return 0;
  490. memset (tool->actual, ' ', count);
  491. tool->actual += count;
  492. tool->remain -= count;
  493. return 1;
  494. }
  495. /* --------------------------------------------------------------------------------------------- */
  496. /* utility function, adds one characters to actual */
  497. static int
  498. utf8_tool_insert_char (struct utf8_tool *tool, char ch)
  499. {
  500. if (tool->remain <= 1)
  501. return 0;
  502. tool->actual[0] = ch;
  503. tool->actual++;
  504. tool->remain--;
  505. return 1;
  506. }
  507. /* --------------------------------------------------------------------------------------------- */
  508. /* utility function, thah skips characters from checked until ident is greater or
  509. * equal to to_ident */
  510. static gboolean
  511. utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
  512. {
  513. gunichar uni;
  514. while (to_ident > tool->ident && tool->checked[0] != '\0')
  515. {
  516. uni = g_utf8_get_char (tool->checked);
  517. if (!str_unichar_iscombiningmark (uni))
  518. {
  519. tool->ident++;
  520. if (g_unichar_iswide (uni))
  521. tool->ident++;
  522. }
  523. tool->checked = g_utf8_next_char (tool->checked);
  524. }
  525. uni = g_utf8_get_char (tool->checked);
  526. while (str_unichar_iscombiningmark (uni))
  527. {
  528. tool->checked = g_utf8_next_char (tool->checked);
  529. uni = g_utf8_get_char (tool->checked);
  530. }
  531. return TRUE;
  532. }
  533. /* --------------------------------------------------------------------------------------------- */
  534. static void
  535. utf8_tool_compose (char *buffer, size_t size)
  536. {
  537. char *composed;
  538. composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  539. g_strlcpy (buffer, composed, size);
  540. g_free (composed);
  541. }
  542. /* --------------------------------------------------------------------------------------------- */
  543. static const char *
  544. str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
  545. {
  546. static char result[BUF_MEDIUM * MB_LEN_MAX];
  547. const struct term_form *pre_form;
  548. struct utf8_tool tool;
  549. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  550. tool.checked = pre_form->text;
  551. tool.actual = result;
  552. tool.remain = sizeof (result);
  553. tool.compose = FALSE;
  554. if (pre_form->width <= (gsize) width)
  555. {
  556. switch (HIDE_FIT (just_mode))
  557. {
  558. case J_CENTER_LEFT:
  559. case J_CENTER:
  560. tool.ident = (width - pre_form->width) / 2;
  561. break;
  562. case J_RIGHT:
  563. tool.ident = width - pre_form->width;
  564. break;
  565. default:
  566. tool.ident = 0;
  567. break;
  568. }
  569. utf8_tool_insert_space (&tool, tool.ident);
  570. utf8_tool_copy_chars_to_end (&tool);
  571. utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
  572. }
  573. else if (IS_FIT (just_mode))
  574. {
  575. tool.ident = 0;
  576. utf8_tool_copy_chars_to (&tool, width / 2);
  577. utf8_tool_insert_char (&tool, '~');
  578. tool.ident = 0;
  579. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  580. utf8_tool_copy_chars_to_end (&tool);
  581. utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
  582. }
  583. else
  584. {
  585. switch (HIDE_FIT (just_mode))
  586. {
  587. case J_CENTER:
  588. tool.ident = (width - pre_form->width) / 2;
  589. break;
  590. case J_RIGHT:
  591. tool.ident = width - pre_form->width;
  592. break;
  593. default:
  594. tool.ident = 0;
  595. break;
  596. }
  597. utf8_tool_skip_chars_to (&tool, 0);
  598. utf8_tool_insert_space (&tool, tool.ident);
  599. utf8_tool_copy_chars_to (&tool, width);
  600. utf8_tool_insert_space (&tool, width - tool.ident);
  601. }
  602. tool.actual[0] = '\0';
  603. if (tool.compose)
  604. utf8_tool_compose (result, sizeof (result));
  605. return result;
  606. }
  607. /* --------------------------------------------------------------------------------------------- */
  608. static const char *
  609. str_utf8_term_trim (const char *text, int width)
  610. {
  611. static char result[BUF_MEDIUM * MB_LEN_MAX];
  612. const struct term_form *pre_form;
  613. struct utf8_tool tool;
  614. if (width < 1)
  615. {
  616. result[0] = '\0';
  617. return result;
  618. }
  619. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  620. tool.checked = pre_form->text;
  621. tool.actual = result;
  622. tool.remain = sizeof (result);
  623. tool.compose = FALSE;
  624. if ((gsize) width >= pre_form->width)
  625. utf8_tool_copy_chars_to_end (&tool);
  626. else if (width <= 3)
  627. {
  628. memset (tool.actual, '.', width);
  629. tool.actual += width;
  630. tool.remain -= width;
  631. }
  632. else
  633. {
  634. memset (tool.actual, '.', 3);
  635. tool.actual += 3;
  636. tool.remain -= 3;
  637. tool.ident = 0;
  638. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
  639. utf8_tool_copy_chars_to_end (&tool);
  640. }
  641. tool.actual[0] = '\0';
  642. if (tool.compose)
  643. utf8_tool_compose (result, sizeof (result));
  644. return result;
  645. }
  646. /* --------------------------------------------------------------------------------------------- */
  647. static int
  648. str_utf8_term_width2 (const char *text, size_t length)
  649. {
  650. const struct term_form *result;
  651. result = str_utf8_make_make_term_form (text, length);
  652. return result->width;
  653. }
  654. /* --------------------------------------------------------------------------------------------- */
  655. static int
  656. str_utf8_term_width1 (const char *text)
  657. {
  658. return str_utf8_term_width2 (text, (size_t) (-1));
  659. }
  660. /* --------------------------------------------------------------------------------------------- */
  661. static int
  662. str_utf8_term_char_width (const char *text)
  663. {
  664. gunichar uni;
  665. uni = g_utf8_get_char_validated (text, -1);
  666. return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
  667. }
  668. /* --------------------------------------------------------------------------------------------- */
  669. static const char *
  670. str_utf8_term_substring (const char *text, int start, int width)
  671. {
  672. static char result[BUF_MEDIUM * MB_LEN_MAX];
  673. const struct term_form *pre_form;
  674. struct utf8_tool tool;
  675. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  676. tool.checked = pre_form->text;
  677. tool.actual = result;
  678. tool.remain = sizeof (result);
  679. tool.compose = FALSE;
  680. tool.ident = -start;
  681. utf8_tool_skip_chars_to (&tool, 0);
  682. if (tool.ident < 0)
  683. tool.ident = 0;
  684. utf8_tool_insert_space (&tool, tool.ident);
  685. utf8_tool_copy_chars_to (&tool, width);
  686. utf8_tool_insert_space (&tool, width - tool.ident);
  687. tool.actual[0] = '\0';
  688. if (tool.compose)
  689. utf8_tool_compose (result, sizeof (result));
  690. return result;
  691. }
  692. /* --------------------------------------------------------------------------------------------- */
  693. static const char *
  694. str_utf8_trunc (const char *text, int width)
  695. {
  696. static char result[MC_MAXPATHLEN * MB_LEN_MAX * 2];
  697. const struct term_form *pre_form;
  698. struct utf8_tool tool;
  699. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  700. tool.checked = pre_form->text;
  701. tool.actual = result;
  702. tool.remain = sizeof (result);
  703. tool.compose = FALSE;
  704. if (pre_form->width <= (gsize) width)
  705. utf8_tool_copy_chars_to_end (&tool);
  706. else
  707. {
  708. tool.ident = 0;
  709. utf8_tool_copy_chars_to (&tool, width / 2);
  710. utf8_tool_insert_char (&tool, '~');
  711. tool.ident = 0;
  712. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  713. utf8_tool_copy_chars_to_end (&tool);
  714. }
  715. tool.actual[0] = '\0';
  716. if (tool.compose)
  717. utf8_tool_compose (result, sizeof (result));
  718. return result;
  719. }
  720. /* --------------------------------------------------------------------------------------------- */
  721. static int
  722. str_utf8_offset_to_pos (const char *text, size_t length)
  723. {
  724. if (str_utf8_is_valid_string (text))
  725. return g_utf8_offset_to_pointer (text, length) - text;
  726. else
  727. {
  728. int result;
  729. char *buffer;
  730. buffer = g_strdup (text);
  731. str_utf8_fix_string (buffer);
  732. result = g_utf8_offset_to_pointer (buffer, length) - buffer;
  733. g_free (buffer);
  734. return result;
  735. }
  736. }
  737. /* --------------------------------------------------------------------------------------------- */
  738. static int
  739. str_utf8_column_to_pos (const char *text, size_t pos)
  740. {
  741. int result = 0;
  742. int width = 0;
  743. while (text[0] != '\0')
  744. {
  745. gunichar uni;
  746. uni = g_utf8_get_char_validated (text, MB_LEN_MAX);
  747. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  748. {
  749. if (g_unichar_isprint (uni))
  750. {
  751. if (!str_unichar_iscombiningmark (uni))
  752. {
  753. width++;
  754. if (g_unichar_iswide (uni))
  755. width++;
  756. }
  757. }
  758. else
  759. {
  760. width++;
  761. }
  762. text = g_utf8_next_char (text);
  763. }
  764. else
  765. {
  766. text++;
  767. width++;
  768. }
  769. if ((gsize) width > pos)
  770. return result;
  771. result++;
  772. }
  773. return result;
  774. }
  775. /* --------------------------------------------------------------------------------------------- */
  776. static char *
  777. str_utf8_create_search_needle (const char *needle, gboolean case_sen)
  778. {
  779. char *fold, *result;
  780. if (needle == NULL)
  781. return NULL;
  782. if (case_sen)
  783. return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
  784. fold = g_utf8_casefold (needle, -1);
  785. result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  786. g_free (fold);
  787. return result;
  788. }
  789. /* --------------------------------------------------------------------------------------------- */
  790. static void
  791. str_utf8_release_search_needle (char *needle, gboolean case_sen)
  792. {
  793. (void) case_sen;
  794. g_free (needle);
  795. }
  796. /* --------------------------------------------------------------------------------------------- */
  797. static const char *
  798. str_utf8_search_first (const char *text, const char *search, gboolean case_sen)
  799. {
  800. char *deco_text;
  801. const char *match;
  802. const char *result = NULL;
  803. size_t search_len;
  804. if (case_sen)
  805. deco_text = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
  806. else
  807. {
  808. char *fold_text;
  809. fold_text = g_utf8_casefold (text, -1);
  810. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  811. g_free (fold_text);
  812. }
  813. search_len = strlen (search);
  814. match = deco_text;
  815. do
  816. {
  817. match = g_strstr_len (match, -1, search);
  818. if (match != NULL)
  819. {
  820. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  821. !str_utf8_iscombiningmark (match + search_len))
  822. {
  823. const char *m = deco_text;
  824. result = text;
  825. while (m < match)
  826. {
  827. str_utf8_cnext_noncomb_char (&m);
  828. str_utf8_cnext_noncomb_char (&result);
  829. }
  830. }
  831. else
  832. str_utf8_cnext_char (&match);
  833. }
  834. }
  835. while (match != NULL && result == NULL);
  836. g_free (deco_text);
  837. return result;
  838. }
  839. /* --------------------------------------------------------------------------------------------- */
  840. static const char *
  841. str_utf8_search_last (const char *text, const char *search, gboolean case_sen)
  842. {
  843. char *deco_text;
  844. char *match;
  845. const char *result = NULL;
  846. size_t search_len;
  847. if (case_sen)
  848. deco_text = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
  849. else
  850. {
  851. char *fold_text;
  852. fold_text = g_utf8_casefold (text, -1);
  853. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  854. g_free (fold_text);
  855. }
  856. search_len = strlen (search);
  857. do
  858. {
  859. match = g_strrstr_len (deco_text, -1, search);
  860. if (match != NULL)
  861. {
  862. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  863. !str_utf8_iscombiningmark (match + search_len))
  864. {
  865. const char *m = deco_text;
  866. result = text;
  867. while (m < match)
  868. {
  869. str_utf8_cnext_noncomb_char (&m);
  870. str_utf8_cnext_noncomb_char (&result);
  871. }
  872. }
  873. else
  874. match[0] = '\0';
  875. }
  876. }
  877. while (match != NULL && result == NULL);
  878. g_free (deco_text);
  879. return result;
  880. }
  881. /* --------------------------------------------------------------------------------------------- */
  882. static char *
  883. str_utf8_normalize (const char *text)
  884. {
  885. GString *fixed;
  886. char *tmp;
  887. char *result;
  888. const char *start;
  889. const char *end;
  890. /* g_utf8_normalize() is a heavyweight function, that converts UTF-8 into UCS-4,
  891. * does the normalization and then converts UCS-4 back into UTF-8.
  892. * Since file names are composed of ASCII characters in most cases, we can speed up
  893. * utf8 normalization by checking if the heavyweight Unicode normalization is actually
  894. * needed. Normalization of ASCII string is no-op.
  895. */
  896. /* find out whether text is ASCII only */
  897. for (end = text; *end != '\0'; end++)
  898. if ((*end & 0x80) != 0)
  899. {
  900. /* found 2nd byte of utf8-encoded symbol */
  901. break;
  902. }
  903. /* if text is ASCII-only, return copy, normalize otherwise */
  904. if (*end == '\0')
  905. return g_strndup (text, end - text);
  906. fixed = g_string_sized_new (4);
  907. start = text;
  908. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  909. {
  910. if (start != end)
  911. {
  912. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  913. g_string_append (fixed, tmp);
  914. g_free (tmp);
  915. }
  916. g_string_append_c (fixed, end[0]);
  917. start = end + 1;
  918. }
  919. if (start == text)
  920. {
  921. result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
  922. g_string_free (fixed, TRUE);
  923. }
  924. else
  925. {
  926. if (start[0] != '\0' && start != end)
  927. {
  928. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  929. g_string_append (fixed, tmp);
  930. g_free (tmp);
  931. }
  932. result = g_string_free (fixed, FALSE);
  933. }
  934. return result;
  935. }
  936. /* --------------------------------------------------------------------------------------------- */
  937. static char *
  938. str_utf8_casefold_normalize (const char *text)
  939. {
  940. GString *fixed;
  941. char *tmp, *fold;
  942. char *result;
  943. const char *start;
  944. const char *end;
  945. fixed = g_string_sized_new (4);
  946. start = text;
  947. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  948. {
  949. if (start != end)
  950. {
  951. fold = g_utf8_casefold (start, end - start);
  952. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  953. g_string_append (fixed, tmp);
  954. g_free (tmp);
  955. g_free (fold);
  956. }
  957. g_string_append_c (fixed, end[0]);
  958. start = end + 1;
  959. }
  960. if (start == text)
  961. {
  962. fold = g_utf8_casefold (text, -1);
  963. result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  964. g_free (fold);
  965. g_string_free (fixed, TRUE);
  966. }
  967. else
  968. {
  969. if (start[0] != '\0' && start != end)
  970. {
  971. fold = g_utf8_casefold (start, end - start);
  972. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  973. g_string_append (fixed, tmp);
  974. g_free (tmp);
  975. g_free (fold);
  976. }
  977. result = g_string_free (fixed, FALSE);
  978. }
  979. return result;
  980. }
  981. /* --------------------------------------------------------------------------------------------- */
  982. static int
  983. str_utf8_compare (const char *t1, const char *t2)
  984. {
  985. char *n1, *n2;
  986. int result;
  987. n1 = str_utf8_normalize (t1);
  988. n2 = str_utf8_normalize (t2);
  989. result = strcmp (n1, n2);
  990. g_free (n1);
  991. g_free (n2);
  992. return result;
  993. }
  994. /* --------------------------------------------------------------------------------------------- */
  995. static int
  996. str_utf8_ncompare (const char *t1, const char *t2)
  997. {
  998. char *n1, *n2;
  999. size_t l1, l2;
  1000. int result;
  1001. n1 = str_utf8_normalize (t1);
  1002. n2 = str_utf8_normalize (t2);
  1003. l1 = strlen (n1);
  1004. l2 = strlen (n2);
  1005. result = strncmp (n1, n2, MIN (l1, l2));
  1006. g_free (n1);
  1007. g_free (n2);
  1008. return result;
  1009. }
  1010. /* --------------------------------------------------------------------------------------------- */
  1011. static int
  1012. str_utf8_casecmp (const char *t1, const char *t2)
  1013. {
  1014. char *n1, *n2;
  1015. int result;
  1016. n1 = str_utf8_casefold_normalize (t1);
  1017. n2 = str_utf8_casefold_normalize (t2);
  1018. result = strcmp (n1, n2);
  1019. g_free (n1);
  1020. g_free (n2);
  1021. return result;
  1022. }
  1023. /* --------------------------------------------------------------------------------------------- */
  1024. static int
  1025. str_utf8_ncasecmp (const char *t1, const char *t2)
  1026. {
  1027. char *n1, *n2;
  1028. size_t l1, l2;
  1029. int result;
  1030. n1 = str_utf8_casefold_normalize (t1);
  1031. n2 = str_utf8_casefold_normalize (t2);
  1032. l1 = strlen (n1);
  1033. l2 = strlen (n2);
  1034. result = strncmp (n1, n2, MIN (l1, l2));
  1035. g_free (n1);
  1036. g_free (n2);
  1037. return result;
  1038. }
  1039. /* --------------------------------------------------------------------------------------------- */
  1040. static int
  1041. str_utf8_prefix (const char *text, const char *prefix)
  1042. {
  1043. char *t, *p;
  1044. const char *nt, *np;
  1045. const char *nnt, *nnp;
  1046. int result;
  1047. t = str_utf8_normalize (text);
  1048. p = str_utf8_normalize (prefix);
  1049. nt = t;
  1050. np = p;
  1051. nnt = t;
  1052. nnp = p;
  1053. while (nt[0] != '\0' && np[0] != '\0')
  1054. {
  1055. str_utf8_cnext_char_safe (&nnt);
  1056. str_utf8_cnext_char_safe (&nnp);
  1057. if (nnt - nt != nnp - np)
  1058. break;
  1059. if (strncmp (nt, np, nnt - nt) != 0)
  1060. break;
  1061. nt = nnt;
  1062. np = nnp;
  1063. }
  1064. result = np - p;
  1065. g_free (t);
  1066. g_free (p);
  1067. return result;
  1068. }
  1069. /* --------------------------------------------------------------------------------------------- */
  1070. static int
  1071. str_utf8_caseprefix (const char *text, const char *prefix)
  1072. {
  1073. char *t, *p;
  1074. const char *nt, *np;
  1075. const char *nnt, *nnp;
  1076. int result;
  1077. t = str_utf8_casefold_normalize (text);
  1078. p = str_utf8_casefold_normalize (prefix);
  1079. nt = t;
  1080. np = p;
  1081. nnt = t;
  1082. nnp = p;
  1083. while (nt[0] != '\0' && np[0] != '\0')
  1084. {
  1085. str_utf8_cnext_char_safe (&nnt);
  1086. str_utf8_cnext_char_safe (&nnp);
  1087. if (nnt - nt != nnp - np)
  1088. break;
  1089. if (strncmp (nt, np, nnt - nt) != 0)
  1090. break;
  1091. nt = nnt;
  1092. np = nnp;
  1093. }
  1094. result = np - p;
  1095. g_free (t);
  1096. g_free (p);
  1097. return result;
  1098. }
  1099. /* --------------------------------------------------------------------------------------------- */
  1100. static char *
  1101. str_utf8_create_key_gen (const char *text, gboolean case_sen,
  1102. gchar *(*keygen) (const gchar *text, gssize size))
  1103. {
  1104. char *result;
  1105. if (case_sen)
  1106. result = str_utf8_normalize (text);
  1107. else
  1108. {
  1109. gboolean dot;
  1110. GString *fixed;
  1111. const char *start, *end;
  1112. char *fold, *key;
  1113. dot = text[0] == '.';
  1114. fixed = g_string_sized_new (16);
  1115. if (!dot)
  1116. start = text;
  1117. else
  1118. {
  1119. start = text + 1;
  1120. g_string_append_c (fixed, '.');
  1121. }
  1122. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  1123. {
  1124. if (start != end)
  1125. {
  1126. fold = g_utf8_casefold (start, end - start);
  1127. key = keygen (fold, -1);
  1128. g_string_append (fixed, key);
  1129. g_free (key);
  1130. g_free (fold);
  1131. }
  1132. g_string_append_c (fixed, end[0]);
  1133. start = end + 1;
  1134. }
  1135. if (start == text)
  1136. {
  1137. fold = g_utf8_casefold (start, -1);
  1138. result = keygen (fold, -1);
  1139. g_free (fold);
  1140. g_string_free (fixed, TRUE);
  1141. }
  1142. else if (dot && (start == text + 1))
  1143. {
  1144. fold = g_utf8_casefold (start, -1);
  1145. key = keygen (fold, -1);
  1146. g_string_append (fixed, key);
  1147. g_free (key);
  1148. g_free (fold);
  1149. result = g_string_free (fixed, FALSE);
  1150. }
  1151. else
  1152. {
  1153. if (start[0] != '\0' && start != end)
  1154. {
  1155. fold = g_utf8_casefold (start, end - start);
  1156. key = keygen (fold, -1);
  1157. g_string_append (fixed, key);
  1158. g_free (key);
  1159. g_free (fold);
  1160. }
  1161. result = g_string_free (fixed, FALSE);
  1162. }
  1163. }
  1164. return result;
  1165. }
  1166. /* --------------------------------------------------------------------------------------------- */
  1167. static char *
  1168. str_utf8_create_key (const char *text, gboolean case_sen)
  1169. {
  1170. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
  1171. }
  1172. /* --------------------------------------------------------------------------------------------- */
  1173. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1174. static char *
  1175. str_utf8_create_key_for_filename (const char *text, gboolean case_sen)
  1176. {
  1177. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
  1178. }
  1179. #endif
  1180. /* --------------------------------------------------------------------------------------------- */
  1181. static int
  1182. str_utf8_key_collate (const char *t1, const char *t2, gboolean case_sen)
  1183. {
  1184. (void) case_sen;
  1185. return strcmp (t1, t2);
  1186. }
  1187. /* --------------------------------------------------------------------------------------------- */
  1188. static void
  1189. str_utf8_release_key (char *key, gboolean case_sen)
  1190. {
  1191. (void) case_sen;
  1192. g_free (key);
  1193. }
  1194. /* --------------------------------------------------------------------------------------------- */
  1195. /*** public functions ****************************************************************************/
  1196. /* --------------------------------------------------------------------------------------------- */
  1197. struct str_class
  1198. str_utf8_init (void)
  1199. {
  1200. struct str_class result;
  1201. result.conv_gerror_message = str_utf8_conv_gerror_message;
  1202. result.vfs_convert_to = str_utf8_vfs_convert_to;
  1203. result.insert_replace_char = str_utf8_insert_replace_char;
  1204. result.is_valid_string = str_utf8_is_valid_string;
  1205. result.is_valid_char = str_utf8_is_valid_char;
  1206. result.cnext_char = str_utf8_cnext_char;
  1207. result.cprev_char = str_utf8_cprev_char;
  1208. result.cnext_char_safe = str_utf8_cnext_char_safe;
  1209. result.cprev_char_safe = str_utf8_cprev_char_safe;
  1210. result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
  1211. result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
  1212. result.char_isspace = str_utf8_isspace;
  1213. result.char_ispunct = str_utf8_ispunct;
  1214. result.char_isalnum = str_utf8_isalnum;
  1215. result.char_isdigit = str_utf8_isdigit;
  1216. result.char_isprint = str_utf8_isprint;
  1217. result.char_iscombiningmark = str_utf8_iscombiningmark;
  1218. result.char_toupper = str_utf8_toupper;
  1219. result.char_tolower = str_utf8_tolower;
  1220. result.length = str_utf8_length;
  1221. result.length2 = str_utf8_length2;
  1222. result.length_noncomb = str_utf8_length_noncomb;
  1223. result.fix_string = str_utf8_fix_string;
  1224. result.term_form = str_utf8_term_form;
  1225. result.fit_to_term = str_utf8_fit_to_term;
  1226. result.term_trim = str_utf8_term_trim;
  1227. result.term_width2 = str_utf8_term_width2;
  1228. result.term_width1 = str_utf8_term_width1;
  1229. result.term_char_width = str_utf8_term_char_width;
  1230. result.term_substring = str_utf8_term_substring;
  1231. result.trunc = str_utf8_trunc;
  1232. result.offset_to_pos = str_utf8_offset_to_pos;
  1233. result.column_to_pos = str_utf8_column_to_pos;
  1234. result.create_search_needle = str_utf8_create_search_needle;
  1235. result.release_search_needle = str_utf8_release_search_needle;
  1236. result.search_first = str_utf8_search_first;
  1237. result.search_last = str_utf8_search_last;
  1238. result.compare = str_utf8_compare;
  1239. result.ncompare = str_utf8_ncompare;
  1240. result.casecmp = str_utf8_casecmp;
  1241. result.ncasecmp = str_utf8_ncasecmp;
  1242. result.prefix = str_utf8_prefix;
  1243. result.caseprefix = str_utf8_caseprefix;
  1244. result.create_key = str_utf8_create_key;
  1245. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1246. /* case insensitive sort files in "a1 a2 a10" order */
  1247. result.create_key_for_filename = str_utf8_create_key_for_filename;
  1248. #else
  1249. /* case insensitive sort files in "a1 a10 a2" order */
  1250. result.create_key_for_filename = str_utf8_create_key;
  1251. #endif
  1252. result.key_collate = str_utf8_key_collate;
  1253. result.release_key = str_utf8_release_key;
  1254. return result;
  1255. }
  1256. /* --------------------------------------------------------------------------------------------- */