strutilutf8.c 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519
  1. /*
  2. UTF-8 strings utilities
  3. Copyright (C) 2007-2023
  4. Free Software Foundation, Inc.
  5. Written by:
  6. Rostislav Benes, 2007
  7. This file is part of the Midnight Commander.
  8. The Midnight Commander is free software: you can redistribute it
  9. and/or modify it under the terms of the GNU General Public License as
  10. published by the Free Software Foundation, either version 3 of the License,
  11. or (at your option) any later version.
  12. The Midnight Commander is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. GNU General Public License for more details.
  16. You should have received a copy of the GNU General Public License
  17. along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include <config.h>
  20. #include <stdlib.h>
  21. #include <langinfo.h>
  22. #include <limits.h> /* MB_LEN_MAX */
  23. #include <string.h>
  24. #include "lib/global.h"
  25. #include "lib/strutil.h"
  26. /* using function for utf-8 from glib */
  27. /*** global variables ****************************************************************************/
  28. /*** file scope macro definitions ****************************************************************/
  29. /*** file scope type declarations ****************************************************************/
  30. struct utf8_tool
  31. {
  32. char *actual;
  33. size_t remain;
  34. const char *checked;
  35. int ident;
  36. gboolean compose;
  37. };
  38. struct term_form
  39. {
  40. char text[BUF_MEDIUM * MB_LEN_MAX];
  41. size_t width;
  42. gboolean compose;
  43. };
  44. /*** file scope variables ************************************************************************/
  45. static const char replch[] = "\xEF\xBF\xBD";
  46. /* --------------------------------------------------------------------------------------------- */
  47. /*** file scope functions ************************************************************************/
  48. /* --------------------------------------------------------------------------------------------- */
  49. static gboolean
  50. str_unichar_iscombiningmark (gunichar uni)
  51. {
  52. GUnicodeType type;
  53. type = g_unichar_type (uni);
  54. return (type == G_UNICODE_SPACING_MARK)
  55. || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  56. }
  57. /* --------------------------------------------------------------------------------------------- */
  58. static void
  59. str_utf8_insert_replace_char (GString * buffer)
  60. {
  61. g_string_append (buffer, replch);
  62. }
  63. /* --------------------------------------------------------------------------------------------- */
  64. static gboolean
  65. str_utf8_is_valid_string (const char *text)
  66. {
  67. return g_utf8_validate (text, -1, NULL);
  68. }
  69. /* --------------------------------------------------------------------------------------------- */
  70. static int
  71. str_utf8_is_valid_char (const char *ch, size_t size)
  72. {
  73. switch (g_utf8_get_char_validated (ch, size))
  74. {
  75. case (gunichar) (-2):
  76. return (-2);
  77. case (gunichar) (-1):
  78. return (-1);
  79. default:
  80. return 1;
  81. }
  82. }
  83. /* --------------------------------------------------------------------------------------------- */
  84. static void
  85. str_utf8_cnext_char (const char **text)
  86. {
  87. (*text) = g_utf8_next_char (*text);
  88. }
  89. /* --------------------------------------------------------------------------------------------- */
  90. static void
  91. str_utf8_cprev_char (const char **text)
  92. {
  93. (*text) = g_utf8_prev_char (*text);
  94. }
  95. /* --------------------------------------------------------------------------------------------- */
  96. static void
  97. str_utf8_cnext_char_safe (const char **text)
  98. {
  99. if (str_utf8_is_valid_char (*text, -1) == 1)
  100. (*text) = g_utf8_next_char (*text);
  101. else
  102. (*text)++;
  103. }
  104. /* --------------------------------------------------------------------------------------------- */
  105. static void
  106. str_utf8_cprev_char_safe (const char **text)
  107. {
  108. const char *result, *t;
  109. result = g_utf8_prev_char (*text);
  110. t = result;
  111. str_utf8_cnext_char_safe (&t);
  112. if (t == *text)
  113. (*text) = result;
  114. else
  115. (*text)--;
  116. }
  117. /* --------------------------------------------------------------------------------------------- */
  118. static void
  119. str_utf8_fix_string (char *text)
  120. {
  121. while (text[0] != '\0')
  122. {
  123. gunichar uni;
  124. uni = g_utf8_get_char_validated (text, -1);
  125. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  126. text = g_utf8_next_char (text);
  127. else
  128. {
  129. text[0] = '?';
  130. text++;
  131. }
  132. }
  133. }
  134. /* --------------------------------------------------------------------------------------------- */
  135. static gboolean
  136. str_utf8_isspace (const char *text)
  137. {
  138. gunichar uni;
  139. uni = g_utf8_get_char_validated (text, -1);
  140. return g_unichar_isspace (uni);
  141. }
  142. /* --------------------------------------------------------------------------------------------- */
  143. static gboolean
  144. str_utf8_ispunct (const char *text)
  145. {
  146. gunichar uni;
  147. uni = g_utf8_get_char_validated (text, -1);
  148. return g_unichar_ispunct (uni);
  149. }
  150. /* --------------------------------------------------------------------------------------------- */
  151. static gboolean
  152. str_utf8_isalnum (const char *text)
  153. {
  154. gunichar uni;
  155. uni = g_utf8_get_char_validated (text, -1);
  156. return g_unichar_isalnum (uni);
  157. }
  158. /* --------------------------------------------------------------------------------------------- */
  159. static gboolean
  160. str_utf8_isdigit (const char *text)
  161. {
  162. gunichar uni;
  163. uni = g_utf8_get_char_validated (text, -1);
  164. return g_unichar_isdigit (uni);
  165. }
  166. /* --------------------------------------------------------------------------------------------- */
  167. static gboolean
  168. str_utf8_isprint (const char *ch)
  169. {
  170. gunichar uni;
  171. uni = g_utf8_get_char_validated (ch, -1);
  172. return g_unichar_isprint (uni);
  173. }
  174. /* --------------------------------------------------------------------------------------------- */
  175. static gboolean
  176. str_utf8_iscombiningmark (const char *ch)
  177. {
  178. gunichar uni;
  179. uni = g_utf8_get_char_validated (ch, -1);
  180. return str_unichar_iscombiningmark (uni);
  181. }
  182. /* --------------------------------------------------------------------------------------------- */
  183. static int
  184. str_utf8_cnext_noncomb_char (const char **text)
  185. {
  186. int count = 0;
  187. while ((*text)[0] != '\0')
  188. {
  189. str_utf8_cnext_char_safe (text);
  190. count++;
  191. if (!str_utf8_iscombiningmark (*text))
  192. break;
  193. }
  194. return count;
  195. }
  196. /* --------------------------------------------------------------------------------------------- */
  197. static int
  198. str_utf8_cprev_noncomb_char (const char **text, const char *begin)
  199. {
  200. int count = 0;
  201. while ((*text) != begin)
  202. {
  203. str_utf8_cprev_char_safe (text);
  204. count++;
  205. if (!str_utf8_iscombiningmark (*text))
  206. break;
  207. }
  208. return count;
  209. }
  210. /* --------------------------------------------------------------------------------------------- */
  211. static gboolean
  212. str_utf8_toupper (const char *text, char **out, size_t * remain)
  213. {
  214. gunichar uni;
  215. size_t left;
  216. uni = g_utf8_get_char_validated (text, -1);
  217. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  218. return FALSE;
  219. uni = g_unichar_toupper (uni);
  220. left = g_unichar_to_utf8 (uni, NULL);
  221. if (left >= *remain)
  222. return FALSE;
  223. left = g_unichar_to_utf8 (uni, *out);
  224. (*out) += left;
  225. (*remain) -= left;
  226. return TRUE;
  227. }
  228. /* --------------------------------------------------------------------------------------------- */
  229. static gboolean
  230. str_utf8_tolower (const char *text, char **out, size_t * remain)
  231. {
  232. gunichar uni;
  233. size_t left;
  234. uni = g_utf8_get_char_validated (text, -1);
  235. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  236. return FALSE;
  237. uni = g_unichar_tolower (uni);
  238. left = g_unichar_to_utf8 (uni, NULL);
  239. if (left >= *remain)
  240. return FALSE;
  241. left = g_unichar_to_utf8 (uni, *out);
  242. (*out) += left;
  243. (*remain) -= left;
  244. return TRUE;
  245. }
  246. /* --------------------------------------------------------------------------------------------- */
  247. static int
  248. str_utf8_length (const char *text)
  249. {
  250. int result = 0;
  251. const char *start;
  252. const char *end;
  253. start = text;
  254. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  255. {
  256. if (start != end)
  257. result += g_utf8_strlen (start, end - start);
  258. result++;
  259. start = end + 1;
  260. }
  261. if (start == text)
  262. result = g_utf8_strlen (text, -1);
  263. else if (start[0] != '\0' && start != end)
  264. result += g_utf8_strlen (start, end - start);
  265. return result;
  266. }
  267. /* --------------------------------------------------------------------------------------------- */
  268. static int
  269. str_utf8_length2 (const char *text, int size)
  270. {
  271. int result = 0;
  272. const char *start;
  273. const char *end;
  274. start = text;
  275. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
  276. {
  277. if (start != end)
  278. {
  279. result += g_utf8_strlen (start, MIN (end - start, size));
  280. size -= end - start;
  281. }
  282. result += (size > 0);
  283. size--;
  284. start = end + 1;
  285. }
  286. if (start == text)
  287. result = g_utf8_strlen (text, size);
  288. else if (start[0] != '\0' && start != end && size > 0)
  289. result += g_utf8_strlen (start, MIN (end - start, size));
  290. return result;
  291. }
  292. /* --------------------------------------------------------------------------------------------- */
  293. static int
  294. str_utf8_length_noncomb (const char *text)
  295. {
  296. int result = 0;
  297. const char *t = text;
  298. while (t[0] != '\0')
  299. {
  300. str_utf8_cnext_noncomb_char (&t);
  301. result++;
  302. }
  303. return result;
  304. }
  305. /* --------------------------------------------------------------------------------------------- */
  306. #if 0
  307. static void
  308. str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
  309. {
  310. char *next;
  311. next = g_utf8_next_char (*string);
  312. (*left) -= next - (*string);
  313. (*string) = next;
  314. g_string_append_c (buffer, '?');
  315. }
  316. #endif
  317. /* --------------------------------------------------------------------------------------------- */
  318. static gchar *
  319. str_utf8_conv_gerror_message (GError * mcerror, const char *def_msg)
  320. {
  321. if (mcerror != NULL)
  322. return g_strdup (mcerror->message);
  323. return g_strdup (def_msg != NULL ? def_msg : "");
  324. }
  325. /* --------------------------------------------------------------------------------------------- */
  326. static estr_t
  327. str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
  328. {
  329. estr_t result = ESTR_SUCCESS;
  330. if (coder == str_cnv_not_convert)
  331. g_string_append_len (buffer, string, size);
  332. else
  333. result = str_nconvert (coder, string, size, buffer);
  334. return result;
  335. }
  336. /* --------------------------------------------------------------------------------------------- */
  337. /* utility function, that makes string valid in utf8 and all characters printable
  338. * return width of string too */
  339. static const struct term_form *
  340. str_utf8_make_make_term_form (const char *text, size_t length)
  341. {
  342. static struct term_form result;
  343. gunichar uni;
  344. size_t left;
  345. char *actual;
  346. result.text[0] = '\0';
  347. result.width = 0;
  348. result.compose = FALSE;
  349. actual = result.text;
  350. /* check if text start with combining character,
  351. * add space at begin in this case */
  352. if (length != 0 && text[0] != '\0')
  353. {
  354. uni = g_utf8_get_char_validated (text, -1);
  355. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
  356. && str_unichar_iscombiningmark (uni))
  357. {
  358. actual[0] = ' ';
  359. actual++;
  360. result.width++;
  361. result.compose = TRUE;
  362. }
  363. }
  364. while (length != 0 && text[0] != '\0')
  365. {
  366. uni = g_utf8_get_char_validated (text, -1);
  367. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  368. {
  369. if (g_unichar_isprint (uni))
  370. {
  371. left = g_unichar_to_utf8 (uni, actual);
  372. actual += left;
  373. if (str_unichar_iscombiningmark (uni))
  374. result.compose = TRUE;
  375. else
  376. {
  377. result.width++;
  378. if (g_unichar_iswide (uni))
  379. result.width++;
  380. }
  381. }
  382. else
  383. {
  384. actual[0] = '.';
  385. actual++;
  386. result.width++;
  387. }
  388. text = g_utf8_next_char (text);
  389. }
  390. else
  391. {
  392. text++;
  393. /*actual[0] = '?'; */
  394. memcpy (actual, replch, strlen (replch));
  395. actual += strlen (replch);
  396. result.width++;
  397. }
  398. if (length != (size_t) (-1))
  399. length--;
  400. }
  401. actual[0] = '\0';
  402. return &result;
  403. }
  404. /* --------------------------------------------------------------------------------------------- */
  405. static const char *
  406. str_utf8_term_form (const char *text)
  407. {
  408. static char result[BUF_MEDIUM * MB_LEN_MAX];
  409. const struct term_form *pre_form;
  410. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  411. if (pre_form->compose)
  412. {
  413. char *composed;
  414. composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  415. g_strlcpy (result, composed, sizeof (result));
  416. g_free (composed);
  417. }
  418. else
  419. g_strlcpy (result, pre_form->text, sizeof (result));
  420. return result;
  421. }
  422. /* --------------------------------------------------------------------------------------------- */
  423. /* utility function, that copies all characters from checked to actual */
  424. static gboolean
  425. utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
  426. {
  427. tool->compose = FALSE;
  428. while (tool->checked[0] != '\0')
  429. {
  430. gunichar uni;
  431. size_t left;
  432. uni = g_utf8_get_char (tool->checked);
  433. tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
  434. left = g_unichar_to_utf8 (uni, NULL);
  435. if (tool->remain <= left)
  436. return FALSE;
  437. left = g_unichar_to_utf8 (uni, tool->actual);
  438. tool->actual += left;
  439. tool->remain -= left;
  440. tool->checked = g_utf8_next_char (tool->checked);
  441. }
  442. return TRUE;
  443. }
  444. /* --------------------------------------------------------------------------------------------- */
  445. /* utility function, that copies characters from checked to actual until ident is
  446. * smaller than to_ident */
  447. static gboolean
  448. utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
  449. {
  450. tool->compose = FALSE;
  451. while (tool->checked[0] != '\0')
  452. {
  453. gunichar uni;
  454. size_t left;
  455. int w = 0;
  456. uni = g_utf8_get_char (tool->checked);
  457. if (str_unichar_iscombiningmark (uni))
  458. tool->compose = TRUE;
  459. else
  460. {
  461. w = 1;
  462. if (g_unichar_iswide (uni))
  463. w++;
  464. if (tool->ident + w > to_ident)
  465. return TRUE;
  466. }
  467. left = g_unichar_to_utf8 (uni, NULL);
  468. if (tool->remain <= left)
  469. return FALSE;
  470. left = g_unichar_to_utf8 (uni, tool->actual);
  471. tool->actual += left;
  472. tool->remain -= left;
  473. tool->checked = g_utf8_next_char (tool->checked);
  474. tool->ident += w;
  475. }
  476. return TRUE;
  477. }
  478. /* --------------------------------------------------------------------------------------------- */
  479. /* utility function, adds count spaces to actual */
  480. static int
  481. utf8_tool_insert_space (struct utf8_tool *tool, int count)
  482. {
  483. if (count <= 0)
  484. return 1;
  485. if (tool->remain <= (gsize) count)
  486. return 0;
  487. memset (tool->actual, ' ', count);
  488. tool->actual += count;
  489. tool->remain -= count;
  490. return 1;
  491. }
  492. /* --------------------------------------------------------------------------------------------- */
  493. /* utility function, adds one characters to actual */
  494. static int
  495. utf8_tool_insert_char (struct utf8_tool *tool, char ch)
  496. {
  497. if (tool->remain <= 1)
  498. return 0;
  499. tool->actual[0] = ch;
  500. tool->actual++;
  501. tool->remain--;
  502. return 1;
  503. }
  504. /* --------------------------------------------------------------------------------------------- */
  505. /* utility function, thah skips characters from checked until ident is greater or
  506. * equal to to_ident */
  507. static gboolean
  508. utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
  509. {
  510. gunichar uni;
  511. while (to_ident > tool->ident && tool->checked[0] != '\0')
  512. {
  513. uni = g_utf8_get_char (tool->checked);
  514. if (!str_unichar_iscombiningmark (uni))
  515. {
  516. tool->ident++;
  517. if (g_unichar_iswide (uni))
  518. tool->ident++;
  519. }
  520. tool->checked = g_utf8_next_char (tool->checked);
  521. }
  522. uni = g_utf8_get_char (tool->checked);
  523. while (str_unichar_iscombiningmark (uni))
  524. {
  525. tool->checked = g_utf8_next_char (tool->checked);
  526. uni = g_utf8_get_char (tool->checked);
  527. }
  528. return TRUE;
  529. }
  530. /* --------------------------------------------------------------------------------------------- */
  531. static void
  532. utf8_tool_compose (char *buffer, size_t size)
  533. {
  534. char *composed;
  535. composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  536. g_strlcpy (buffer, composed, size);
  537. g_free (composed);
  538. }
  539. /* --------------------------------------------------------------------------------------------- */
  540. static const char *
  541. str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
  542. {
  543. static char result[BUF_MEDIUM * MB_LEN_MAX];
  544. const struct term_form *pre_form;
  545. struct utf8_tool tool;
  546. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  547. tool.checked = pre_form->text;
  548. tool.actual = result;
  549. tool.remain = sizeof (result);
  550. tool.compose = FALSE;
  551. if (pre_form->width <= (gsize) width)
  552. {
  553. switch (HIDE_FIT (just_mode))
  554. {
  555. case J_CENTER_LEFT:
  556. case J_CENTER:
  557. tool.ident = (width - pre_form->width) / 2;
  558. break;
  559. case J_RIGHT:
  560. tool.ident = width - pre_form->width;
  561. break;
  562. default:
  563. tool.ident = 0;
  564. break;
  565. }
  566. utf8_tool_insert_space (&tool, tool.ident);
  567. utf8_tool_copy_chars_to_end (&tool);
  568. utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
  569. }
  570. else if (IS_FIT (just_mode))
  571. {
  572. tool.ident = 0;
  573. utf8_tool_copy_chars_to (&tool, width / 2);
  574. utf8_tool_insert_char (&tool, '~');
  575. tool.ident = 0;
  576. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  577. utf8_tool_copy_chars_to_end (&tool);
  578. utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
  579. }
  580. else
  581. {
  582. switch (HIDE_FIT (just_mode))
  583. {
  584. case J_CENTER:
  585. tool.ident = (width - pre_form->width) / 2;
  586. break;
  587. case J_RIGHT:
  588. tool.ident = width - pre_form->width;
  589. break;
  590. default:
  591. tool.ident = 0;
  592. break;
  593. }
  594. utf8_tool_skip_chars_to (&tool, 0);
  595. utf8_tool_insert_space (&tool, tool.ident);
  596. utf8_tool_copy_chars_to (&tool, width);
  597. utf8_tool_insert_space (&tool, width - tool.ident);
  598. }
  599. tool.actual[0] = '\0';
  600. if (tool.compose)
  601. utf8_tool_compose (result, sizeof (result));
  602. return result;
  603. }
  604. /* --------------------------------------------------------------------------------------------- */
  605. static const char *
  606. str_utf8_term_trim (const char *text, int width)
  607. {
  608. static char result[BUF_MEDIUM * MB_LEN_MAX];
  609. const struct term_form *pre_form;
  610. struct utf8_tool tool;
  611. if (width < 1)
  612. {
  613. result[0] = '\0';
  614. return result;
  615. }
  616. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  617. tool.checked = pre_form->text;
  618. tool.actual = result;
  619. tool.remain = sizeof (result);
  620. tool.compose = FALSE;
  621. if ((gsize) width >= pre_form->width)
  622. utf8_tool_copy_chars_to_end (&tool);
  623. else if (width <= 3)
  624. {
  625. memset (tool.actual, '.', width);
  626. tool.actual += width;
  627. tool.remain -= width;
  628. }
  629. else
  630. {
  631. memset (tool.actual, '.', 3);
  632. tool.actual += 3;
  633. tool.remain -= 3;
  634. tool.ident = 0;
  635. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
  636. utf8_tool_copy_chars_to_end (&tool);
  637. }
  638. tool.actual[0] = '\0';
  639. if (tool.compose)
  640. utf8_tool_compose (result, sizeof (result));
  641. return result;
  642. }
  643. /* --------------------------------------------------------------------------------------------- */
  644. static int
  645. str_utf8_term_width2 (const char *text, size_t length)
  646. {
  647. const struct term_form *result;
  648. result = str_utf8_make_make_term_form (text, length);
  649. return result->width;
  650. }
  651. /* --------------------------------------------------------------------------------------------- */
  652. static int
  653. str_utf8_term_width1 (const char *text)
  654. {
  655. return str_utf8_term_width2 (text, (size_t) (-1));
  656. }
  657. /* --------------------------------------------------------------------------------------------- */
  658. static int
  659. str_utf8_term_char_width (const char *text)
  660. {
  661. gunichar uni;
  662. uni = g_utf8_get_char_validated (text, -1);
  663. return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
  664. }
  665. /* --------------------------------------------------------------------------------------------- */
  666. static const char *
  667. str_utf8_term_substring (const char *text, int start, int width)
  668. {
  669. static char result[BUF_MEDIUM * MB_LEN_MAX];
  670. const struct term_form *pre_form;
  671. struct utf8_tool tool;
  672. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  673. tool.checked = pre_form->text;
  674. tool.actual = result;
  675. tool.remain = sizeof (result);
  676. tool.compose = FALSE;
  677. tool.ident = -start;
  678. utf8_tool_skip_chars_to (&tool, 0);
  679. if (tool.ident < 0)
  680. tool.ident = 0;
  681. utf8_tool_insert_space (&tool, tool.ident);
  682. utf8_tool_copy_chars_to (&tool, width);
  683. utf8_tool_insert_space (&tool, width - tool.ident);
  684. tool.actual[0] = '\0';
  685. if (tool.compose)
  686. utf8_tool_compose (result, sizeof (result));
  687. return result;
  688. }
  689. /* --------------------------------------------------------------------------------------------- */
  690. static const char *
  691. str_utf8_trunc (const char *text, int width)
  692. {
  693. static char result[MC_MAXPATHLEN * MB_LEN_MAX * 2];
  694. const struct term_form *pre_form;
  695. struct utf8_tool tool;
  696. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  697. tool.checked = pre_form->text;
  698. tool.actual = result;
  699. tool.remain = sizeof (result);
  700. tool.compose = FALSE;
  701. if (pre_form->width <= (gsize) width)
  702. utf8_tool_copy_chars_to_end (&tool);
  703. else
  704. {
  705. tool.ident = 0;
  706. utf8_tool_copy_chars_to (&tool, width / 2);
  707. utf8_tool_insert_char (&tool, '~');
  708. tool.ident = 0;
  709. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  710. utf8_tool_copy_chars_to_end (&tool);
  711. }
  712. tool.actual[0] = '\0';
  713. if (tool.compose)
  714. utf8_tool_compose (result, sizeof (result));
  715. return result;
  716. }
  717. /* --------------------------------------------------------------------------------------------- */
  718. static int
  719. str_utf8_offset_to_pos (const char *text, size_t length)
  720. {
  721. if (str_utf8_is_valid_string (text))
  722. return g_utf8_offset_to_pointer (text, length) - text;
  723. else
  724. {
  725. int result;
  726. GString *buffer;
  727. buffer = g_string_new (text);
  728. str_utf8_fix_string (buffer->str);
  729. result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
  730. g_string_free (buffer, TRUE);
  731. return result;
  732. }
  733. }
  734. /* --------------------------------------------------------------------------------------------- */
  735. static int
  736. str_utf8_column_to_pos (const char *text, size_t pos)
  737. {
  738. int result = 0;
  739. int width = 0;
  740. while (text[0] != '\0')
  741. {
  742. gunichar uni;
  743. uni = g_utf8_get_char_validated (text, MB_LEN_MAX);
  744. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  745. {
  746. if (g_unichar_isprint (uni))
  747. {
  748. if (!str_unichar_iscombiningmark (uni))
  749. {
  750. width++;
  751. if (g_unichar_iswide (uni))
  752. width++;
  753. }
  754. }
  755. else
  756. {
  757. width++;
  758. }
  759. text = g_utf8_next_char (text);
  760. }
  761. else
  762. {
  763. text++;
  764. width++;
  765. }
  766. if ((gsize) width > pos)
  767. return result;
  768. result++;
  769. }
  770. return result;
  771. }
  772. /* --------------------------------------------------------------------------------------------- */
  773. static char *
  774. str_utf8_create_search_needle (const char *needle, gboolean case_sen)
  775. {
  776. char *fold, *result;
  777. if (needle == NULL)
  778. return NULL;
  779. if (case_sen)
  780. return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
  781. fold = g_utf8_casefold (needle, -1);
  782. result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  783. g_free (fold);
  784. return result;
  785. }
  786. /* --------------------------------------------------------------------------------------------- */
  787. static void
  788. str_utf8_release_search_needle (char *needle, gboolean case_sen)
  789. {
  790. (void) case_sen;
  791. g_free (needle);
  792. }
  793. /* --------------------------------------------------------------------------------------------- */
  794. static const char *
  795. str_utf8_search_first (const char *text, const char *search, gboolean case_sen)
  796. {
  797. char *fold_text;
  798. char *deco_text;
  799. const char *match;
  800. const char *result = NULL;
  801. const char *m;
  802. fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
  803. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  804. match = deco_text;
  805. do
  806. {
  807. match = g_strstr_len (match, -1, search);
  808. if (match != NULL)
  809. {
  810. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  811. !str_utf8_iscombiningmark (match + strlen (search)))
  812. {
  813. result = text;
  814. m = deco_text;
  815. while (m < match)
  816. {
  817. str_utf8_cnext_noncomb_char (&m);
  818. str_utf8_cnext_noncomb_char (&result);
  819. }
  820. }
  821. else
  822. str_utf8_cnext_char (&match);
  823. }
  824. }
  825. while (match != NULL && result == NULL);
  826. g_free (deco_text);
  827. if (!case_sen)
  828. g_free (fold_text);
  829. return result;
  830. }
  831. /* --------------------------------------------------------------------------------------------- */
  832. static const char *
  833. str_utf8_search_last (const char *text, const char *search, gboolean case_sen)
  834. {
  835. char *fold_text;
  836. char *deco_text;
  837. char *match;
  838. const char *result = NULL;
  839. const char *m;
  840. fold_text = case_sen ? (char *) text : g_utf8_casefold (text, -1);
  841. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  842. do
  843. {
  844. match = g_strrstr_len (deco_text, -1, search);
  845. if (match != NULL)
  846. {
  847. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  848. !str_utf8_iscombiningmark (match + strlen (search)))
  849. {
  850. result = text;
  851. m = deco_text;
  852. while (m < match)
  853. {
  854. str_utf8_cnext_noncomb_char (&m);
  855. str_utf8_cnext_noncomb_char (&result);
  856. }
  857. }
  858. else
  859. match[0] = '\0';
  860. }
  861. }
  862. while (match != NULL && result == NULL);
  863. g_free (deco_text);
  864. if (!case_sen)
  865. g_free (fold_text);
  866. return result;
  867. }
  868. /* --------------------------------------------------------------------------------------------- */
  869. static char *
  870. str_utf8_normalize (const char *text)
  871. {
  872. GString *fixed;
  873. char *tmp;
  874. char *result;
  875. const char *start;
  876. const char *end;
  877. /* g_utf8_normalize() is a heavyweight function, that converts UTF-8 into UCS-4,
  878. * does the normalization and then converts UCS-4 back into UTF-8.
  879. * Since file names are composed of ASCII characters in most cases, we can speed up
  880. * utf8 normalization by checking if the heavyweight Unicode normalization is actually
  881. * needed. Normalization of ASCII string is no-op.
  882. */
  883. /* find out whether text is ASCII only */
  884. for (end = text; *end != '\0'; end++)
  885. if ((*end & 0x80) != 0)
  886. {
  887. /* found 2nd byte of utf8-encoded symbol */
  888. break;
  889. }
  890. /* if text is ASCII-only, return copy, normalize otherwise */
  891. if (*end == '\0')
  892. return g_strndup (text, end - text);
  893. fixed = g_string_sized_new (4);
  894. start = text;
  895. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  896. {
  897. if (start != end)
  898. {
  899. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  900. g_string_append (fixed, tmp);
  901. g_free (tmp);
  902. }
  903. g_string_append_c (fixed, end[0]);
  904. start = end + 1;
  905. }
  906. if (start == text)
  907. {
  908. result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
  909. g_string_free (fixed, TRUE);
  910. }
  911. else
  912. {
  913. if (start[0] != '\0' && start != end)
  914. {
  915. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  916. g_string_append (fixed, tmp);
  917. g_free (tmp);
  918. }
  919. result = g_string_free (fixed, FALSE);
  920. }
  921. return result;
  922. }
  923. /* --------------------------------------------------------------------------------------------- */
  924. static char *
  925. str_utf8_casefold_normalize (const char *text)
  926. {
  927. GString *fixed;
  928. char *tmp, *fold;
  929. char *result;
  930. const char *start;
  931. const char *end;
  932. fixed = g_string_sized_new (4);
  933. start = text;
  934. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  935. {
  936. if (start != end)
  937. {
  938. fold = g_utf8_casefold (start, end - start);
  939. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  940. g_string_append (fixed, tmp);
  941. g_free (tmp);
  942. g_free (fold);
  943. }
  944. g_string_append_c (fixed, end[0]);
  945. start = end + 1;
  946. }
  947. if (start == text)
  948. {
  949. fold = g_utf8_casefold (text, -1);
  950. result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  951. g_free (fold);
  952. g_string_free (fixed, TRUE);
  953. }
  954. else
  955. {
  956. if (start[0] != '\0' && start != end)
  957. {
  958. fold = g_utf8_casefold (start, end - start);
  959. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  960. g_string_append (fixed, tmp);
  961. g_free (tmp);
  962. g_free (fold);
  963. }
  964. result = g_string_free (fixed, FALSE);
  965. }
  966. return result;
  967. }
  968. /* --------------------------------------------------------------------------------------------- */
  969. static int
  970. str_utf8_compare (const char *t1, const char *t2)
  971. {
  972. char *n1, *n2;
  973. int result;
  974. n1 = str_utf8_normalize (t1);
  975. n2 = str_utf8_normalize (t2);
  976. result = strcmp (n1, n2);
  977. g_free (n1);
  978. g_free (n2);
  979. return result;
  980. }
  981. /* --------------------------------------------------------------------------------------------- */
  982. static int
  983. str_utf8_ncompare (const char *t1, const char *t2)
  984. {
  985. char *n1, *n2;
  986. size_t l1, l2;
  987. int result;
  988. n1 = str_utf8_normalize (t1);
  989. n2 = str_utf8_normalize (t2);
  990. l1 = strlen (n1);
  991. l2 = strlen (n2);
  992. result = strncmp (n1, n2, MIN (l1, l2));
  993. g_free (n1);
  994. g_free (n2);
  995. return result;
  996. }
  997. /* --------------------------------------------------------------------------------------------- */
  998. static int
  999. str_utf8_casecmp (const char *t1, const char *t2)
  1000. {
  1001. char *n1, *n2;
  1002. int result;
  1003. n1 = str_utf8_casefold_normalize (t1);
  1004. n2 = str_utf8_casefold_normalize (t2);
  1005. result = strcmp (n1, n2);
  1006. g_free (n1);
  1007. g_free (n2);
  1008. return result;
  1009. }
  1010. /* --------------------------------------------------------------------------------------------- */
  1011. static int
  1012. str_utf8_ncasecmp (const char *t1, const char *t2)
  1013. {
  1014. char *n1, *n2;
  1015. size_t l1, l2;
  1016. int result;
  1017. n1 = str_utf8_casefold_normalize (t1);
  1018. n2 = str_utf8_casefold_normalize (t2);
  1019. l1 = strlen (n1);
  1020. l2 = strlen (n2);
  1021. result = strncmp (n1, n2, MIN (l1, l2));
  1022. g_free (n1);
  1023. g_free (n2);
  1024. return result;
  1025. }
  1026. /* --------------------------------------------------------------------------------------------- */
  1027. static int
  1028. str_utf8_prefix (const char *text, const char *prefix)
  1029. {
  1030. char *t, *p;
  1031. const char *nt, *np;
  1032. const char *nnt, *nnp;
  1033. int result;
  1034. t = str_utf8_normalize (text);
  1035. p = str_utf8_normalize (prefix);
  1036. nt = t;
  1037. np = p;
  1038. nnt = t;
  1039. nnp = p;
  1040. while (nt[0] != '\0' && np[0] != '\0')
  1041. {
  1042. str_utf8_cnext_char_safe (&nnt);
  1043. str_utf8_cnext_char_safe (&nnp);
  1044. if (nnt - nt != nnp - np)
  1045. break;
  1046. if (strncmp (nt, np, nnt - nt) != 0)
  1047. break;
  1048. nt = nnt;
  1049. np = nnp;
  1050. }
  1051. result = np - p;
  1052. g_free (t);
  1053. g_free (p);
  1054. return result;
  1055. }
  1056. /* --------------------------------------------------------------------------------------------- */
  1057. static int
  1058. str_utf8_caseprefix (const char *text, const char *prefix)
  1059. {
  1060. char *t, *p;
  1061. const char *nt, *np;
  1062. const char *nnt, *nnp;
  1063. int result;
  1064. t = str_utf8_casefold_normalize (text);
  1065. p = str_utf8_casefold_normalize (prefix);
  1066. nt = t;
  1067. np = p;
  1068. nnt = t;
  1069. nnp = p;
  1070. while (nt[0] != '\0' && np[0] != '\0')
  1071. {
  1072. str_utf8_cnext_char_safe (&nnt);
  1073. str_utf8_cnext_char_safe (&nnp);
  1074. if (nnt - nt != nnp - np)
  1075. break;
  1076. if (strncmp (nt, np, nnt - nt) != 0)
  1077. break;
  1078. nt = nnt;
  1079. np = nnp;
  1080. }
  1081. result = np - p;
  1082. g_free (t);
  1083. g_free (p);
  1084. return result;
  1085. }
  1086. /* --------------------------------------------------------------------------------------------- */
  1087. static char *
  1088. str_utf8_create_key_gen (const char *text, gboolean case_sen,
  1089. gchar * (*keygen) (const gchar * text, gssize size))
  1090. {
  1091. char *result;
  1092. if (case_sen)
  1093. result = str_utf8_normalize (text);
  1094. else
  1095. {
  1096. gboolean dot;
  1097. GString *fixed;
  1098. const char *start, *end;
  1099. char *fold, *key;
  1100. dot = text[0] == '.';
  1101. fixed = g_string_sized_new (16);
  1102. if (!dot)
  1103. start = text;
  1104. else
  1105. {
  1106. start = text + 1;
  1107. g_string_append_c (fixed, '.');
  1108. }
  1109. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  1110. {
  1111. if (start != end)
  1112. {
  1113. fold = g_utf8_casefold (start, end - start);
  1114. key = keygen (fold, -1);
  1115. g_string_append (fixed, key);
  1116. g_free (key);
  1117. g_free (fold);
  1118. }
  1119. g_string_append_c (fixed, end[0]);
  1120. start = end + 1;
  1121. }
  1122. if (start == text)
  1123. {
  1124. fold = g_utf8_casefold (start, -1);
  1125. result = keygen (fold, -1);
  1126. g_free (fold);
  1127. g_string_free (fixed, TRUE);
  1128. }
  1129. else if (dot && (start == text + 1))
  1130. {
  1131. fold = g_utf8_casefold (start, -1);
  1132. key = keygen (fold, -1);
  1133. g_string_append (fixed, key);
  1134. g_free (key);
  1135. g_free (fold);
  1136. result = g_string_free (fixed, FALSE);
  1137. }
  1138. else
  1139. {
  1140. if (start[0] != '\0' && start != end)
  1141. {
  1142. fold = g_utf8_casefold (start, end - start);
  1143. key = keygen (fold, -1);
  1144. g_string_append (fixed, key);
  1145. g_free (key);
  1146. g_free (fold);
  1147. }
  1148. result = g_string_free (fixed, FALSE);
  1149. }
  1150. }
  1151. return result;
  1152. }
  1153. /* --------------------------------------------------------------------------------------------- */
  1154. static char *
  1155. str_utf8_create_key (const char *text, gboolean case_sen)
  1156. {
  1157. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
  1158. }
  1159. /* --------------------------------------------------------------------------------------------- */
  1160. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1161. static char *
  1162. str_utf8_create_key_for_filename (const char *text, gboolean case_sen)
  1163. {
  1164. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
  1165. }
  1166. #endif
  1167. /* --------------------------------------------------------------------------------------------- */
  1168. static int
  1169. str_utf8_key_collate (const char *t1, const char *t2, gboolean case_sen)
  1170. {
  1171. (void) case_sen;
  1172. return strcmp (t1, t2);
  1173. }
  1174. /* --------------------------------------------------------------------------------------------- */
  1175. static void
  1176. str_utf8_release_key (char *key, gboolean case_sen)
  1177. {
  1178. (void) case_sen;
  1179. g_free (key);
  1180. }
  1181. /* --------------------------------------------------------------------------------------------- */
  1182. /*** public functions ****************************************************************************/
  1183. /* --------------------------------------------------------------------------------------------- */
  1184. struct str_class
  1185. str_utf8_init (void)
  1186. {
  1187. struct str_class result;
  1188. result.conv_gerror_message = str_utf8_conv_gerror_message;
  1189. result.vfs_convert_to = str_utf8_vfs_convert_to;
  1190. result.insert_replace_char = str_utf8_insert_replace_char;
  1191. result.is_valid_string = str_utf8_is_valid_string;
  1192. result.is_valid_char = str_utf8_is_valid_char;
  1193. result.cnext_char = str_utf8_cnext_char;
  1194. result.cprev_char = str_utf8_cprev_char;
  1195. result.cnext_char_safe = str_utf8_cnext_char_safe;
  1196. result.cprev_char_safe = str_utf8_cprev_char_safe;
  1197. result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
  1198. result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
  1199. result.char_isspace = str_utf8_isspace;
  1200. result.char_ispunct = str_utf8_ispunct;
  1201. result.char_isalnum = str_utf8_isalnum;
  1202. result.char_isdigit = str_utf8_isdigit;
  1203. result.char_isprint = str_utf8_isprint;
  1204. result.char_iscombiningmark = str_utf8_iscombiningmark;
  1205. result.char_toupper = str_utf8_toupper;
  1206. result.char_tolower = str_utf8_tolower;
  1207. result.length = str_utf8_length;
  1208. result.length2 = str_utf8_length2;
  1209. result.length_noncomb = str_utf8_length_noncomb;
  1210. result.fix_string = str_utf8_fix_string;
  1211. result.term_form = str_utf8_term_form;
  1212. result.fit_to_term = str_utf8_fit_to_term;
  1213. result.term_trim = str_utf8_term_trim;
  1214. result.term_width2 = str_utf8_term_width2;
  1215. result.term_width1 = str_utf8_term_width1;
  1216. result.term_char_width = str_utf8_term_char_width;
  1217. result.term_substring = str_utf8_term_substring;
  1218. result.trunc = str_utf8_trunc;
  1219. result.offset_to_pos = str_utf8_offset_to_pos;
  1220. result.column_to_pos = str_utf8_column_to_pos;
  1221. result.create_search_needle = str_utf8_create_search_needle;
  1222. result.release_search_needle = str_utf8_release_search_needle;
  1223. result.search_first = str_utf8_search_first;
  1224. result.search_last = str_utf8_search_last;
  1225. result.compare = str_utf8_compare;
  1226. result.ncompare = str_utf8_ncompare;
  1227. result.casecmp = str_utf8_casecmp;
  1228. result.ncasecmp = str_utf8_ncasecmp;
  1229. result.prefix = str_utf8_prefix;
  1230. result.caseprefix = str_utf8_caseprefix;
  1231. result.create_key = str_utf8_create_key;
  1232. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1233. /* case insensitive sort files in "a1 a2 a10" order */
  1234. result.create_key_for_filename = str_utf8_create_key_for_filename;
  1235. #else
  1236. /* case insensitive sort files in "a1 a10 a2" order */
  1237. result.create_key_for_filename = str_utf8_create_key;
  1238. #endif
  1239. result.key_collate = str_utf8_key_collate;
  1240. result.release_key = str_utf8_release_key;
  1241. return result;
  1242. }
  1243. /* --------------------------------------------------------------------------------------------- */