strutilutf8.c 32 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360
  1. /*
  2. UTF-8 strings utilities
  3. Copyright (C) 2007-2015
  4. Free Software Foundation, Inc.
  5. Written by:
  6. Rostislav Benes, 2007
  7. This file is part of the Midnight Commander.
  8. The Midnight Commander is free software: you can redistribute it
  9. and/or modify it under the terms of the GNU General Public License as
  10. published by the Free Software Foundation, either version 3 of the License,
  11. or (at your option) any later version.
  12. The Midnight Commander is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. GNU General Public License for more details.
  16. You should have received a copy of the GNU General Public License
  17. along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include <config.h>
  20. #include <stdlib.h>
  21. #include <langinfo.h>
  22. #include <string.h>
  23. #include "lib/global.h"
  24. #include "lib/strutil.h"
  25. /* using function for utf-8 from glib */
  26. static const char replch[] = "\xEF\xBF\xBD";
  27. static gboolean
  28. str_unichar_iscombiningmark (gunichar uni)
  29. {
  30. GUnicodeType type;
  31. type = g_unichar_type (uni);
  32. return (type == G_UNICODE_COMBINING_MARK)
  33. || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  34. }
  35. static void
  36. str_utf8_insert_replace_char (GString * buffer)
  37. {
  38. g_string_append (buffer, replch);
  39. }
  40. static int
  41. str_utf8_is_valid_string (const char *text)
  42. {
  43. return g_utf8_validate (text, -1, NULL);
  44. }
  45. static int
  46. str_utf8_is_valid_char (const char *ch, size_t size)
  47. {
  48. switch (g_utf8_get_char_validated (ch, size))
  49. {
  50. case (gunichar) (-2):
  51. return (-2);
  52. case (gunichar) (-1):
  53. return (-1);
  54. default:
  55. return 1;
  56. }
  57. }
  58. static void
  59. str_utf8_cnext_char (const char **text)
  60. {
  61. (*text) = g_utf8_next_char (*text);
  62. }
  63. static void
  64. str_utf8_cprev_char (const char **text)
  65. {
  66. (*text) = g_utf8_prev_char (*text);
  67. }
  68. static void
  69. str_utf8_cnext_char_safe (const char **text)
  70. {
  71. if (str_utf8_is_valid_char (*text, -1) == 1)
  72. (*text) = g_utf8_next_char (*text);
  73. else
  74. (*text)++;
  75. }
  76. static void
  77. str_utf8_cprev_char_safe (const char **text)
  78. {
  79. const char *result, *t;
  80. result = g_utf8_prev_char (*text);
  81. t = result;
  82. str_utf8_cnext_char_safe (&t);
  83. if (t == *text)
  84. (*text) = result;
  85. else
  86. (*text)--;
  87. }
  88. static void
  89. str_utf8_fix_string (char *text)
  90. {
  91. while (text[0] != '\0')
  92. {
  93. gunichar uni;
  94. uni = g_utf8_get_char_validated (text, -1);
  95. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  96. text = g_utf8_next_char (text);
  97. else
  98. {
  99. text[0] = '?';
  100. text++;
  101. }
  102. }
  103. }
  104. static int
  105. str_utf8_isspace (const char *text)
  106. {
  107. gunichar uni;
  108. uni = g_utf8_get_char_validated (text, -1);
  109. return g_unichar_isspace (uni);
  110. }
  111. static int
  112. str_utf8_ispunct (const char *text)
  113. {
  114. gunichar uni;
  115. uni = g_utf8_get_char_validated (text, -1);
  116. return g_unichar_ispunct (uni);
  117. }
  118. static int
  119. str_utf8_isalnum (const char *text)
  120. {
  121. gunichar uni;
  122. uni = g_utf8_get_char_validated (text, -1);
  123. return g_unichar_isalnum (uni);
  124. }
  125. static int
  126. str_utf8_isdigit (const char *text)
  127. {
  128. gunichar uni;
  129. uni = g_utf8_get_char_validated (text, -1);
  130. return g_unichar_isdigit (uni);
  131. }
  132. static int
  133. str_utf8_isprint (const char *ch)
  134. {
  135. gunichar uni;
  136. uni = g_utf8_get_char_validated (ch, -1);
  137. return g_unichar_isprint (uni);
  138. }
  139. static gboolean
  140. str_utf8_iscombiningmark (const char *ch)
  141. {
  142. gunichar uni;
  143. uni = g_utf8_get_char_validated (ch, -1);
  144. return str_unichar_iscombiningmark (uni);
  145. }
  146. static int
  147. str_utf8_cnext_noncomb_char (const char **text)
  148. {
  149. int count = 0;
  150. while ((*text)[0] != '\0')
  151. {
  152. str_utf8_cnext_char_safe (text);
  153. count++;
  154. if (!str_utf8_iscombiningmark (*text))
  155. break;
  156. }
  157. return count;
  158. }
  159. static int
  160. str_utf8_cprev_noncomb_char (const char **text, const char *begin)
  161. {
  162. int count = 0;
  163. while ((*text) != begin)
  164. {
  165. str_utf8_cprev_char_safe (text);
  166. count++;
  167. if (!str_utf8_iscombiningmark (*text))
  168. break;
  169. }
  170. return count;
  171. }
  172. static int
  173. str_utf8_toupper (const char *text, char **out, size_t * remain)
  174. {
  175. gunichar uni;
  176. size_t left;
  177. uni = g_utf8_get_char_validated (text, -1);
  178. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  179. return 0;
  180. uni = g_unichar_toupper (uni);
  181. left = g_unichar_to_utf8 (uni, NULL);
  182. if (left >= *remain)
  183. return 0;
  184. left = g_unichar_to_utf8 (uni, *out);
  185. (*out) += left;
  186. (*remain) -= left;
  187. return 1;
  188. }
  189. static int
  190. str_utf8_tolower (const char *text, char **out, size_t * remain)
  191. {
  192. gunichar uni;
  193. size_t left;
  194. uni = g_utf8_get_char_validated (text, -1);
  195. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  196. return 0;
  197. uni = g_unichar_tolower (uni);
  198. left = g_unichar_to_utf8 (uni, NULL);
  199. if (left >= *remain)
  200. return 0;
  201. left = g_unichar_to_utf8 (uni, *out);
  202. (*out) += left;
  203. (*remain) -= left;
  204. return 1;
  205. }
  206. static int
  207. str_utf8_length (const char *text)
  208. {
  209. int result = 0;
  210. const char *start;
  211. const char *end;
  212. start = text;
  213. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  214. {
  215. if (start != end)
  216. result += g_utf8_strlen (start, end - start);
  217. result++;
  218. start = end + 1;
  219. }
  220. if (start == text)
  221. result = g_utf8_strlen (text, -1);
  222. else if (start[0] != '\0' && start != end)
  223. result += g_utf8_strlen (start, end - start);
  224. return result;
  225. }
  226. static int
  227. str_utf8_length2 (const char *text, int size)
  228. {
  229. int result = 0;
  230. const char *start;
  231. const char *end;
  232. start = text;
  233. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
  234. {
  235. if (start != end)
  236. {
  237. result += g_utf8_strlen (start, min (end - start, size));
  238. size -= end - start;
  239. }
  240. result += (size > 0);
  241. size--;
  242. start = end + 1;
  243. }
  244. if (start == text)
  245. result = g_utf8_strlen (text, size);
  246. else if (start[0] != '\0' && start != end && size > 0)
  247. result += g_utf8_strlen (start, min (end - start, size));
  248. return result;
  249. }
  250. static int
  251. str_utf8_length_noncomb (const char *text)
  252. {
  253. int result = 0;
  254. const char *t = text;
  255. while (t[0] != '\0')
  256. {
  257. str_utf8_cnext_noncomb_char (&t);
  258. result++;
  259. }
  260. return result;
  261. }
  262. /*
  263. static void
  264. str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
  265. {
  266. char *next = g_utf8_next_char (*string);
  267. (*left) -= next - (*string);
  268. (*string) = next;
  269. g_string_append_c (buffer, '?');
  270. }
  271. */
  272. static gchar *
  273. str_utf8_conv_gerror_message (GError * mcerror, const char *def_msg)
  274. {
  275. if (mcerror != NULL)
  276. return g_strdup (mcerror->message);
  277. return g_strdup (def_msg != NULL ? def_msg : "");
  278. }
  279. static estr_t
  280. str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
  281. {
  282. estr_t result = ESTR_SUCCESS;
  283. if (coder == str_cnv_not_convert)
  284. g_string_append_len (buffer, string, size);
  285. else
  286. result = str_nconvert (coder, (char *) string, size, buffer);
  287. return result;
  288. }
  289. struct term_form
  290. {
  291. char text[BUF_MEDIUM * 6];
  292. size_t width;
  293. gboolean compose;
  294. };
  295. /* utiliti function, that make string valid in utf8 and all characters printable
  296. * return width of string too*/
  297. static const struct term_form *
  298. str_utf8_make_make_term_form (const char *text, size_t length)
  299. {
  300. static struct term_form result;
  301. gunichar uni;
  302. size_t left;
  303. char *actual;
  304. result.text[0] = '\0';
  305. result.width = 0;
  306. result.compose = FALSE;
  307. actual = result.text;
  308. /* check if text start with combining character,
  309. * add space at begin in this case */
  310. if (length != 0 && text[0] != '\0')
  311. {
  312. uni = g_utf8_get_char_validated (text, -1);
  313. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
  314. && str_unichar_iscombiningmark (uni))
  315. {
  316. actual[0] = ' ';
  317. actual++;
  318. result.width++;
  319. result.compose = TRUE;
  320. }
  321. }
  322. while (length != 0 && text[0] != '\0')
  323. {
  324. uni = g_utf8_get_char_validated (text, -1);
  325. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  326. {
  327. if (g_unichar_isprint (uni))
  328. {
  329. left = g_unichar_to_utf8 (uni, actual);
  330. actual += left;
  331. if (str_unichar_iscombiningmark (uni))
  332. result.compose = TRUE;
  333. else
  334. {
  335. result.width++;
  336. if (g_unichar_iswide (uni))
  337. result.width++;
  338. }
  339. }
  340. else
  341. {
  342. actual[0] = '.';
  343. actual++;
  344. result.width++;
  345. }
  346. text = g_utf8_next_char (text);
  347. }
  348. else
  349. {
  350. text++;
  351. /*actual[0] = '?'; */
  352. memcpy (actual, replch, strlen (replch));
  353. actual += strlen (replch);
  354. result.width++;
  355. }
  356. if (length != (size_t) (-1))
  357. length--;
  358. }
  359. actual[0] = '\0';
  360. return &result;
  361. }
  362. static const char *
  363. str_utf8_term_form (const char *text)
  364. {
  365. static char result[BUF_MEDIUM * 6];
  366. const struct term_form *pre_form;
  367. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  368. if (pre_form->compose)
  369. {
  370. char *composed;
  371. composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  372. g_strlcpy (result, composed, sizeof (result));
  373. g_free (composed);
  374. }
  375. else
  376. g_strlcpy (result, pre_form->text, sizeof (result));
  377. return result;
  378. }
  379. struct utf8_tool
  380. {
  381. char *actual;
  382. size_t remain;
  383. const char *cheked;
  384. int ident;
  385. gboolean compose;
  386. };
  387. /* utiliti function, that copy all characters from cheked to actual */
  388. static gboolean
  389. utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
  390. {
  391. tool->compose = FALSE;
  392. while (tool->cheked[0] != '\0')
  393. {
  394. gunichar uni;
  395. size_t left;
  396. uni = g_utf8_get_char (tool->cheked);
  397. tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
  398. left = g_unichar_to_utf8 (uni, NULL);
  399. if (tool->remain <= left)
  400. return FALSE;
  401. left = g_unichar_to_utf8 (uni, tool->actual);
  402. tool->actual += left;
  403. tool->remain -= left;
  404. tool->cheked = g_utf8_next_char (tool->cheked);
  405. }
  406. return TRUE;
  407. }
  408. /* utiliti function, that copy characters from cheked to actual until ident is
  409. * smaller than to_ident */
  410. static gboolean
  411. utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
  412. {
  413. tool->compose = FALSE;
  414. while (tool->cheked[0] != '\0')
  415. {
  416. gunichar uni;
  417. size_t left;
  418. int w = 0;
  419. uni = g_utf8_get_char (tool->cheked);
  420. if (str_unichar_iscombiningmark (uni))
  421. tool->compose = TRUE;
  422. else
  423. {
  424. w = 1;
  425. if (g_unichar_iswide (uni))
  426. w++;
  427. if (tool->ident + w > to_ident)
  428. return TRUE;
  429. }
  430. left = g_unichar_to_utf8 (uni, NULL);
  431. if (tool->remain <= left)
  432. return FALSE;
  433. left = g_unichar_to_utf8 (uni, tool->actual);
  434. tool->actual += left;
  435. tool->remain -= left;
  436. tool->cheked = g_utf8_next_char (tool->cheked);
  437. tool->ident += w;
  438. }
  439. return TRUE;
  440. }
  441. /* utiliti function, add count spaces to actual */
  442. static int
  443. utf8_tool_insert_space (struct utf8_tool *tool, int count)
  444. {
  445. if (count <= 0)
  446. return 1;
  447. if (tool->remain <= (gsize) count)
  448. return 0;
  449. memset (tool->actual, ' ', count);
  450. tool->actual += count;
  451. tool->remain -= count;
  452. return 1;
  453. }
  454. /* utiliti function, add one characters to actual */
  455. static int
  456. utf8_tool_insert_char (struct utf8_tool *tool, char ch)
  457. {
  458. if (tool->remain <= 1)
  459. return 0;
  460. tool->actual[0] = ch;
  461. tool->actual++;
  462. tool->remain--;
  463. return 1;
  464. }
  465. /* utiliti function, thah skip characters from cheked until ident is greater or
  466. * equal to to_ident */
  467. static gboolean
  468. utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
  469. {
  470. gunichar uni;
  471. while (to_ident > tool->ident && tool->cheked[0] != '\0')
  472. {
  473. uni = g_utf8_get_char (tool->cheked);
  474. if (!str_unichar_iscombiningmark (uni))
  475. {
  476. tool->ident++;
  477. if (g_unichar_iswide (uni))
  478. tool->ident++;
  479. }
  480. tool->cheked = g_utf8_next_char (tool->cheked);
  481. }
  482. uni = g_utf8_get_char (tool->cheked);
  483. while (str_unichar_iscombiningmark (uni))
  484. {
  485. tool->cheked = g_utf8_next_char (tool->cheked);
  486. uni = g_utf8_get_char (tool->cheked);
  487. }
  488. return TRUE;
  489. }
  490. static void
  491. utf8_tool_compose (char *buffer, size_t size)
  492. {
  493. char *composed;
  494. composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  495. g_strlcpy (buffer, composed, size);
  496. g_free (composed);
  497. }
  498. static const char *
  499. str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
  500. {
  501. static char result[BUF_MEDIUM * 6];
  502. const struct term_form *pre_form;
  503. struct utf8_tool tool;
  504. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  505. tool.cheked = pre_form->text;
  506. tool.actual = result;
  507. tool.remain = sizeof (result);
  508. tool.compose = FALSE;
  509. if (pre_form->width <= (gsize) width)
  510. {
  511. tool.ident = 0;
  512. switch (HIDE_FIT (just_mode))
  513. {
  514. case J_CENTER_LEFT:
  515. case J_CENTER:
  516. tool.ident = (width - pre_form->width) / 2;
  517. break;
  518. case J_RIGHT:
  519. tool.ident = width - pre_form->width;
  520. break;
  521. }
  522. utf8_tool_insert_space (&tool, tool.ident);
  523. utf8_tool_copy_chars_to_end (&tool);
  524. utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
  525. }
  526. else if (IS_FIT (just_mode))
  527. {
  528. tool.ident = 0;
  529. utf8_tool_copy_chars_to (&tool, width / 2);
  530. utf8_tool_insert_char (&tool, '~');
  531. tool.ident = 0;
  532. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  533. utf8_tool_copy_chars_to_end (&tool);
  534. utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
  535. }
  536. else
  537. {
  538. tool.ident = 0;
  539. switch (HIDE_FIT (just_mode))
  540. {
  541. case J_CENTER:
  542. tool.ident = (width - pre_form->width) / 2;
  543. break;
  544. case J_RIGHT:
  545. tool.ident = width - pre_form->width;
  546. break;
  547. }
  548. utf8_tool_skip_chars_to (&tool, 0);
  549. utf8_tool_insert_space (&tool, tool.ident);
  550. utf8_tool_copy_chars_to (&tool, width);
  551. utf8_tool_insert_space (&tool, width - tool.ident);
  552. }
  553. tool.actual[0] = '\0';
  554. if (tool.compose)
  555. utf8_tool_compose (result, sizeof (result));
  556. return result;
  557. }
  558. static const char *
  559. str_utf8_term_trim (const char *text, int width)
  560. {
  561. static char result[BUF_MEDIUM * 6];
  562. const struct term_form *pre_form;
  563. struct utf8_tool tool;
  564. if (width < 1)
  565. {
  566. result[0] = '\0';
  567. return result;
  568. }
  569. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  570. tool.cheked = pre_form->text;
  571. tool.actual = result;
  572. tool.remain = sizeof (result);
  573. tool.compose = FALSE;
  574. if ((gsize) width >= pre_form->width)
  575. utf8_tool_copy_chars_to_end (&tool);
  576. else if (width <= 3)
  577. {
  578. memset (tool.actual, '.', width);
  579. tool.actual += width;
  580. tool.remain -= width;
  581. }
  582. else
  583. {
  584. memset (tool.actual, '.', 3);
  585. tool.actual += 3;
  586. tool.remain -= 3;
  587. tool.ident = 0;
  588. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
  589. utf8_tool_copy_chars_to_end (&tool);
  590. }
  591. tool.actual[0] = '\0';
  592. if (tool.compose)
  593. utf8_tool_compose (result, sizeof (result));
  594. return result;
  595. }
  596. static int
  597. str_utf8_term_width2 (const char *text, size_t length)
  598. {
  599. const struct term_form *result;
  600. result = str_utf8_make_make_term_form (text, length);
  601. return result->width;
  602. }
  603. static int
  604. str_utf8_term_width1 (const char *text)
  605. {
  606. return str_utf8_term_width2 (text, (size_t) (-1));
  607. }
  608. static int
  609. str_utf8_term_char_width (const char *text)
  610. {
  611. gunichar uni;
  612. uni = g_utf8_get_char_validated (text, -1);
  613. return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
  614. }
  615. static const char *
  616. str_utf8_term_substring (const char *text, int start, int width)
  617. {
  618. static char result[BUF_MEDIUM * 6];
  619. const struct term_form *pre_form;
  620. struct utf8_tool tool;
  621. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  622. tool.cheked = pre_form->text;
  623. tool.actual = result;
  624. tool.remain = sizeof (result);
  625. tool.compose = FALSE;
  626. tool.ident = -start;
  627. utf8_tool_skip_chars_to (&tool, 0);
  628. if (tool.ident < 0)
  629. tool.ident = 0;
  630. utf8_tool_insert_space (&tool, tool.ident);
  631. utf8_tool_copy_chars_to (&tool, width);
  632. utf8_tool_insert_space (&tool, width - tool.ident);
  633. tool.actual[0] = '\0';
  634. if (tool.compose)
  635. utf8_tool_compose (result, sizeof (result));
  636. return result;
  637. }
  638. static const char *
  639. str_utf8_trunc (const char *text, int width)
  640. {
  641. static char result[MC_MAXPATHLEN * 6 * 2];
  642. const struct term_form *pre_form;
  643. struct utf8_tool tool;
  644. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  645. tool.cheked = pre_form->text;
  646. tool.actual = result;
  647. tool.remain = sizeof (result);
  648. tool.compose = FALSE;
  649. if (pre_form->width <= (gsize) width)
  650. utf8_tool_copy_chars_to_end (&tool);
  651. else
  652. {
  653. tool.ident = 0;
  654. utf8_tool_copy_chars_to (&tool, width / 2);
  655. utf8_tool_insert_char (&tool, '~');
  656. tool.ident = 0;
  657. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  658. utf8_tool_copy_chars_to_end (&tool);
  659. }
  660. tool.actual[0] = '\0';
  661. if (tool.compose)
  662. utf8_tool_compose (result, sizeof (result));
  663. return result;
  664. }
  665. static int
  666. str_utf8_offset_to_pos (const char *text, size_t length)
  667. {
  668. if (str_utf8_is_valid_string (text))
  669. return g_utf8_offset_to_pointer (text, length) - text;
  670. else
  671. {
  672. int result;
  673. GString *buffer;
  674. buffer = g_string_new (text);
  675. str_utf8_fix_string (buffer->str);
  676. result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
  677. g_string_free (buffer, TRUE);
  678. return result;
  679. }
  680. }
  681. static int
  682. str_utf8_column_to_pos (const char *text, size_t pos)
  683. {
  684. int result = 0;
  685. int width = 0;
  686. while (text[0] != '\0')
  687. {
  688. gunichar uni;
  689. uni = g_utf8_get_char_validated (text, 6);
  690. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  691. {
  692. if (g_unichar_isprint (uni))
  693. {
  694. if (!str_unichar_iscombiningmark (uni))
  695. {
  696. width++;
  697. if (g_unichar_iswide (uni))
  698. width++;
  699. }
  700. }
  701. else
  702. {
  703. width++;
  704. }
  705. text = g_utf8_next_char (text);
  706. }
  707. else
  708. {
  709. text++;
  710. width++;
  711. }
  712. if ((gsize) width > pos)
  713. return result;
  714. result++;
  715. }
  716. return result;
  717. }
  718. static char *
  719. str_utf8_create_search_needle (const char *needle, int case_sen)
  720. {
  721. char *fold, *result;
  722. if (needle == NULL)
  723. return NULL;
  724. if (case_sen)
  725. return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
  726. fold = g_utf8_casefold (needle, -1);
  727. result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  728. g_free (fold);
  729. return result;
  730. }
  731. static void
  732. str_utf8_release_search_needle (char *needle, int case_sen)
  733. {
  734. (void) case_sen;
  735. g_free (needle);
  736. }
  737. static const char *
  738. str_utf8_search_first (const char *text, const char *search, int case_sen)
  739. {
  740. char *fold_text;
  741. char *deco_text;
  742. const char *match;
  743. const char *result = NULL;
  744. const char *m;
  745. fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
  746. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  747. match = deco_text;
  748. do
  749. {
  750. match = g_strstr_len (match, -1, search);
  751. if (match != NULL)
  752. {
  753. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  754. !str_utf8_iscombiningmark (match + strlen (search)))
  755. {
  756. result = text;
  757. m = deco_text;
  758. while (m < match)
  759. {
  760. str_utf8_cnext_noncomb_char (&m);
  761. str_utf8_cnext_noncomb_char (&result);
  762. }
  763. }
  764. else
  765. str_utf8_cnext_char (&match);
  766. }
  767. }
  768. while (match != NULL && result == NULL);
  769. g_free (deco_text);
  770. if (!case_sen)
  771. g_free (fold_text);
  772. return result;
  773. }
  774. static const char *
  775. str_utf8_search_last (const char *text, const char *search, int case_sen)
  776. {
  777. char *fold_text;
  778. char *deco_text;
  779. char *match;
  780. const char *result = NULL;
  781. const char *m;
  782. fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
  783. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  784. do
  785. {
  786. match = g_strrstr_len (deco_text, -1, search);
  787. if (match != NULL)
  788. {
  789. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  790. !str_utf8_iscombiningmark (match + strlen (search)))
  791. {
  792. result = text;
  793. m = deco_text;
  794. while (m < match)
  795. {
  796. str_utf8_cnext_noncomb_char (&m);
  797. str_utf8_cnext_noncomb_char (&result);
  798. }
  799. }
  800. else
  801. match[0] = '\0';
  802. }
  803. }
  804. while (match != NULL && result == NULL);
  805. g_free (deco_text);
  806. if (!case_sen)
  807. g_free (fold_text);
  808. return result;
  809. }
  810. static char *
  811. str_utf8_normalize (const char *text)
  812. {
  813. GString *fixed;
  814. char *tmp;
  815. char *result;
  816. const char *start;
  817. const char *end;
  818. fixed = g_string_sized_new (4);
  819. start = text;
  820. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  821. {
  822. if (start != end)
  823. {
  824. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  825. g_string_append (fixed, tmp);
  826. g_free (tmp);
  827. }
  828. g_string_append_c (fixed, end[0]);
  829. start = end + 1;
  830. }
  831. if (start == text)
  832. {
  833. result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
  834. g_string_free (fixed, TRUE);
  835. }
  836. else
  837. {
  838. if (start[0] != '\0' && start != end)
  839. {
  840. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  841. g_string_append (fixed, tmp);
  842. g_free (tmp);
  843. }
  844. result = g_string_free (fixed, FALSE);
  845. }
  846. return result;
  847. }
  848. static char *
  849. str_utf8_casefold_normalize (const char *text)
  850. {
  851. GString *fixed;
  852. char *tmp, *fold;
  853. char *result;
  854. const char *start;
  855. const char *end;
  856. fixed = g_string_sized_new (4);
  857. start = text;
  858. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  859. {
  860. if (start != end)
  861. {
  862. fold = g_utf8_casefold (start, end - start);
  863. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  864. g_string_append (fixed, tmp);
  865. g_free (tmp);
  866. g_free (fold);
  867. }
  868. g_string_append_c (fixed, end[0]);
  869. start = end + 1;
  870. }
  871. if (start == text)
  872. {
  873. fold = g_utf8_casefold (text, -1);
  874. result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  875. g_free (fold);
  876. g_string_free (fixed, TRUE);
  877. }
  878. else
  879. {
  880. if (start[0] != '\0' && start != end)
  881. {
  882. fold = g_utf8_casefold (start, end - start);
  883. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  884. g_string_append (fixed, tmp);
  885. g_free (tmp);
  886. g_free (fold);
  887. }
  888. result = g_string_free (fixed, FALSE);
  889. }
  890. return result;
  891. }
  892. static int
  893. str_utf8_compare (const char *t1, const char *t2)
  894. {
  895. char *n1, *n2;
  896. int result;
  897. n1 = str_utf8_normalize (t1);
  898. n2 = str_utf8_normalize (t2);
  899. result = strcmp (n1, n2);
  900. g_free (n1);
  901. g_free (n2);
  902. return result;
  903. }
  904. static int
  905. str_utf8_ncompare (const char *t1, const char *t2)
  906. {
  907. char *n1, *n2;
  908. size_t l1, l2;
  909. int result;
  910. n1 = str_utf8_normalize (t1);
  911. n2 = str_utf8_normalize (t2);
  912. l1 = strlen (n1);
  913. l2 = strlen (n2);
  914. result = strncmp (n1, n2, min (l1, l2));
  915. g_free (n1);
  916. g_free (n2);
  917. return result;
  918. }
  919. static int
  920. str_utf8_casecmp (const char *t1, const char *t2)
  921. {
  922. char *n1, *n2;
  923. int result;
  924. n1 = str_utf8_casefold_normalize (t1);
  925. n2 = str_utf8_casefold_normalize (t2);
  926. result = strcmp (n1, n2);
  927. g_free (n1);
  928. g_free (n2);
  929. return result;
  930. }
  931. static int
  932. str_utf8_ncasecmp (const char *t1, const char *t2)
  933. {
  934. char *n1, *n2;
  935. size_t l1, l2;
  936. int result;
  937. n1 = str_utf8_casefold_normalize (t1);
  938. n2 = str_utf8_casefold_normalize (t2);
  939. l1 = strlen (n1);
  940. l2 = strlen (n2);
  941. result = strncmp (n1, n2, min (l1, l2));
  942. g_free (n1);
  943. g_free (n2);
  944. return result;
  945. }
  946. static int
  947. str_utf8_prefix (const char *text, const char *prefix)
  948. {
  949. char *t, *p;
  950. const char *nt, *np;
  951. const char *nnt, *nnp;
  952. int result;
  953. t = str_utf8_normalize (text);
  954. p = str_utf8_normalize (prefix);
  955. nt = t;
  956. np = p;
  957. nnt = t;
  958. nnp = p;
  959. while (nt[0] != '\0' && np[0] != '\0')
  960. {
  961. str_utf8_cnext_char_safe (&nnt);
  962. str_utf8_cnext_char_safe (&nnp);
  963. if (nnt - nt != nnp - np)
  964. break;
  965. if (strncmp (nt, np, nnt - nt) != 0)
  966. break;
  967. nt = nnt;
  968. np = nnp;
  969. }
  970. result = np - p;
  971. g_free (t);
  972. g_free (p);
  973. return result;
  974. }
  975. static int
  976. str_utf8_caseprefix (const char *text, const char *prefix)
  977. {
  978. char *t, *p;
  979. const char *nt, *np;
  980. const char *nnt, *nnp;
  981. int result;
  982. t = str_utf8_casefold_normalize (text);
  983. p = str_utf8_casefold_normalize (prefix);
  984. nt = t;
  985. np = p;
  986. nnt = t;
  987. nnp = p;
  988. while (nt[0] != '\0' && np[0] != '\0')
  989. {
  990. str_utf8_cnext_char_safe (&nnt);
  991. str_utf8_cnext_char_safe (&nnp);
  992. if (nnt - nt != nnp - np)
  993. break;
  994. if (strncmp (nt, np, nnt - nt) != 0)
  995. break;
  996. nt = nnt;
  997. np = nnp;
  998. }
  999. result = np - p;
  1000. g_free (t);
  1001. g_free (p);
  1002. return result;
  1003. }
  1004. static char *
  1005. str_utf8_create_key_gen (const char *text, int case_sen,
  1006. gchar * (*keygen) (const gchar * text, gssize size))
  1007. {
  1008. char *result;
  1009. if (case_sen)
  1010. result = str_utf8_normalize (text);
  1011. else
  1012. {
  1013. gboolean dot;
  1014. GString *fixed;
  1015. const char *start, *end;
  1016. char *fold, *key;
  1017. dot = text[0] == '.';
  1018. fixed = g_string_sized_new (16);
  1019. if (!dot)
  1020. start = text;
  1021. else
  1022. {
  1023. start = text + 1;
  1024. g_string_append_c (fixed, '.');
  1025. }
  1026. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  1027. {
  1028. if (start != end)
  1029. {
  1030. fold = g_utf8_casefold (start, end - start);
  1031. key = keygen (fold, -1);
  1032. g_string_append (fixed, key);
  1033. g_free (key);
  1034. g_free (fold);
  1035. }
  1036. g_string_append_c (fixed, end[0]);
  1037. start = end + 1;
  1038. }
  1039. if (start == text)
  1040. {
  1041. fold = g_utf8_casefold (start, -1);
  1042. result = keygen (fold, -1);
  1043. g_free (fold);
  1044. g_string_free (fixed, TRUE);
  1045. }
  1046. else if (dot && (start == text + 1))
  1047. {
  1048. fold = g_utf8_casefold (start, -1);
  1049. key = keygen (fold, -1);
  1050. g_string_append (fixed, key);
  1051. g_free (key);
  1052. g_free (fold);
  1053. result = g_string_free (fixed, FALSE);
  1054. }
  1055. else
  1056. {
  1057. if (start[0] != '\0' && start != end)
  1058. {
  1059. fold = g_utf8_casefold (start, end - start);
  1060. key = keygen (fold, -1);
  1061. g_string_append (fixed, key);
  1062. g_free (key);
  1063. g_free (fold);
  1064. }
  1065. result = g_string_free (fixed, FALSE);
  1066. }
  1067. }
  1068. return result;
  1069. }
  1070. static char *
  1071. str_utf8_create_key (const char *text, int case_sen)
  1072. {
  1073. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
  1074. }
  1075. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1076. static char *
  1077. str_utf8_create_key_for_filename (const char *text, int case_sen)
  1078. {
  1079. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
  1080. }
  1081. #endif
  1082. static int
  1083. str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
  1084. {
  1085. (void) case_sen;
  1086. return strcmp (t1, t2);
  1087. }
  1088. static void
  1089. str_utf8_release_key (char *key, int case_sen)
  1090. {
  1091. (void) case_sen;
  1092. g_free (key);
  1093. }
  1094. struct str_class
  1095. str_utf8_init (void)
  1096. {
  1097. struct str_class result;
  1098. result.conv_gerror_message = str_utf8_conv_gerror_message;
  1099. result.vfs_convert_to = str_utf8_vfs_convert_to;
  1100. result.insert_replace_char = str_utf8_insert_replace_char;
  1101. result.is_valid_string = str_utf8_is_valid_string;
  1102. result.is_valid_char = str_utf8_is_valid_char;
  1103. result.cnext_char = str_utf8_cnext_char;
  1104. result.cprev_char = str_utf8_cprev_char;
  1105. result.cnext_char_safe = str_utf8_cnext_char_safe;
  1106. result.cprev_char_safe = str_utf8_cprev_char_safe;
  1107. result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
  1108. result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
  1109. result.char_isspace = str_utf8_isspace;
  1110. result.char_ispunct = str_utf8_ispunct;
  1111. result.char_isalnum = str_utf8_isalnum;
  1112. result.char_isdigit = str_utf8_isdigit;
  1113. result.char_isprint = str_utf8_isprint;
  1114. result.char_iscombiningmark = str_utf8_iscombiningmark;
  1115. result.char_toupper = str_utf8_toupper;
  1116. result.char_tolower = str_utf8_tolower;
  1117. result.length = str_utf8_length;
  1118. result.length2 = str_utf8_length2;
  1119. result.length_noncomb = str_utf8_length_noncomb;
  1120. result.fix_string = str_utf8_fix_string;
  1121. result.term_form = str_utf8_term_form;
  1122. result.fit_to_term = str_utf8_fit_to_term;
  1123. result.term_trim = str_utf8_term_trim;
  1124. result.term_width2 = str_utf8_term_width2;
  1125. result.term_width1 = str_utf8_term_width1;
  1126. result.term_char_width = str_utf8_term_char_width;
  1127. result.term_substring = str_utf8_term_substring;
  1128. result.trunc = str_utf8_trunc;
  1129. result.offset_to_pos = str_utf8_offset_to_pos;
  1130. result.column_to_pos = str_utf8_column_to_pos;
  1131. result.create_search_needle = str_utf8_create_search_needle;
  1132. result.release_search_needle = str_utf8_release_search_needle;
  1133. result.search_first = str_utf8_search_first;
  1134. result.search_last = str_utf8_search_last;
  1135. result.compare = str_utf8_compare;
  1136. result.ncompare = str_utf8_ncompare;
  1137. result.casecmp = str_utf8_casecmp;
  1138. result.ncasecmp = str_utf8_ncasecmp;
  1139. result.prefix = str_utf8_prefix;
  1140. result.caseprefix = str_utf8_caseprefix;
  1141. result.create_key = str_utf8_create_key;
  1142. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1143. /* case insensitive sort files in "a1 a2 a10" order */
  1144. result.create_key_for_filename = str_utf8_create_key_for_filename;
  1145. #else
  1146. /* case insensitive sort files in "a1 a10 a2" order */
  1147. result.create_key_for_filename = str_utf8_create_key;
  1148. #endif
  1149. result.key_collate = str_utf8_key_collate;
  1150. result.release_key = str_utf8_release_key;
  1151. return result;
  1152. }