strutilutf8.c 33 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390
  1. /* UTF-8 strings utilities
  2. Copyright (C) 2007 Free Software Foundation, Inc.
  3. Written 2007 by:
  4. Rostislav Benes
  5. The file_date routine is mostly from GNU's fileutils package,
  6. written by Richard Stallman and David MacKenzie.
  7. This program is free software; you can redistribute it and/or modify
  8. it under the terms of the GNU General Public License as published by
  9. the Free Software Foundation; either version 2 of the License, or
  10. (at your option) any later version.
  11. This program is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. GNU General Public License for more details.
  15. You should have received a copy of the GNU General Public License
  16. along with this program; if not, write to the Free Software
  17. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. */
  19. #include <config.h>
  20. #include <stdlib.h>
  21. #include <stdio.h>
  22. #include <errno.h>
  23. #include <glib.h>
  24. #include <langinfo.h>
  25. #include <string.h>
  26. #include "lib/global.h"
  27. #include "lib/strutil.h"
  28. /* using function for utf-8 from glib */
  29. static const char replch[] = "\xEF\xBF\xBD";
  30. static int
  31. str_unichar_iscombiningmark (gunichar uni)
  32. {
  33. int type = g_unichar_type (uni);
  34. return (type == G_UNICODE_COMBINING_MARK)
  35. || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  36. }
  37. static void
  38. str_utf8_insert_replace_char (GString * buffer)
  39. {
  40. g_string_append (buffer, replch);
  41. }
  42. static int
  43. str_utf8_is_valid_string (const char *text)
  44. {
  45. return g_utf8_validate (text, -1, NULL);
  46. }
  47. static int
  48. str_utf8_is_valid_char (const char *ch, size_t size)
  49. {
  50. switch (g_utf8_get_char_validated (ch, size))
  51. {
  52. case (gunichar) (-2):
  53. return -2;
  54. case (gunichar) (-1):
  55. return -1;
  56. default:
  57. return 1;
  58. }
  59. }
  60. static void
  61. str_utf8_cnext_char (const char **text)
  62. {
  63. (*text) = g_utf8_next_char (*text);
  64. }
  65. static void
  66. str_utf8_cprev_char (const char **text)
  67. {
  68. (*text) = g_utf8_prev_char (*text);
  69. }
  70. static void
  71. str_utf8_cnext_char_safe (const char **text)
  72. {
  73. if (str_utf8_is_valid_char (*text, -1) == 1)
  74. (*text) = g_utf8_next_char (*text);
  75. else
  76. (*text)++;
  77. }
  78. static void
  79. str_utf8_cprev_char_safe (const char **text)
  80. {
  81. const char *result = g_utf8_prev_char (*text);
  82. const char *t = result;
  83. str_utf8_cnext_char_safe (&t);
  84. if (t == *text)
  85. (*text) = result;
  86. else
  87. (*text)--;
  88. }
  89. static void
  90. str_utf8_fix_string (char *text)
  91. {
  92. gunichar uni;
  93. while (text[0] != '\0')
  94. {
  95. uni = g_utf8_get_char_validated (text, -1);
  96. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  97. {
  98. text = g_utf8_next_char (text);
  99. }
  100. else
  101. {
  102. text[0] = '?';
  103. text++;
  104. }
  105. }
  106. }
  107. static int
  108. str_utf8_isspace (const char *text)
  109. {
  110. gunichar uni = g_utf8_get_char_validated (text, -1);
  111. return g_unichar_isspace (uni);
  112. }
  113. static int
  114. str_utf8_ispunct (const char *text)
  115. {
  116. gunichar uni = g_utf8_get_char_validated (text, -1);
  117. return g_unichar_ispunct (uni);
  118. }
  119. static int
  120. str_utf8_isalnum (const char *text)
  121. {
  122. gunichar uni = g_utf8_get_char_validated (text, -1);
  123. return g_unichar_isalnum (uni);
  124. }
  125. static int
  126. str_utf8_isdigit (const char *text)
  127. {
  128. gunichar uni = g_utf8_get_char_validated (text, -1);
  129. return g_unichar_isdigit (uni);
  130. }
  131. static int
  132. str_utf8_isprint (const char *ch)
  133. {
  134. gunichar uni = g_utf8_get_char_validated (ch, -1);
  135. return g_unichar_isprint (uni);
  136. }
  137. static int
  138. str_utf8_iscombiningmark (const char *ch)
  139. {
  140. gunichar uni = g_utf8_get_char_validated (ch, -1);
  141. return str_unichar_iscombiningmark (uni);
  142. }
  143. static int
  144. str_utf8_cnext_noncomb_char (const char **text)
  145. {
  146. int count = 0;
  147. while ((*text)[0] != '\0')
  148. {
  149. str_utf8_cnext_char_safe (text);
  150. count++;
  151. if (!str_utf8_iscombiningmark (*text))
  152. break;
  153. }
  154. return count;
  155. }
  156. static int
  157. str_utf8_cprev_noncomb_char (const char **text, const char *begin)
  158. {
  159. int count = 0;
  160. while ((*text) != begin)
  161. {
  162. str_utf8_cprev_char_safe (text);
  163. count++;
  164. if (!str_utf8_iscombiningmark (*text))
  165. break;
  166. }
  167. return count;
  168. }
  169. static int
  170. str_utf8_toupper (const char *text, char **out, size_t * remain)
  171. {
  172. gunichar uni;
  173. size_t left;
  174. uni = g_utf8_get_char_validated (text, -1);
  175. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  176. return 0;
  177. uni = g_unichar_toupper (uni);
  178. left = g_unichar_to_utf8 (uni, NULL);
  179. if (left >= *remain)
  180. return 0;
  181. left = g_unichar_to_utf8 (uni, *out);
  182. (*out) += left;
  183. (*remain) -= left;
  184. return 1;
  185. }
  186. static int
  187. str_utf8_tolower (const char *text, char **out, size_t * remain)
  188. {
  189. gunichar uni;
  190. size_t left;
  191. uni = g_utf8_get_char_validated (text, -1);
  192. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  193. return 0;
  194. uni = g_unichar_tolower (uni);
  195. left = g_unichar_to_utf8 (uni, NULL);
  196. if (left >= *remain)
  197. return 0;
  198. left = g_unichar_to_utf8 (uni, *out);
  199. (*out) += left;
  200. (*remain) -= left;
  201. return 1;
  202. }
  203. static int
  204. str_utf8_length (const char *text)
  205. {
  206. int result = 0;
  207. const char *start;
  208. const char *end;
  209. start = text;
  210. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  211. {
  212. if (start != end)
  213. {
  214. result += g_utf8_strlen (start, end - start);
  215. }
  216. result++;
  217. start = end + 1;
  218. }
  219. if (start == text)
  220. {
  221. result = g_utf8_strlen (text, -1);
  222. }
  223. else
  224. {
  225. if (start[0] != '\0' && start != end)
  226. {
  227. result += g_utf8_strlen (start, end - start);
  228. }
  229. }
  230. return result;
  231. }
  232. static int
  233. str_utf8_length2 (const char *text, int size)
  234. {
  235. int result = 0;
  236. const char *start;
  237. const char *end;
  238. start = text;
  239. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
  240. {
  241. if (start != end)
  242. {
  243. result += g_utf8_strlen (start, min (end - start, size));
  244. size -= end - start;
  245. }
  246. result += (size > 0);
  247. size--;
  248. start = end + 1;
  249. }
  250. if (start == text)
  251. {
  252. result = g_utf8_strlen (text, size);
  253. }
  254. else
  255. {
  256. if (start[0] != '\0' && start != end && size > 0)
  257. {
  258. result += g_utf8_strlen (start, min (end - start, size));
  259. }
  260. }
  261. return result;
  262. }
  263. static int
  264. str_utf8_length_noncomb (const char *text)
  265. {
  266. int result = 0;
  267. const char *t = text;
  268. while (t[0] != '\0')
  269. {
  270. str_utf8_cnext_noncomb_char (&t);
  271. result++;
  272. }
  273. return result;
  274. }
  275. /*
  276. static void
  277. str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
  278. {
  279. char *next = g_utf8_next_char (*string);
  280. (*left) -= next - (*string);
  281. (*string) = next;
  282. g_string_append_c (buffer, '?');
  283. }
  284. */
  285. static gchar *
  286. str_utf8_conv_gerror_message (GError * error, const char *def_msg)
  287. {
  288. if ((error != NULL) && (error->message != NULL))
  289. return g_strdup (error->message);
  290. return g_strdup (def_msg != NULL ? def_msg : "");
  291. }
  292. static estr_t
  293. str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
  294. {
  295. estr_t result;
  296. if (coder == str_cnv_not_convert)
  297. {
  298. g_string_append_len (buffer, string, size);
  299. result = ESTR_SUCCESS;
  300. }
  301. else
  302. result = str_nconvert (coder, (char *) string, size, buffer);
  303. return result;
  304. }
  305. struct term_form
  306. {
  307. char text[BUF_MEDIUM * 6];
  308. size_t width;
  309. int compose;
  310. };
  311. /* utiliti function, that make string valid in utf8 and all characters printable
  312. * return width of string too*/
  313. static const struct term_form *
  314. str_utf8_make_make_term_form (const char *text, size_t length)
  315. {
  316. static struct term_form result;
  317. gunichar uni;
  318. size_t left;
  319. char *actual;
  320. result.text[0] = '\0';
  321. result.width = 0;
  322. result.compose = 0;
  323. actual = result.text;
  324. /* check if text start with combining character,
  325. * add space at begin in this case */
  326. if (length != 0 && text[0] != '\0')
  327. {
  328. uni = g_utf8_get_char_validated (text, -1);
  329. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  330. {
  331. if (str_unichar_iscombiningmark (uni))
  332. {
  333. actual[0] = ' ';
  334. actual++;
  335. result.width++;
  336. result.compose = 1;
  337. }
  338. }
  339. }
  340. while (length != 0 && text[0] != '\0')
  341. {
  342. uni = g_utf8_get_char_validated (text, -1);
  343. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  344. {
  345. if (g_unichar_isprint (uni))
  346. {
  347. left = g_unichar_to_utf8 (uni, actual);
  348. actual += left;
  349. if (!str_unichar_iscombiningmark (uni))
  350. {
  351. result.width++;
  352. if (g_unichar_iswide (uni))
  353. result.width++;
  354. }
  355. else
  356. result.compose = 1;
  357. }
  358. else
  359. {
  360. actual[0] = '.';
  361. actual++;
  362. result.width++;
  363. }
  364. text = g_utf8_next_char (text);
  365. }
  366. else
  367. {
  368. text++;
  369. /*actual[0] = '?'; */
  370. memcpy (actual, replch, strlen (replch));
  371. actual += strlen (replch);
  372. result.width++;
  373. }
  374. if (length != (size_t) (-1))
  375. length--;
  376. }
  377. actual[0] = '\0';
  378. return &result;
  379. }
  380. static const char *
  381. str_utf8_term_form (const char *text)
  382. {
  383. static char result[BUF_MEDIUM * 6];
  384. const struct term_form *pre_form;
  385. char *composed;
  386. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  387. if (pre_form->compose)
  388. {
  389. composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  390. g_strlcpy (result, composed, sizeof (result));
  391. g_free (composed);
  392. }
  393. else
  394. {
  395. g_strlcpy (result, pre_form->text, sizeof (result));
  396. }
  397. return result;
  398. }
  399. struct utf8_tool
  400. {
  401. char *actual;
  402. size_t remain;
  403. const char *cheked;
  404. int ident;
  405. int compose;
  406. };
  407. /* utiliti function, that copy all characters from cheked to actual */
  408. static int
  409. utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
  410. {
  411. size_t left;
  412. gunichar uni;
  413. tool->compose = 0;
  414. while (tool->cheked[0] != '\0')
  415. {
  416. uni = g_utf8_get_char (tool->cheked);
  417. tool->compose |= str_unichar_iscombiningmark (uni);
  418. left = g_unichar_to_utf8 (uni, NULL);
  419. if (tool->remain <= left)
  420. return 0;
  421. left = g_unichar_to_utf8 (uni, tool->actual);
  422. tool->actual += left;
  423. tool->remain -= left;
  424. tool->cheked = g_utf8_next_char (tool->cheked);
  425. }
  426. return 1;
  427. }
  428. /* utiliti function, that copy characters from cheked to actual until ident is
  429. * smaller than to_ident */
  430. static int
  431. utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
  432. {
  433. size_t left;
  434. gunichar uni;
  435. int w;
  436. tool->compose = 0;
  437. while (tool->cheked[0] != '\0')
  438. {
  439. uni = g_utf8_get_char (tool->cheked);
  440. if (!str_unichar_iscombiningmark (uni))
  441. {
  442. w = 1;
  443. if (g_unichar_iswide (uni))
  444. w++;
  445. if (tool->ident + w > to_ident)
  446. return 1;
  447. }
  448. else
  449. {
  450. w = 0;
  451. tool->compose = 1;
  452. }
  453. left = g_unichar_to_utf8 (uni, NULL);
  454. if (tool->remain <= left)
  455. return 0;
  456. left = g_unichar_to_utf8 (uni, tool->actual);
  457. tool->actual += left;
  458. tool->remain -= left;
  459. tool->cheked = g_utf8_next_char (tool->cheked);
  460. tool->ident += w;
  461. }
  462. return 1;
  463. }
  464. /* utiliti function, add count spaces to actual */
  465. static int
  466. utf8_tool_insert_space (struct utf8_tool *tool, int count)
  467. {
  468. if (count <= 0)
  469. return 1;
  470. if (tool->remain <= (gsize) count)
  471. return 0;
  472. memset (tool->actual, ' ', count);
  473. tool->actual += count;
  474. tool->remain -= count;
  475. return 1;
  476. }
  477. /* utiliti function, add one characters to actual */
  478. static int
  479. utf8_tool_insert_char (struct utf8_tool *tool, char ch)
  480. {
  481. if (tool->remain <= 1)
  482. return 0;
  483. tool->actual[0] = ch;
  484. tool->actual++;
  485. tool->remain--;
  486. return 1;
  487. }
  488. /* utiliti function, thah skip characters from cheked until ident is greater or
  489. * equal to to_ident */
  490. static int
  491. utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
  492. {
  493. gunichar uni;
  494. while (to_ident > tool->ident && tool->cheked[0] != '\0')
  495. {
  496. uni = g_utf8_get_char (tool->cheked);
  497. if (!str_unichar_iscombiningmark (uni))
  498. {
  499. tool->ident++;
  500. if (g_unichar_iswide (uni))
  501. tool->ident++;
  502. }
  503. tool->cheked = g_utf8_next_char (tool->cheked);
  504. }
  505. uni = g_utf8_get_char (tool->cheked);
  506. while (str_unichar_iscombiningmark (uni))
  507. {
  508. tool->cheked = g_utf8_next_char (tool->cheked);
  509. uni = g_utf8_get_char (tool->cheked);
  510. }
  511. return 1;
  512. }
  513. static void
  514. utf8_tool_compose (char *buffer, size_t size)
  515. {
  516. char *composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  517. g_strlcpy (buffer, composed, size);
  518. g_free (composed);
  519. }
  520. static const char *
  521. str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
  522. {
  523. static char result[BUF_MEDIUM * 6];
  524. const struct term_form *pre_form;
  525. struct utf8_tool tool;
  526. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  527. tool.cheked = pre_form->text;
  528. tool.actual = result;
  529. tool.remain = sizeof (result);
  530. tool.compose = 0;
  531. if (pre_form->width <= (gsize) width)
  532. {
  533. tool.ident = 0;
  534. switch (HIDE_FIT (just_mode))
  535. {
  536. case J_CENTER_LEFT:
  537. case J_CENTER:
  538. tool.ident = (width - pre_form->width) / 2;
  539. break;
  540. case J_RIGHT:
  541. tool.ident = width - pre_form->width;
  542. break;
  543. }
  544. utf8_tool_insert_space (&tool, tool.ident);
  545. utf8_tool_copy_chars_to_end (&tool);
  546. utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
  547. }
  548. else
  549. {
  550. if (IS_FIT (just_mode))
  551. {
  552. tool.ident = 0;
  553. utf8_tool_copy_chars_to (&tool, width / 2);
  554. utf8_tool_insert_char (&tool, '~');
  555. tool.ident = 0;
  556. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  557. utf8_tool_copy_chars_to_end (&tool);
  558. utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
  559. }
  560. else
  561. {
  562. tool.ident = 0;
  563. switch (HIDE_FIT (just_mode))
  564. {
  565. case J_CENTER:
  566. tool.ident = (width - pre_form->width) / 2;
  567. break;
  568. case J_RIGHT:
  569. tool.ident = width - pre_form->width;
  570. break;
  571. }
  572. utf8_tool_skip_chars_to (&tool, 0);
  573. utf8_tool_insert_space (&tool, tool.ident);
  574. utf8_tool_copy_chars_to (&tool, width);
  575. utf8_tool_insert_space (&tool, width - tool.ident);
  576. }
  577. }
  578. tool.actual[0] = '\0';
  579. if (tool.compose)
  580. utf8_tool_compose (result, sizeof (result));
  581. return result;
  582. }
  583. static const char *
  584. str_utf8_term_trim (const char *text, int width)
  585. {
  586. static char result[BUF_MEDIUM * 6];
  587. const struct term_form *pre_form;
  588. struct utf8_tool tool;
  589. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  590. tool.cheked = pre_form->text;
  591. tool.actual = result;
  592. tool.remain = sizeof (result);
  593. tool.compose = 0;
  594. if ((gsize) width < pre_form->width)
  595. {
  596. if (width <= 3)
  597. {
  598. memset (tool.actual, '.', width);
  599. tool.actual += width;
  600. tool.remain -= width;
  601. }
  602. else
  603. {
  604. memset (tool.actual, '.', 3);
  605. tool.actual += 3;
  606. tool.remain -= 3;
  607. tool.ident = 0;
  608. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
  609. utf8_tool_copy_chars_to_end (&tool);
  610. }
  611. }
  612. else
  613. {
  614. utf8_tool_copy_chars_to_end (&tool);
  615. }
  616. tool.actual[0] = '\0';
  617. if (tool.compose)
  618. utf8_tool_compose (result, sizeof (result));
  619. return result;
  620. }
  621. static int
  622. str_utf8_term_width2 (const char *text, size_t length)
  623. {
  624. const struct term_form *result;
  625. result = str_utf8_make_make_term_form (text, length);
  626. return result->width;
  627. }
  628. static int
  629. str_utf8_term_width1 (const char *text)
  630. {
  631. return str_utf8_term_width2 (text, (size_t) (-1));
  632. }
  633. static int
  634. str_utf8_term_char_width (const char *text)
  635. {
  636. gunichar uni = g_utf8_get_char_validated (text, -1);
  637. return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
  638. }
  639. static void
  640. str_utf8_msg_term_size (const char *text, int *lines, int *columns)
  641. {
  642. char *p, *tmp;
  643. char *q;
  644. char c = '\0';
  645. int width;
  646. (*lines) = 1;
  647. (*columns) = 0;
  648. tmp = g_strdup (text);
  649. p = tmp;
  650. for (;;)
  651. {
  652. q = strchr (p, '\n');
  653. if (q != NULL)
  654. {
  655. c = q[0];
  656. q[0] = '\0';
  657. }
  658. width = str_utf8_term_width1 (p);
  659. if (width > (*columns))
  660. (*columns) = width;
  661. if (q == NULL)
  662. break;
  663. q[0] = c;
  664. p = q + 1;
  665. (*lines)++;
  666. }
  667. g_free (tmp);
  668. }
  669. static const char *
  670. str_utf8_term_substring (const char *text, int start, int width)
  671. {
  672. static char result[BUF_MEDIUM * 6];
  673. const struct term_form *pre_form;
  674. struct utf8_tool tool;
  675. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  676. tool.cheked = pre_form->text;
  677. tool.actual = result;
  678. tool.remain = sizeof (result);
  679. tool.compose = 0;
  680. tool.ident = -start;
  681. utf8_tool_skip_chars_to (&tool, 0);
  682. if (tool.ident < 0)
  683. tool.ident = 0;
  684. utf8_tool_insert_space (&tool, tool.ident);
  685. utf8_tool_copy_chars_to (&tool, width);
  686. utf8_tool_insert_space (&tool, width - tool.ident);
  687. tool.actual[0] = '\0';
  688. if (tool.compose)
  689. utf8_tool_compose (result, sizeof (result));
  690. return result;
  691. }
  692. static const char *
  693. str_utf8_trunc (const char *text, int width)
  694. {
  695. static char result[MC_MAXPATHLEN * 6 * 2];
  696. const struct term_form *pre_form;
  697. struct utf8_tool tool;
  698. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  699. tool.cheked = pre_form->text;
  700. tool.actual = result;
  701. tool.remain = sizeof (result);
  702. tool.compose = 0;
  703. if (pre_form->width > (gsize) width)
  704. {
  705. tool.ident = 0;
  706. utf8_tool_copy_chars_to (&tool, width / 2);
  707. utf8_tool_insert_char (&tool, '~');
  708. tool.ident = 0;
  709. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  710. utf8_tool_copy_chars_to_end (&tool);
  711. }
  712. else
  713. {
  714. utf8_tool_copy_chars_to_end (&tool);
  715. }
  716. tool.actual[0] = '\0';
  717. if (tool.compose)
  718. utf8_tool_compose (result, sizeof (result));
  719. return result;
  720. }
  721. static int
  722. str_utf8_offset_to_pos (const char *text, size_t length)
  723. {
  724. if (str_utf8_is_valid_string (text))
  725. return g_utf8_offset_to_pointer (text, length) - text;
  726. else
  727. {
  728. int result;
  729. GString *buffer = g_string_new (text);
  730. str_utf8_fix_string (buffer->str);
  731. result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
  732. g_string_free (buffer, TRUE);
  733. return result;
  734. }
  735. }
  736. static int
  737. str_utf8_column_to_pos (const char *text, size_t pos)
  738. {
  739. static int result;
  740. gunichar uni;
  741. int width;
  742. width = 0;
  743. result = 0;
  744. while (text[0] != '\0')
  745. {
  746. uni = g_utf8_get_char_validated (text, 6);
  747. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  748. {
  749. if (g_unichar_isprint (uni))
  750. {
  751. if (!str_unichar_iscombiningmark (uni))
  752. {
  753. width++;
  754. if (g_unichar_iswide (uni))
  755. width++;
  756. }
  757. }
  758. else
  759. {
  760. width++;
  761. }
  762. text = g_utf8_next_char (text);
  763. }
  764. else
  765. {
  766. text++;
  767. width++;
  768. }
  769. if ((gsize) width > pos)
  770. return result;
  771. result++;
  772. }
  773. return result;
  774. }
  775. static char *
  776. str_utf8_create_search_needle (const char *needle, int case_sen)
  777. {
  778. if (needle != NULL)
  779. {
  780. if (case_sen)
  781. {
  782. return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
  783. }
  784. else
  785. {
  786. char *fold = g_utf8_casefold (needle, -1);
  787. char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  788. g_free (fold);
  789. return result;
  790. }
  791. }
  792. else
  793. return NULL;
  794. }
  795. static void
  796. str_utf8_release_search_needle (char *needle, int case_sen)
  797. {
  798. (void) case_sen;
  799. if (needle != NULL)
  800. g_free (needle);
  801. }
  802. static const char *
  803. str_utf8_search_first (const char *text, const char *search, int case_sen)
  804. {
  805. char *fold_text;
  806. char *deco_text;
  807. const char *match;
  808. const char *result = NULL;
  809. const char *m;
  810. fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
  811. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  812. match = deco_text;
  813. do
  814. {
  815. match = g_strstr_len (match, -1, search);
  816. if (match != NULL)
  817. {
  818. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  819. !str_utf8_iscombiningmark (match + strlen (search)))
  820. {
  821. result = text;
  822. m = deco_text;
  823. while (m < match)
  824. {
  825. str_utf8_cnext_noncomb_char (&m);
  826. str_utf8_cnext_noncomb_char (&result);
  827. }
  828. }
  829. else
  830. {
  831. str_utf8_cnext_char (&match);
  832. }
  833. }
  834. }
  835. while (match != NULL && result == NULL);
  836. g_free (deco_text);
  837. if (!case_sen)
  838. g_free (fold_text);
  839. return result;
  840. }
  841. static const char *
  842. str_utf8_search_last (const char *text, const char *search, int case_sen)
  843. {
  844. char *fold_text;
  845. char *deco_text;
  846. char *match;
  847. const char *result = NULL;
  848. const char *m;
  849. fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
  850. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  851. do
  852. {
  853. match = g_strrstr_len (deco_text, -1, search);
  854. if (match != NULL)
  855. {
  856. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  857. !str_utf8_iscombiningmark (match + strlen (search)))
  858. {
  859. result = text;
  860. m = deco_text;
  861. while (m < match)
  862. {
  863. str_utf8_cnext_noncomb_char (&m);
  864. str_utf8_cnext_noncomb_char (&result);
  865. }
  866. }
  867. else
  868. {
  869. match[0] = '\0';
  870. }
  871. }
  872. }
  873. while (match != NULL && result == NULL);
  874. g_free (deco_text);
  875. if (!case_sen)
  876. g_free (fold_text);
  877. return result;
  878. }
  879. static char *
  880. str_utf8_normalize (const char *text)
  881. {
  882. GString *fixed = g_string_new ("");
  883. char *tmp;
  884. char *result;
  885. const char *start;
  886. const char *end;
  887. start = text;
  888. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  889. {
  890. if (start != end)
  891. {
  892. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  893. g_string_append (fixed, tmp);
  894. g_free (tmp);
  895. }
  896. g_string_append_c (fixed, end[0]);
  897. start = end + 1;
  898. }
  899. if (start == text)
  900. {
  901. result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
  902. }
  903. else
  904. {
  905. if (start[0] != '\0' && start != end)
  906. {
  907. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  908. g_string_append (fixed, tmp);
  909. g_free (tmp);
  910. }
  911. result = g_strdup (fixed->str);
  912. }
  913. g_string_free (fixed, TRUE);
  914. return result;
  915. }
  916. static char *
  917. str_utf8_casefold_normalize (const char *text)
  918. {
  919. GString *fixed = g_string_new ("");
  920. char *tmp, *fold;
  921. char *result;
  922. const char *start;
  923. const char *end;
  924. start = text;
  925. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  926. {
  927. if (start != end)
  928. {
  929. fold = g_utf8_casefold (start, end - start);
  930. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  931. g_string_append (fixed, tmp);
  932. g_free (tmp);
  933. g_free (fold);
  934. }
  935. g_string_append_c (fixed, end[0]);
  936. start = end + 1;
  937. }
  938. if (start == text)
  939. {
  940. fold = g_utf8_casefold (text, -1);
  941. result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  942. g_free (fold);
  943. }
  944. else
  945. {
  946. if (start[0] != '\0' && start != end)
  947. {
  948. fold = g_utf8_casefold (start, end - start);
  949. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  950. g_string_append (fixed, tmp);
  951. g_free (tmp);
  952. g_free (fold);
  953. }
  954. result = g_strdup (fixed->str);
  955. }
  956. g_string_free (fixed, TRUE);
  957. return result;
  958. }
  959. static int
  960. str_utf8_compare (const char *t1, const char *t2)
  961. {
  962. char *n1, *n2;
  963. int result;
  964. n1 = str_utf8_normalize (t1);
  965. n2 = str_utf8_normalize (t2);
  966. result = strcmp (n1, n2);
  967. g_free (n1);
  968. g_free (n2);
  969. return result;
  970. }
  971. static int
  972. str_utf8_ncompare (const char *t1, const char *t2)
  973. {
  974. char *n1, *n2;
  975. int result;
  976. n1 = str_utf8_normalize (t1);
  977. n2 = str_utf8_normalize (t2);
  978. result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
  979. g_free (n1);
  980. g_free (n2);
  981. return result;
  982. }
  983. static int
  984. str_utf8_casecmp (const char *t1, const char *t2)
  985. {
  986. char *n1, *n2;
  987. int result;
  988. n1 = str_utf8_casefold_normalize (t1);
  989. n2 = str_utf8_casefold_normalize (t2);
  990. result = strcmp (n1, n2);
  991. g_free (n1);
  992. g_free (n2);
  993. return result;
  994. }
  995. static int
  996. str_utf8_ncasecmp (const char *t1, const char *t2)
  997. {
  998. char *n1, *n2;
  999. int result;
  1000. n1 = str_utf8_casefold_normalize (t1);
  1001. n2 = str_utf8_casefold_normalize (t2);
  1002. result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
  1003. g_free (n1);
  1004. g_free (n2);
  1005. return result;
  1006. }
  1007. static int
  1008. str_utf8_prefix (const char *text, const char *prefix)
  1009. {
  1010. char *t = str_utf8_normalize (text);
  1011. char *p = str_utf8_normalize (prefix);
  1012. const char *nt = t;
  1013. const char *np = p;
  1014. const char *nnt = t;
  1015. const char *nnp = p;
  1016. int result;
  1017. while (nt[0] != '\0' && np[0] != '\0')
  1018. {
  1019. str_utf8_cnext_char_safe (&nnt);
  1020. str_utf8_cnext_char_safe (&nnp);
  1021. if (nnt - nt != nnp - np)
  1022. break;
  1023. if (strncmp (nt, np, nnt - nt) != 0)
  1024. break;
  1025. nt = nnt;
  1026. np = nnp;
  1027. }
  1028. result = np - p;
  1029. g_free (t);
  1030. g_free (p);
  1031. return result;
  1032. }
  1033. static int
  1034. str_utf8_caseprefix (const char *text, const char *prefix)
  1035. {
  1036. char *t = str_utf8_casefold_normalize (text);
  1037. char *p = str_utf8_casefold_normalize (prefix);
  1038. const char *nt = t;
  1039. const char *np = p;
  1040. const char *nnt = t;
  1041. const char *nnp = p;
  1042. int result;
  1043. while (nt[0] != '\0' && np[0] != '\0')
  1044. {
  1045. str_utf8_cnext_char_safe (&nnt);
  1046. str_utf8_cnext_char_safe (&nnp);
  1047. if (nnt - nt != nnp - np)
  1048. break;
  1049. if (strncmp (nt, np, nnt - nt) != 0)
  1050. break;
  1051. nt = nnt;
  1052. np = nnp;
  1053. }
  1054. result = np - p;
  1055. g_free (t);
  1056. g_free (p);
  1057. return result;
  1058. }
  1059. static char *
  1060. str_utf8_create_key_gen (const char *text, int case_sen,
  1061. gchar * (*keygen) (const gchar * text, gssize size))
  1062. {
  1063. char *result;
  1064. if (case_sen)
  1065. {
  1066. result = str_utf8_normalize (text);
  1067. }
  1068. else
  1069. {
  1070. gboolean dot;
  1071. GString *fixed;
  1072. const char *start, *end;
  1073. char *fold, *key;
  1074. dot = text[0] == '.';
  1075. fixed = g_string_sized_new (16);
  1076. if (!dot)
  1077. start = text;
  1078. else
  1079. {
  1080. start = text + 1;
  1081. g_string_append_c (fixed, '.');
  1082. }
  1083. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  1084. {
  1085. if (start != end)
  1086. {
  1087. fold = g_utf8_casefold (start, end - start);
  1088. key = keygen (fold, -1);
  1089. g_string_append (fixed, key);
  1090. g_free (key);
  1091. g_free (fold);
  1092. }
  1093. g_string_append_c (fixed, end[0]);
  1094. start = end + 1;
  1095. }
  1096. if (start == text)
  1097. {
  1098. fold = g_utf8_casefold (start, -1);
  1099. result = keygen (fold, -1);
  1100. g_free (fold);
  1101. g_string_free (fixed, TRUE);
  1102. }
  1103. else if (dot && (start == text + 1))
  1104. {
  1105. fold = g_utf8_casefold (start, -1);
  1106. key = keygen (fold, -1);
  1107. g_string_append (fixed, key);
  1108. g_free (key);
  1109. g_free (fold);
  1110. result = g_string_free (fixed, FALSE);
  1111. }
  1112. else
  1113. {
  1114. if (start[0] != '\0' && start != end)
  1115. {
  1116. fold = g_utf8_casefold (start, end - start);
  1117. key = keygen (fold, -1);
  1118. g_string_append (fixed, key);
  1119. g_free (key);
  1120. g_free (fold);
  1121. }
  1122. result = g_string_free (fixed, FALSE);
  1123. }
  1124. }
  1125. return result;
  1126. }
  1127. static char *
  1128. str_utf8_create_key (const char *text, int case_sen)
  1129. {
  1130. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
  1131. }
  1132. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1133. static char *
  1134. str_utf8_create_key_for_filename (const char *text, int case_sen)
  1135. {
  1136. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
  1137. }
  1138. #endif
  1139. static int
  1140. str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
  1141. {
  1142. (void) case_sen;
  1143. return strcmp (t1, t2);
  1144. }
  1145. static void
  1146. str_utf8_release_key (char *key, int case_sen)
  1147. {
  1148. (void) case_sen;
  1149. g_free (key);
  1150. }
  1151. struct str_class
  1152. str_utf8_init (void)
  1153. {
  1154. struct str_class result;
  1155. result.conv_gerror_message = str_utf8_conv_gerror_message;
  1156. result.vfs_convert_to = str_utf8_vfs_convert_to;
  1157. result.insert_replace_char = str_utf8_insert_replace_char;
  1158. result.is_valid_string = str_utf8_is_valid_string;
  1159. result.is_valid_char = str_utf8_is_valid_char;
  1160. result.cnext_char = str_utf8_cnext_char;
  1161. result.cprev_char = str_utf8_cprev_char;
  1162. result.cnext_char_safe = str_utf8_cnext_char_safe;
  1163. result.cprev_char_safe = str_utf8_cprev_char_safe;
  1164. result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
  1165. result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
  1166. result.isspace = str_utf8_isspace;
  1167. result.ispunct = str_utf8_ispunct;
  1168. result.isalnum = str_utf8_isalnum;
  1169. result.isdigit = str_utf8_isdigit;
  1170. result.isprint = str_utf8_isprint;
  1171. result.iscombiningmark = str_utf8_iscombiningmark;
  1172. result.toupper = str_utf8_toupper;
  1173. result.tolower = str_utf8_tolower;
  1174. result.length = str_utf8_length;
  1175. result.length2 = str_utf8_length2;
  1176. result.length_noncomb = str_utf8_length_noncomb;
  1177. result.fix_string = str_utf8_fix_string;
  1178. result.term_form = str_utf8_term_form;
  1179. result.fit_to_term = str_utf8_fit_to_term;
  1180. result.term_trim = str_utf8_term_trim;
  1181. result.term_width2 = str_utf8_term_width2;
  1182. result.term_width1 = str_utf8_term_width1;
  1183. result.term_char_width = str_utf8_term_char_width;
  1184. result.msg_term_size = str_utf8_msg_term_size;
  1185. result.term_substring = str_utf8_term_substring;
  1186. result.trunc = str_utf8_trunc;
  1187. result.offset_to_pos = str_utf8_offset_to_pos;
  1188. result.column_to_pos = str_utf8_column_to_pos;
  1189. result.create_search_needle = str_utf8_create_search_needle;
  1190. result.release_search_needle = str_utf8_release_search_needle;
  1191. result.search_first = str_utf8_search_first;
  1192. result.search_last = str_utf8_search_last;
  1193. result.compare = str_utf8_compare;
  1194. result.ncompare = str_utf8_ncompare;
  1195. result.casecmp = str_utf8_casecmp;
  1196. result.ncasecmp = str_utf8_ncasecmp;
  1197. result.prefix = str_utf8_prefix;
  1198. result.caseprefix = str_utf8_caseprefix;
  1199. result.create_key = str_utf8_create_key;
  1200. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1201. /* case insensitive sort files in "a1 a2 a10" order */
  1202. result.create_key_for_filename = str_utf8_create_key_for_filename;
  1203. #else
  1204. /* case insensitive sort files in "a1 a10 a2" order */
  1205. result.create_key_for_filename = str_utf8_create_key;
  1206. #endif
  1207. result.key_collate = str_utf8_key_collate;
  1208. result.release_key = str_utf8_release_key;
  1209. return result;
  1210. }