strutilutf8.c 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364
  1. /*
  2. UTF-8 strings utilities
  3. Copyright (C) 2007, 2011
  4. The Free Software Foundation, Inc.
  5. Written by:
  6. Rostislav Benes, 2007
  7. The file_date routine is mostly from GNU's fileutils package,
  8. written by Richard Stallman and David MacKenzie.
  9. This file is part of the Midnight Commander.
  10. The Midnight Commander is free software: you can redistribute it
  11. and/or modify it under the terms of the GNU General Public License as
  12. published by the Free Software Foundation, either version 3 of the License,
  13. or (at your option) any later version.
  14. The Midnight Commander is distributed in the hope that it will be useful,
  15. but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. GNU General Public License for more details.
  18. You should have received a copy of the GNU General Public License
  19. along with this program. If not, see <http://www.gnu.org/licenses/>.
  20. */
  21. #include <config.h>
  22. #include <stdlib.h>
  23. #include <stdio.h>
  24. #include <errno.h>
  25. #include <glib.h>
  26. #include <langinfo.h>
  27. #include <string.h>
  28. #include "lib/global.h"
  29. #include "lib/strutil.h"
  30. /* using function for utf-8 from glib */
  31. static const char replch[] = "\xEF\xBF\xBD";
  32. static int
  33. str_unichar_iscombiningmark (gunichar uni)
  34. {
  35. int type = g_unichar_type (uni);
  36. return (type == G_UNICODE_COMBINING_MARK)
  37. || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  38. }
  39. static void
  40. str_utf8_insert_replace_char (GString * buffer)
  41. {
  42. g_string_append (buffer, replch);
  43. }
  44. static int
  45. str_utf8_is_valid_string (const char *text)
  46. {
  47. return g_utf8_validate (text, -1, NULL);
  48. }
  49. static int
  50. str_utf8_is_valid_char (const char *ch, size_t size)
  51. {
  52. switch (g_utf8_get_char_validated (ch, size))
  53. {
  54. case (gunichar) (-2):
  55. return -2;
  56. case (gunichar) (-1):
  57. return -1;
  58. default:
  59. return 1;
  60. }
  61. }
  62. static void
  63. str_utf8_cnext_char (const char **text)
  64. {
  65. (*text) = g_utf8_next_char (*text);
  66. }
  67. static void
  68. str_utf8_cprev_char (const char **text)
  69. {
  70. (*text) = g_utf8_prev_char (*text);
  71. }
  72. static void
  73. str_utf8_cnext_char_safe (const char **text)
  74. {
  75. if (str_utf8_is_valid_char (*text, -1) == 1)
  76. (*text) = g_utf8_next_char (*text);
  77. else
  78. (*text)++;
  79. }
  80. static void
  81. str_utf8_cprev_char_safe (const char **text)
  82. {
  83. const char *result = g_utf8_prev_char (*text);
  84. const char *t = result;
  85. str_utf8_cnext_char_safe (&t);
  86. if (t == *text)
  87. (*text) = result;
  88. else
  89. (*text)--;
  90. }
  91. static void
  92. str_utf8_fix_string (char *text)
  93. {
  94. gunichar uni;
  95. while (text[0] != '\0')
  96. {
  97. uni = g_utf8_get_char_validated (text, -1);
  98. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  99. {
  100. text = g_utf8_next_char (text);
  101. }
  102. else
  103. {
  104. text[0] = '?';
  105. text++;
  106. }
  107. }
  108. }
  109. static int
  110. str_utf8_isspace (const char *text)
  111. {
  112. gunichar uni = g_utf8_get_char_validated (text, -1);
  113. return g_unichar_isspace (uni);
  114. }
  115. static int
  116. str_utf8_ispunct (const char *text)
  117. {
  118. gunichar uni = g_utf8_get_char_validated (text, -1);
  119. return g_unichar_ispunct (uni);
  120. }
  121. static int
  122. str_utf8_isalnum (const char *text)
  123. {
  124. gunichar uni = g_utf8_get_char_validated (text, -1);
  125. return g_unichar_isalnum (uni);
  126. }
  127. static int
  128. str_utf8_isdigit (const char *text)
  129. {
  130. gunichar uni = g_utf8_get_char_validated (text, -1);
  131. return g_unichar_isdigit (uni);
  132. }
  133. static int
  134. str_utf8_isprint (const char *ch)
  135. {
  136. gunichar uni = g_utf8_get_char_validated (ch, -1);
  137. return g_unichar_isprint (uni);
  138. }
  139. static int
  140. str_utf8_iscombiningmark (const char *ch)
  141. {
  142. gunichar uni = g_utf8_get_char_validated (ch, -1);
  143. return str_unichar_iscombiningmark (uni);
  144. }
  145. static int
  146. str_utf8_cnext_noncomb_char (const char **text)
  147. {
  148. int count = 0;
  149. while ((*text)[0] != '\0')
  150. {
  151. str_utf8_cnext_char_safe (text);
  152. count++;
  153. if (!str_utf8_iscombiningmark (*text))
  154. break;
  155. }
  156. return count;
  157. }
  158. static int
  159. str_utf8_cprev_noncomb_char (const char **text, const char *begin)
  160. {
  161. int count = 0;
  162. while ((*text) != begin)
  163. {
  164. str_utf8_cprev_char_safe (text);
  165. count++;
  166. if (!str_utf8_iscombiningmark (*text))
  167. break;
  168. }
  169. return count;
  170. }
  171. static int
  172. str_utf8_toupper (const char *text, char **out, size_t * remain)
  173. {
  174. gunichar uni;
  175. size_t left;
  176. uni = g_utf8_get_char_validated (text, -1);
  177. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  178. return 0;
  179. uni = g_unichar_toupper (uni);
  180. left = g_unichar_to_utf8 (uni, NULL);
  181. if (left >= *remain)
  182. return 0;
  183. left = g_unichar_to_utf8 (uni, *out);
  184. (*out) += left;
  185. (*remain) -= left;
  186. return 1;
  187. }
  188. static int
  189. str_utf8_tolower (const char *text, char **out, size_t * remain)
  190. {
  191. gunichar uni;
  192. size_t left;
  193. uni = g_utf8_get_char_validated (text, -1);
  194. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  195. return 0;
  196. uni = g_unichar_tolower (uni);
  197. left = g_unichar_to_utf8 (uni, NULL);
  198. if (left >= *remain)
  199. return 0;
  200. left = g_unichar_to_utf8 (uni, *out);
  201. (*out) += left;
  202. (*remain) -= left;
  203. return 1;
  204. }
  205. static int
  206. str_utf8_length (const char *text)
  207. {
  208. int result = 0;
  209. const char *start;
  210. const char *end;
  211. start = text;
  212. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  213. {
  214. if (start != end)
  215. {
  216. result += g_utf8_strlen (start, end - start);
  217. }
  218. result++;
  219. start = end + 1;
  220. }
  221. if (start == text)
  222. {
  223. result = g_utf8_strlen (text, -1);
  224. }
  225. else
  226. {
  227. if (start[0] != '\0' && start != end)
  228. {
  229. result += g_utf8_strlen (start, end - start);
  230. }
  231. }
  232. return result;
  233. }
  234. static int
  235. str_utf8_length2 (const char *text, int size)
  236. {
  237. int result = 0;
  238. const char *start;
  239. const char *end;
  240. start = text;
  241. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
  242. {
  243. if (start != end)
  244. {
  245. result += g_utf8_strlen (start, min (end - start, size));
  246. size -= end - start;
  247. }
  248. result += (size > 0);
  249. size--;
  250. start = end + 1;
  251. }
  252. if (start == text)
  253. {
  254. result = g_utf8_strlen (text, size);
  255. }
  256. else
  257. {
  258. if (start[0] != '\0' && start != end && size > 0)
  259. {
  260. result += g_utf8_strlen (start, min (end - start, size));
  261. }
  262. }
  263. return result;
  264. }
  265. static int
  266. str_utf8_length_noncomb (const char *text)
  267. {
  268. int result = 0;
  269. const char *t = text;
  270. while (t[0] != '\0')
  271. {
  272. str_utf8_cnext_noncomb_char (&t);
  273. result++;
  274. }
  275. return result;
  276. }
  277. /*
  278. static void
  279. str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
  280. {
  281. char *next = g_utf8_next_char (*string);
  282. (*left) -= next - (*string);
  283. (*string) = next;
  284. g_string_append_c (buffer, '?');
  285. }
  286. */
  287. static gchar *
  288. str_utf8_conv_gerror_message (GError * error, const char *def_msg)
  289. {
  290. if ((error != NULL) && (error->message != NULL))
  291. return g_strdup (error->message);
  292. return g_strdup (def_msg != NULL ? def_msg : "");
  293. }
  294. static estr_t
  295. str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
  296. {
  297. estr_t result;
  298. if (coder == str_cnv_not_convert)
  299. {
  300. g_string_append_len (buffer, string, size);
  301. result = ESTR_SUCCESS;
  302. }
  303. else
  304. result = str_nconvert (coder, (char *) string, size, buffer);
  305. return result;
  306. }
  307. struct term_form
  308. {
  309. char text[BUF_MEDIUM * 6];
  310. size_t width;
  311. int compose;
  312. };
  313. /* utiliti function, that make string valid in utf8 and all characters printable
  314. * return width of string too*/
  315. static const struct term_form *
  316. str_utf8_make_make_term_form (const char *text, size_t length)
  317. {
  318. static struct term_form result;
  319. gunichar uni;
  320. size_t left;
  321. char *actual;
  322. result.text[0] = '\0';
  323. result.width = 0;
  324. result.compose = 0;
  325. actual = result.text;
  326. /* check if text start with combining character,
  327. * add space at begin in this case */
  328. if (length != 0 && text[0] != '\0')
  329. {
  330. uni = g_utf8_get_char_validated (text, -1);
  331. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  332. {
  333. if (str_unichar_iscombiningmark (uni))
  334. {
  335. actual[0] = ' ';
  336. actual++;
  337. result.width++;
  338. result.compose = 1;
  339. }
  340. }
  341. }
  342. while (length != 0 && text[0] != '\0')
  343. {
  344. uni = g_utf8_get_char_validated (text, -1);
  345. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  346. {
  347. if (g_unichar_isprint (uni))
  348. {
  349. left = g_unichar_to_utf8 (uni, actual);
  350. actual += left;
  351. if (!str_unichar_iscombiningmark (uni))
  352. {
  353. result.width++;
  354. if (g_unichar_iswide (uni))
  355. result.width++;
  356. }
  357. else
  358. result.compose = 1;
  359. }
  360. else
  361. {
  362. actual[0] = '.';
  363. actual++;
  364. result.width++;
  365. }
  366. text = g_utf8_next_char (text);
  367. }
  368. else
  369. {
  370. text++;
  371. /*actual[0] = '?'; */
  372. memcpy (actual, replch, strlen (replch));
  373. actual += strlen (replch);
  374. result.width++;
  375. }
  376. if (length != (size_t) (-1))
  377. length--;
  378. }
  379. actual[0] = '\0';
  380. return &result;
  381. }
  382. static const char *
  383. str_utf8_term_form (const char *text)
  384. {
  385. static char result[BUF_MEDIUM * 6];
  386. const struct term_form *pre_form;
  387. char *composed;
  388. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  389. if (pre_form->compose)
  390. {
  391. composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  392. g_strlcpy (result, composed, sizeof (result));
  393. g_free (composed);
  394. }
  395. else
  396. {
  397. g_strlcpy (result, pre_form->text, sizeof (result));
  398. }
  399. return result;
  400. }
  401. struct utf8_tool
  402. {
  403. char *actual;
  404. size_t remain;
  405. const char *cheked;
  406. int ident;
  407. int compose;
  408. };
  409. /* utiliti function, that copy all characters from cheked to actual */
  410. static int
  411. utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
  412. {
  413. size_t left;
  414. gunichar uni;
  415. tool->compose = 0;
  416. while (tool->cheked[0] != '\0')
  417. {
  418. uni = g_utf8_get_char (tool->cheked);
  419. tool->compose |= str_unichar_iscombiningmark (uni);
  420. left = g_unichar_to_utf8 (uni, NULL);
  421. if (tool->remain <= left)
  422. return 0;
  423. left = g_unichar_to_utf8 (uni, tool->actual);
  424. tool->actual += left;
  425. tool->remain -= left;
  426. tool->cheked = g_utf8_next_char (tool->cheked);
  427. }
  428. return 1;
  429. }
  430. /* utiliti function, that copy characters from cheked to actual until ident is
  431. * smaller than to_ident */
  432. static int
  433. utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
  434. {
  435. size_t left;
  436. gunichar uni;
  437. int w;
  438. tool->compose = 0;
  439. while (tool->cheked[0] != '\0')
  440. {
  441. uni = g_utf8_get_char (tool->cheked);
  442. if (!str_unichar_iscombiningmark (uni))
  443. {
  444. w = 1;
  445. if (g_unichar_iswide (uni))
  446. w++;
  447. if (tool->ident + w > to_ident)
  448. return 1;
  449. }
  450. else
  451. {
  452. w = 0;
  453. tool->compose = 1;
  454. }
  455. left = g_unichar_to_utf8 (uni, NULL);
  456. if (tool->remain <= left)
  457. return 0;
  458. left = g_unichar_to_utf8 (uni, tool->actual);
  459. tool->actual += left;
  460. tool->remain -= left;
  461. tool->cheked = g_utf8_next_char (tool->cheked);
  462. tool->ident += w;
  463. }
  464. return 1;
  465. }
  466. /* utiliti function, add count spaces to actual */
  467. static int
  468. utf8_tool_insert_space (struct utf8_tool *tool, int count)
  469. {
  470. if (count <= 0)
  471. return 1;
  472. if (tool->remain <= (gsize) count)
  473. return 0;
  474. memset (tool->actual, ' ', count);
  475. tool->actual += count;
  476. tool->remain -= count;
  477. return 1;
  478. }
  479. /* utiliti function, add one characters to actual */
  480. static int
  481. utf8_tool_insert_char (struct utf8_tool *tool, char ch)
  482. {
  483. if (tool->remain <= 1)
  484. return 0;
  485. tool->actual[0] = ch;
  486. tool->actual++;
  487. tool->remain--;
  488. return 1;
  489. }
  490. /* utiliti function, thah skip characters from cheked until ident is greater or
  491. * equal to to_ident */
  492. static int
  493. utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
  494. {
  495. gunichar uni;
  496. while (to_ident > tool->ident && tool->cheked[0] != '\0')
  497. {
  498. uni = g_utf8_get_char (tool->cheked);
  499. if (!str_unichar_iscombiningmark (uni))
  500. {
  501. tool->ident++;
  502. if (g_unichar_iswide (uni))
  503. tool->ident++;
  504. }
  505. tool->cheked = g_utf8_next_char (tool->cheked);
  506. }
  507. uni = g_utf8_get_char (tool->cheked);
  508. while (str_unichar_iscombiningmark (uni))
  509. {
  510. tool->cheked = g_utf8_next_char (tool->cheked);
  511. uni = g_utf8_get_char (tool->cheked);
  512. }
  513. return 1;
  514. }
  515. static void
  516. utf8_tool_compose (char *buffer, size_t size)
  517. {
  518. char *composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  519. g_strlcpy (buffer, composed, size);
  520. g_free (composed);
  521. }
  522. static const char *
  523. str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
  524. {
  525. static char result[BUF_MEDIUM * 6];
  526. const struct term_form *pre_form;
  527. struct utf8_tool tool;
  528. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  529. tool.cheked = pre_form->text;
  530. tool.actual = result;
  531. tool.remain = sizeof (result);
  532. tool.compose = 0;
  533. if (pre_form->width <= (gsize) width)
  534. {
  535. tool.ident = 0;
  536. switch (HIDE_FIT (just_mode))
  537. {
  538. case J_CENTER_LEFT:
  539. case J_CENTER:
  540. tool.ident = (width - pre_form->width) / 2;
  541. break;
  542. case J_RIGHT:
  543. tool.ident = width - pre_form->width;
  544. break;
  545. }
  546. utf8_tool_insert_space (&tool, tool.ident);
  547. utf8_tool_copy_chars_to_end (&tool);
  548. utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
  549. }
  550. else
  551. {
  552. if (IS_FIT (just_mode))
  553. {
  554. tool.ident = 0;
  555. utf8_tool_copy_chars_to (&tool, width / 2);
  556. utf8_tool_insert_char (&tool, '~');
  557. tool.ident = 0;
  558. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  559. utf8_tool_copy_chars_to_end (&tool);
  560. utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
  561. }
  562. else
  563. {
  564. tool.ident = 0;
  565. switch (HIDE_FIT (just_mode))
  566. {
  567. case J_CENTER:
  568. tool.ident = (width - pre_form->width) / 2;
  569. break;
  570. case J_RIGHT:
  571. tool.ident = width - pre_form->width;
  572. break;
  573. }
  574. utf8_tool_skip_chars_to (&tool, 0);
  575. utf8_tool_insert_space (&tool, tool.ident);
  576. utf8_tool_copy_chars_to (&tool, width);
  577. utf8_tool_insert_space (&tool, width - tool.ident);
  578. }
  579. }
  580. tool.actual[0] = '\0';
  581. if (tool.compose)
  582. utf8_tool_compose (result, sizeof (result));
  583. return result;
  584. }
  585. static const char *
  586. str_utf8_term_trim (const char *text, int width)
  587. {
  588. static char result[BUF_MEDIUM * 6];
  589. const struct term_form *pre_form;
  590. struct utf8_tool tool;
  591. if (width < 1)
  592. {
  593. result [0] = '\0';
  594. return result;
  595. }
  596. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  597. tool.cheked = pre_form->text;
  598. tool.actual = result;
  599. tool.remain = sizeof (result);
  600. tool.compose = 0;
  601. if ((gsize) width < pre_form->width)
  602. {
  603. if (width <= 3)
  604. {
  605. memset (tool.actual, '.', width);
  606. tool.actual += width;
  607. tool.remain -= width;
  608. }
  609. else
  610. {
  611. memset (tool.actual, '.', 3);
  612. tool.actual += 3;
  613. tool.remain -= 3;
  614. tool.ident = 0;
  615. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
  616. utf8_tool_copy_chars_to_end (&tool);
  617. }
  618. }
  619. else
  620. {
  621. utf8_tool_copy_chars_to_end (&tool);
  622. }
  623. tool.actual[0] = '\0';
  624. if (tool.compose)
  625. utf8_tool_compose (result, sizeof (result));
  626. return result;
  627. }
  628. static int
  629. str_utf8_term_width2 (const char *text, size_t length)
  630. {
  631. const struct term_form *result;
  632. result = str_utf8_make_make_term_form (text, length);
  633. return result->width;
  634. }
  635. static int
  636. str_utf8_term_width1 (const char *text)
  637. {
  638. return str_utf8_term_width2 (text, (size_t) (-1));
  639. }
  640. static int
  641. str_utf8_term_char_width (const char *text)
  642. {
  643. gunichar uni = g_utf8_get_char_validated (text, -1);
  644. return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
  645. }
  646. static const char *
  647. str_utf8_term_substring (const char *text, int start, int width)
  648. {
  649. static char result[BUF_MEDIUM * 6];
  650. const struct term_form *pre_form;
  651. struct utf8_tool tool;
  652. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  653. tool.cheked = pre_form->text;
  654. tool.actual = result;
  655. tool.remain = sizeof (result);
  656. tool.compose = 0;
  657. tool.ident = -start;
  658. utf8_tool_skip_chars_to (&tool, 0);
  659. if (tool.ident < 0)
  660. tool.ident = 0;
  661. utf8_tool_insert_space (&tool, tool.ident);
  662. utf8_tool_copy_chars_to (&tool, width);
  663. utf8_tool_insert_space (&tool, width - tool.ident);
  664. tool.actual[0] = '\0';
  665. if (tool.compose)
  666. utf8_tool_compose (result, sizeof (result));
  667. return result;
  668. }
  669. static const char *
  670. str_utf8_trunc (const char *text, int width)
  671. {
  672. static char result[MC_MAXPATHLEN * 6 * 2];
  673. const struct term_form *pre_form;
  674. struct utf8_tool tool;
  675. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  676. tool.cheked = pre_form->text;
  677. tool.actual = result;
  678. tool.remain = sizeof (result);
  679. tool.compose = 0;
  680. if (pre_form->width > (gsize) width)
  681. {
  682. tool.ident = 0;
  683. utf8_tool_copy_chars_to (&tool, width / 2);
  684. utf8_tool_insert_char (&tool, '~');
  685. tool.ident = 0;
  686. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  687. utf8_tool_copy_chars_to_end (&tool);
  688. }
  689. else
  690. {
  691. utf8_tool_copy_chars_to_end (&tool);
  692. }
  693. tool.actual[0] = '\0';
  694. if (tool.compose)
  695. utf8_tool_compose (result, sizeof (result));
  696. return result;
  697. }
  698. static int
  699. str_utf8_offset_to_pos (const char *text, size_t length)
  700. {
  701. if (str_utf8_is_valid_string (text))
  702. return g_utf8_offset_to_pointer (text, length) - text;
  703. else
  704. {
  705. int result;
  706. GString *buffer = g_string_new (text);
  707. str_utf8_fix_string (buffer->str);
  708. result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
  709. g_string_free (buffer, TRUE);
  710. return result;
  711. }
  712. }
  713. static int
  714. str_utf8_column_to_pos (const char *text, size_t pos)
  715. {
  716. static int result;
  717. gunichar uni;
  718. int width;
  719. width = 0;
  720. result = 0;
  721. while (text[0] != '\0')
  722. {
  723. uni = g_utf8_get_char_validated (text, 6);
  724. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  725. {
  726. if (g_unichar_isprint (uni))
  727. {
  728. if (!str_unichar_iscombiningmark (uni))
  729. {
  730. width++;
  731. if (g_unichar_iswide (uni))
  732. width++;
  733. }
  734. }
  735. else
  736. {
  737. width++;
  738. }
  739. text = g_utf8_next_char (text);
  740. }
  741. else
  742. {
  743. text++;
  744. width++;
  745. }
  746. if ((gsize) width > pos)
  747. return result;
  748. result++;
  749. }
  750. return result;
  751. }
  752. static char *
  753. str_utf8_create_search_needle (const char *needle, int case_sen)
  754. {
  755. if (needle != NULL)
  756. {
  757. if (case_sen)
  758. {
  759. return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
  760. }
  761. else
  762. {
  763. char *fold = g_utf8_casefold (needle, -1);
  764. char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  765. g_free (fold);
  766. return result;
  767. }
  768. }
  769. else
  770. return NULL;
  771. }
  772. static void
  773. str_utf8_release_search_needle (char *needle, int case_sen)
  774. {
  775. (void) case_sen;
  776. if (needle != NULL)
  777. g_free (needle);
  778. }
  779. static const char *
  780. str_utf8_search_first (const char *text, const char *search, int case_sen)
  781. {
  782. char *fold_text;
  783. char *deco_text;
  784. const char *match;
  785. const char *result = NULL;
  786. const char *m;
  787. fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
  788. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  789. match = deco_text;
  790. do
  791. {
  792. match = g_strstr_len (match, -1, search);
  793. if (match != NULL)
  794. {
  795. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  796. !str_utf8_iscombiningmark (match + strlen (search)))
  797. {
  798. result = text;
  799. m = deco_text;
  800. while (m < match)
  801. {
  802. str_utf8_cnext_noncomb_char (&m);
  803. str_utf8_cnext_noncomb_char (&result);
  804. }
  805. }
  806. else
  807. {
  808. str_utf8_cnext_char (&match);
  809. }
  810. }
  811. }
  812. while (match != NULL && result == NULL);
  813. g_free (deco_text);
  814. if (!case_sen)
  815. g_free (fold_text);
  816. return result;
  817. }
  818. static const char *
  819. str_utf8_search_last (const char *text, const char *search, int case_sen)
  820. {
  821. char *fold_text;
  822. char *deco_text;
  823. char *match;
  824. const char *result = NULL;
  825. const char *m;
  826. fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
  827. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  828. do
  829. {
  830. match = g_strrstr_len (deco_text, -1, search);
  831. if (match != NULL)
  832. {
  833. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  834. !str_utf8_iscombiningmark (match + strlen (search)))
  835. {
  836. result = text;
  837. m = deco_text;
  838. while (m < match)
  839. {
  840. str_utf8_cnext_noncomb_char (&m);
  841. str_utf8_cnext_noncomb_char (&result);
  842. }
  843. }
  844. else
  845. {
  846. match[0] = '\0';
  847. }
  848. }
  849. }
  850. while (match != NULL && result == NULL);
  851. g_free (deco_text);
  852. if (!case_sen)
  853. g_free (fold_text);
  854. return result;
  855. }
  856. static char *
  857. str_utf8_normalize (const char *text)
  858. {
  859. GString *fixed = g_string_new ("");
  860. char *tmp;
  861. char *result;
  862. const char *start;
  863. const char *end;
  864. start = text;
  865. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  866. {
  867. if (start != end)
  868. {
  869. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  870. g_string_append (fixed, tmp);
  871. g_free (tmp);
  872. }
  873. g_string_append_c (fixed, end[0]);
  874. start = end + 1;
  875. }
  876. if (start == text)
  877. {
  878. result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
  879. }
  880. else
  881. {
  882. if (start[0] != '\0' && start != end)
  883. {
  884. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  885. g_string_append (fixed, tmp);
  886. g_free (tmp);
  887. }
  888. result = g_strdup (fixed->str);
  889. }
  890. g_string_free (fixed, TRUE);
  891. return result;
  892. }
  893. static char *
  894. str_utf8_casefold_normalize (const char *text)
  895. {
  896. GString *fixed = g_string_new ("");
  897. char *tmp, *fold;
  898. char *result;
  899. const char *start;
  900. const char *end;
  901. start = text;
  902. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  903. {
  904. if (start != end)
  905. {
  906. fold = g_utf8_casefold (start, end - start);
  907. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  908. g_string_append (fixed, tmp);
  909. g_free (tmp);
  910. g_free (fold);
  911. }
  912. g_string_append_c (fixed, end[0]);
  913. start = end + 1;
  914. }
  915. if (start == text)
  916. {
  917. fold = g_utf8_casefold (text, -1);
  918. result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  919. g_free (fold);
  920. }
  921. else
  922. {
  923. if (start[0] != '\0' && start != end)
  924. {
  925. fold = g_utf8_casefold (start, end - start);
  926. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  927. g_string_append (fixed, tmp);
  928. g_free (tmp);
  929. g_free (fold);
  930. }
  931. result = g_strdup (fixed->str);
  932. }
  933. g_string_free (fixed, TRUE);
  934. return result;
  935. }
  936. static int
  937. str_utf8_compare (const char *t1, const char *t2)
  938. {
  939. char *n1, *n2;
  940. int result;
  941. n1 = str_utf8_normalize (t1);
  942. n2 = str_utf8_normalize (t2);
  943. result = strcmp (n1, n2);
  944. g_free (n1);
  945. g_free (n2);
  946. return result;
  947. }
  948. static int
  949. str_utf8_ncompare (const char *t1, const char *t2)
  950. {
  951. char *n1, *n2;
  952. int result;
  953. n1 = str_utf8_normalize (t1);
  954. n2 = str_utf8_normalize (t2);
  955. result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
  956. g_free (n1);
  957. g_free (n2);
  958. return result;
  959. }
  960. static int
  961. str_utf8_casecmp (const char *t1, const char *t2)
  962. {
  963. char *n1, *n2;
  964. int result;
  965. n1 = str_utf8_casefold_normalize (t1);
  966. n2 = str_utf8_casefold_normalize (t2);
  967. result = strcmp (n1, n2);
  968. g_free (n1);
  969. g_free (n2);
  970. return result;
  971. }
  972. static int
  973. str_utf8_ncasecmp (const char *t1, const char *t2)
  974. {
  975. char *n1, *n2;
  976. int result;
  977. n1 = str_utf8_casefold_normalize (t1);
  978. n2 = str_utf8_casefold_normalize (t2);
  979. result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
  980. g_free (n1);
  981. g_free (n2);
  982. return result;
  983. }
  984. static int
  985. str_utf8_prefix (const char *text, const char *prefix)
  986. {
  987. char *t = str_utf8_normalize (text);
  988. char *p = str_utf8_normalize (prefix);
  989. const char *nt = t;
  990. const char *np = p;
  991. const char *nnt = t;
  992. const char *nnp = p;
  993. int result;
  994. while (nt[0] != '\0' && np[0] != '\0')
  995. {
  996. str_utf8_cnext_char_safe (&nnt);
  997. str_utf8_cnext_char_safe (&nnp);
  998. if (nnt - nt != nnp - np)
  999. break;
  1000. if (strncmp (nt, np, nnt - nt) != 0)
  1001. break;
  1002. nt = nnt;
  1003. np = nnp;
  1004. }
  1005. result = np - p;
  1006. g_free (t);
  1007. g_free (p);
  1008. return result;
  1009. }
  1010. static int
  1011. str_utf8_caseprefix (const char *text, const char *prefix)
  1012. {
  1013. char *t = str_utf8_casefold_normalize (text);
  1014. char *p = str_utf8_casefold_normalize (prefix);
  1015. const char *nt = t;
  1016. const char *np = p;
  1017. const char *nnt = t;
  1018. const char *nnp = p;
  1019. int result;
  1020. while (nt[0] != '\0' && np[0] != '\0')
  1021. {
  1022. str_utf8_cnext_char_safe (&nnt);
  1023. str_utf8_cnext_char_safe (&nnp);
  1024. if (nnt - nt != nnp - np)
  1025. break;
  1026. if (strncmp (nt, np, nnt - nt) != 0)
  1027. break;
  1028. nt = nnt;
  1029. np = nnp;
  1030. }
  1031. result = np - p;
  1032. g_free (t);
  1033. g_free (p);
  1034. return result;
  1035. }
  1036. static char *
  1037. str_utf8_create_key_gen (const char *text, int case_sen,
  1038. gchar * (*keygen) (const gchar * text, gssize size))
  1039. {
  1040. char *result;
  1041. if (case_sen)
  1042. {
  1043. result = str_utf8_normalize (text);
  1044. }
  1045. else
  1046. {
  1047. gboolean dot;
  1048. GString *fixed;
  1049. const char *start, *end;
  1050. char *fold, *key;
  1051. dot = text[0] == '.';
  1052. fixed = g_string_sized_new (16);
  1053. if (!dot)
  1054. start = text;
  1055. else
  1056. {
  1057. start = text + 1;
  1058. g_string_append_c (fixed, '.');
  1059. }
  1060. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  1061. {
  1062. if (start != end)
  1063. {
  1064. fold = g_utf8_casefold (start, end - start);
  1065. key = keygen (fold, -1);
  1066. g_string_append (fixed, key);
  1067. g_free (key);
  1068. g_free (fold);
  1069. }
  1070. g_string_append_c (fixed, end[0]);
  1071. start = end + 1;
  1072. }
  1073. if (start == text)
  1074. {
  1075. fold = g_utf8_casefold (start, -1);
  1076. result = keygen (fold, -1);
  1077. g_free (fold);
  1078. g_string_free (fixed, TRUE);
  1079. }
  1080. else if (dot && (start == text + 1))
  1081. {
  1082. fold = g_utf8_casefold (start, -1);
  1083. key = keygen (fold, -1);
  1084. g_string_append (fixed, key);
  1085. g_free (key);
  1086. g_free (fold);
  1087. result = g_string_free (fixed, FALSE);
  1088. }
  1089. else
  1090. {
  1091. if (start[0] != '\0' && start != end)
  1092. {
  1093. fold = g_utf8_casefold (start, end - start);
  1094. key = keygen (fold, -1);
  1095. g_string_append (fixed, key);
  1096. g_free (key);
  1097. g_free (fold);
  1098. }
  1099. result = g_string_free (fixed, FALSE);
  1100. }
  1101. }
  1102. return result;
  1103. }
  1104. static char *
  1105. str_utf8_create_key (const char *text, int case_sen)
  1106. {
  1107. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
  1108. }
  1109. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1110. static char *
  1111. str_utf8_create_key_for_filename (const char *text, int case_sen)
  1112. {
  1113. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
  1114. }
  1115. #endif
  1116. static int
  1117. str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
  1118. {
  1119. (void) case_sen;
  1120. return strcmp (t1, t2);
  1121. }
  1122. static void
  1123. str_utf8_release_key (char *key, int case_sen)
  1124. {
  1125. (void) case_sen;
  1126. g_free (key);
  1127. }
  1128. struct str_class
  1129. str_utf8_init (void)
  1130. {
  1131. struct str_class result;
  1132. result.conv_gerror_message = str_utf8_conv_gerror_message;
  1133. result.vfs_convert_to = str_utf8_vfs_convert_to;
  1134. result.insert_replace_char = str_utf8_insert_replace_char;
  1135. result.is_valid_string = str_utf8_is_valid_string;
  1136. result.is_valid_char = str_utf8_is_valid_char;
  1137. result.cnext_char = str_utf8_cnext_char;
  1138. result.cprev_char = str_utf8_cprev_char;
  1139. result.cnext_char_safe = str_utf8_cnext_char_safe;
  1140. result.cprev_char_safe = str_utf8_cprev_char_safe;
  1141. result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
  1142. result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
  1143. result.isspace = str_utf8_isspace;
  1144. result.ispunct = str_utf8_ispunct;
  1145. result.isalnum = str_utf8_isalnum;
  1146. result.isdigit = str_utf8_isdigit;
  1147. result.isprint = str_utf8_isprint;
  1148. result.iscombiningmark = str_utf8_iscombiningmark;
  1149. result.toupper = str_utf8_toupper;
  1150. result.tolower = str_utf8_tolower;
  1151. result.length = str_utf8_length;
  1152. result.length2 = str_utf8_length2;
  1153. result.length_noncomb = str_utf8_length_noncomb;
  1154. result.fix_string = str_utf8_fix_string;
  1155. result.term_form = str_utf8_term_form;
  1156. result.fit_to_term = str_utf8_fit_to_term;
  1157. result.term_trim = str_utf8_term_trim;
  1158. result.term_width2 = str_utf8_term_width2;
  1159. result.term_width1 = str_utf8_term_width1;
  1160. result.term_char_width = str_utf8_term_char_width;
  1161. result.term_substring = str_utf8_term_substring;
  1162. result.trunc = str_utf8_trunc;
  1163. result.offset_to_pos = str_utf8_offset_to_pos;
  1164. result.column_to_pos = str_utf8_column_to_pos;
  1165. result.create_search_needle = str_utf8_create_search_needle;
  1166. result.release_search_needle = str_utf8_release_search_needle;
  1167. result.search_first = str_utf8_search_first;
  1168. result.search_last = str_utf8_search_last;
  1169. result.compare = str_utf8_compare;
  1170. result.ncompare = str_utf8_ncompare;
  1171. result.casecmp = str_utf8_casecmp;
  1172. result.ncasecmp = str_utf8_ncasecmp;
  1173. result.prefix = str_utf8_prefix;
  1174. result.caseprefix = str_utf8_caseprefix;
  1175. result.create_key = str_utf8_create_key;
  1176. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1177. /* case insensitive sort files in "a1 a2 a10" order */
  1178. result.create_key_for_filename = str_utf8_create_key_for_filename;
  1179. #else
  1180. /* case insensitive sort files in "a1 a10 a2" order */
  1181. result.create_key_for_filename = str_utf8_create_key;
  1182. #endif
  1183. result.key_collate = str_utf8_key_collate;
  1184. result.release_key = str_utf8_release_key;
  1185. return result;
  1186. }