strutilutf8.c 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370
  1. /*
  2. UTF-8 strings utilities
  3. Copyright (C) 2007, 2011
  4. The Free Software Foundation, Inc.
  5. Written by:
  6. Rostislav Benes, 2007
  7. The file_date routine is mostly from GNU's fileutils package,
  8. written by Richard Stallman and David MacKenzie.
  9. This file is part of the Midnight Commander.
  10. The Midnight Commander is free software: you can redistribute it
  11. and/or modify it under the terms of the GNU General Public License as
  12. published by the Free Software Foundation, either version 3 of the License,
  13. or (at your option) any later version.
  14. The Midnight Commander is distributed in the hope that it will be useful,
  15. but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. GNU General Public License for more details.
  18. You should have received a copy of the GNU General Public License
  19. along with this program. If not, see <http://www.gnu.org/licenses/>.
  20. */
  21. #include <config.h>
  22. #include <stdlib.h>
  23. #include <stdio.h>
  24. #include <errno.h>
  25. #include <glib.h>
  26. #include <langinfo.h>
  27. #include <string.h>
  28. #include "lib/global.h"
  29. #include "lib/strutil.h"
  30. /* using function for utf-8 from glib */
  31. static const char replch[] = "\xEF\xBF\xBD";
  32. static gboolean
  33. str_unichar_iscombiningmark (gunichar uni)
  34. {
  35. GUnicodeType type;
  36. type = g_unichar_type (uni);
  37. return (type == G_UNICODE_COMBINING_MARK)
  38. || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  39. }
  40. static void
  41. str_utf8_insert_replace_char (GString * buffer)
  42. {
  43. g_string_append (buffer, replch);
  44. }
  45. static int
  46. str_utf8_is_valid_string (const char *text)
  47. {
  48. return g_utf8_validate (text, -1, NULL);
  49. }
  50. static int
  51. str_utf8_is_valid_char (const char *ch, size_t size)
  52. {
  53. switch (g_utf8_get_char_validated (ch, size))
  54. {
  55. case (gunichar) (-2):
  56. return -2;
  57. case (gunichar) (-1):
  58. return -1;
  59. default:
  60. return 1;
  61. }
  62. }
  63. static void
  64. str_utf8_cnext_char (const char **text)
  65. {
  66. (*text) = g_utf8_next_char (*text);
  67. }
  68. static void
  69. str_utf8_cprev_char (const char **text)
  70. {
  71. (*text) = g_utf8_prev_char (*text);
  72. }
  73. static void
  74. str_utf8_cnext_char_safe (const char **text)
  75. {
  76. if (str_utf8_is_valid_char (*text, -1) == 1)
  77. (*text) = g_utf8_next_char (*text);
  78. else
  79. (*text)++;
  80. }
  81. static void
  82. str_utf8_cprev_char_safe (const char **text)
  83. {
  84. const char *result = g_utf8_prev_char (*text);
  85. const char *t = result;
  86. str_utf8_cnext_char_safe (&t);
  87. if (t == *text)
  88. (*text) = result;
  89. else
  90. (*text)--;
  91. }
  92. static void
  93. str_utf8_fix_string (char *text)
  94. {
  95. gunichar uni;
  96. while (text[0] != '\0')
  97. {
  98. uni = g_utf8_get_char_validated (text, -1);
  99. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  100. {
  101. text = g_utf8_next_char (text);
  102. }
  103. else
  104. {
  105. text[0] = '?';
  106. text++;
  107. }
  108. }
  109. }
  110. static int
  111. str_utf8_isspace (const char *text)
  112. {
  113. gunichar uni = g_utf8_get_char_validated (text, -1);
  114. return g_unichar_isspace (uni);
  115. }
  116. static int
  117. str_utf8_ispunct (const char *text)
  118. {
  119. gunichar uni = g_utf8_get_char_validated (text, -1);
  120. return g_unichar_ispunct (uni);
  121. }
  122. static int
  123. str_utf8_isalnum (const char *text)
  124. {
  125. gunichar uni = g_utf8_get_char_validated (text, -1);
  126. return g_unichar_isalnum (uni);
  127. }
  128. static int
  129. str_utf8_isdigit (const char *text)
  130. {
  131. gunichar uni = g_utf8_get_char_validated (text, -1);
  132. return g_unichar_isdigit (uni);
  133. }
  134. static int
  135. str_utf8_isprint (const char *ch)
  136. {
  137. gunichar uni = g_utf8_get_char_validated (ch, -1);
  138. return g_unichar_isprint (uni);
  139. }
  140. static gboolean
  141. str_utf8_iscombiningmark (const char *ch)
  142. {
  143. gunichar uni = g_utf8_get_char_validated (ch, -1);
  144. return str_unichar_iscombiningmark (uni);
  145. }
  146. static int
  147. str_utf8_cnext_noncomb_char (const char **text)
  148. {
  149. int count = 0;
  150. while ((*text)[0] != '\0')
  151. {
  152. str_utf8_cnext_char_safe (text);
  153. count++;
  154. if (!str_utf8_iscombiningmark (*text))
  155. break;
  156. }
  157. return count;
  158. }
  159. static int
  160. str_utf8_cprev_noncomb_char (const char **text, const char *begin)
  161. {
  162. int count = 0;
  163. while ((*text) != begin)
  164. {
  165. str_utf8_cprev_char_safe (text);
  166. count++;
  167. if (!str_utf8_iscombiningmark (*text))
  168. break;
  169. }
  170. return count;
  171. }
  172. static int
  173. str_utf8_toupper (const char *text, char **out, size_t * remain)
  174. {
  175. gunichar uni;
  176. size_t left;
  177. uni = g_utf8_get_char_validated (text, -1);
  178. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  179. return 0;
  180. uni = g_unichar_toupper (uni);
  181. left = g_unichar_to_utf8 (uni, NULL);
  182. if (left >= *remain)
  183. return 0;
  184. left = g_unichar_to_utf8 (uni, *out);
  185. (*out) += left;
  186. (*remain) -= left;
  187. return 1;
  188. }
  189. static int
  190. str_utf8_tolower (const char *text, char **out, size_t * remain)
  191. {
  192. gunichar uni;
  193. size_t left;
  194. uni = g_utf8_get_char_validated (text, -1);
  195. if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
  196. return 0;
  197. uni = g_unichar_tolower (uni);
  198. left = g_unichar_to_utf8 (uni, NULL);
  199. if (left >= *remain)
  200. return 0;
  201. left = g_unichar_to_utf8 (uni, *out);
  202. (*out) += left;
  203. (*remain) -= left;
  204. return 1;
  205. }
  206. static int
  207. str_utf8_length (const char *text)
  208. {
  209. int result = 0;
  210. const char *start;
  211. const char *end;
  212. start = text;
  213. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  214. {
  215. if (start != end)
  216. {
  217. result += g_utf8_strlen (start, end - start);
  218. }
  219. result++;
  220. start = end + 1;
  221. }
  222. if (start == text)
  223. {
  224. result = g_utf8_strlen (text, -1);
  225. }
  226. else
  227. {
  228. if (start[0] != '\0' && start != end)
  229. {
  230. result += g_utf8_strlen (start, end - start);
  231. }
  232. }
  233. return result;
  234. }
  235. static int
  236. str_utf8_length2 (const char *text, int size)
  237. {
  238. int result = 0;
  239. const char *start;
  240. const char *end;
  241. start = text;
  242. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
  243. {
  244. if (start != end)
  245. {
  246. result += g_utf8_strlen (start, min (end - start, size));
  247. size -= end - start;
  248. }
  249. result += (size > 0);
  250. size--;
  251. start = end + 1;
  252. }
  253. if (start == text)
  254. {
  255. result = g_utf8_strlen (text, size);
  256. }
  257. else
  258. {
  259. if (start[0] != '\0' && start != end && size > 0)
  260. {
  261. result += g_utf8_strlen (start, min (end - start, size));
  262. }
  263. }
  264. return result;
  265. }
  266. static int
  267. str_utf8_length_noncomb (const char *text)
  268. {
  269. int result = 0;
  270. const char *t = text;
  271. while (t[0] != '\0')
  272. {
  273. str_utf8_cnext_noncomb_char (&t);
  274. result++;
  275. }
  276. return result;
  277. }
  278. /*
  279. static void
  280. str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
  281. {
  282. char *next = g_utf8_next_char (*string);
  283. (*left) -= next - (*string);
  284. (*string) = next;
  285. g_string_append_c (buffer, '?');
  286. }
  287. */
  288. static gchar *
  289. str_utf8_conv_gerror_message (GError * error, const char *def_msg)
  290. {
  291. if ((error != NULL) && (error->message != NULL))
  292. return g_strdup (error->message);
  293. return g_strdup (def_msg != NULL ? def_msg : "");
  294. }
  295. static estr_t
  296. str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString * buffer)
  297. {
  298. estr_t result;
  299. if (coder == str_cnv_not_convert)
  300. {
  301. g_string_append_len (buffer, string, size);
  302. result = ESTR_SUCCESS;
  303. }
  304. else
  305. result = str_nconvert (coder, (char *) string, size, buffer);
  306. return result;
  307. }
  308. struct term_form
  309. {
  310. char text[BUF_MEDIUM * 6];
  311. size_t width;
  312. gboolean compose;
  313. };
  314. /* utiliti function, that make string valid in utf8 and all characters printable
  315. * return width of string too*/
  316. static const struct term_form *
  317. str_utf8_make_make_term_form (const char *text, size_t length)
  318. {
  319. static struct term_form result;
  320. gunichar uni;
  321. size_t left;
  322. char *actual;
  323. result.text[0] = '\0';
  324. result.width = 0;
  325. result.compose = FALSE;
  326. actual = result.text;
  327. /* check if text start with combining character,
  328. * add space at begin in this case */
  329. if (length != 0 && text[0] != '\0')
  330. {
  331. uni = g_utf8_get_char_validated (text, -1);
  332. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  333. {
  334. if (str_unichar_iscombiningmark (uni))
  335. {
  336. actual[0] = ' ';
  337. actual++;
  338. result.width++;
  339. result.compose = TRUE;
  340. }
  341. }
  342. }
  343. while (length != 0 && text[0] != '\0')
  344. {
  345. uni = g_utf8_get_char_validated (text, -1);
  346. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  347. {
  348. if (g_unichar_isprint (uni))
  349. {
  350. left = g_unichar_to_utf8 (uni, actual);
  351. actual += left;
  352. if (str_unichar_iscombiningmark (uni))
  353. result.compose = TRUE;
  354. else
  355. {
  356. result.width++;
  357. if (g_unichar_iswide (uni))
  358. result.width++;
  359. }
  360. }
  361. else
  362. {
  363. actual[0] = '.';
  364. actual++;
  365. result.width++;
  366. }
  367. text = g_utf8_next_char (text);
  368. }
  369. else
  370. {
  371. text++;
  372. /*actual[0] = '?'; */
  373. memcpy (actual, replch, strlen (replch));
  374. actual += strlen (replch);
  375. result.width++;
  376. }
  377. if (length != (size_t) (-1))
  378. length--;
  379. }
  380. actual[0] = '\0';
  381. return &result;
  382. }
  383. static const char *
  384. str_utf8_term_form (const char *text)
  385. {
  386. static char result[BUF_MEDIUM * 6];
  387. const struct term_form *pre_form;
  388. char *composed;
  389. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  390. if (pre_form->compose)
  391. {
  392. composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  393. g_strlcpy (result, composed, sizeof (result));
  394. g_free (composed);
  395. }
  396. else
  397. {
  398. g_strlcpy (result, pre_form->text, sizeof (result));
  399. }
  400. return result;
  401. }
  402. struct utf8_tool
  403. {
  404. char *actual;
  405. size_t remain;
  406. const char *cheked;
  407. int ident;
  408. gboolean compose;
  409. };
  410. /* utiliti function, that copy all characters from cheked to actual */
  411. static gboolean
  412. utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
  413. {
  414. size_t left;
  415. gunichar uni;
  416. tool->compose = FALSE;
  417. while (tool->cheked[0] != '\0')
  418. {
  419. uni = g_utf8_get_char (tool->cheked);
  420. tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
  421. left = g_unichar_to_utf8 (uni, NULL);
  422. if (tool->remain <= left)
  423. return FALSE;
  424. left = g_unichar_to_utf8 (uni, tool->actual);
  425. tool->actual += left;
  426. tool->remain -= left;
  427. tool->cheked = g_utf8_next_char (tool->cheked);
  428. }
  429. return TRUE;
  430. }
  431. /* utiliti function, that copy characters from cheked to actual until ident is
  432. * smaller than to_ident */
  433. static gboolean
  434. utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
  435. {
  436. size_t left;
  437. gunichar uni;
  438. int w;
  439. tool->compose = FALSE;
  440. while (tool->cheked[0] != '\0')
  441. {
  442. uni = g_utf8_get_char (tool->cheked);
  443. if (!str_unichar_iscombiningmark (uni))
  444. {
  445. w = 1;
  446. if (g_unichar_iswide (uni))
  447. w++;
  448. if (tool->ident + w > to_ident)
  449. return TRUE;
  450. }
  451. else
  452. {
  453. w = 0;
  454. tool->compose = TRUE;
  455. }
  456. left = g_unichar_to_utf8 (uni, NULL);
  457. if (tool->remain <= left)
  458. return FALSE;
  459. left = g_unichar_to_utf8 (uni, tool->actual);
  460. tool->actual += left;
  461. tool->remain -= left;
  462. tool->cheked = g_utf8_next_char (tool->cheked);
  463. tool->ident += w;
  464. }
  465. return TRUE;
  466. }
  467. /* utiliti function, add count spaces to actual */
  468. static int
  469. utf8_tool_insert_space (struct utf8_tool *tool, int count)
  470. {
  471. if (count <= 0)
  472. return 1;
  473. if (tool->remain <= (gsize) count)
  474. return 0;
  475. memset (tool->actual, ' ', count);
  476. tool->actual += count;
  477. tool->remain -= count;
  478. return 1;
  479. }
  480. /* utiliti function, add one characters to actual */
  481. static int
  482. utf8_tool_insert_char (struct utf8_tool *tool, char ch)
  483. {
  484. if (tool->remain <= 1)
  485. return 0;
  486. tool->actual[0] = ch;
  487. tool->actual++;
  488. tool->remain--;
  489. return 1;
  490. }
  491. /* utiliti function, thah skip characters from cheked until ident is greater or
  492. * equal to to_ident */
  493. static gboolean
  494. utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
  495. {
  496. gunichar uni;
  497. while (to_ident > tool->ident && tool->cheked[0] != '\0')
  498. {
  499. uni = g_utf8_get_char (tool->cheked);
  500. if (!str_unichar_iscombiningmark (uni))
  501. {
  502. tool->ident++;
  503. if (g_unichar_iswide (uni))
  504. tool->ident++;
  505. }
  506. tool->cheked = g_utf8_next_char (tool->cheked);
  507. }
  508. uni = g_utf8_get_char (tool->cheked);
  509. while (str_unichar_iscombiningmark (uni))
  510. {
  511. tool->cheked = g_utf8_next_char (tool->cheked);
  512. uni = g_utf8_get_char (tool->cheked);
  513. }
  514. return TRUE;
  515. }
  516. static void
  517. utf8_tool_compose (char *buffer, size_t size)
  518. {
  519. char *composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
  520. g_strlcpy (buffer, composed, size);
  521. g_free (composed);
  522. }
  523. static const char *
  524. str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
  525. {
  526. static char result[BUF_MEDIUM * 6];
  527. const struct term_form *pre_form;
  528. struct utf8_tool tool;
  529. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  530. tool.cheked = pre_form->text;
  531. tool.actual = result;
  532. tool.remain = sizeof (result);
  533. tool.compose = FALSE;
  534. if (pre_form->width <= (gsize) width)
  535. {
  536. tool.ident = 0;
  537. switch (HIDE_FIT (just_mode))
  538. {
  539. case J_CENTER_LEFT:
  540. case J_CENTER:
  541. tool.ident = (width - pre_form->width) / 2;
  542. break;
  543. case J_RIGHT:
  544. tool.ident = width - pre_form->width;
  545. break;
  546. }
  547. utf8_tool_insert_space (&tool, tool.ident);
  548. utf8_tool_copy_chars_to_end (&tool);
  549. utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
  550. }
  551. else
  552. {
  553. if (IS_FIT (just_mode))
  554. {
  555. tool.ident = 0;
  556. utf8_tool_copy_chars_to (&tool, width / 2);
  557. utf8_tool_insert_char (&tool, '~');
  558. tool.ident = 0;
  559. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  560. utf8_tool_copy_chars_to_end (&tool);
  561. utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
  562. }
  563. else
  564. {
  565. tool.ident = 0;
  566. switch (HIDE_FIT (just_mode))
  567. {
  568. case J_CENTER:
  569. tool.ident = (width - pre_form->width) / 2;
  570. break;
  571. case J_RIGHT:
  572. tool.ident = width - pre_form->width;
  573. break;
  574. }
  575. utf8_tool_skip_chars_to (&tool, 0);
  576. utf8_tool_insert_space (&tool, tool.ident);
  577. utf8_tool_copy_chars_to (&tool, width);
  578. utf8_tool_insert_space (&tool, width - tool.ident);
  579. }
  580. }
  581. tool.actual[0] = '\0';
  582. if (tool.compose)
  583. utf8_tool_compose (result, sizeof (result));
  584. return result;
  585. }
  586. static const char *
  587. str_utf8_term_trim (const char *text, int width)
  588. {
  589. static char result[BUF_MEDIUM * 6];
  590. const struct term_form *pre_form;
  591. struct utf8_tool tool;
  592. if (width < 1)
  593. {
  594. result[0] = '\0';
  595. return result;
  596. }
  597. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  598. tool.cheked = pre_form->text;
  599. tool.actual = result;
  600. tool.remain = sizeof (result);
  601. tool.compose = FALSE;
  602. if ((gsize) width < pre_form->width)
  603. {
  604. if (width <= 3)
  605. {
  606. memset (tool.actual, '.', width);
  607. tool.actual += width;
  608. tool.remain -= width;
  609. }
  610. else
  611. {
  612. memset (tool.actual, '.', 3);
  613. tool.actual += 3;
  614. tool.remain -= 3;
  615. tool.ident = 0;
  616. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
  617. utf8_tool_copy_chars_to_end (&tool);
  618. }
  619. }
  620. else
  621. {
  622. utf8_tool_copy_chars_to_end (&tool);
  623. }
  624. tool.actual[0] = '\0';
  625. if (tool.compose)
  626. utf8_tool_compose (result, sizeof (result));
  627. return result;
  628. }
  629. static int
  630. str_utf8_term_width2 (const char *text, size_t length)
  631. {
  632. const struct term_form *result;
  633. result = str_utf8_make_make_term_form (text, length);
  634. return result->width;
  635. }
  636. static int
  637. str_utf8_term_width1 (const char *text)
  638. {
  639. return str_utf8_term_width2 (text, (size_t) (-1));
  640. }
  641. static int
  642. str_utf8_term_char_width (const char *text)
  643. {
  644. gunichar uni = g_utf8_get_char_validated (text, -1);
  645. return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
  646. }
  647. static const char *
  648. str_utf8_term_substring (const char *text, int start, int width)
  649. {
  650. static char result[BUF_MEDIUM * 6];
  651. const struct term_form *pre_form;
  652. struct utf8_tool tool;
  653. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  654. tool.cheked = pre_form->text;
  655. tool.actual = result;
  656. tool.remain = sizeof (result);
  657. tool.compose = FALSE;
  658. tool.ident = -start;
  659. utf8_tool_skip_chars_to (&tool, 0);
  660. if (tool.ident < 0)
  661. tool.ident = 0;
  662. utf8_tool_insert_space (&tool, tool.ident);
  663. utf8_tool_copy_chars_to (&tool, width);
  664. utf8_tool_insert_space (&tool, width - tool.ident);
  665. tool.actual[0] = '\0';
  666. if (tool.compose)
  667. utf8_tool_compose (result, sizeof (result));
  668. return result;
  669. }
  670. static const char *
  671. str_utf8_trunc (const char *text, int width)
  672. {
  673. static char result[MC_MAXPATHLEN * 6 * 2];
  674. const struct term_form *pre_form;
  675. struct utf8_tool tool;
  676. pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
  677. tool.cheked = pre_form->text;
  678. tool.actual = result;
  679. tool.remain = sizeof (result);
  680. tool.compose = FALSE;
  681. if (pre_form->width > (gsize) width)
  682. {
  683. tool.ident = 0;
  684. utf8_tool_copy_chars_to (&tool, width / 2);
  685. utf8_tool_insert_char (&tool, '~');
  686. tool.ident = 0;
  687. utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
  688. utf8_tool_copy_chars_to_end (&tool);
  689. }
  690. else
  691. {
  692. utf8_tool_copy_chars_to_end (&tool);
  693. }
  694. tool.actual[0] = '\0';
  695. if (tool.compose)
  696. utf8_tool_compose (result, sizeof (result));
  697. return result;
  698. }
  699. static int
  700. str_utf8_offset_to_pos (const char *text, size_t length)
  701. {
  702. if (str_utf8_is_valid_string (text))
  703. return g_utf8_offset_to_pointer (text, length) - text;
  704. else
  705. {
  706. int result;
  707. GString *buffer = g_string_new (text);
  708. str_utf8_fix_string (buffer->str);
  709. result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
  710. g_string_free (buffer, TRUE);
  711. return result;
  712. }
  713. }
  714. static int
  715. str_utf8_column_to_pos (const char *text, size_t pos)
  716. {
  717. static int result;
  718. gunichar uni;
  719. int width;
  720. width = 0;
  721. result = 0;
  722. while (text[0] != '\0')
  723. {
  724. uni = g_utf8_get_char_validated (text, 6);
  725. if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
  726. {
  727. if (g_unichar_isprint (uni))
  728. {
  729. if (!str_unichar_iscombiningmark (uni))
  730. {
  731. width++;
  732. if (g_unichar_iswide (uni))
  733. width++;
  734. }
  735. }
  736. else
  737. {
  738. width++;
  739. }
  740. text = g_utf8_next_char (text);
  741. }
  742. else
  743. {
  744. text++;
  745. width++;
  746. }
  747. if ((gsize) width > pos)
  748. return result;
  749. result++;
  750. }
  751. return result;
  752. }
  753. static char *
  754. str_utf8_create_search_needle (const char *needle, int case_sen)
  755. {
  756. if (needle != NULL)
  757. {
  758. if (case_sen)
  759. {
  760. return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
  761. }
  762. else
  763. {
  764. char *fold = g_utf8_casefold (needle, -1);
  765. char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  766. g_free (fold);
  767. return result;
  768. }
  769. }
  770. else
  771. return NULL;
  772. }
  773. static void
  774. str_utf8_release_search_needle (char *needle, int case_sen)
  775. {
  776. (void) case_sen;
  777. if (needle != NULL)
  778. g_free (needle);
  779. }
  780. static const char *
  781. str_utf8_search_first (const char *text, const char *search, int case_sen)
  782. {
  783. char *fold_text;
  784. char *deco_text;
  785. const char *match;
  786. const char *result = NULL;
  787. const char *m;
  788. fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
  789. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  790. match = deco_text;
  791. do
  792. {
  793. match = g_strstr_len (match, -1, search);
  794. if (match != NULL)
  795. {
  796. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  797. !str_utf8_iscombiningmark (match + strlen (search)))
  798. {
  799. result = text;
  800. m = deco_text;
  801. while (m < match)
  802. {
  803. str_utf8_cnext_noncomb_char (&m);
  804. str_utf8_cnext_noncomb_char (&result);
  805. }
  806. }
  807. else
  808. {
  809. str_utf8_cnext_char (&match);
  810. }
  811. }
  812. }
  813. while (match != NULL && result == NULL);
  814. g_free (deco_text);
  815. if (!case_sen)
  816. g_free (fold_text);
  817. return result;
  818. }
  819. static const char *
  820. str_utf8_search_last (const char *text, const char *search, int case_sen)
  821. {
  822. char *fold_text;
  823. char *deco_text;
  824. char *match;
  825. const char *result = NULL;
  826. const char *m;
  827. fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
  828. deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
  829. do
  830. {
  831. match = g_strrstr_len (deco_text, -1, search);
  832. if (match != NULL)
  833. {
  834. if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
  835. !str_utf8_iscombiningmark (match + strlen (search)))
  836. {
  837. result = text;
  838. m = deco_text;
  839. while (m < match)
  840. {
  841. str_utf8_cnext_noncomb_char (&m);
  842. str_utf8_cnext_noncomb_char (&result);
  843. }
  844. }
  845. else
  846. {
  847. match[0] = '\0';
  848. }
  849. }
  850. }
  851. while (match != NULL && result == NULL);
  852. g_free (deco_text);
  853. if (!case_sen)
  854. g_free (fold_text);
  855. return result;
  856. }
  857. static char *
  858. str_utf8_normalize (const char *text)
  859. {
  860. GString *fixed;
  861. char *tmp;
  862. char *result;
  863. const char *start;
  864. const char *end;
  865. fixed = g_string_sized_new (4);
  866. start = text;
  867. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  868. {
  869. if (start != end)
  870. {
  871. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  872. g_string_append (fixed, tmp);
  873. g_free (tmp);
  874. }
  875. g_string_append_c (fixed, end[0]);
  876. start = end + 1;
  877. }
  878. if (start == text)
  879. {
  880. result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
  881. g_string_free (fixed, TRUE);
  882. }
  883. else
  884. {
  885. if (start[0] != '\0' && start != end)
  886. {
  887. tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
  888. g_string_append (fixed, tmp);
  889. g_free (tmp);
  890. }
  891. result = g_string_free (fixed, FALSE);
  892. }
  893. return result;
  894. }
  895. static char *
  896. str_utf8_casefold_normalize (const char *text)
  897. {
  898. GString *fixed;
  899. char *tmp, *fold;
  900. char *result;
  901. const char *start;
  902. const char *end;
  903. fixed = g_string_sized_new (4);
  904. start = text;
  905. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  906. {
  907. if (start != end)
  908. {
  909. fold = g_utf8_casefold (start, end - start);
  910. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  911. g_string_append (fixed, tmp);
  912. g_free (tmp);
  913. g_free (fold);
  914. }
  915. g_string_append_c (fixed, end[0]);
  916. start = end + 1;
  917. }
  918. if (start == text)
  919. {
  920. fold = g_utf8_casefold (text, -1);
  921. result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  922. g_free (fold);
  923. g_string_free (fixed, TRUE);
  924. }
  925. else
  926. {
  927. if (start[0] != '\0' && start != end)
  928. {
  929. fold = g_utf8_casefold (start, end - start);
  930. tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
  931. g_string_append (fixed, tmp);
  932. g_free (tmp);
  933. g_free (fold);
  934. }
  935. result = g_string_free (fixed, FALSE);
  936. }
  937. return result;
  938. }
  939. static int
  940. str_utf8_compare (const char *t1, const char *t2)
  941. {
  942. char *n1, *n2;
  943. int result;
  944. n1 = str_utf8_normalize (t1);
  945. n2 = str_utf8_normalize (t2);
  946. result = strcmp (n1, n2);
  947. g_free (n1);
  948. g_free (n2);
  949. return result;
  950. }
  951. static int
  952. str_utf8_ncompare (const char *t1, const char *t2)
  953. {
  954. char *n1, *n2;
  955. int result;
  956. n1 = str_utf8_normalize (t1);
  957. n2 = str_utf8_normalize (t2);
  958. result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
  959. g_free (n1);
  960. g_free (n2);
  961. return result;
  962. }
  963. static int
  964. str_utf8_casecmp (const char *t1, const char *t2)
  965. {
  966. char *n1, *n2;
  967. int result;
  968. n1 = str_utf8_casefold_normalize (t1);
  969. n2 = str_utf8_casefold_normalize (t2);
  970. result = strcmp (n1, n2);
  971. g_free (n1);
  972. g_free (n2);
  973. return result;
  974. }
  975. static int
  976. str_utf8_ncasecmp (const char *t1, const char *t2)
  977. {
  978. char *n1, *n2;
  979. int result;
  980. n1 = str_utf8_casefold_normalize (t1);
  981. n2 = str_utf8_casefold_normalize (t2);
  982. result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
  983. g_free (n1);
  984. g_free (n2);
  985. return result;
  986. }
  987. static int
  988. str_utf8_prefix (const char *text, const char *prefix)
  989. {
  990. char *t = str_utf8_normalize (text);
  991. char *p = str_utf8_normalize (prefix);
  992. const char *nt = t;
  993. const char *np = p;
  994. const char *nnt = t;
  995. const char *nnp = p;
  996. int result;
  997. while (nt[0] != '\0' && np[0] != '\0')
  998. {
  999. str_utf8_cnext_char_safe (&nnt);
  1000. str_utf8_cnext_char_safe (&nnp);
  1001. if (nnt - nt != nnp - np)
  1002. break;
  1003. if (strncmp (nt, np, nnt - nt) != 0)
  1004. break;
  1005. nt = nnt;
  1006. np = nnp;
  1007. }
  1008. result = np - p;
  1009. g_free (t);
  1010. g_free (p);
  1011. return result;
  1012. }
  1013. static int
  1014. str_utf8_caseprefix (const char *text, const char *prefix)
  1015. {
  1016. char *t = str_utf8_casefold_normalize (text);
  1017. char *p = str_utf8_casefold_normalize (prefix);
  1018. const char *nt = t;
  1019. const char *np = p;
  1020. const char *nnt = t;
  1021. const char *nnp = p;
  1022. int result;
  1023. while (nt[0] != '\0' && np[0] != '\0')
  1024. {
  1025. str_utf8_cnext_char_safe (&nnt);
  1026. str_utf8_cnext_char_safe (&nnp);
  1027. if (nnt - nt != nnp - np)
  1028. break;
  1029. if (strncmp (nt, np, nnt - nt) != 0)
  1030. break;
  1031. nt = nnt;
  1032. np = nnp;
  1033. }
  1034. result = np - p;
  1035. g_free (t);
  1036. g_free (p);
  1037. return result;
  1038. }
  1039. static char *
  1040. str_utf8_create_key_gen (const char *text, int case_sen,
  1041. gchar * (*keygen) (const gchar * text, gssize size))
  1042. {
  1043. char *result;
  1044. if (case_sen)
  1045. {
  1046. result = str_utf8_normalize (text);
  1047. }
  1048. else
  1049. {
  1050. gboolean dot;
  1051. GString *fixed;
  1052. const char *start, *end;
  1053. char *fold, *key;
  1054. dot = text[0] == '.';
  1055. fixed = g_string_sized_new (16);
  1056. if (!dot)
  1057. start = text;
  1058. else
  1059. {
  1060. start = text + 1;
  1061. g_string_append_c (fixed, '.');
  1062. }
  1063. while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
  1064. {
  1065. if (start != end)
  1066. {
  1067. fold = g_utf8_casefold (start, end - start);
  1068. key = keygen (fold, -1);
  1069. g_string_append (fixed, key);
  1070. g_free (key);
  1071. g_free (fold);
  1072. }
  1073. g_string_append_c (fixed, end[0]);
  1074. start = end + 1;
  1075. }
  1076. if (start == text)
  1077. {
  1078. fold = g_utf8_casefold (start, -1);
  1079. result = keygen (fold, -1);
  1080. g_free (fold);
  1081. g_string_free (fixed, TRUE);
  1082. }
  1083. else if (dot && (start == text + 1))
  1084. {
  1085. fold = g_utf8_casefold (start, -1);
  1086. key = keygen (fold, -1);
  1087. g_string_append (fixed, key);
  1088. g_free (key);
  1089. g_free (fold);
  1090. result = g_string_free (fixed, FALSE);
  1091. }
  1092. else
  1093. {
  1094. if (start[0] != '\0' && start != end)
  1095. {
  1096. fold = g_utf8_casefold (start, end - start);
  1097. key = keygen (fold, -1);
  1098. g_string_append (fixed, key);
  1099. g_free (key);
  1100. g_free (fold);
  1101. }
  1102. result = g_string_free (fixed, FALSE);
  1103. }
  1104. }
  1105. return result;
  1106. }
  1107. static char *
  1108. str_utf8_create_key (const char *text, int case_sen)
  1109. {
  1110. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
  1111. }
  1112. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1113. static char *
  1114. str_utf8_create_key_for_filename (const char *text, int case_sen)
  1115. {
  1116. return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
  1117. }
  1118. #endif
  1119. static int
  1120. str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
  1121. {
  1122. (void) case_sen;
  1123. return strcmp (t1, t2);
  1124. }
  1125. static void
  1126. str_utf8_release_key (char *key, int case_sen)
  1127. {
  1128. (void) case_sen;
  1129. g_free (key);
  1130. }
  1131. struct str_class
  1132. str_utf8_init (void)
  1133. {
  1134. struct str_class result;
  1135. result.conv_gerror_message = str_utf8_conv_gerror_message;
  1136. result.vfs_convert_to = str_utf8_vfs_convert_to;
  1137. result.insert_replace_char = str_utf8_insert_replace_char;
  1138. result.is_valid_string = str_utf8_is_valid_string;
  1139. result.is_valid_char = str_utf8_is_valid_char;
  1140. result.cnext_char = str_utf8_cnext_char;
  1141. result.cprev_char = str_utf8_cprev_char;
  1142. result.cnext_char_safe = str_utf8_cnext_char_safe;
  1143. result.cprev_char_safe = str_utf8_cprev_char_safe;
  1144. result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
  1145. result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
  1146. result.char_isspace = str_utf8_isspace;
  1147. result.char_ispunct = str_utf8_ispunct;
  1148. result.char_isalnum = str_utf8_isalnum;
  1149. result.char_isdigit = str_utf8_isdigit;
  1150. result.char_isprint = str_utf8_isprint;
  1151. result.char_iscombiningmark = str_utf8_iscombiningmark;
  1152. result.char_toupper = str_utf8_toupper;
  1153. result.char_tolower = str_utf8_tolower;
  1154. result.length = str_utf8_length;
  1155. result.length2 = str_utf8_length2;
  1156. result.length_noncomb = str_utf8_length_noncomb;
  1157. result.fix_string = str_utf8_fix_string;
  1158. result.term_form = str_utf8_term_form;
  1159. result.fit_to_term = str_utf8_fit_to_term;
  1160. result.term_trim = str_utf8_term_trim;
  1161. result.term_width2 = str_utf8_term_width2;
  1162. result.term_width1 = str_utf8_term_width1;
  1163. result.term_char_width = str_utf8_term_char_width;
  1164. result.term_substring = str_utf8_term_substring;
  1165. result.trunc = str_utf8_trunc;
  1166. result.offset_to_pos = str_utf8_offset_to_pos;
  1167. result.column_to_pos = str_utf8_column_to_pos;
  1168. result.create_search_needle = str_utf8_create_search_needle;
  1169. result.release_search_needle = str_utf8_release_search_needle;
  1170. result.search_first = str_utf8_search_first;
  1171. result.search_last = str_utf8_search_last;
  1172. result.compare = str_utf8_compare;
  1173. result.ncompare = str_utf8_ncompare;
  1174. result.casecmp = str_utf8_casecmp;
  1175. result.ncasecmp = str_utf8_ncasecmp;
  1176. result.prefix = str_utf8_prefix;
  1177. result.caseprefix = str_utf8_caseprefix;
  1178. result.create_key = str_utf8_create_key;
  1179. #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
  1180. /* case insensitive sort files in "a1 a2 a10" order */
  1181. result.create_key_for_filename = str_utf8_create_key_for_filename;
  1182. #else
  1183. /* case insensitive sort files in "a1 a10 a2" order */
  1184. result.create_key_for_filename = str_utf8_create_key;
  1185. #endif
  1186. result.key_collate = str_utf8_key_collate;
  1187. result.release_key = str_utf8_release_key;
  1188. return result;
  1189. }