1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540 |
- /*
- UTF-8 strings utilities
- Copyright (C) 2007-2025
- Free Software Foundation, Inc.
- Written by:
- Rostislav Benes, 2007
- This file is part of the Midnight Commander.
- The Midnight Commander is free software: you can redistribute it
- and/or modify it under the terms of the GNU General Public License as
- published by the Free Software Foundation, either version 3 of the License,
- or (at your option) any later version.
- The Midnight Commander is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
- #include <config.h>
- #include <stdlib.h>
- #include <langinfo.h>
- #include <limits.h> /* MB_LEN_MAX */
- #include <string.h>
- #include "lib/global.h"
- #include "lib/strutil.h"
- /* using function for utf-8 from glib */
- /*** global variables ****************************************************************************/
- /*** file scope macro definitions ****************************************************************/
- /*** file scope type declarations ****************************************************************/
- struct utf8_tool
- {
- char *actual;
- size_t remain;
- const char *checked;
- int ident;
- gboolean compose;
- };
- struct term_form
- {
- char text[BUF_MEDIUM * MB_LEN_MAX];
- size_t width;
- gboolean compose;
- };
- /*** forward declarations (file scope functions) *************************************************/
- /*** file scope variables ************************************************************************/
- static const char replch[] = "\xEF\xBF\xBD";
- /* --------------------------------------------------------------------------------------------- */
- /*** file scope functions ************************************************************************/
- /* --------------------------------------------------------------------------------------------- */
- static gboolean
- str_unichar_iscombiningmark (gunichar uni)
- {
- GUnicodeType type;
- type = g_unichar_type (uni);
- return (type == G_UNICODE_SPACING_MARK)
- || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
- }
- /* --------------------------------------------------------------------------------------------- */
- static void
- str_utf8_insert_replace_char (GString *buffer)
- {
- g_string_append (buffer, replch);
- }
- /* --------------------------------------------------------------------------------------------- */
- static gboolean
- str_utf8_is_valid_string (const char *text)
- {
- return g_utf8_validate (text, -1, NULL);
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_is_valid_char (const char *ch, size_t size)
- {
- switch (g_utf8_get_char_validated (ch, size))
- {
- case (gunichar) (-2):
- return (-2);
- case (gunichar) (-1):
- return (-1);
- default:
- return 1;
- }
- }
- /* --------------------------------------------------------------------------------------------- */
- static void
- str_utf8_cnext_char (const char **text)
- {
- (*text) = g_utf8_next_char (*text);
- }
- /* --------------------------------------------------------------------------------------------- */
- static void
- str_utf8_cprev_char (const char **text)
- {
- (*text) = g_utf8_prev_char (*text);
- }
- /* --------------------------------------------------------------------------------------------- */
- static void
- str_utf8_cnext_char_safe (const char **text)
- {
- if (str_utf8_is_valid_char (*text, -1) == 1)
- (*text) = g_utf8_next_char (*text);
- else
- (*text)++;
- }
- /* --------------------------------------------------------------------------------------------- */
- static void
- str_utf8_cprev_char_safe (const char **text)
- {
- const char *result, *t;
- result = g_utf8_prev_char (*text);
- t = result;
- str_utf8_cnext_char_safe (&t);
- if (t == *text)
- (*text) = result;
- else
- (*text)--;
- }
- /* --------------------------------------------------------------------------------------------- */
- static void
- str_utf8_fix_string (char *text)
- {
- while (text[0] != '\0')
- {
- gunichar uni;
- uni = g_utf8_get_char_validated (text, -1);
- if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
- text = g_utf8_next_char (text);
- else
- {
- text[0] = '?';
- text++;
- }
- }
- }
- /* --------------------------------------------------------------------------------------------- */
- static gboolean
- str_utf8_isspace (const char *text)
- {
- gunichar uni;
- uni = g_utf8_get_char_validated (text, -1);
- return g_unichar_isspace (uni);
- }
- /* --------------------------------------------------------------------------------------------- */
- static gboolean
- str_utf8_ispunct (const char *text)
- {
- gunichar uni;
- uni = g_utf8_get_char_validated (text, -1);
- return g_unichar_ispunct (uni);
- }
- /* --------------------------------------------------------------------------------------------- */
- static gboolean
- str_utf8_isalnum (const char *text)
- {
- gunichar uni;
- uni = g_utf8_get_char_validated (text, -1);
- return g_unichar_isalnum (uni);
- }
- /* --------------------------------------------------------------------------------------------- */
- static gboolean
- str_utf8_isdigit (const char *text)
- {
- gunichar uni;
- uni = g_utf8_get_char_validated (text, -1);
- return g_unichar_isdigit (uni);
- }
- /* --------------------------------------------------------------------------------------------- */
- static gboolean
- str_utf8_isprint (const char *ch)
- {
- gunichar uni;
- uni = g_utf8_get_char_validated (ch, -1);
- return g_unichar_isprint (uni);
- }
- /* --------------------------------------------------------------------------------------------- */
- static gboolean
- str_utf8_iscombiningmark (const char *ch)
- {
- gunichar uni;
- uni = g_utf8_get_char_validated (ch, -1);
- return str_unichar_iscombiningmark (uni);
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_cnext_noncomb_char (const char **text)
- {
- int count = 0;
- while ((*text)[0] != '\0')
- {
- str_utf8_cnext_char_safe (text);
- count++;
- if (!str_utf8_iscombiningmark (*text))
- break;
- }
- return count;
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_cprev_noncomb_char (const char **text, const char *begin)
- {
- int count = 0;
- while ((*text) != begin)
- {
- str_utf8_cprev_char_safe (text);
- count++;
- if (!str_utf8_iscombiningmark (*text))
- break;
- }
- return count;
- }
- /* --------------------------------------------------------------------------------------------- */
- static gboolean
- str_utf8_toupper (const char *text, char **out, size_t *remain)
- {
- gunichar uni;
- size_t left;
- uni = g_utf8_get_char_validated (text, -1);
- if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
- return FALSE;
- uni = g_unichar_toupper (uni);
- left = g_unichar_to_utf8 (uni, NULL);
- if (left >= *remain)
- return FALSE;
- left = g_unichar_to_utf8 (uni, *out);
- (*out) += left;
- (*remain) -= left;
- return TRUE;
- }
- /* --------------------------------------------------------------------------------------------- */
- static gboolean
- str_utf8_tolower (const char *text, char **out, size_t *remain)
- {
- gunichar uni;
- size_t left;
- uni = g_utf8_get_char_validated (text, -1);
- if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
- return FALSE;
- uni = g_unichar_tolower (uni);
- left = g_unichar_to_utf8 (uni, NULL);
- if (left >= *remain)
- return FALSE;
- left = g_unichar_to_utf8 (uni, *out);
- (*out) += left;
- (*remain) -= left;
- return TRUE;
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_length (const char *text)
- {
- int result = 0;
- const char *start;
- const char *end;
- start = text;
- while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
- {
- if (start != end)
- result += g_utf8_strlen (start, end - start);
- result++;
- start = end + 1;
- }
- if (start == text)
- result = g_utf8_strlen (text, -1);
- else if (start[0] != '\0' && start != end)
- result += g_utf8_strlen (start, end - start);
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_length2 (const char *text, int size)
- {
- int result = 0;
- const char *start;
- const char *end;
- start = text;
- while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
- {
- if (start != end)
- {
- result += g_utf8_strlen (start, MIN (end - start, size));
- size -= end - start;
- }
- result += (size > 0);
- size--;
- start = end + 1;
- }
- if (start == text)
- result = g_utf8_strlen (text, size);
- else if (start[0] != '\0' && start != end && size > 0)
- result += g_utf8_strlen (start, MIN (end - start, size));
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_length_noncomb (const char *text)
- {
- int result = 0;
- const char *t = text;
- while (t[0] != '\0')
- {
- str_utf8_cnext_noncomb_char (&t);
- result++;
- }
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- #if 0
- static void
- str_utf8_questmark_sustb (char **string, size_t *left, GString *buffer)
- {
- char *next;
- next = g_utf8_next_char (*string);
- (*left) -= next - (*string);
- (*string) = next;
- g_string_append_c (buffer, '?');
- }
- #endif
- /* --------------------------------------------------------------------------------------------- */
- static gchar *
- str_utf8_conv_gerror_message (GError *mcerror, const char *def_msg)
- {
- if (mcerror != NULL)
- return g_strdup (mcerror->message);
- return g_strdup (def_msg != NULL ? def_msg : "");
- }
- /* --------------------------------------------------------------------------------------------- */
- static estr_t
- str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString *buffer)
- {
- estr_t result = ESTR_SUCCESS;
- if (coder == str_cnv_not_convert)
- g_string_append_len (buffer, string, size);
- else
- result = str_nconvert (coder, string, size, buffer);
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- /* utility function, that makes string valid in utf8 and all characters printable
- * return width of string too */
- static const struct term_form *
- str_utf8_make_make_term_form (const char *text, size_t length)
- {
- static struct term_form result;
- gunichar uni;
- size_t left;
- char *actual;
- result.text[0] = '\0';
- result.width = 0;
- result.compose = FALSE;
- actual = result.text;
- /* check if text start with combining character,
- * add space at begin in this case */
- if (length != 0 && text[0] != '\0')
- {
- uni = g_utf8_get_char_validated (text, -1);
- if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
- && str_unichar_iscombiningmark (uni))
- {
- actual[0] = ' ';
- actual++;
- result.width++;
- result.compose = TRUE;
- }
- }
- while (length != 0 && text[0] != '\0')
- {
- uni = g_utf8_get_char_validated (text, -1);
- if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
- {
- if (g_unichar_isprint (uni))
- {
- left = g_unichar_to_utf8 (uni, actual);
- actual += left;
- if (str_unichar_iscombiningmark (uni))
- result.compose = TRUE;
- else
- {
- result.width++;
- if (g_unichar_iswide (uni))
- result.width++;
- }
- }
- else
- {
- actual[0] = '.';
- actual++;
- result.width++;
- }
- text = g_utf8_next_char (text);
- }
- else
- {
- size_t repl_len;
- text++;
- /*actual[0] = '?'; */
- repl_len = strlen (replch);
- memcpy (actual, replch, repl_len);
- actual += repl_len;
- result.width++;
- }
- if (length != (size_t) (-1))
- length--;
- }
- actual[0] = '\0';
- return &result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static const char *
- str_utf8_term_form (const char *text)
- {
- static char result[BUF_MEDIUM * MB_LEN_MAX];
- const struct term_form *pre_form;
- pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
- if (pre_form->compose)
- {
- char *composed;
- composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
- g_strlcpy (result, composed, sizeof (result));
- g_free (composed);
- }
- else
- g_strlcpy (result, pre_form->text, sizeof (result));
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- /* utility function, that copies all characters from checked to actual */
- static gboolean
- utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
- {
- tool->compose = FALSE;
- while (tool->checked[0] != '\0')
- {
- gunichar uni;
- size_t left;
- uni = g_utf8_get_char (tool->checked);
- tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
- left = g_unichar_to_utf8 (uni, NULL);
- if (tool->remain <= left)
- return FALSE;
- left = g_unichar_to_utf8 (uni, tool->actual);
- tool->actual += left;
- tool->remain -= left;
- tool->checked = g_utf8_next_char (tool->checked);
- }
- return TRUE;
- }
- /* --------------------------------------------------------------------------------------------- */
- /* utility function, that copies characters from checked to actual until ident is
- * smaller than to_ident */
- static gboolean
- utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
- {
- tool->compose = FALSE;
- while (tool->checked[0] != '\0')
- {
- gunichar uni;
- size_t left;
- int w = 0;
- uni = g_utf8_get_char (tool->checked);
- if (str_unichar_iscombiningmark (uni))
- tool->compose = TRUE;
- else
- {
- w = 1;
- if (g_unichar_iswide (uni))
- w++;
- if (tool->ident + w > to_ident)
- return TRUE;
- }
- left = g_unichar_to_utf8 (uni, NULL);
- if (tool->remain <= left)
- return FALSE;
- left = g_unichar_to_utf8 (uni, tool->actual);
- tool->actual += left;
- tool->remain -= left;
- tool->checked = g_utf8_next_char (tool->checked);
- tool->ident += w;
- }
- return TRUE;
- }
- /* --------------------------------------------------------------------------------------------- */
- /* utility function, adds count spaces to actual */
- static int
- utf8_tool_insert_space (struct utf8_tool *tool, int count)
- {
- if (count <= 0)
- return 1;
- if (tool->remain <= (gsize) count)
- return 0;
- memset (tool->actual, ' ', count);
- tool->actual += count;
- tool->remain -= count;
- return 1;
- }
- /* --------------------------------------------------------------------------------------------- */
- /* utility function, adds one characters to actual */
- static int
- utf8_tool_insert_char (struct utf8_tool *tool, char ch)
- {
- if (tool->remain <= 1)
- return 0;
- tool->actual[0] = ch;
- tool->actual++;
- tool->remain--;
- return 1;
- }
- /* --------------------------------------------------------------------------------------------- */
- /* utility function, thah skips characters from checked until ident is greater or
- * equal to to_ident */
- static gboolean
- utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
- {
- gunichar uni;
- while (to_ident > tool->ident && tool->checked[0] != '\0')
- {
- uni = g_utf8_get_char (tool->checked);
- if (!str_unichar_iscombiningmark (uni))
- {
- tool->ident++;
- if (g_unichar_iswide (uni))
- tool->ident++;
- }
- tool->checked = g_utf8_next_char (tool->checked);
- }
- uni = g_utf8_get_char (tool->checked);
- while (str_unichar_iscombiningmark (uni))
- {
- tool->checked = g_utf8_next_char (tool->checked);
- uni = g_utf8_get_char (tool->checked);
- }
- return TRUE;
- }
- /* --------------------------------------------------------------------------------------------- */
- static void
- utf8_tool_compose (char *buffer, size_t size)
- {
- char *composed;
- composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
- g_strlcpy (buffer, composed, size);
- g_free (composed);
- }
- /* --------------------------------------------------------------------------------------------- */
- static const char *
- str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
- {
- static char result[BUF_MEDIUM * MB_LEN_MAX];
- const struct term_form *pre_form;
- struct utf8_tool tool;
- pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
- tool.checked = pre_form->text;
- tool.actual = result;
- tool.remain = sizeof (result);
- tool.compose = FALSE;
- if (pre_form->width <= (gsize) width)
- {
- switch (HIDE_FIT (just_mode))
- {
- case J_CENTER_LEFT:
- case J_CENTER:
- tool.ident = (width - pre_form->width) / 2;
- break;
- case J_RIGHT:
- tool.ident = width - pre_form->width;
- break;
- default:
- tool.ident = 0;
- break;
- }
- utf8_tool_insert_space (&tool, tool.ident);
- utf8_tool_copy_chars_to_end (&tool);
- utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
- }
- else if (IS_FIT (just_mode))
- {
- tool.ident = 0;
- utf8_tool_copy_chars_to (&tool, width / 2);
- utf8_tool_insert_char (&tool, '~');
- tool.ident = 0;
- utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
- utf8_tool_copy_chars_to_end (&tool);
- utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
- }
- else
- {
- switch (HIDE_FIT (just_mode))
- {
- case J_CENTER:
- tool.ident = (width - pre_form->width) / 2;
- break;
- case J_RIGHT:
- tool.ident = width - pre_form->width;
- break;
- default:
- tool.ident = 0;
- break;
- }
- utf8_tool_skip_chars_to (&tool, 0);
- utf8_tool_insert_space (&tool, tool.ident);
- utf8_tool_copy_chars_to (&tool, width);
- utf8_tool_insert_space (&tool, width - tool.ident);
- }
- tool.actual[0] = '\0';
- if (tool.compose)
- utf8_tool_compose (result, sizeof (result));
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static const char *
- str_utf8_term_trim (const char *text, int width)
- {
- static char result[BUF_MEDIUM * MB_LEN_MAX];
- const struct term_form *pre_form;
- struct utf8_tool tool;
- if (width < 1)
- {
- result[0] = '\0';
- return result;
- }
- pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
- tool.checked = pre_form->text;
- tool.actual = result;
- tool.remain = sizeof (result);
- tool.compose = FALSE;
- if ((gsize) width >= pre_form->width)
- utf8_tool_copy_chars_to_end (&tool);
- else if (width <= 3)
- {
- memset (tool.actual, '.', width);
- tool.actual += width;
- tool.remain -= width;
- }
- else
- {
- memset (tool.actual, '.', 3);
- tool.actual += 3;
- tool.remain -= 3;
- tool.ident = 0;
- utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
- utf8_tool_copy_chars_to_end (&tool);
- }
- tool.actual[0] = '\0';
- if (tool.compose)
- utf8_tool_compose (result, sizeof (result));
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_term_width2 (const char *text, size_t length)
- {
- const struct term_form *result;
- result = str_utf8_make_make_term_form (text, length);
- return result->width;
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_term_width1 (const char *text)
- {
- return str_utf8_term_width2 (text, (size_t) (-1));
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_term_char_width (const char *text)
- {
- gunichar uni;
- uni = g_utf8_get_char_validated (text, -1);
- return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
- }
- /* --------------------------------------------------------------------------------------------- */
- static const char *
- str_utf8_term_substring (const char *text, int start, int width)
- {
- static char result[BUF_MEDIUM * MB_LEN_MAX];
- const struct term_form *pre_form;
- struct utf8_tool tool;
- pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
- tool.checked = pre_form->text;
- tool.actual = result;
- tool.remain = sizeof (result);
- tool.compose = FALSE;
- tool.ident = -start;
- utf8_tool_skip_chars_to (&tool, 0);
- if (tool.ident < 0)
- tool.ident = 0;
- utf8_tool_insert_space (&tool, tool.ident);
- utf8_tool_copy_chars_to (&tool, width);
- utf8_tool_insert_space (&tool, width - tool.ident);
- tool.actual[0] = '\0';
- if (tool.compose)
- utf8_tool_compose (result, sizeof (result));
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static const char *
- str_utf8_trunc (const char *text, int width)
- {
- static char result[MC_MAXPATHLEN * MB_LEN_MAX * 2];
- const struct term_form *pre_form;
- struct utf8_tool tool;
- pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
- tool.checked = pre_form->text;
- tool.actual = result;
- tool.remain = sizeof (result);
- tool.compose = FALSE;
- if (pre_form->width <= (gsize) width)
- utf8_tool_copy_chars_to_end (&tool);
- else
- {
- tool.ident = 0;
- utf8_tool_copy_chars_to (&tool, width / 2);
- utf8_tool_insert_char (&tool, '~');
- tool.ident = 0;
- utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
- utf8_tool_copy_chars_to_end (&tool);
- }
- tool.actual[0] = '\0';
- if (tool.compose)
- utf8_tool_compose (result, sizeof (result));
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_offset_to_pos (const char *text, size_t length)
- {
- if (str_utf8_is_valid_string (text))
- return g_utf8_offset_to_pointer (text, length) - text;
- else
- {
- int result;
- char *buffer;
- buffer = g_strdup (text);
- str_utf8_fix_string (buffer);
- result = g_utf8_offset_to_pointer (buffer, length) - buffer;
- g_free (buffer);
- return result;
- }
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_column_to_pos (const char *text, size_t pos)
- {
- int result = 0;
- int width = 0;
- while (text[0] != '\0')
- {
- gunichar uni;
- uni = g_utf8_get_char_validated (text, MB_LEN_MAX);
- if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
- {
- if (g_unichar_isprint (uni))
- {
- if (!str_unichar_iscombiningmark (uni))
- {
- width++;
- if (g_unichar_iswide (uni))
- width++;
- }
- }
- else
- {
- width++;
- }
- text = g_utf8_next_char (text);
- }
- else
- {
- text++;
- width++;
- }
- if ((gsize) width > pos)
- return result;
- result++;
- }
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static char *
- str_utf8_create_search_needle (const char *needle, gboolean case_sen)
- {
- char *fold, *result;
- if (needle == NULL)
- return NULL;
- if (case_sen)
- return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
- fold = g_utf8_casefold (needle, -1);
- result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
- g_free (fold);
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static void
- str_utf8_release_search_needle (char *needle, gboolean case_sen)
- {
- (void) case_sen;
- g_free (needle);
- }
- /* --------------------------------------------------------------------------------------------- */
- static const char *
- str_utf8_search_first (const char *text, const char *search, gboolean case_sen)
- {
- char *deco_text;
- const char *match;
- const char *result = NULL;
- size_t search_len;
- if (case_sen)
- deco_text = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
- else
- {
- char *fold_text;
- fold_text = g_utf8_casefold (text, -1);
- deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
- g_free (fold_text);
- }
- search_len = strlen (search);
- match = deco_text;
- do
- {
- match = g_strstr_len (match, -1, search);
- if (match != NULL)
- {
- if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
- !str_utf8_iscombiningmark (match + search_len))
- {
- const char *m = deco_text;
- result = text;
- while (m < match)
- {
- str_utf8_cnext_noncomb_char (&m);
- str_utf8_cnext_noncomb_char (&result);
- }
- }
- else
- str_utf8_cnext_char (&match);
- }
- }
- while (match != NULL && result == NULL);
- g_free (deco_text);
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static const char *
- str_utf8_search_last (const char *text, const char *search, gboolean case_sen)
- {
- char *deco_text;
- char *match;
- const char *result = NULL;
- size_t search_len;
- if (case_sen)
- deco_text = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
- else
- {
- char *fold_text;
- fold_text = g_utf8_casefold (text, -1);
- deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
- g_free (fold_text);
- }
- search_len = strlen (search);
- do
- {
- match = g_strrstr_len (deco_text, -1, search);
- if (match != NULL)
- {
- if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
- !str_utf8_iscombiningmark (match + search_len))
- {
- const char *m = deco_text;
- result = text;
- while (m < match)
- {
- str_utf8_cnext_noncomb_char (&m);
- str_utf8_cnext_noncomb_char (&result);
- }
- }
- else
- match[0] = '\0';
- }
- }
- while (match != NULL && result == NULL);
- g_free (deco_text);
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static char *
- str_utf8_normalize (const char *text)
- {
- GString *fixed;
- char *tmp;
- char *result;
- const char *start;
- const char *end;
- /* g_utf8_normalize() is a heavyweight function, that converts UTF-8 into UCS-4,
- * does the normalization and then converts UCS-4 back into UTF-8.
- * Since file names are composed of ASCII characters in most cases, we can speed up
- * utf8 normalization by checking if the heavyweight Unicode normalization is actually
- * needed. Normalization of ASCII string is no-op.
- */
- /* find out whether text is ASCII only */
- for (end = text; *end != '\0'; end++)
- if ((*end & 0x80) != 0)
- {
- /* found 2nd byte of utf8-encoded symbol */
- break;
- }
- /* if text is ASCII-only, return copy, normalize otherwise */
- if (*end == '\0')
- return g_strndup (text, end - text);
- fixed = g_string_sized_new (4);
- start = text;
- while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
- {
- if (start != end)
- {
- tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
- g_string_append (fixed, tmp);
- g_free (tmp);
- }
- g_string_append_c (fixed, end[0]);
- start = end + 1;
- }
- if (start == text)
- {
- result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
- g_string_free (fixed, TRUE);
- }
- else
- {
- if (start[0] != '\0' && start != end)
- {
- tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
- g_string_append (fixed, tmp);
- g_free (tmp);
- }
- result = g_string_free (fixed, FALSE);
- }
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static char *
- str_utf8_casefold_normalize (const char *text)
- {
- GString *fixed;
- char *tmp, *fold;
- char *result;
- const char *start;
- const char *end;
- fixed = g_string_sized_new (4);
- start = text;
- while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
- {
- if (start != end)
- {
- fold = g_utf8_casefold (start, end - start);
- tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
- g_string_append (fixed, tmp);
- g_free (tmp);
- g_free (fold);
- }
- g_string_append_c (fixed, end[0]);
- start = end + 1;
- }
- if (start == text)
- {
- fold = g_utf8_casefold (text, -1);
- result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
- g_free (fold);
- g_string_free (fixed, TRUE);
- }
- else
- {
- if (start[0] != '\0' && start != end)
- {
- fold = g_utf8_casefold (start, end - start);
- tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
- g_string_append (fixed, tmp);
- g_free (tmp);
- g_free (fold);
- }
- result = g_string_free (fixed, FALSE);
- }
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_compare (const char *t1, const char *t2)
- {
- char *n1, *n2;
- int result;
- n1 = str_utf8_normalize (t1);
- n2 = str_utf8_normalize (t2);
- result = strcmp (n1, n2);
- g_free (n1);
- g_free (n2);
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_ncompare (const char *t1, const char *t2)
- {
- char *n1, *n2;
- size_t l1, l2;
- int result;
- n1 = str_utf8_normalize (t1);
- n2 = str_utf8_normalize (t2);
- l1 = strlen (n1);
- l2 = strlen (n2);
- result = strncmp (n1, n2, MIN (l1, l2));
- g_free (n1);
- g_free (n2);
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_casecmp (const char *t1, const char *t2)
- {
- char *n1, *n2;
- int result;
- n1 = str_utf8_casefold_normalize (t1);
- n2 = str_utf8_casefold_normalize (t2);
- result = strcmp (n1, n2);
- g_free (n1);
- g_free (n2);
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_ncasecmp (const char *t1, const char *t2)
- {
- char *n1, *n2;
- size_t l1, l2;
- int result;
- n1 = str_utf8_casefold_normalize (t1);
- n2 = str_utf8_casefold_normalize (t2);
- l1 = strlen (n1);
- l2 = strlen (n2);
- result = strncmp (n1, n2, MIN (l1, l2));
- g_free (n1);
- g_free (n2);
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_prefix (const char *text, const char *prefix)
- {
- char *t, *p;
- const char *nt, *np;
- const char *nnt, *nnp;
- int result;
- t = str_utf8_normalize (text);
- p = str_utf8_normalize (prefix);
- nt = t;
- np = p;
- nnt = t;
- nnp = p;
- while (nt[0] != '\0' && np[0] != '\0')
- {
- str_utf8_cnext_char_safe (&nnt);
- str_utf8_cnext_char_safe (&nnp);
- if (nnt - nt != nnp - np)
- break;
- if (strncmp (nt, np, nnt - nt) != 0)
- break;
- nt = nnt;
- np = nnp;
- }
- result = np - p;
- g_free (t);
- g_free (p);
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_caseprefix (const char *text, const char *prefix)
- {
- char *t, *p;
- const char *nt, *np;
- const char *nnt, *nnp;
- int result;
- t = str_utf8_casefold_normalize (text);
- p = str_utf8_casefold_normalize (prefix);
- nt = t;
- np = p;
- nnt = t;
- nnp = p;
- while (nt[0] != '\0' && np[0] != '\0')
- {
- str_utf8_cnext_char_safe (&nnt);
- str_utf8_cnext_char_safe (&nnp);
- if (nnt - nt != nnp - np)
- break;
- if (strncmp (nt, np, nnt - nt) != 0)
- break;
- nt = nnt;
- np = nnp;
- }
- result = np - p;
- g_free (t);
- g_free (p);
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static char *
- str_utf8_create_key_gen (const char *text, gboolean case_sen,
- gchar *(*keygen) (const gchar *text, gssize size))
- {
- char *result;
- if (case_sen)
- result = str_utf8_normalize (text);
- else
- {
- gboolean dot;
- GString *fixed;
- const char *start, *end;
- char *fold, *key;
- dot = text[0] == '.';
- fixed = g_string_sized_new (16);
- if (!dot)
- start = text;
- else
- {
- start = text + 1;
- g_string_append_c (fixed, '.');
- }
- while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
- {
- if (start != end)
- {
- fold = g_utf8_casefold (start, end - start);
- key = keygen (fold, -1);
- g_string_append (fixed, key);
- g_free (key);
- g_free (fold);
- }
- g_string_append_c (fixed, end[0]);
- start = end + 1;
- }
- if (start == text)
- {
- fold = g_utf8_casefold (start, -1);
- result = keygen (fold, -1);
- g_free (fold);
- g_string_free (fixed, TRUE);
- }
- else if (dot && (start == text + 1))
- {
- fold = g_utf8_casefold (start, -1);
- key = keygen (fold, -1);
- g_string_append (fixed, key);
- g_free (key);
- g_free (fold);
- result = g_string_free (fixed, FALSE);
- }
- else
- {
- if (start[0] != '\0' && start != end)
- {
- fold = g_utf8_casefold (start, end - start);
- key = keygen (fold, -1);
- g_string_append (fixed, key);
- g_free (key);
- g_free (fold);
- }
- result = g_string_free (fixed, FALSE);
- }
- }
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
- static char *
- str_utf8_create_key (const char *text, gboolean case_sen)
- {
- return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
- }
- /* --------------------------------------------------------------------------------------------- */
- #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
- static char *
- str_utf8_create_key_for_filename (const char *text, gboolean case_sen)
- {
- return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
- }
- #endif
- /* --------------------------------------------------------------------------------------------- */
- static int
- str_utf8_key_collate (const char *t1, const char *t2, gboolean case_sen)
- {
- (void) case_sen;
- return strcmp (t1, t2);
- }
- /* --------------------------------------------------------------------------------------------- */
- static void
- str_utf8_release_key (char *key, gboolean case_sen)
- {
- (void) case_sen;
- g_free (key);
- }
- /* --------------------------------------------------------------------------------------------- */
- /*** public functions ****************************************************************************/
- /* --------------------------------------------------------------------------------------------- */
- struct str_class
- str_utf8_init (void)
- {
- struct str_class result;
- result.conv_gerror_message = str_utf8_conv_gerror_message;
- result.vfs_convert_to = str_utf8_vfs_convert_to;
- result.insert_replace_char = str_utf8_insert_replace_char;
- result.is_valid_string = str_utf8_is_valid_string;
- result.is_valid_char = str_utf8_is_valid_char;
- result.cnext_char = str_utf8_cnext_char;
- result.cprev_char = str_utf8_cprev_char;
- result.cnext_char_safe = str_utf8_cnext_char_safe;
- result.cprev_char_safe = str_utf8_cprev_char_safe;
- result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
- result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
- result.char_isspace = str_utf8_isspace;
- result.char_ispunct = str_utf8_ispunct;
- result.char_isalnum = str_utf8_isalnum;
- result.char_isdigit = str_utf8_isdigit;
- result.char_isprint = str_utf8_isprint;
- result.char_iscombiningmark = str_utf8_iscombiningmark;
- result.char_toupper = str_utf8_toupper;
- result.char_tolower = str_utf8_tolower;
- result.length = str_utf8_length;
- result.length2 = str_utf8_length2;
- result.length_noncomb = str_utf8_length_noncomb;
- result.fix_string = str_utf8_fix_string;
- result.term_form = str_utf8_term_form;
- result.fit_to_term = str_utf8_fit_to_term;
- result.term_trim = str_utf8_term_trim;
- result.term_width2 = str_utf8_term_width2;
- result.term_width1 = str_utf8_term_width1;
- result.term_char_width = str_utf8_term_char_width;
- result.term_substring = str_utf8_term_substring;
- result.trunc = str_utf8_trunc;
- result.offset_to_pos = str_utf8_offset_to_pos;
- result.column_to_pos = str_utf8_column_to_pos;
- result.create_search_needle = str_utf8_create_search_needle;
- result.release_search_needle = str_utf8_release_search_needle;
- result.search_first = str_utf8_search_first;
- result.search_last = str_utf8_search_last;
- result.compare = str_utf8_compare;
- result.ncompare = str_utf8_ncompare;
- result.casecmp = str_utf8_casecmp;
- result.ncasecmp = str_utf8_ncasecmp;
- result.prefix = str_utf8_prefix;
- result.caseprefix = str_utf8_caseprefix;
- result.create_key = str_utf8_create_key;
- #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
- /* case insensitive sort files in "a1 a2 a10" order */
- result.create_key_for_filename = str_utf8_create_key_for_filename;
- #else
- /* case insensitive sort files in "a1 a10 a2" order */
- result.create_key_for_filename = str_utf8_create_key;
- #endif
- result.key_collate = str_utf8_key_collate;
- result.release_key = str_utf8_release_key;
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
|