123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234 |
- /*
- Search text engine.
- HEX-style pattern matching
- Copyright (C) 2009-2025
- Free Software Foundation, Inc.
- Written by:
- Slava Zanko <slavazanko@gmail.com>, 2009.
- This file is part of the Midnight Commander.
- The Midnight Commander is free software: you can redistribute it
- and/or modify it under the terms of the GNU General Public License as
- published by the Free Software Foundation, either version 3 of the License,
- or (at your option) any later version.
- The Midnight Commander is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
- #include <config.h>
- #include <stdio.h>
- #include "lib/global.h"
- #include "lib/strutil.h"
- #include "lib/search.h"
- #include "internal.h"
- /*** global variables ****************************************************************************/
- /*** file scope macro definitions ****************************************************************/
- typedef enum
- {
- MC_SEARCH_HEX_E_OK,
- MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE,
- MC_SEARCH_HEX_E_INVALID_CHARACTER,
- MC_SEARCH_HEX_E_UNMATCHED_QUOTES
- } mc_search_hex_parse_error_t;
- /*** file scope type declarations ****************************************************************/
- /*** forward declarations (file scope functions) *************************************************/
- /*** file scope variables ************************************************************************/
- /* --------------------------------------------------------------------------------------------- */
- /*** file scope functions ************************************************************************/
- /* --------------------------------------------------------------------------------------------- */
- static GString *
- mc_search__hex_translate_to_regex (const GString *astr, mc_search_hex_parse_error_t *error_ptr,
- int *error_pos_ptr)
- {
- GString *buff;
- const char *str;
- gsize str_len;
- gsize loop = 0;
- mc_search_hex_parse_error_t error = MC_SEARCH_HEX_E_OK;
- buff = g_string_sized_new (64);
- str = astr->str;
- str_len = astr->len;
- while (loop < str_len && error == MC_SEARCH_HEX_E_OK)
- {
- unsigned int val;
- int ptr;
- if (g_ascii_isspace (str[loop]))
- {
- /* Eat-up whitespace between tokens. */
- while (g_ascii_isspace (str[loop]))
- loop++;
- }
- /* cppcheck-suppress invalidscanf */
- else if (sscanf (str + loop, "%x%n", &val, &ptr) == 1)
- {
- if (val > 255)
- error = MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE;
- else
- {
- g_string_append_printf (buff, "\\x%02X", val);
- loop += ptr;
- }
- }
- else if (str[loop] == '"')
- {
- gsize loop2;
- loop2 = loop + 1;
- while (loop2 < str_len)
- {
- if (str[loop2] == '"')
- break;
- if (str[loop2] == '\\' && loop2 + 1 < str_len)
- loop2++;
- g_string_append_c (buff, str[loop2]);
- loop2++;
- }
- if (str[loop2] == '\0')
- error = MC_SEARCH_HEX_E_UNMATCHED_QUOTES;
- else
- loop = loop2 + 1;
- }
- else
- error = MC_SEARCH_HEX_E_INVALID_CHARACTER;
- }
- if (error != MC_SEARCH_HEX_E_OK)
- {
- g_string_free (buff, TRUE);
- if (error_ptr != NULL)
- *error_ptr = error;
- if (error_pos_ptr != NULL)
- *error_pos_ptr = loop;
- return NULL;
- }
- return buff;
- }
- /* --------------------------------------------------------------------------------------------- */
- /*** public functions ****************************************************************************/
- /* --------------------------------------------------------------------------------------------- */
- void
- mc_search__cond_struct_new_init_hex (const char *charset, mc_search_t *lc_mc_search,
- mc_search_cond_t *mc_search_cond)
- {
- GString *tmp;
- mc_search_hex_parse_error_t error = MC_SEARCH_HEX_E_OK;
- int error_pos = 0;
- /*
- * We may be searching in binary data, which is often invalid UTF-8.
- *
- * We have to create a non UTF-8 regex (that is, G_REGEX_RAW) or else, as
- * the data is invalid UTF-8, both GLib's PCRE and our
- * mc_search__g_regex_match_full_safe() are going to fail us. The former by
- * not finding all bytes, the latter by overwriting the supposedly invalid
- * UTF-8 with NULs.
- *
- * To do this, we specify "ASCII" as the charset.
- *
- * In fact, we can specify any charset other than "UTF-8": any such charset
- * will trigger G_REGEX_RAW (see [1]). The output of [2] will be the same
- * for all charsets because it skips the \xXX symbols
- * mc_search__hex_translate_to_regex() outputs.
- *
- * But "ASCII" is the best choice because a hex pattern may contain a
- * quoted string: this way we know [2] will ignore any characters outside
- * ASCII letters range (these ignored chars will be copied verbatim to the
- * output and will match as-is; in other words, in a case-sensitive manner;
- * If the user is interested in case-insensitive searches of international
- * text, he shouldn't be using hex search in the first place.)
- *
- * Switching out of UTF-8 has another advantage:
- *
- * When doing case-insensitive searches, GLib treats \xXX symbols as normal
- * letters and therefore matches both "a" and "A" for the hex pattern
- * "0x61". When we switch out of UTF-8, we're switching to using [2], which
- * doesn't have this issue.
- *
- * [1] mc_search__cond_struct_new_init_regex
- * [2] mc_search__cond_struct_new_regex_ci_str
- */
- if (str_isutf8 (charset))
- charset = "ASCII";
- tmp = mc_search__hex_translate_to_regex (mc_search_cond->str, &error, &error_pos);
- if (tmp != NULL)
- {
- g_string_free (mc_search_cond->str, TRUE);
- mc_search_cond->str = tmp;
- mc_search__cond_struct_new_init_regex (charset, lc_mc_search, mc_search_cond);
- }
- else
- {
- const char *desc;
- switch (error)
- {
- case MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE:
- desc =
- _
- ("Number out of range (should be in byte range, 0 <= n <= 0xFF, expressed in hex)");
- break;
- case MC_SEARCH_HEX_E_INVALID_CHARACTER:
- desc = _("Invalid character");
- break;
- case MC_SEARCH_HEX_E_UNMATCHED_QUOTES:
- desc = _("Unmatched quotes character");
- break;
- default:
- desc = "";
- }
- lc_mc_search->error = MC_SEARCH_E_INPUT;
- lc_mc_search->error_str =
- g_strdup_printf (_("Hex pattern error at position %d:\n%s."), error_pos + 1, desc);
- }
- }
- /* --------------------------------------------------------------------------------------------- */
- gboolean
- mc_search__run_hex (mc_search_t *lc_mc_search, const void *user_data,
- off_t start_search, off_t end_search, gsize *found_len)
- {
- return mc_search__run_regex (lc_mc_search, user_data, start_search, end_search, found_len);
- }
- /* --------------------------------------------------------------------------------------------- */
- GString *
- mc_search_hex_prepare_replace_str (mc_search_t *lc_mc_search, GString *replace_str)
- {
- (void) lc_mc_search;
- return mc_g_string_dup (replace_str);
- }
- /* --------------------------------------------------------------------------------------------- */
|