hex.c 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. /*
  2. Search text engine.
  3. HEX-style pattern matching
  4. Copyright (C) 2009-2019
  5. Free Software Foundation, Inc.
  6. Written by:
  7. Slava Zanko <slavazanko@gmail.com>, 2009.
  8. This file is part of the Midnight Commander.
  9. The Midnight Commander is free software: you can redistribute it
  10. and/or modify it under the terms of the GNU General Public License as
  11. published by the Free Software Foundation, either version 3 of the License,
  12. or (at your option) any later version.
  13. The Midnight Commander is distributed in the hope that it will be useful,
  14. but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. GNU General Public License for more details.
  17. You should have received a copy of the GNU General Public License
  18. along with this program. If not, see <http://www.gnu.org/licenses/>.
  19. */
  20. #include <config.h>
  21. #include <stdio.h>
  22. #include "lib/global.h"
  23. #include "lib/strutil.h"
  24. #include "lib/search.h"
  25. #include "lib/strescape.h"
  26. #include "internal.h"
  27. /*** global variables ****************************************************************************/
  28. /*** file scope macro definitions ****************************************************************/
  29. typedef enum
  30. {
  31. MC_SEARCH_HEX_E_OK,
  32. MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE,
  33. MC_SEARCH_HEX_E_INVALID_CHARACTER,
  34. MC_SEARCH_HEX_E_UNMATCHED_QUOTES
  35. } mc_search_hex_parse_error_t;
  36. /*** file scope type declarations ****************************************************************/
  37. /*** file scope variables ************************************************************************/
  38. /*** file scope functions ************************************************************************/
  39. static GString *
  40. mc_search__hex_translate_to_regex (const GString * astr, mc_search_hex_parse_error_t * error_ptr,
  41. int *error_pos_ptr)
  42. {
  43. GString *buff;
  44. const char *str;
  45. gsize str_len;
  46. gsize loop = 0;
  47. mc_search_hex_parse_error_t error = MC_SEARCH_HEX_E_OK;
  48. buff = g_string_sized_new (64);
  49. str = astr->str;
  50. str_len = astr->len;
  51. while (loop < str_len && error == MC_SEARCH_HEX_E_OK)
  52. {
  53. unsigned int val;
  54. int ptr;
  55. if (g_ascii_isspace (str[loop]))
  56. {
  57. /* Eat-up whitespace between tokens. */
  58. while (g_ascii_isspace (str[loop]))
  59. loop++;
  60. }
  61. /* cppcheck-suppress invalidscanf */
  62. else if (sscanf (str + loop, "%x%n", &val, &ptr) == 1)
  63. {
  64. if (val > 255)
  65. error = MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE;
  66. else
  67. {
  68. g_string_append_printf (buff, "\\x%02X", val);
  69. loop += ptr;
  70. }
  71. }
  72. else if (str[loop] == '"')
  73. {
  74. gsize loop2;
  75. loop2 = loop + 1;
  76. while (loop2 < str_len)
  77. {
  78. if (str[loop2] == '"')
  79. break;
  80. if (str[loop2] == '\\' && loop2 + 1 < str_len)
  81. loop2++;
  82. g_string_append_c (buff, str[loop2]);
  83. loop2++;
  84. }
  85. if (str[loop2] == '\0')
  86. error = MC_SEARCH_HEX_E_UNMATCHED_QUOTES;
  87. else
  88. loop = loop2 + 1;
  89. }
  90. else
  91. error = MC_SEARCH_HEX_E_INVALID_CHARACTER;
  92. }
  93. if (error != MC_SEARCH_HEX_E_OK)
  94. {
  95. g_string_free (buff, TRUE);
  96. if (error_ptr != NULL)
  97. *error_ptr = error;
  98. if (error_pos_ptr != NULL)
  99. *error_pos_ptr = loop;
  100. return NULL;
  101. }
  102. return buff;
  103. }
  104. /*** public functions ****************************************************************************/
  105. void
  106. mc_search__cond_struct_new_init_hex (const char *charset, mc_search_t * lc_mc_search,
  107. mc_search_cond_t * mc_search_cond)
  108. {
  109. GString *tmp;
  110. mc_search_hex_parse_error_t error = MC_SEARCH_HEX_E_OK;
  111. int error_pos = 0;
  112. /*
  113. * We may be searching in binary data, which is often invalid UTF-8.
  114. *
  115. * We have to create a non UTF-8 regex (that is, G_REGEX_RAW) or else, as
  116. * the data is invalid UTF-8, both GLib's PCRE and our
  117. * mc_search__g_regex_match_full_safe() are going to fail us. The former by
  118. * not finding all bytes, the latter by overwriting the supposedly invalid
  119. * UTF-8 with NULs.
  120. *
  121. * To do this, we specify "ASCII" as the charset.
  122. *
  123. * In fact, we can specify any charset other than "UTF-8": any such charset
  124. * will trigger G_REGEX_RAW (see [1]). The output of [2] will be the same
  125. * for all charsets because it skips the \xXX symbols
  126. * mc_search__hex_translate_to_regex() outputs.
  127. *
  128. * But "ASCII" is the best choice because a hex pattern may contain a
  129. * quoted string: this way we know [2] will ignore any characters outside
  130. * ASCII letters range (these ignored chars will be copied verbatim to the
  131. * output and will match as-is; in other words, in a case-sensitive manner;
  132. * If the user is interested in case-insensitive searches of international
  133. * text, he shouldn't be using hex search in the first place.)
  134. *
  135. * Switching out of UTF-8 has another advantage:
  136. *
  137. * When doing case-insensitive searches, GLib treats \xXX symbols as normal
  138. * letters and therefore matches both "a" and "A" for the hex pattern
  139. * "0x61". When we switch out of UTF-8, we're switching to using [2], which
  140. * doesn't have this issue.
  141. *
  142. * [1] mc_search__cond_struct_new_init_regex
  143. * [2] mc_search__cond_struct_new_regex_ci_str
  144. */
  145. if (str_isutf8 (charset))
  146. charset = "ASCII";
  147. tmp = mc_search__hex_translate_to_regex (mc_search_cond->str, &error, &error_pos);
  148. if (tmp != NULL)
  149. {
  150. g_string_free (mc_search_cond->str, TRUE);
  151. mc_search_cond->str = tmp;
  152. mc_search__cond_struct_new_init_regex (charset, lc_mc_search, mc_search_cond);
  153. }
  154. else
  155. {
  156. const char *desc;
  157. switch (error)
  158. {
  159. case MC_SEARCH_HEX_E_NUM_OUT_OF_RANGE:
  160. desc =
  161. _
  162. ("Number out of range (should be in byte range, 0 <= n <= 0xFF, expressed in hex)");
  163. break;
  164. case MC_SEARCH_HEX_E_INVALID_CHARACTER:
  165. desc = _("Invalid character");
  166. break;
  167. case MC_SEARCH_HEX_E_UNMATCHED_QUOTES:
  168. desc = _("Unmatched quotes character");
  169. break;
  170. default:
  171. desc = "";
  172. }
  173. lc_mc_search->error = MC_SEARCH_E_INPUT;
  174. lc_mc_search->error_str =
  175. g_strdup_printf (_("Hex pattern error at position %d:\n%s."), error_pos + 1, desc);
  176. }
  177. }
  178. /* --------------------------------------------------------------------------------------------- */
  179. gboolean
  180. mc_search__run_hex (mc_search_t * lc_mc_search, const void *user_data,
  181. gsize start_search, gsize end_search, gsize * found_len)
  182. {
  183. return mc_search__run_regex (lc_mc_search, user_data, start_search, end_search, found_len);
  184. }
  185. /* --------------------------------------------------------------------------------------------- */
  186. GString *
  187. mc_search_hex_prepare_replace_str (mc_search_t * lc_mc_search, GString * replace_str)
  188. {
  189. (void) lc_mc_search;
  190. return g_string_new_len (replace_str->str, replace_str->len);
  191. }
  192. /* --------------------------------------------------------------------------------------------- */