regex.c 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121
  1. /*
  2. Search text engine.
  3. Regex search
  4. Copyright (C) 2009-2023
  5. Free Software Foundation, Inc.
  6. Written by:
  7. Slava Zanko <slavazanko@gmail.com>, 2009, 2010, 2011, 2013
  8. Vitaliy Filippov <vitalif@yourcmc.ru>, 2011
  9. Andrew Borodin <aborodin@vmail.ru>, 2013-2015
  10. This file is part of the Midnight Commander.
  11. The Midnight Commander is free software: you can redistribute it
  12. and/or modify it under the terms of the GNU General Public License as
  13. published by the Free Software Foundation, either version 3 of the License,
  14. or (at your option) any later version.
  15. The Midnight Commander is distributed in the hope that it will be useful,
  16. but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. GNU General Public License for more details.
  19. You should have received a copy of the GNU General Public License
  20. along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. */
  22. #include <config.h>
  23. #include <stdlib.h>
  24. #include "lib/global.h"
  25. #include "lib/strutil.h"
  26. #include "lib/search.h"
  27. #include "lib/strescape.h"
  28. #include "lib/util.h" /* MC_PTR_FREE */
  29. #include "internal.h"
  30. /*** global variables ****************************************************************************/
  31. /*** file scope macro definitions ****************************************************************/
  32. #define REPLACE_PREPARE_T_NOTHING_SPECIAL -1
  33. #define REPLACE_PREPARE_T_REPLACE_FLAG -2
  34. #define REPLACE_PREPARE_T_ESCAPE_SEQ -3
  35. /*** file scope type declarations ****************************************************************/
  36. typedef enum
  37. {
  38. REPLACE_T_NO_TRANSFORM = 0,
  39. REPLACE_T_UPP_TRANSFORM_CHAR = 1,
  40. REPLACE_T_LOW_TRANSFORM_CHAR = 2,
  41. REPLACE_T_UPP_TRANSFORM = 4,
  42. REPLACE_T_LOW_TRANSFORM = 8
  43. } replace_transform_type_t;
  44. /*** forward declarations (file scope functions) *************************************************/
  45. /*** file scope variables ************************************************************************/
  46. /* --------------------------------------------------------------------------------------------- */
  47. /*** file scope functions ************************************************************************/
  48. /* --------------------------------------------------------------------------------------------- */
  49. static gboolean
  50. mc_search__regex_str_append_if_special (GString * copy_to, const GString * regex_str,
  51. gsize * offset)
  52. {
  53. const char *special_chars[] = {
  54. "\\s", "\\S",
  55. "\\d", "\\D",
  56. "\\b", "\\B",
  57. "\\w", "\\W",
  58. "\\t", "\\n",
  59. "\\r", "\\f",
  60. "\\a", "\\e",
  61. "\\x", "\\X",
  62. "\\c", "\\C",
  63. "\\l", "\\L",
  64. "\\u", "\\U",
  65. "\\E", "\\Q",
  66. NULL
  67. };
  68. char *tmp_regex_str;
  69. const char **spec_chr;
  70. tmp_regex_str = &(regex_str->str[*offset]);
  71. for (spec_chr = special_chars; *spec_chr != NULL; spec_chr++)
  72. {
  73. gsize spec_chr_len;
  74. spec_chr_len = strlen (*spec_chr);
  75. if (strncmp (tmp_regex_str, *spec_chr, spec_chr_len) == 0
  76. && !strutils_is_char_escaped (regex_str->str, tmp_regex_str))
  77. {
  78. if (strncmp ("\\x", *spec_chr, spec_chr_len) == 0)
  79. {
  80. if (tmp_regex_str[spec_chr_len] != '{')
  81. spec_chr_len += 2;
  82. else
  83. {
  84. while ((spec_chr_len < regex_str->len - *offset)
  85. && tmp_regex_str[spec_chr_len] != '}')
  86. spec_chr_len++;
  87. if (tmp_regex_str[spec_chr_len] == '}')
  88. spec_chr_len++;
  89. }
  90. }
  91. g_string_append_len (copy_to, tmp_regex_str, spec_chr_len);
  92. *offset += spec_chr_len;
  93. return TRUE;
  94. }
  95. }
  96. return FALSE;
  97. }
  98. /* --------------------------------------------------------------------------------------------- */
  99. static void
  100. mc_search__cond_struct_new_regex_hex_add (const char *charset, GString * str_to,
  101. const GString * one_char)
  102. {
  103. GString *upp, *low;
  104. gsize loop;
  105. upp = mc_search__toupper_case_str (charset, one_char);
  106. low = mc_search__tolower_case_str (charset, one_char);
  107. for (loop = 0; loop < upp->len; loop++)
  108. {
  109. gchar tmp_str[10 + 1]; /* longest content is "[\\x%02X\\x%02X]" */
  110. gint tmp_len;
  111. if (loop >= low->len || upp->str[loop] == low->str[loop])
  112. tmp_len =
  113. g_snprintf (tmp_str, sizeof (tmp_str), "\\x%02X", (unsigned char) upp->str[loop]);
  114. else
  115. tmp_len =
  116. g_snprintf (tmp_str, sizeof (tmp_str), "[\\x%02X\\x%02X]",
  117. (unsigned char) upp->str[loop], (unsigned char) low->str[loop]);
  118. g_string_append_len (str_to, tmp_str, tmp_len);
  119. }
  120. g_string_free (upp, TRUE);
  121. g_string_free (low, TRUE);
  122. }
  123. /* --------------------------------------------------------------------------------------------- */
  124. static void
  125. mc_search__cond_struct_new_regex_accum_append (const char *charset, GString * str_to,
  126. GString * str_from)
  127. {
  128. GString *recoded_part;
  129. gsize loop = 0;
  130. recoded_part = g_string_sized_new (32);
  131. while (loop < str_from->len)
  132. {
  133. GString *one_char;
  134. gboolean just_letters;
  135. one_char =
  136. mc_search__get_one_symbol (charset, str_from->str + loop,
  137. MIN (str_from->len - loop, 6), &just_letters);
  138. if (one_char->len == 0)
  139. loop++;
  140. else
  141. {
  142. loop += one_char->len;
  143. if (just_letters)
  144. mc_search__cond_struct_new_regex_hex_add (charset, recoded_part, one_char);
  145. else
  146. g_string_append_len (recoded_part, one_char->str, one_char->len);
  147. }
  148. g_string_free (one_char, TRUE);
  149. }
  150. g_string_append_len (str_to, recoded_part->str, recoded_part->len);
  151. g_string_free (recoded_part, TRUE);
  152. g_string_set_size (str_from, 0);
  153. }
  154. /* --------------------------------------------------------------------------------------------- */
  155. /**
  156. * Creates a case-insensitive version of a regex pattern.
  157. *
  158. * For example (assuming ASCII charset): given "\\bHello!\\xAB", returns
  159. * "\\b[Hh][Ee][Ll][Ll][Oo]!\\xAB" (this example is for easier reading; in
  160. * reality hex codes are used instead of letters).
  161. *
  162. * This function knows not to ruin special regex symbols.
  163. *
  164. * This function is used when working with non-UTF-8 charsets: GLib's
  165. * regex engine doesn't understand such charsets and therefore can't do
  166. * this job itself.
  167. */
  168. static GString *
  169. mc_search__cond_struct_new_regex_ci_str (const char *charset, const GString * astr)
  170. {
  171. GString *accumulator, *spec_char, *ret_str;
  172. gsize loop;
  173. ret_str = g_string_sized_new (64);
  174. accumulator = g_string_sized_new (64);
  175. spec_char = g_string_sized_new (64);
  176. loop = 0;
  177. while (loop < astr->len)
  178. {
  179. if (mc_search__regex_str_append_if_special (spec_char, astr, &loop))
  180. {
  181. mc_search__cond_struct_new_regex_accum_append (charset, ret_str, accumulator);
  182. g_string_append_len (ret_str, spec_char->str, spec_char->len);
  183. g_string_set_size (spec_char, 0);
  184. continue;
  185. }
  186. if (astr->str[loop] == '[' && !strutils_is_char_escaped (astr->str, &(astr->str[loop])))
  187. {
  188. mc_search__cond_struct_new_regex_accum_append (charset, ret_str, accumulator);
  189. while (loop < astr->len && !(astr->str[loop] == ']'
  190. && !strutils_is_char_escaped (astr->str,
  191. &(astr->str[loop]))))
  192. {
  193. g_string_append_c (ret_str, astr->str[loop]);
  194. loop++;
  195. }
  196. g_string_append_c (ret_str, astr->str[loop]);
  197. loop++;
  198. continue;
  199. }
  200. /*
  201. TODO: handle [ and ]
  202. */
  203. g_string_append_c (accumulator, astr->str[loop]);
  204. loop++;
  205. }
  206. mc_search__cond_struct_new_regex_accum_append (charset, ret_str, accumulator);
  207. g_string_free (accumulator, TRUE);
  208. g_string_free (spec_char, TRUE);
  209. return ret_str;
  210. }
  211. /* --------------------------------------------------------------------------------------------- */
  212. #ifdef SEARCH_TYPE_GLIB
  213. /* A thin wrapper above g_regex_match_full that makes sure the string passed
  214. * to it is valid UTF-8 (unless G_REGEX_RAW compile flag was set), as it is a
  215. * requirement by glib and it might crash otherwise. See: mc ticket 3449.
  216. * Be careful: there might be embedded NULs in the strings. */
  217. static gboolean
  218. mc_search__g_regex_match_full_safe (const GRegex * regex,
  219. const gchar * string,
  220. gssize string_len,
  221. gint start_position,
  222. GRegexMatchFlags match_options,
  223. GMatchInfo ** match_info, GError ** error)
  224. {
  225. char *string_safe, *p, *end;
  226. gboolean ret;
  227. if (string_len < 0)
  228. string_len = strlen (string);
  229. if ((g_regex_get_compile_flags (regex) & G_REGEX_RAW)
  230. || g_utf8_validate (string, string_len, NULL))
  231. {
  232. return g_regex_match_full (regex, string, string_len, start_position, match_options,
  233. match_info, error);
  234. }
  235. /* Correctly handle embedded NULs while copying */
  236. p = string_safe = g_malloc (string_len + 1);
  237. memcpy (string_safe, string, string_len);
  238. string_safe[string_len] = '\0';
  239. end = p + string_len;
  240. while (p < end)
  241. {
  242. gunichar c = g_utf8_get_char_validated (p, -1);
  243. if (c != (gunichar) (-1) && c != (gunichar) (-2))
  244. {
  245. p = g_utf8_next_char (p);
  246. }
  247. else
  248. {
  249. /* U+FFFD would be the proper choice, but then we'd have to
  250. maintain mapping between old and new offsets.
  251. So rather do a byte by byte replacement. */
  252. *p++ = '\0';
  253. }
  254. }
  255. ret =
  256. g_regex_match_full (regex, string_safe, string_len, start_position, match_options,
  257. match_info, error);
  258. g_free (string_safe);
  259. return ret;
  260. }
  261. #endif /* SEARCH_TYPE_GLIB */
  262. /* --------------------------------------------------------------------------------------------- */
  263. static mc_search__found_cond_t
  264. mc_search__regex_found_cond_one (mc_search_t * lc_mc_search, mc_search_regex_t * regex,
  265. GString * search_str)
  266. {
  267. #ifdef SEARCH_TYPE_GLIB
  268. GError *mcerror = NULL;
  269. if (!mc_search__g_regex_match_full_safe
  270. (regex, search_str->str, search_str->len, 0, G_REGEX_MATCH_NEWLINE_ANY,
  271. &lc_mc_search->regex_match_info, &mcerror))
  272. {
  273. g_match_info_free (lc_mc_search->regex_match_info);
  274. lc_mc_search->regex_match_info = NULL;
  275. if (mcerror != NULL)
  276. {
  277. lc_mc_search->error = MC_SEARCH_E_REGEX;
  278. g_free (lc_mc_search->error_str);
  279. lc_mc_search->error_str =
  280. str_conv_gerror_message (mcerror, _("Regular expression error"));
  281. g_error_free (mcerror);
  282. return COND__FOUND_ERROR;
  283. }
  284. return COND__NOT_FOUND;
  285. }
  286. lc_mc_search->num_results = g_match_info_get_match_count (lc_mc_search->regex_match_info);
  287. #else /* SEARCH_TYPE_GLIB */
  288. lc_mc_search->num_results =
  289. #ifdef HAVE_PCRE2
  290. pcre2_match (regex, (unsigned char *) search_str->str, search_str->len, 0, 0,
  291. lc_mc_search->regex_match_info, NULL);
  292. #else
  293. pcre_exec (regex, lc_mc_search->regex_match_info, search_str->str, search_str->len, 0, 0,
  294. lc_mc_search->iovector, MC_SEARCH__NUM_REPLACE_ARGS);
  295. #endif
  296. if (lc_mc_search->num_results < 0)
  297. {
  298. return COND__NOT_FOUND;
  299. }
  300. #endif /* SEARCH_TYPE_GLIB */
  301. return COND__FOUND_OK;
  302. }
  303. /* --------------------------------------------------------------------------------------------- */
  304. static mc_search__found_cond_t
  305. mc_search__regex_found_cond (mc_search_t * lc_mc_search, GString * search_str)
  306. {
  307. gsize loop1;
  308. for (loop1 = 0; loop1 < lc_mc_search->prepared.conditions->len; loop1++)
  309. {
  310. mc_search_cond_t *mc_search_cond;
  311. mc_search__found_cond_t ret;
  312. mc_search_cond =
  313. (mc_search_cond_t *) g_ptr_array_index (lc_mc_search->prepared.conditions, loop1);
  314. if (!mc_search_cond->regex_handle)
  315. continue;
  316. ret =
  317. mc_search__regex_found_cond_one (lc_mc_search, mc_search_cond->regex_handle,
  318. search_str);
  319. if (ret != COND__NOT_FOUND)
  320. return ret;
  321. }
  322. return COND__NOT_ALL_FOUND;
  323. }
  324. /* --------------------------------------------------------------------------------------------- */
  325. static int
  326. mc_search_regex__get_max_num_of_replace_tokens (const gchar * str, gsize len)
  327. {
  328. int max_token = 0;
  329. gsize loop;
  330. for (loop = 0; loop < len - 1; loop++)
  331. {
  332. if (str[loop] == '\\' && g_ascii_isdigit (str[loop + 1]))
  333. {
  334. if (strutils_is_char_escaped (str, &str[loop]))
  335. continue;
  336. if (max_token < str[loop + 1] - '0')
  337. max_token = str[loop + 1] - '0';
  338. continue;
  339. }
  340. if (str[loop] == '$' && str[loop + 1] == '{')
  341. {
  342. gsize tmp_len;
  343. if (strutils_is_char_escaped (str, &str[loop]))
  344. continue;
  345. for (tmp_len = 0;
  346. loop + tmp_len + 2 < len && (str[loop + 2 + tmp_len] & (char) 0xf0) == 0x30;
  347. tmp_len++);
  348. if (str[loop + 2 + tmp_len] == '}')
  349. {
  350. int tmp_token;
  351. char *tmp_str;
  352. tmp_str = g_strndup (&str[loop + 2], tmp_len);
  353. tmp_token = atoi (tmp_str);
  354. if (max_token < tmp_token)
  355. max_token = tmp_token;
  356. g_free (tmp_str);
  357. }
  358. }
  359. }
  360. return max_token;
  361. }
  362. /* --------------------------------------------------------------------------------------------- */
  363. static char *
  364. mc_search_regex__get_token_by_num (const mc_search_t * lc_mc_search, gsize lc_index)
  365. {
  366. int fnd_start = 0, fnd_end = 0;
  367. #ifdef SEARCH_TYPE_GLIB
  368. g_match_info_fetch_pos (lc_mc_search->regex_match_info, lc_index, &fnd_start, &fnd_end);
  369. #else /* SEARCH_TYPE_GLIB */
  370. fnd_start = lc_mc_search->iovector[lc_index * 2 + 0];
  371. fnd_end = lc_mc_search->iovector[lc_index * 2 + 1];
  372. #endif /* SEARCH_TYPE_GLIB */
  373. if (fnd_end == fnd_start)
  374. return g_strdup ("");
  375. return g_strndup (lc_mc_search->regex_buffer->str + fnd_start, fnd_end - fnd_start);
  376. }
  377. /* --------------------------------------------------------------------------------------------- */
  378. static gboolean
  379. mc_search_regex__replace_handle_esc_seq (const GString * replace_str, const gsize current_pos,
  380. gsize * skip_len, int *ret)
  381. {
  382. char *curr_str = &(replace_str->str[current_pos]);
  383. char c = curr_str[1];
  384. if (replace_str->len > current_pos + 2)
  385. {
  386. if (c == '{')
  387. {
  388. for (*skip_len = 2; /* \{ */
  389. current_pos + *skip_len < replace_str->len && curr_str[*skip_len] >= '0'
  390. && curr_str[*skip_len] <= '7'; (*skip_len)++)
  391. ;
  392. if (current_pos + *skip_len < replace_str->len && curr_str[*skip_len] == '}')
  393. {
  394. (*skip_len)++;
  395. *ret = REPLACE_PREPARE_T_ESCAPE_SEQ;
  396. return FALSE;
  397. }
  398. else
  399. {
  400. *ret = REPLACE_PREPARE_T_NOTHING_SPECIAL;
  401. return TRUE;
  402. }
  403. }
  404. if (c == 'x')
  405. {
  406. *skip_len = 2; /* \x */
  407. c = curr_str[2];
  408. if (c == '{')
  409. {
  410. for (*skip_len = 3; /* \x{ */
  411. current_pos + *skip_len < replace_str->len
  412. && g_ascii_isxdigit ((guchar) curr_str[*skip_len]); (*skip_len)++)
  413. ;
  414. if (current_pos + *skip_len < replace_str->len && curr_str[*skip_len] == '}')
  415. {
  416. (*skip_len)++;
  417. *ret = REPLACE_PREPARE_T_ESCAPE_SEQ;
  418. return FALSE;
  419. }
  420. else
  421. {
  422. *ret = REPLACE_PREPARE_T_NOTHING_SPECIAL;
  423. return TRUE;
  424. }
  425. }
  426. else if (!g_ascii_isxdigit ((guchar) c))
  427. {
  428. *skip_len = 2; /* \x without number behind */
  429. *ret = REPLACE_PREPARE_T_NOTHING_SPECIAL;
  430. return FALSE;
  431. }
  432. else
  433. {
  434. c = curr_str[3];
  435. if (!g_ascii_isxdigit ((guchar) c))
  436. *skip_len = 3; /* \xH */
  437. else
  438. *skip_len = 4; /* \xHH */
  439. *ret = REPLACE_PREPARE_T_ESCAPE_SEQ;
  440. return FALSE;
  441. }
  442. }
  443. }
  444. if (strchr ("ntvbrfa", c) != NULL)
  445. {
  446. *skip_len = 2;
  447. *ret = REPLACE_PREPARE_T_ESCAPE_SEQ;
  448. return FALSE;
  449. }
  450. return TRUE;
  451. }
  452. /* --------------------------------------------------------------------------------------------- */
  453. static int
  454. mc_search_regex__process_replace_str (const GString * replace_str, const gsize current_pos,
  455. gsize * skip_len, replace_transform_type_t * replace_flags)
  456. {
  457. int ret = -1;
  458. const char *curr_str = &(replace_str->str[current_pos]);
  459. if (current_pos > replace_str->len)
  460. return REPLACE_PREPARE_T_NOTHING_SPECIAL;
  461. *skip_len = 0;
  462. if (replace_str->len > current_pos + 2 && curr_str[0] == '$' && curr_str[1] == '{'
  463. && (curr_str[2] & (char) 0xf0) == 0x30)
  464. {
  465. char *tmp_str;
  466. if (strutils_is_char_escaped (replace_str->str, curr_str))
  467. {
  468. *skip_len = 1;
  469. return REPLACE_PREPARE_T_NOTHING_SPECIAL;
  470. }
  471. for (*skip_len = 0;
  472. current_pos + *skip_len + 2 < replace_str->len
  473. && (curr_str[2 + *skip_len] & (char) 0xf0) == 0x30; (*skip_len)++)
  474. ;
  475. if (curr_str[2 + *skip_len] != '}')
  476. return REPLACE_PREPARE_T_NOTHING_SPECIAL;
  477. tmp_str = g_strndup (curr_str + 2, *skip_len);
  478. if (tmp_str == NULL)
  479. return REPLACE_PREPARE_T_NOTHING_SPECIAL;
  480. ret = atoi (tmp_str);
  481. g_free (tmp_str);
  482. *skip_len += 3; /* ${} */
  483. return ret; /* capture buffer index >= 0 */
  484. }
  485. if (curr_str[0] == '\\' && replace_str->len > current_pos + 1)
  486. {
  487. if (strutils_is_char_escaped (replace_str->str, curr_str))
  488. {
  489. *skip_len = 1;
  490. return REPLACE_PREPARE_T_NOTHING_SPECIAL;
  491. }
  492. if (g_ascii_isdigit (curr_str[1]))
  493. {
  494. ret = g_ascii_digit_value (curr_str[1]); /* capture buffer index >= 0 */
  495. *skip_len = 2; /* \\ and one digit */
  496. return ret;
  497. }
  498. if (!mc_search_regex__replace_handle_esc_seq (replace_str, current_pos, skip_len, &ret))
  499. return ret;
  500. ret = REPLACE_PREPARE_T_REPLACE_FLAG;
  501. *skip_len += 2;
  502. switch (curr_str[1])
  503. {
  504. case 'U':
  505. *replace_flags |= REPLACE_T_UPP_TRANSFORM;
  506. *replace_flags &= ~REPLACE_T_LOW_TRANSFORM;
  507. break;
  508. case 'u':
  509. *replace_flags |= REPLACE_T_UPP_TRANSFORM_CHAR;
  510. break;
  511. case 'L':
  512. *replace_flags |= REPLACE_T_LOW_TRANSFORM;
  513. *replace_flags &= ~REPLACE_T_UPP_TRANSFORM;
  514. break;
  515. case 'l':
  516. *replace_flags |= REPLACE_T_LOW_TRANSFORM_CHAR;
  517. break;
  518. case 'E':
  519. *replace_flags = REPLACE_T_NO_TRANSFORM;
  520. break;
  521. default:
  522. ret = REPLACE_PREPARE_T_NOTHING_SPECIAL;
  523. break;
  524. }
  525. }
  526. return ret;
  527. }
  528. /* --------------------------------------------------------------------------------------------- */
  529. static void
  530. mc_search_regex__process_append_str (GString * dest_str, const char *from, gsize len,
  531. replace_transform_type_t * replace_flags)
  532. {
  533. gsize loop;
  534. gsize char_len;
  535. if (len == (gsize) (-1))
  536. len = strlen (from);
  537. if (*replace_flags == REPLACE_T_NO_TRANSFORM)
  538. {
  539. g_string_append_len (dest_str, from, len);
  540. return;
  541. }
  542. for (loop = 0; loop < len; loop += char_len)
  543. {
  544. GString *tmp_string = NULL;
  545. GString *s;
  546. s = mc_search__get_one_symbol (NULL, from + loop, len - loop, NULL);
  547. char_len = s->len;
  548. if ((*replace_flags & REPLACE_T_UPP_TRANSFORM_CHAR) != 0)
  549. {
  550. *replace_flags &= ~REPLACE_T_UPP_TRANSFORM_CHAR;
  551. tmp_string = mc_search__toupper_case_str (NULL, s);
  552. g_string_append_len (dest_str, tmp_string->str, tmp_string->len);
  553. }
  554. else if ((*replace_flags & REPLACE_T_LOW_TRANSFORM_CHAR) != 0)
  555. {
  556. *replace_flags &= ~REPLACE_T_LOW_TRANSFORM_CHAR;
  557. tmp_string = mc_search__tolower_case_str (NULL, s);
  558. g_string_append_len (dest_str, tmp_string->str, tmp_string->len);
  559. }
  560. else if ((*replace_flags & REPLACE_T_UPP_TRANSFORM) != 0)
  561. {
  562. tmp_string = mc_search__toupper_case_str (NULL, s);
  563. g_string_append_len (dest_str, tmp_string->str, tmp_string->len);
  564. }
  565. else if ((*replace_flags & REPLACE_T_LOW_TRANSFORM) != 0)
  566. {
  567. tmp_string = mc_search__tolower_case_str (NULL, s);
  568. g_string_append_len (dest_str, tmp_string->str, tmp_string->len);
  569. }
  570. g_string_free (s, TRUE);
  571. if (tmp_string != NULL)
  572. g_string_free (tmp_string, TRUE);
  573. }
  574. }
  575. /* --------------------------------------------------------------------------------------------- */
  576. static void
  577. mc_search_regex__process_escape_sequence (GString * dest_str, const char *from, gsize len,
  578. replace_transform_type_t * replace_flags,
  579. gboolean is_utf8)
  580. {
  581. gsize i = 0;
  582. unsigned int c = 0;
  583. char b;
  584. if (len == (gsize) (-1))
  585. len = strlen (from);
  586. if (len == 0)
  587. return;
  588. if (from[i] == '{')
  589. i++;
  590. if (i >= len)
  591. return;
  592. if (from[i] == 'x')
  593. {
  594. i++;
  595. if (i < len && from[i] == '{')
  596. i++;
  597. for (; i < len; i++)
  598. {
  599. if (from[i] >= '0' && from[i] <= '9')
  600. c = c * 16 + from[i] - '0';
  601. else if (from[i] >= 'a' && from[i] <= 'f')
  602. c = c * 16 + 10 + from[i] - 'a';
  603. else if (from[i] >= 'A' && from[i] <= 'F')
  604. c = c * 16 + 10 + from[i] - 'A';
  605. else
  606. break;
  607. }
  608. }
  609. else if (from[i] >= '0' && from[i] <= '7')
  610. for (; i < len && from[i] >= '0' && from[i] <= '7'; i++)
  611. c = c * 8 + from[i] - '0';
  612. else
  613. {
  614. switch (from[i])
  615. {
  616. case 'n':
  617. c = '\n';
  618. break;
  619. case 't':
  620. c = '\t';
  621. break;
  622. case 'v':
  623. c = '\v';
  624. break;
  625. case 'b':
  626. c = '\b';
  627. break;
  628. case 'r':
  629. c = '\r';
  630. break;
  631. case 'f':
  632. c = '\f';
  633. break;
  634. case 'a':
  635. c = '\a';
  636. break;
  637. default:
  638. mc_search_regex__process_append_str (dest_str, from, len, replace_flags);
  639. return;
  640. }
  641. }
  642. if (c < 0x80 || !is_utf8)
  643. g_string_append_c (dest_str, (char) c);
  644. else if (c < 0x800)
  645. {
  646. b = 0xC0 | (c >> 6);
  647. g_string_append_c (dest_str, b);
  648. b = 0x80 | (c & 0x3F);
  649. g_string_append_c (dest_str, b);
  650. }
  651. else if (c < 0x10000)
  652. {
  653. b = 0xE0 | (c >> 12);
  654. g_string_append_c (dest_str, b);
  655. b = 0x80 | ((c >> 6) & 0x3F);
  656. g_string_append_c (dest_str, b);
  657. b = 0x80 | (c & 0x3F);
  658. g_string_append_c (dest_str, b);
  659. }
  660. else if (c < 0x10FFFF)
  661. {
  662. b = 0xF0 | (c >> 16);
  663. g_string_append_c (dest_str, b);
  664. b = 0x80 | ((c >> 12) & 0x3F);
  665. g_string_append_c (dest_str, b);
  666. b = 0x80 | ((c >> 6) & 0x3F);
  667. g_string_append_c (dest_str, b);
  668. b = 0x80 | (c & 0x3F);
  669. g_string_append_c (dest_str, b);
  670. }
  671. }
  672. /* --------------------------------------------------------------------------------------------- */
  673. /*** public functions ****************************************************************************/
  674. /* --------------------------------------------------------------------------------------------- */
  675. void
  676. mc_search__cond_struct_new_init_regex (const char *charset, mc_search_t * lc_mc_search,
  677. mc_search_cond_t * mc_search_cond)
  678. {
  679. if (lc_mc_search->whole_words && !lc_mc_search->is_entire_line)
  680. {
  681. /* NOTE: \b as word boundary doesn't allow search
  682. * whole words with non-ASCII symbols.
  683. * Update: Is it still true nowadays? Probably not. #2396, #3524 */
  684. g_string_prepend (mc_search_cond->str, "(?<![\\p{L}\\p{N}_])");
  685. g_string_append (mc_search_cond->str, "(?![\\p{L}\\p{N}_])");
  686. }
  687. {
  688. #ifdef SEARCH_TYPE_GLIB
  689. GError *mcerror = NULL;
  690. GRegexCompileFlags g_regex_options = G_REGEX_OPTIMIZE | G_REGEX_DOTALL;
  691. if (str_isutf8 (charset) && mc_global.utf8_display)
  692. {
  693. if (!lc_mc_search->is_case_sensitive)
  694. g_regex_options |= G_REGEX_CASELESS;
  695. }
  696. else
  697. {
  698. g_regex_options |= G_REGEX_RAW;
  699. if (!lc_mc_search->is_case_sensitive)
  700. {
  701. GString *tmp;
  702. tmp = mc_search_cond->str;
  703. mc_search_cond->str = mc_search__cond_struct_new_regex_ci_str (charset, tmp);
  704. g_string_free (tmp, TRUE);
  705. }
  706. }
  707. mc_search_cond->regex_handle =
  708. g_regex_new (mc_search_cond->str->str, g_regex_options, 0, &mcerror);
  709. if (mcerror != NULL)
  710. {
  711. lc_mc_search->error = MC_SEARCH_E_REGEX_COMPILE;
  712. g_free (lc_mc_search->error_str);
  713. lc_mc_search->error_str =
  714. str_conv_gerror_message (mcerror, _("Regular expression error"));
  715. g_error_free (mcerror);
  716. return;
  717. }
  718. #else /* SEARCH_TYPE_GLIB */
  719. #ifdef HAVE_PCRE2
  720. int errcode;
  721. char error[BUF_SMALL];
  722. size_t erroffset;
  723. int pcre_options = PCRE2_MULTILINE;
  724. #else
  725. const char *error;
  726. int erroffset;
  727. int pcre_options = PCRE_EXTRA | PCRE_MULTILINE;
  728. #endif
  729. if (str_isutf8 (charset) && mc_global.utf8_display)
  730. {
  731. #ifdef HAVE_PCRE2
  732. pcre_options |= PCRE2_UTF;
  733. if (!lc_mc_search->is_case_sensitive)
  734. pcre_options |= PCRE2_CASELESS;
  735. #else
  736. pcre_options |= PCRE_UTF8;
  737. if (!lc_mc_search->is_case_sensitive)
  738. pcre_options |= PCRE_CASELESS;
  739. #endif
  740. }
  741. else if (!lc_mc_search->is_case_sensitive)
  742. {
  743. GString *tmp;
  744. tmp = mc_search_cond->str;
  745. mc_search_cond->str = mc_search__cond_struct_new_regex_ci_str (charset, tmp);
  746. g_string_free (tmp, TRUE);
  747. }
  748. mc_search_cond->regex_handle =
  749. #ifdef HAVE_PCRE2
  750. pcre2_compile ((unsigned char *) mc_search_cond->str->str, PCRE2_ZERO_TERMINATED,
  751. pcre_options, &errcode, &erroffset, NULL);
  752. #else
  753. pcre_compile (mc_search_cond->str->str, pcre_options, &error, &erroffset, NULL);
  754. #endif
  755. if (mc_search_cond->regex_handle == NULL)
  756. {
  757. #ifdef HAVE_PCRE2
  758. pcre2_get_error_message (errcode, (unsigned char *) error, sizeof (error));
  759. #endif
  760. mc_search_set_error (lc_mc_search, MC_SEARCH_E_REGEX_COMPILE, "%s", error);
  761. return;
  762. }
  763. #ifdef HAVE_PCRE2
  764. if (pcre2_jit_compile (mc_search_cond->regex_handle, PCRE2_JIT_COMPLETE) && *error != '\0')
  765. #else
  766. lc_mc_search->regex_match_info = pcre_study (mc_search_cond->regex_handle, 0, &error);
  767. if (lc_mc_search->regex_match_info == NULL && error != NULL)
  768. #endif
  769. {
  770. mc_search_set_error (lc_mc_search, MC_SEARCH_E_REGEX_COMPILE, "%s", error);
  771. MC_PTR_FREE (mc_search_cond->regex_handle);
  772. return;
  773. }
  774. #endif /* SEARCH_TYPE_GLIB */
  775. }
  776. lc_mc_search->is_utf8 = str_isutf8 (charset);
  777. }
  778. /* --------------------------------------------------------------------------------------------- */
  779. gboolean
  780. mc_search__run_regex (mc_search_t * lc_mc_search, const void *user_data,
  781. gsize start_search, gsize end_search, gsize * found_len)
  782. {
  783. mc_search_cbret_t ret = MC_SEARCH_CB_NOTFOUND;
  784. gsize current_pos, virtual_pos;
  785. gint start_pos;
  786. gint end_pos;
  787. if (lc_mc_search->regex_buffer != NULL)
  788. g_string_set_size (lc_mc_search->regex_buffer, 0);
  789. else
  790. lc_mc_search->regex_buffer = g_string_sized_new (64);
  791. virtual_pos = current_pos = start_search;
  792. while (virtual_pos <= end_search)
  793. {
  794. g_string_set_size (lc_mc_search->regex_buffer, 0);
  795. lc_mc_search->start_buffer = current_pos;
  796. if (lc_mc_search->search_fn != NULL)
  797. {
  798. while (TRUE)
  799. {
  800. int current_chr = '\n'; /* stop search symbol */
  801. ret = lc_mc_search->search_fn (user_data, current_pos, &current_chr);
  802. if (ret == MC_SEARCH_CB_ABORT)
  803. break;
  804. if (ret == MC_SEARCH_CB_INVALID)
  805. continue;
  806. current_pos++;
  807. if (ret == MC_SEARCH_CB_SKIP)
  808. continue;
  809. virtual_pos++;
  810. g_string_append_c (lc_mc_search->regex_buffer, (char) current_chr);
  811. if ((char) current_chr == '\n' || virtual_pos > end_search)
  812. break;
  813. }
  814. }
  815. else
  816. {
  817. /* optimization for standard case (for search from file manager)
  818. * where there is no MC_SEARCH_CB_INVALID or MC_SEARCH_CB_SKIP
  819. * return codes, so we can copy line at regex buffer all at once
  820. */
  821. while (TRUE)
  822. {
  823. const char current_chr = ((const char *) user_data)[current_pos];
  824. if (current_chr == '\0')
  825. break;
  826. current_pos++;
  827. if (current_chr == '\n' || current_pos > end_search)
  828. break;
  829. }
  830. /* use virtual_pos as index of start of current chunk */
  831. g_string_append_len (lc_mc_search->regex_buffer, (const char *) user_data + virtual_pos,
  832. current_pos - virtual_pos);
  833. virtual_pos = current_pos;
  834. }
  835. switch (mc_search__regex_found_cond (lc_mc_search, lc_mc_search->regex_buffer))
  836. {
  837. case COND__FOUND_OK:
  838. #ifdef SEARCH_TYPE_GLIB
  839. g_match_info_fetch_pos (lc_mc_search->regex_match_info, 0, &start_pos, &end_pos);
  840. #else /* SEARCH_TYPE_GLIB */
  841. start_pos = lc_mc_search->iovector[0];
  842. end_pos = lc_mc_search->iovector[1];
  843. #endif /* SEARCH_TYPE_GLIB */
  844. if (found_len != NULL)
  845. *found_len = end_pos - start_pos;
  846. lc_mc_search->normal_offset = lc_mc_search->start_buffer + start_pos;
  847. return TRUE;
  848. case COND__NOT_ALL_FOUND:
  849. break;
  850. default:
  851. g_string_free (lc_mc_search->regex_buffer, TRUE);
  852. lc_mc_search->regex_buffer = NULL;
  853. return FALSE;
  854. }
  855. if ((lc_mc_search->update_fn != NULL) &&
  856. ((lc_mc_search->update_fn) (user_data, current_pos) == MC_SEARCH_CB_ABORT))
  857. ret = MC_SEARCH_CB_ABORT;
  858. if (ret == MC_SEARCH_CB_ABORT || ret == MC_SEARCH_CB_NOTFOUND)
  859. break;
  860. }
  861. g_string_free (lc_mc_search->regex_buffer, TRUE);
  862. lc_mc_search->regex_buffer = NULL;
  863. MC_PTR_FREE (lc_mc_search->error_str);
  864. lc_mc_search->error = ret == MC_SEARCH_CB_ABORT ? MC_SEARCH_E_ABORT : MC_SEARCH_E_NOTFOUND;
  865. return FALSE;
  866. }
  867. /* --------------------------------------------------------------------------------------------- */
  868. GString *
  869. mc_search_regex_prepare_replace_str (mc_search_t * lc_mc_search, GString * replace_str)
  870. {
  871. GString *ret;
  872. int num_replace_tokens;
  873. gsize loop;
  874. gsize prev = 0;
  875. replace_transform_type_t replace_flags = REPLACE_T_NO_TRANSFORM;
  876. num_replace_tokens =
  877. mc_search_regex__get_max_num_of_replace_tokens (replace_str->str, replace_str->len);
  878. if (lc_mc_search->num_results < 0)
  879. return mc_g_string_dup (replace_str);
  880. if (num_replace_tokens > lc_mc_search->num_results - 1
  881. || num_replace_tokens > MC_SEARCH__NUM_REPLACE_ARGS)
  882. {
  883. mc_search_set_error (lc_mc_search, MC_SEARCH_E_REGEX_REPLACE, "%s",
  884. _(STR_E_RPL_NOT_EQ_TO_FOUND));
  885. return NULL;
  886. }
  887. ret = g_string_sized_new (64);
  888. for (loop = 0; loop < replace_str->len - 1; loop++)
  889. {
  890. int lc_index;
  891. gchar *tmp_str;
  892. gsize len = 0;
  893. lc_index = mc_search_regex__process_replace_str (replace_str, loop, &len, &replace_flags);
  894. if (lc_index == REPLACE_PREPARE_T_NOTHING_SPECIAL)
  895. {
  896. if (len != 0)
  897. {
  898. mc_search_regex__process_append_str (ret, replace_str->str + prev, loop - prev,
  899. &replace_flags);
  900. mc_search_regex__process_append_str (ret, replace_str->str + loop + 1, len - 1,
  901. &replace_flags);
  902. prev = loop + len;
  903. loop = prev - 1; /* prepare to loop++ */
  904. }
  905. continue;
  906. }
  907. if (lc_index == REPLACE_PREPARE_T_REPLACE_FLAG)
  908. {
  909. if (loop != 0)
  910. mc_search_regex__process_append_str (ret, replace_str->str + prev, loop - prev,
  911. &replace_flags);
  912. prev = loop + len;
  913. loop = prev - 1; /* prepare to loop++ */
  914. continue;
  915. }
  916. /* escape sequence */
  917. if (lc_index == REPLACE_PREPARE_T_ESCAPE_SEQ)
  918. {
  919. mc_search_regex__process_append_str (ret, replace_str->str + prev, loop - prev,
  920. &replace_flags);
  921. /* call process_escape_sequence without starting '\\' */
  922. mc_search_regex__process_escape_sequence (ret, replace_str->str + loop + 1, len - 1,
  923. &replace_flags, lc_mc_search->is_utf8);
  924. prev = loop + len;
  925. loop = prev - 1; /* prepare to loop++ */
  926. continue;
  927. }
  928. /* invalid capture buffer number */
  929. if (lc_index > lc_mc_search->num_results)
  930. {
  931. g_string_free (ret, TRUE);
  932. mc_search_set_error (lc_mc_search, MC_SEARCH_E_REGEX_REPLACE,
  933. _(STR_E_RPL_INVALID_TOKEN), lc_index);
  934. return NULL;
  935. }
  936. tmp_str = mc_search_regex__get_token_by_num (lc_mc_search, lc_index);
  937. if (loop != 0)
  938. mc_search_regex__process_append_str (ret, replace_str->str + prev, loop - prev,
  939. &replace_flags);
  940. mc_search_regex__process_append_str (ret, tmp_str, -1, &replace_flags);
  941. g_free (tmp_str);
  942. prev = loop + len;
  943. loop = prev - 1; /* prepare to loop++ */
  944. }
  945. mc_search_regex__process_append_str (ret, replace_str->str + prev, replace_str->len - prev,
  946. &replace_flags);
  947. return ret;
  948. }