regex.c 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094
  1. /*
  2. Search text engine.
  3. Regex search
  4. Copyright (C) 2009-2021
  5. Free Software Foundation, Inc.
  6. Written by:
  7. Slava Zanko <slavazanko@gmail.com>, 2009, 2010, 2011, 2013
  8. Vitaliy Filippov <vitalif@yourcmc.ru>, 2011
  9. Andrew Borodin <aborodin@vmail.ru>, 2013-2015
  10. This file is part of the Midnight Commander.
  11. The Midnight Commander is free software: you can redistribute it
  12. and/or modify it under the terms of the GNU General Public License as
  13. published by the Free Software Foundation, either version 3 of the License,
  14. or (at your option) any later version.
  15. The Midnight Commander is distributed in the hope that it will be useful,
  16. but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. GNU General Public License for more details.
  19. You should have received a copy of the GNU General Public License
  20. along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. */
  22. #include <config.h>
  23. #include <stdlib.h>
  24. #include "lib/global.h"
  25. #include "lib/strutil.h"
  26. #include "lib/search.h"
  27. #include "lib/strescape.h"
  28. #include "lib/util.h" /* MC_PTR_FREE */
  29. #include "internal.h"
  30. /*** global variables ****************************************************************************/
  31. /*** file scope macro definitions ****************************************************************/
  32. #define REPLACE_PREPARE_T_NOTHING_SPECIAL -1
  33. #define REPLACE_PREPARE_T_REPLACE_FLAG -2
  34. #define REPLACE_PREPARE_T_ESCAPE_SEQ -3
  35. /*** file scope type declarations ****************************************************************/
  36. typedef enum
  37. {
  38. REPLACE_T_NO_TRANSFORM = 0,
  39. REPLACE_T_UPP_TRANSFORM_CHAR = 1,
  40. REPLACE_T_LOW_TRANSFORM_CHAR = 2,
  41. REPLACE_T_UPP_TRANSFORM = 4,
  42. REPLACE_T_LOW_TRANSFORM = 8
  43. } replace_transform_type_t;
  44. /*** file scope variables ************************************************************************/
  45. /*** file scope functions ************************************************************************/
  46. static gboolean
  47. mc_search__regex_str_append_if_special (GString * copy_to, const GString * regex_str,
  48. gsize * offset)
  49. {
  50. const char *special_chars[] = {
  51. "\\s", "\\S",
  52. "\\d", "\\D",
  53. "\\b", "\\B",
  54. "\\w", "\\W",
  55. "\\t", "\\n",
  56. "\\r", "\\f",
  57. "\\a", "\\e",
  58. "\\x", "\\X",
  59. "\\c", "\\C",
  60. "\\l", "\\L",
  61. "\\u", "\\U",
  62. "\\E", "\\Q",
  63. NULL
  64. };
  65. char *tmp_regex_str;
  66. const char **spec_chr;
  67. tmp_regex_str = &(regex_str->str[*offset]);
  68. for (spec_chr = special_chars; *spec_chr != NULL; spec_chr++)
  69. {
  70. gsize spec_chr_len;
  71. spec_chr_len = strlen (*spec_chr);
  72. if (strncmp (tmp_regex_str, *spec_chr, spec_chr_len) == 0
  73. && !strutils_is_char_escaped (regex_str->str, tmp_regex_str))
  74. {
  75. if (strncmp ("\\x", *spec_chr, spec_chr_len) == 0)
  76. {
  77. if (tmp_regex_str[spec_chr_len] != '{')
  78. spec_chr_len += 2;
  79. else
  80. {
  81. while ((spec_chr_len < regex_str->len - *offset)
  82. && tmp_regex_str[spec_chr_len] != '}')
  83. spec_chr_len++;
  84. if (tmp_regex_str[spec_chr_len] == '}')
  85. spec_chr_len++;
  86. }
  87. }
  88. g_string_append_len (copy_to, tmp_regex_str, spec_chr_len);
  89. *offset += spec_chr_len;
  90. return TRUE;
  91. }
  92. }
  93. return FALSE;
  94. }
  95. /* --------------------------------------------------------------------------------------------- */
  96. static void
  97. mc_search__cond_struct_new_regex_hex_add (const char *charset, GString * str_to,
  98. const char *one_char, gsize str_len)
  99. {
  100. GString *upp, *low;
  101. gsize loop;
  102. upp = mc_search__toupper_case_str (charset, one_char, str_len);
  103. low = mc_search__tolower_case_str (charset, one_char, str_len);
  104. for (loop = 0; loop < upp->len; loop++)
  105. {
  106. gchar tmp_str[10 + 1]; /* longest content is "[\\x%02X\\x%02X]" */
  107. gint tmp_len;
  108. if (loop >= low->len || upp->str[loop] == low->str[loop])
  109. tmp_len =
  110. g_snprintf (tmp_str, sizeof (tmp_str), "\\x%02X", (unsigned char) upp->str[loop]);
  111. else
  112. tmp_len =
  113. g_snprintf (tmp_str, sizeof (tmp_str), "[\\x%02X\\x%02X]",
  114. (unsigned char) upp->str[loop], (unsigned char) low->str[loop]);
  115. g_string_append_len (str_to, tmp_str, tmp_len);
  116. }
  117. g_string_free (upp, TRUE);
  118. g_string_free (low, TRUE);
  119. }
  120. /* --------------------------------------------------------------------------------------------- */
  121. static void
  122. mc_search__cond_struct_new_regex_accum_append (const char *charset, GString * str_to,
  123. GString * str_from)
  124. {
  125. GString *recoded_part;
  126. gsize loop = 0;
  127. recoded_part = g_string_sized_new (32);
  128. while (loop < str_from->len)
  129. {
  130. gchar *one_char;
  131. gsize one_char_len;
  132. gboolean just_letters;
  133. one_char =
  134. mc_search__get_one_symbol (charset, &(str_from->str[loop]),
  135. MIN (str_from->len - loop, 6), &just_letters);
  136. one_char_len = strlen (one_char);
  137. if (one_char_len == 0)
  138. loop++;
  139. else
  140. {
  141. loop += one_char_len;
  142. if (just_letters)
  143. mc_search__cond_struct_new_regex_hex_add (charset, recoded_part, one_char,
  144. one_char_len);
  145. else
  146. g_string_append_len (recoded_part, one_char, one_char_len);
  147. }
  148. g_free (one_char);
  149. }
  150. g_string_append_len (str_to, recoded_part->str, recoded_part->len);
  151. g_string_free (recoded_part, TRUE);
  152. g_string_set_size (str_from, 0);
  153. }
  154. /* --------------------------------------------------------------------------------------------- */
  155. /**
  156. * Creates a case-insensitive version of a regex pattern.
  157. *
  158. * For example (assuming ASCII charset): given "\\bHello!\\xAB", returns
  159. * "\\b[Hh][Ee][Ll][Ll][Oo]!\\xAB" (this example is for easier reading; in
  160. * reality hex codes are used instead of letters).
  161. *
  162. * This function knows not to ruin special regex symbols.
  163. *
  164. * This function is used when working with non-UTF-8 charsets: GLib's
  165. * regex engine doesn't understand such charsets and therefore can't do
  166. * this job itself.
  167. */
  168. static GString *
  169. mc_search__cond_struct_new_regex_ci_str (const char *charset, const GString * astr)
  170. {
  171. GString *accumulator, *spec_char, *ret_str;
  172. gsize loop;
  173. ret_str = g_string_sized_new (64);
  174. accumulator = g_string_sized_new (64);
  175. spec_char = g_string_sized_new (64);
  176. loop = 0;
  177. while (loop <= astr->len)
  178. {
  179. if (mc_search__regex_str_append_if_special (spec_char, astr, &loop))
  180. {
  181. mc_search__cond_struct_new_regex_accum_append (charset, ret_str, accumulator);
  182. g_string_append_len (ret_str, spec_char->str, spec_char->len);
  183. g_string_set_size (spec_char, 0);
  184. continue;
  185. }
  186. if (astr->str[loop] == '[' && !strutils_is_char_escaped (astr->str, &(astr->str[loop])))
  187. {
  188. mc_search__cond_struct_new_regex_accum_append (charset, ret_str, accumulator);
  189. while (loop < astr->len && !(astr->str[loop] == ']'
  190. && !strutils_is_char_escaped (astr->str,
  191. &(astr->str[loop]))))
  192. {
  193. g_string_append_c (ret_str, astr->str[loop]);
  194. loop++;
  195. }
  196. g_string_append_c (ret_str, astr->str[loop]);
  197. loop++;
  198. continue;
  199. }
  200. /*
  201. TODO: handle [ and ]
  202. */
  203. g_string_append_c (accumulator, astr->str[loop]);
  204. loop++;
  205. }
  206. mc_search__cond_struct_new_regex_accum_append (charset, ret_str, accumulator);
  207. g_string_free (accumulator, TRUE);
  208. g_string_free (spec_char, TRUE);
  209. return ret_str;
  210. }
  211. /* --------------------------------------------------------------------------------------------- */
  212. #ifdef SEARCH_TYPE_GLIB
  213. /* A thin wrapper above g_regex_match_full that makes sure the string passed
  214. * to it is valid UTF-8 (unless G_REGEX_RAW compile flag was set), as it is a
  215. * requirement by glib and it might crash otherwise. See: mc ticket 3449.
  216. * Be careful: there might be embedded NULs in the strings. */
  217. static gboolean
  218. mc_search__g_regex_match_full_safe (const GRegex * regex,
  219. const gchar * string,
  220. gssize string_len,
  221. gint start_position,
  222. GRegexMatchFlags match_options,
  223. GMatchInfo ** match_info, GError ** error)
  224. {
  225. char *string_safe, *p, *end;
  226. gboolean ret;
  227. if (string_len < 0)
  228. string_len = strlen (string);
  229. if ((g_regex_get_compile_flags (regex) & G_REGEX_RAW)
  230. || g_utf8_validate (string, string_len, NULL))
  231. {
  232. return g_regex_match_full (regex, string, string_len, start_position, match_options,
  233. match_info, error);
  234. }
  235. /* Correctly handle embedded NULs while copying */
  236. p = string_safe = g_malloc (string_len + 1);
  237. memcpy (string_safe, string, string_len);
  238. string_safe[string_len] = '\0';
  239. end = p + string_len;
  240. while (p < end)
  241. {
  242. gunichar c = g_utf8_get_char_validated (p, -1);
  243. if (c != (gunichar) (-1) && c != (gunichar) (-2))
  244. {
  245. p = g_utf8_next_char (p);
  246. }
  247. else
  248. {
  249. /* U+FFFD would be the proper choice, but then we'd have to
  250. maintain mapping between old and new offsets.
  251. So rather do a byte by byte replacement. */
  252. *p++ = '\0';
  253. }
  254. }
  255. ret =
  256. g_regex_match_full (regex, string_safe, string_len, start_position, match_options,
  257. match_info, error);
  258. g_free (string_safe);
  259. return ret;
  260. }
  261. #endif /* SEARCH_TYPE_GLIB */
  262. /* --------------------------------------------------------------------------------------------- */
  263. static mc_search__found_cond_t
  264. mc_search__regex_found_cond_one (mc_search_t * lc_mc_search, mc_search_regex_t * regex,
  265. GString * search_str)
  266. {
  267. #ifdef SEARCH_TYPE_GLIB
  268. GError *mcerror = NULL;
  269. if (!mc_search__g_regex_match_full_safe
  270. (regex, search_str->str, search_str->len, 0, G_REGEX_MATCH_NEWLINE_ANY,
  271. &lc_mc_search->regex_match_info, &mcerror))
  272. {
  273. g_match_info_free (lc_mc_search->regex_match_info);
  274. lc_mc_search->regex_match_info = NULL;
  275. if (mcerror != NULL)
  276. {
  277. lc_mc_search->error = MC_SEARCH_E_REGEX;
  278. g_free (lc_mc_search->error_str);
  279. lc_mc_search->error_str =
  280. str_conv_gerror_message (mcerror, _("Regular expression error"));
  281. g_error_free (mcerror);
  282. return COND__FOUND_ERROR;
  283. }
  284. return COND__NOT_FOUND;
  285. }
  286. lc_mc_search->num_results = g_match_info_get_match_count (lc_mc_search->regex_match_info);
  287. #else /* SEARCH_TYPE_GLIB */
  288. lc_mc_search->num_results = pcre_exec (regex, lc_mc_search->regex_match_info,
  289. search_str->str, search_str->len, 0, 0,
  290. lc_mc_search->iovector, MC_SEARCH__NUM_REPLACE_ARGS);
  291. if (lc_mc_search->num_results < 0)
  292. {
  293. return COND__NOT_FOUND;
  294. }
  295. #endif /* SEARCH_TYPE_GLIB */
  296. return COND__FOUND_OK;
  297. }
  298. /* --------------------------------------------------------------------------------------------- */
  299. static mc_search__found_cond_t
  300. mc_search__regex_found_cond (mc_search_t * lc_mc_search, GString * search_str)
  301. {
  302. gsize loop1;
  303. for (loop1 = 0; loop1 < lc_mc_search->conditions->len; loop1++)
  304. {
  305. mc_search_cond_t *mc_search_cond;
  306. mc_search__found_cond_t ret;
  307. mc_search_cond = (mc_search_cond_t *) g_ptr_array_index (lc_mc_search->conditions, loop1);
  308. if (!mc_search_cond->regex_handle)
  309. continue;
  310. ret =
  311. mc_search__regex_found_cond_one (lc_mc_search, mc_search_cond->regex_handle,
  312. search_str);
  313. if (ret != COND__NOT_FOUND)
  314. return ret;
  315. }
  316. return COND__NOT_ALL_FOUND;
  317. }
  318. /* --------------------------------------------------------------------------------------------- */
  319. static int
  320. mc_search_regex__get_max_num_of_replace_tokens (const gchar * str, gsize len)
  321. {
  322. int max_token = 0;
  323. gsize loop;
  324. for (loop = 0; loop < len - 1; loop++)
  325. {
  326. if (str[loop] == '\\' && g_ascii_isdigit (str[loop + 1]))
  327. {
  328. if (strutils_is_char_escaped (str, &str[loop]))
  329. continue;
  330. if (max_token < str[loop + 1] - '0')
  331. max_token = str[loop + 1] - '0';
  332. continue;
  333. }
  334. if (str[loop] == '$' && str[loop + 1] == '{')
  335. {
  336. gsize tmp_len;
  337. if (strutils_is_char_escaped (str, &str[loop]))
  338. continue;
  339. for (tmp_len = 0;
  340. loop + tmp_len + 2 < len && (str[loop + 2 + tmp_len] & (char) 0xf0) == 0x30;
  341. tmp_len++);
  342. if (str[loop + 2 + tmp_len] == '}')
  343. {
  344. int tmp_token;
  345. char *tmp_str;
  346. tmp_str = g_strndup (&str[loop + 2], tmp_len);
  347. tmp_token = atoi (tmp_str);
  348. if (max_token < tmp_token)
  349. max_token = tmp_token;
  350. g_free (tmp_str);
  351. }
  352. }
  353. }
  354. return max_token;
  355. }
  356. /* --------------------------------------------------------------------------------------------- */
  357. static char *
  358. mc_search_regex__get_token_by_num (const mc_search_t * lc_mc_search, gsize lc_index)
  359. {
  360. int fnd_start = 0, fnd_end = 0;
  361. #ifdef SEARCH_TYPE_GLIB
  362. g_match_info_fetch_pos (lc_mc_search->regex_match_info, lc_index, &fnd_start, &fnd_end);
  363. #else /* SEARCH_TYPE_GLIB */
  364. fnd_start = lc_mc_search->iovector[lc_index * 2 + 0];
  365. fnd_end = lc_mc_search->iovector[lc_index * 2 + 1];
  366. #endif /* SEARCH_TYPE_GLIB */
  367. if (fnd_end == fnd_start)
  368. return g_strdup ("");
  369. return g_strndup (lc_mc_search->regex_buffer->str + fnd_start, fnd_end - fnd_start);
  370. }
  371. /* --------------------------------------------------------------------------------------------- */
  372. static gboolean
  373. mc_search_regex__replace_handle_esc_seq (const GString * replace_str, const gsize current_pos,
  374. gsize * skip_len, int *ret)
  375. {
  376. char *curr_str = &(replace_str->str[current_pos]);
  377. char c = curr_str[1];
  378. if (replace_str->len > current_pos + 2)
  379. {
  380. if (c == '{')
  381. {
  382. for (*skip_len = 2; /* \{ */
  383. current_pos + *skip_len < replace_str->len && curr_str[*skip_len] >= '0'
  384. && curr_str[*skip_len] <= '7'; (*skip_len)++)
  385. ;
  386. if (current_pos + *skip_len < replace_str->len && curr_str[*skip_len] == '}')
  387. {
  388. (*skip_len)++;
  389. *ret = REPLACE_PREPARE_T_ESCAPE_SEQ;
  390. return FALSE;
  391. }
  392. else
  393. {
  394. *ret = REPLACE_PREPARE_T_NOTHING_SPECIAL;
  395. return TRUE;
  396. }
  397. }
  398. if (c == 'x')
  399. {
  400. *skip_len = 2; /* \x */
  401. c = curr_str[2];
  402. if (c == '{')
  403. {
  404. for (*skip_len = 3; /* \x{ */
  405. current_pos + *skip_len < replace_str->len
  406. && g_ascii_isxdigit ((guchar) curr_str[*skip_len]); (*skip_len)++)
  407. ;
  408. if (current_pos + *skip_len < replace_str->len && curr_str[*skip_len] == '}')
  409. {
  410. (*skip_len)++;
  411. *ret = REPLACE_PREPARE_T_ESCAPE_SEQ;
  412. return FALSE;
  413. }
  414. else
  415. {
  416. *ret = REPLACE_PREPARE_T_NOTHING_SPECIAL;
  417. return TRUE;
  418. }
  419. }
  420. else if (!g_ascii_isxdigit ((guchar) c))
  421. {
  422. *skip_len = 2; /* \x without number behind */
  423. *ret = REPLACE_PREPARE_T_NOTHING_SPECIAL;
  424. return FALSE;
  425. }
  426. else
  427. {
  428. c = curr_str[3];
  429. if (!g_ascii_isxdigit ((guchar) c))
  430. *skip_len = 3; /* \xH */
  431. else
  432. *skip_len = 4; /* \xHH */
  433. *ret = REPLACE_PREPARE_T_ESCAPE_SEQ;
  434. return FALSE;
  435. }
  436. }
  437. }
  438. if (strchr ("ntvbrfa", c) != NULL)
  439. {
  440. *skip_len = 2;
  441. *ret = REPLACE_PREPARE_T_ESCAPE_SEQ;
  442. return FALSE;
  443. }
  444. return TRUE;
  445. }
  446. /* --------------------------------------------------------------------------------------------- */
  447. static int
  448. mc_search_regex__process_replace_str (const GString * replace_str, const gsize current_pos,
  449. gsize * skip_len, replace_transform_type_t * replace_flags)
  450. {
  451. int ret = -1;
  452. const char *curr_str = &(replace_str->str[current_pos]);
  453. if (current_pos > replace_str->len)
  454. return REPLACE_PREPARE_T_NOTHING_SPECIAL;
  455. *skip_len = 0;
  456. if (replace_str->len > current_pos + 2 && curr_str[0] == '$' && curr_str[1] == '{'
  457. && (curr_str[2] & (char) 0xf0) == 0x30)
  458. {
  459. char *tmp_str;
  460. if (strutils_is_char_escaped (replace_str->str, curr_str))
  461. {
  462. *skip_len = 1;
  463. return REPLACE_PREPARE_T_NOTHING_SPECIAL;
  464. }
  465. for (*skip_len = 0;
  466. current_pos + *skip_len + 2 < replace_str->len
  467. && (curr_str[2 + *skip_len] & (char) 0xf0) == 0x30; (*skip_len)++)
  468. ;
  469. if (curr_str[2 + *skip_len] != '}')
  470. return REPLACE_PREPARE_T_NOTHING_SPECIAL;
  471. tmp_str = g_strndup (curr_str + 2, *skip_len);
  472. if (tmp_str == NULL)
  473. return REPLACE_PREPARE_T_NOTHING_SPECIAL;
  474. ret = atoi (tmp_str);
  475. g_free (tmp_str);
  476. *skip_len += 3; /* ${} */
  477. return ret; /* capture buffer index >= 0 */
  478. }
  479. if (curr_str[0] == '\\' && replace_str->len > current_pos + 1)
  480. {
  481. if (strutils_is_char_escaped (replace_str->str, curr_str))
  482. {
  483. *skip_len = 1;
  484. return REPLACE_PREPARE_T_NOTHING_SPECIAL;
  485. }
  486. if (g_ascii_isdigit (curr_str[1]))
  487. {
  488. ret = g_ascii_digit_value (curr_str[1]); /* capture buffer index >= 0 */
  489. *skip_len = 2; /* \\ and one digit */
  490. return ret;
  491. }
  492. if (!mc_search_regex__replace_handle_esc_seq (replace_str, current_pos, skip_len, &ret))
  493. return ret;
  494. ret = REPLACE_PREPARE_T_REPLACE_FLAG;
  495. *skip_len += 2;
  496. switch (curr_str[1])
  497. {
  498. case 'U':
  499. *replace_flags |= REPLACE_T_UPP_TRANSFORM;
  500. *replace_flags &= ~REPLACE_T_LOW_TRANSFORM;
  501. break;
  502. case 'u':
  503. *replace_flags |= REPLACE_T_UPP_TRANSFORM_CHAR;
  504. break;
  505. case 'L':
  506. *replace_flags |= REPLACE_T_LOW_TRANSFORM;
  507. *replace_flags &= ~REPLACE_T_UPP_TRANSFORM;
  508. break;
  509. case 'l':
  510. *replace_flags |= REPLACE_T_LOW_TRANSFORM_CHAR;
  511. break;
  512. case 'E':
  513. *replace_flags = REPLACE_T_NO_TRANSFORM;
  514. break;
  515. default:
  516. ret = REPLACE_PREPARE_T_NOTHING_SPECIAL;
  517. break;
  518. }
  519. }
  520. return ret;
  521. }
  522. /* --------------------------------------------------------------------------------------------- */
  523. static void
  524. mc_search_regex__process_append_str (GString * dest_str, const char *from, gsize len,
  525. replace_transform_type_t * replace_flags)
  526. {
  527. gsize loop;
  528. gsize char_len;
  529. if (len == (gsize) (-1))
  530. len = strlen (from);
  531. if (*replace_flags == REPLACE_T_NO_TRANSFORM)
  532. {
  533. g_string_append_len (dest_str, from, len);
  534. return;
  535. }
  536. for (loop = 0; loop < len; loop += char_len)
  537. {
  538. GString *tmp_string = NULL;
  539. char *tmp_str;
  540. tmp_str = mc_search__get_one_symbol (NULL, from + loop, len - loop, NULL);
  541. char_len = strlen (tmp_str);
  542. if ((*replace_flags & REPLACE_T_UPP_TRANSFORM_CHAR) != 0)
  543. {
  544. *replace_flags &= ~REPLACE_T_UPP_TRANSFORM_CHAR;
  545. tmp_string = mc_search__toupper_case_str (NULL, tmp_str, char_len);
  546. g_string_append_len (dest_str, tmp_string->str, tmp_string->len);
  547. g_string_free (tmp_string, TRUE);
  548. }
  549. else if ((*replace_flags & REPLACE_T_LOW_TRANSFORM_CHAR) != 0)
  550. {
  551. *replace_flags &= ~REPLACE_T_LOW_TRANSFORM_CHAR;
  552. tmp_string = mc_search__tolower_case_str (NULL, tmp_str, char_len);
  553. g_string_append_len (dest_str, tmp_string->str, tmp_string->len);
  554. g_string_free (tmp_string, TRUE);
  555. }
  556. else if ((*replace_flags & REPLACE_T_UPP_TRANSFORM) != 0)
  557. {
  558. tmp_string = mc_search__toupper_case_str (NULL, tmp_str, char_len);
  559. g_string_append_len (dest_str, tmp_string->str, tmp_string->len);
  560. g_string_free (tmp_string, TRUE);
  561. }
  562. else if ((*replace_flags & REPLACE_T_LOW_TRANSFORM) != 0)
  563. {
  564. tmp_string = mc_search__tolower_case_str (NULL, tmp_str, char_len);
  565. g_string_append_len (dest_str, tmp_string->str, tmp_string->len);
  566. g_string_free (tmp_string, TRUE);
  567. }
  568. g_free (tmp_str);
  569. }
  570. }
  571. /* --------------------------------------------------------------------------------------------- */
  572. static void
  573. mc_search_regex__process_escape_sequence (GString * dest_str, const char *from, gsize len,
  574. replace_transform_type_t * replace_flags,
  575. gboolean is_utf8)
  576. {
  577. gsize i = 0;
  578. unsigned int c = 0;
  579. char b;
  580. if (len == (gsize) (-1))
  581. len = strlen (from);
  582. if (len == 0)
  583. return;
  584. if (from[i] == '{')
  585. i++;
  586. if (i >= len)
  587. return;
  588. if (from[i] == 'x')
  589. {
  590. i++;
  591. if (i < len && from[i] == '{')
  592. i++;
  593. for (; i < len; i++)
  594. {
  595. if (from[i] >= '0' && from[i] <= '9')
  596. c = c * 16 + from[i] - '0';
  597. else if (from[i] >= 'a' && from[i] <= 'f')
  598. c = c * 16 + 10 + from[i] - 'a';
  599. else if (from[i] >= 'A' && from[i] <= 'F')
  600. c = c * 16 + 10 + from[i] - 'A';
  601. else
  602. break;
  603. }
  604. }
  605. else if (from[i] >= '0' && from[i] <= '7')
  606. for (; i < len && from[i] >= '0' && from[i] <= '7'; i++)
  607. c = c * 8 + from[i] - '0';
  608. else
  609. {
  610. switch (from[i])
  611. {
  612. case 'n':
  613. c = '\n';
  614. break;
  615. case 't':
  616. c = '\t';
  617. break;
  618. case 'v':
  619. c = '\v';
  620. break;
  621. case 'b':
  622. c = '\b';
  623. break;
  624. case 'r':
  625. c = '\r';
  626. break;
  627. case 'f':
  628. c = '\f';
  629. break;
  630. case 'a':
  631. c = '\a';
  632. break;
  633. default:
  634. mc_search_regex__process_append_str (dest_str, from, len, replace_flags);
  635. return;
  636. }
  637. }
  638. if (c < 0x80 || !is_utf8)
  639. g_string_append_c (dest_str, (char) c);
  640. else if (c < 0x800)
  641. {
  642. b = 0xC0 | (c >> 6);
  643. g_string_append_c (dest_str, b);
  644. b = 0x80 | (c & 0x3F);
  645. g_string_append_c (dest_str, b);
  646. }
  647. else if (c < 0x10000)
  648. {
  649. b = 0xE0 | (c >> 12);
  650. g_string_append_c (dest_str, b);
  651. b = 0x80 | ((c >> 6) & 0x3F);
  652. g_string_append_c (dest_str, b);
  653. b = 0x80 | (c & 0x3F);
  654. g_string_append_c (dest_str, b);
  655. }
  656. else if (c < 0x10FFFF)
  657. {
  658. b = 0xF0 | (c >> 16);
  659. g_string_append_c (dest_str, b);
  660. b = 0x80 | ((c >> 12) & 0x3F);
  661. g_string_append_c (dest_str, b);
  662. b = 0x80 | ((c >> 6) & 0x3F);
  663. g_string_append_c (dest_str, b);
  664. b = 0x80 | (c & 0x3F);
  665. g_string_append_c (dest_str, b);
  666. }
  667. }
  668. /* --------------------------------------------------------------------------------------------- */
  669. /*** public functions ****************************************************************************/
  670. /* --------------------------------------------------------------------------------------------- */
  671. void
  672. mc_search__cond_struct_new_init_regex (const char *charset, mc_search_t * lc_mc_search,
  673. mc_search_cond_t * mc_search_cond)
  674. {
  675. if (lc_mc_search->whole_words && !lc_mc_search->is_entire_line)
  676. {
  677. /* NOTE: \b as word boundary doesn't allow search
  678. * whole words with non-ASCII symbols.
  679. * Update: Is it still true nowadays? Probably not. #2396, #3524 */
  680. g_string_prepend (mc_search_cond->str, "(?<![\\p{L}\\p{N}_])");
  681. g_string_append (mc_search_cond->str, "(?![\\p{L}\\p{N}_])");
  682. }
  683. {
  684. #ifdef SEARCH_TYPE_GLIB
  685. GError *mcerror = NULL;
  686. GRegexCompileFlags g_regex_options = G_REGEX_OPTIMIZE | G_REGEX_DOTALL;
  687. if (str_isutf8 (charset) && mc_global.utf8_display)
  688. {
  689. if (!lc_mc_search->is_case_sensitive)
  690. g_regex_options |= G_REGEX_CASELESS;
  691. }
  692. else
  693. {
  694. g_regex_options |= G_REGEX_RAW;
  695. if (!lc_mc_search->is_case_sensitive)
  696. {
  697. GString *tmp;
  698. tmp = mc_search_cond->str;
  699. mc_search_cond->str = mc_search__cond_struct_new_regex_ci_str (charset, tmp);
  700. g_string_free (tmp, TRUE);
  701. }
  702. }
  703. mc_search_cond->regex_handle =
  704. g_regex_new (mc_search_cond->str->str, g_regex_options, 0, &mcerror);
  705. if (mcerror != NULL)
  706. {
  707. lc_mc_search->error = MC_SEARCH_E_REGEX_COMPILE;
  708. g_free (lc_mc_search->error_str);
  709. lc_mc_search->error_str =
  710. str_conv_gerror_message (mcerror, _("Regular expression error"));
  711. g_error_free (mcerror);
  712. return;
  713. }
  714. #else /* SEARCH_TYPE_GLIB */
  715. const char *error;
  716. int erroffset;
  717. int pcre_options = PCRE_EXTRA | PCRE_MULTILINE;
  718. if (str_isutf8 (charset) && mc_global.utf8_display)
  719. {
  720. pcre_options |= PCRE_UTF8;
  721. if (!lc_mc_search->is_case_sensitive)
  722. pcre_options |= PCRE_CASELESS;
  723. }
  724. else
  725. {
  726. if (!lc_mc_search->is_case_sensitive)
  727. {
  728. GString *tmp;
  729. tmp = mc_search_cond->str;
  730. mc_search_cond->str = mc_search__cond_struct_new_regex_ci_str (charset, tmp);
  731. g_string_free (tmp, TRUE);
  732. }
  733. }
  734. mc_search_cond->regex_handle =
  735. pcre_compile (mc_search_cond->str->str, pcre_options, &error, &erroffset, NULL);
  736. if (mc_search_cond->regex_handle == NULL)
  737. {
  738. mc_search_set_error (lc_mc_search, MC_SEARCH_E_REGEX_COMPILE, "%s", error);
  739. return;
  740. }
  741. lc_mc_search->regex_match_info = pcre_study (mc_search_cond->regex_handle, 0, &error);
  742. if (lc_mc_search->regex_match_info == NULL && error != NULL)
  743. {
  744. mc_search_set_error (lc_mc_search, MC_SEARCH_E_REGEX_COMPILE, "%s", error);
  745. MC_PTR_FREE (mc_search_cond->regex_handle);
  746. return;
  747. }
  748. #endif /* SEARCH_TYPE_GLIB */
  749. }
  750. lc_mc_search->is_utf8 = str_isutf8 (charset);
  751. }
  752. /* --------------------------------------------------------------------------------------------- */
  753. gboolean
  754. mc_search__run_regex (mc_search_t * lc_mc_search, const void *user_data,
  755. gsize start_search, gsize end_search, gsize * found_len)
  756. {
  757. mc_search_cbret_t ret = MC_SEARCH_CB_NOTFOUND;
  758. gsize current_pos, virtual_pos;
  759. gint start_pos;
  760. gint end_pos;
  761. if (lc_mc_search->regex_buffer != NULL)
  762. g_string_set_size (lc_mc_search->regex_buffer, 0);
  763. else
  764. lc_mc_search->regex_buffer = g_string_sized_new (64);
  765. virtual_pos = current_pos = start_search;
  766. while (virtual_pos <= end_search)
  767. {
  768. g_string_set_size (lc_mc_search->regex_buffer, 0);
  769. lc_mc_search->start_buffer = current_pos;
  770. if (lc_mc_search->search_fn != NULL)
  771. {
  772. while (TRUE)
  773. {
  774. int current_chr = '\n'; /* stop search symbol */
  775. ret = lc_mc_search->search_fn (user_data, current_pos, &current_chr);
  776. if (ret == MC_SEARCH_CB_ABORT)
  777. break;
  778. if (ret == MC_SEARCH_CB_INVALID)
  779. continue;
  780. current_pos++;
  781. if (ret == MC_SEARCH_CB_SKIP)
  782. continue;
  783. virtual_pos++;
  784. g_string_append_c (lc_mc_search->regex_buffer, (char) current_chr);
  785. if ((char) current_chr == '\n' || virtual_pos > end_search)
  786. break;
  787. }
  788. }
  789. else
  790. {
  791. /* optimization for standard case (for search from file manager)
  792. * where there is no MC_SEARCH_CB_INVALID or MC_SEARCH_CB_SKIP
  793. * return codes, so we can copy line at regex buffer all at once
  794. */
  795. while (TRUE)
  796. {
  797. const char current_chr = ((const char *) user_data)[current_pos];
  798. if (current_chr == '\0')
  799. break;
  800. current_pos++;
  801. if (current_chr == '\n' || current_pos > end_search)
  802. break;
  803. }
  804. /* use virtual_pos as index of start of current chunk */
  805. g_string_append_len (lc_mc_search->regex_buffer, (const char *) user_data + virtual_pos,
  806. current_pos - virtual_pos);
  807. virtual_pos = current_pos;
  808. }
  809. switch (mc_search__regex_found_cond (lc_mc_search, lc_mc_search->regex_buffer))
  810. {
  811. case COND__FOUND_OK:
  812. #ifdef SEARCH_TYPE_GLIB
  813. g_match_info_fetch_pos (lc_mc_search->regex_match_info, 0, &start_pos, &end_pos);
  814. #else /* SEARCH_TYPE_GLIB */
  815. start_pos = lc_mc_search->iovector[0];
  816. end_pos = lc_mc_search->iovector[1];
  817. #endif /* SEARCH_TYPE_GLIB */
  818. if (found_len != NULL)
  819. *found_len = end_pos - start_pos;
  820. lc_mc_search->normal_offset = lc_mc_search->start_buffer + start_pos;
  821. return TRUE;
  822. case COND__NOT_ALL_FOUND:
  823. break;
  824. default:
  825. g_string_free (lc_mc_search->regex_buffer, TRUE);
  826. lc_mc_search->regex_buffer = NULL;
  827. return FALSE;
  828. }
  829. if ((lc_mc_search->update_fn != NULL) &&
  830. ((lc_mc_search->update_fn) (user_data, current_pos) == MC_SEARCH_CB_ABORT))
  831. ret = MC_SEARCH_CB_ABORT;
  832. if (ret == MC_SEARCH_CB_ABORT || ret == MC_SEARCH_CB_NOTFOUND)
  833. break;
  834. }
  835. g_string_free (lc_mc_search->regex_buffer, TRUE);
  836. lc_mc_search->regex_buffer = NULL;
  837. MC_PTR_FREE (lc_mc_search->error_str);
  838. lc_mc_search->error = ret == MC_SEARCH_CB_ABORT ? MC_SEARCH_E_ABORT : MC_SEARCH_E_NOTFOUND;
  839. return FALSE;
  840. }
  841. /* --------------------------------------------------------------------------------------------- */
  842. GString *
  843. mc_search_regex_prepare_replace_str (mc_search_t * lc_mc_search, GString * replace_str)
  844. {
  845. GString *ret;
  846. int num_replace_tokens;
  847. gsize loop;
  848. gsize prev = 0;
  849. replace_transform_type_t replace_flags = REPLACE_T_NO_TRANSFORM;
  850. num_replace_tokens =
  851. mc_search_regex__get_max_num_of_replace_tokens (replace_str->str, replace_str->len);
  852. if (lc_mc_search->num_results < 0)
  853. return mc_g_string_dup (replace_str);
  854. if (num_replace_tokens > lc_mc_search->num_results - 1
  855. || num_replace_tokens > MC_SEARCH__NUM_REPLACE_ARGS)
  856. {
  857. mc_search_set_error (lc_mc_search, MC_SEARCH_E_REGEX_REPLACE, "%s",
  858. _(STR_E_RPL_NOT_EQ_TO_FOUND));
  859. return NULL;
  860. }
  861. ret = g_string_sized_new (64);
  862. for (loop = 0; loop < replace_str->len - 1; loop++)
  863. {
  864. int lc_index;
  865. gchar *tmp_str;
  866. gsize len = 0;
  867. lc_index = mc_search_regex__process_replace_str (replace_str, loop, &len, &replace_flags);
  868. if (lc_index == REPLACE_PREPARE_T_NOTHING_SPECIAL)
  869. {
  870. if (len != 0)
  871. {
  872. mc_search_regex__process_append_str (ret, replace_str->str + prev, loop - prev,
  873. &replace_flags);
  874. mc_search_regex__process_append_str (ret, replace_str->str + loop + 1, len - 1,
  875. &replace_flags);
  876. prev = loop + len;
  877. loop = prev - 1; /* prepare to loop++ */
  878. }
  879. continue;
  880. }
  881. if (lc_index == REPLACE_PREPARE_T_REPLACE_FLAG)
  882. {
  883. if (loop != 0)
  884. mc_search_regex__process_append_str (ret, replace_str->str + prev, loop - prev,
  885. &replace_flags);
  886. prev = loop + len;
  887. loop = prev - 1; /* prepare to loop++ */
  888. continue;
  889. }
  890. /* escape sequence */
  891. if (lc_index == REPLACE_PREPARE_T_ESCAPE_SEQ)
  892. {
  893. mc_search_regex__process_append_str (ret, replace_str->str + prev, loop - prev,
  894. &replace_flags);
  895. /* call process_escape_sequence without starting '\\' */
  896. mc_search_regex__process_escape_sequence (ret, replace_str->str + loop + 1, len - 1,
  897. &replace_flags, lc_mc_search->is_utf8);
  898. prev = loop + len;
  899. loop = prev - 1; /* prepare to loop++ */
  900. continue;
  901. }
  902. /* invalid capture buffer number */
  903. if (lc_index > lc_mc_search->num_results)
  904. {
  905. g_string_free (ret, TRUE);
  906. mc_search_set_error (lc_mc_search, MC_SEARCH_E_REGEX_REPLACE,
  907. _(STR_E_RPL_INVALID_TOKEN), lc_index);
  908. return NULL;
  909. }
  910. tmp_str = mc_search_regex__get_token_by_num (lc_mc_search, lc_index);
  911. if (loop != 0)
  912. mc_search_regex__process_append_str (ret, replace_str->str + prev, loop - prev,
  913. &replace_flags);
  914. mc_search_regex__process_append_str (ret, tmp_str, -1, &replace_flags);
  915. g_free (tmp_str);
  916. prev = loop + len;
  917. loop = prev - 1; /* prepare to loop++ */
  918. }
  919. mc_search_regex__process_append_str (ret, replace_str->str + prev, replace_str->len - prev,
  920. &replace_flags);
  921. return ret;
  922. }