tokenize.c 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. /*
  2. Parse string into tokens.
  3. Copyright (C) 2024
  4. Free Software Foundation, Inc.
  5. Written by:
  6. Andrew Borodin <aborodin@vmail.ru> 2010-2024
  7. The str_tokenize() and str_tokenize_word routines are mostly from
  8. GNU readline-8.2.
  9. This file is part of the Midnight Commander.
  10. The Midnight Commander is free software: you can redistribute it
  11. and/or modify it under the terms of the GNU General Public License as
  12. published by the Free Software Foundation, either version 3 of the License,
  13. or (at your option) any later version.
  14. The Midnight Commander is distributed in the hope that it will be useful,
  15. but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. GNU General Public License for more details.
  18. You should have received a copy of the GNU General Public License
  19. along with this program. If not, see <http://www.gnu.org/licenses/>.
  20. */
  21. /** \file tokenize.c
  22. * \brief Source: parse string into tokens.
  23. */
  24. #include <config.h>
  25. #include <stdlib.h>
  26. #include <string.h>
  27. #include "lib/global.h"
  28. #include "lib/util.h" /* whiteness() */
  29. #include "lib/strutil.h"
  30. /*** global variables ****************************************************************************/
  31. /*** file scope macro definitions ****************************************************************/
  32. #define WORD_DELIMITERS " \t\n;&()|<>"
  33. #define QUOTE_CHARACTERS "\"'`"
  34. #define slashify_in_quotes "\\`\"$"
  35. #define member(c, s) ((c != '\0') ? (strchr ((s), (c)) != NULL) : FALSE)
  36. /*** file scope type declarations ****************************************************************/
  37. /*** forward declarations (file scope functions) *************************************************/
  38. /*** file scope variables ************************************************************************/
  39. /* --------------------------------------------------------------------------------------------- */
  40. /*** file scope functions ************************************************************************/
  41. /* --------------------------------------------------------------------------------------------- */
  42. /*
  43. * Based on history_tokenize_word() from GNU readline-8.2
  44. */
  45. static int
  46. str_tokenize_word (const char *string, int start)
  47. {
  48. int i = start;
  49. char delimiter = '\0';
  50. char delimopen = '\0';
  51. int nestdelim = 0;
  52. if (member (string[i], "()\n")) /* XXX - included \n, but why? been here forever */
  53. return (i + 1);
  54. if (g_ascii_isdigit (string[i]))
  55. {
  56. int j;
  57. for (j = i; string[j] != '\0' && g_ascii_isdigit (string[j]); j++)
  58. ;
  59. if (string[j] == '\0')
  60. return j;
  61. if (string[j] == '<' || string[j] == '>')
  62. i = j; /* digit sequence is a file descriptor */
  63. else
  64. {
  65. i = j; /* digit sequence is part of a word */
  66. goto get_word;
  67. }
  68. }
  69. if (member (string[i], "<>;&|"))
  70. {
  71. char peek = string[i + 1];
  72. if (peek == string[i])
  73. {
  74. if (peek == '<' && (string[i + 2] == '-' || string[i + 2] == '<'))
  75. i++;
  76. return (i + 2);
  77. }
  78. if (peek == '&' && (string[i] == '>' || string[i] == '<'))
  79. {
  80. int j;
  81. /* file descriptor */
  82. for (j = i + 2; string[j] != '\0' && g_ascii_isdigit (string[j]); j++)
  83. ;
  84. if (string[j] == '-') /* <&[digits]-, >&[digits]- */
  85. j++;
  86. return j;
  87. }
  88. if ((peek == '>' && string[i] == '&') || (peek == '|' && string[i] == '>'))
  89. return (i + 2);
  90. /* XXX - process substitution -- separated out for later -- bash-4.2 */
  91. if (peek == '(' && (string[i] == '>' || string[i] == '<'))
  92. {
  93. /* ) */
  94. i += 2;
  95. delimopen = '(';
  96. delimiter = ')';
  97. nestdelim = 1;
  98. goto get_word;
  99. }
  100. return (i + 1);
  101. }
  102. get_word:
  103. /* Get word from string + i; */
  104. if (delimiter == '\0' && member (string[i], QUOTE_CHARACTERS))
  105. {
  106. delimiter = string[i];
  107. i++;
  108. }
  109. for (; string[i] != '\0'; i++)
  110. {
  111. if (string[i] == '\\' && string[i + 1] == '\n')
  112. {
  113. i++;
  114. continue;
  115. }
  116. if (string[i] == '\\' && delimiter != '\''
  117. && (delimiter != '"' || member (string[i], slashify_in_quotes)))
  118. {
  119. i++;
  120. continue;
  121. }
  122. /* delimiter must be set and set to something other than a quote if
  123. nestdelim is set, so these tests are safe. */
  124. if (nestdelim != 0 && string[i] == delimopen)
  125. {
  126. nestdelim++;
  127. continue;
  128. }
  129. if (nestdelim != 0 && string[i] == delimiter)
  130. {
  131. nestdelim--;
  132. if (nestdelim == 0)
  133. delimiter = '\0';
  134. continue;
  135. }
  136. if (delimiter != '\0' && string[i] == delimiter)
  137. {
  138. delimiter = '\0';
  139. continue;
  140. }
  141. /* Command and process substitution; shell extended globbing patterns */
  142. if (nestdelim == 0 && delimiter == '\0' && member (string[i], "<>$!@?+*")
  143. && string[i + 1] == '(')
  144. {
  145. /* ) */
  146. i += 2;
  147. delimopen = '(';
  148. delimiter = ')';
  149. nestdelim = 1;
  150. continue;
  151. }
  152. if (delimiter == '\0' && member (string[i], WORD_DELIMITERS))
  153. break;
  154. if (delimiter == '\0' && member (string[i], QUOTE_CHARACTERS))
  155. delimiter = string[i];
  156. }
  157. return i;
  158. }
  159. /* --------------------------------------------------------------------------------------------- */
  160. /*** public functions ****************************************************************************/
  161. /* --------------------------------------------------------------------------------------------- */
  162. /* Parse string into tokens.
  163. *
  164. * Based on history_tokenize_internal() from GNU readline-8.2
  165. */
  166. GPtrArray *
  167. str_tokenize (const char *string)
  168. {
  169. GPtrArray *result = NULL;
  170. int i = 0;
  171. /* Get a token, and stuff it into RESULT. The tokens are split
  172. exactly where the shell would split them. */
  173. while (string[i] != '\0')
  174. {
  175. int start;
  176. /* Skip leading whitespace */
  177. for (; string[i] != '\0' && whiteness (string[i]); i++)
  178. ;
  179. if (string[i] == '\0')
  180. return result;
  181. start = i;
  182. i = str_tokenize_word (string, start);
  183. /* If we have a non-whitespace delimiter character (which would not be
  184. skipped by the loop above), use it and any adjacent delimiters to
  185. make a separate field. Any adjacent white space will be skipped the
  186. next time through the loop. */
  187. if (i == start)
  188. for (i++; string[i] != '\0' && member (string[i], WORD_DELIMITERS); i++)
  189. ;
  190. if (result == NULL)
  191. result = g_ptr_array_new ();
  192. g_ptr_array_add (result, g_strndup (string + start, i - start));
  193. }
  194. return result;
  195. }
  196. /* --------------------------------------------------------------------------------------------- */