123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251 |
- /*
- Parse string into tokens.
- Copyright (C) 2024-2025
- Free Software Foundation, Inc.
- Written by:
- Andrew Borodin <aborodin@vmail.ru> 2010-2024
- The str_tokenize() and str_tokenize_word routines are mostly from
- GNU readline-8.2.
- This file is part of the Midnight Commander.
- The Midnight Commander is free software: you can redistribute it
- and/or modify it under the terms of the GNU General Public License as
- published by the Free Software Foundation, either version 3 of the License,
- or (at your option) any later version.
- The Midnight Commander is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
- /** \file tokenize.c
- * \brief Source: parse string into tokens.
- */
- #include <config.h>
- #include <stdlib.h>
- #include <string.h>
- #include "lib/global.h"
- #include "lib/util.h" /* whiteness() */
- #include "lib/strutil.h"
- /*** global variables ****************************************************************************/
- /*** file scope macro definitions ****************************************************************/
- #define WORD_DELIMITERS " \t\n;&()|<>"
- #define QUOTE_CHARACTERS "\"'`"
- #define slashify_in_quotes "\\`\"$"
- #define member(c, s) ((c != '\0') ? (strchr ((s), (c)) != NULL) : FALSE)
- /*** file scope type declarations ****************************************************************/
- /*** forward declarations (file scope functions) *************************************************/
- /*** file scope variables ************************************************************************/
- /* --------------------------------------------------------------------------------------------- */
- /*** file scope functions ************************************************************************/
- /* --------------------------------------------------------------------------------------------- */
- /*
- * Based on history_tokenize_word() from GNU readline-8.2
- */
- static int
- str_tokenize_word (const char *string, int start)
- {
- int i = start;
- char delimiter = '\0';
- char delimopen = '\0';
- int nestdelim = 0;
- if (member (string[i], "()\n")) /* XXX - included \n, but why? been here forever */
- return (i + 1);
- if (g_ascii_isdigit (string[i]))
- {
- int j;
- for (j = i; string[j] != '\0' && g_ascii_isdigit (string[j]); j++)
- ;
- if (string[j] == '\0')
- return j;
- if (string[j] == '<' || string[j] == '>')
- i = j; /* digit sequence is a file descriptor */
- else
- {
- i = j; /* digit sequence is part of a word */
- goto get_word;
- }
- }
- if (member (string[i], "<>;&|"))
- {
- char peek = string[i + 1];
- if (peek == string[i])
- {
- if (peek == '<' && (string[i + 2] == '-' || string[i + 2] == '<'))
- i++;
- return (i + 2);
- }
- if (peek == '&' && (string[i] == '>' || string[i] == '<'))
- {
- int j;
- /* file descriptor */
- for (j = i + 2; string[j] != '\0' && g_ascii_isdigit (string[j]); j++)
- ;
- if (string[j] == '-') /* <&[digits]-, >&[digits]- */
- j++;
- return j;
- }
- if ((peek == '>' && string[i] == '&') || (peek == '|' && string[i] == '>'))
- return (i + 2);
- /* XXX - process substitution -- separated out for later -- bash-4.2 */
- if (peek == '(' && (string[i] == '>' || string[i] == '<'))
- {
- /* ) */
- i += 2;
- delimopen = '(';
- delimiter = ')';
- nestdelim = 1;
- goto get_word;
- }
- return (i + 1);
- }
- get_word:
- /* Get word from string + i; */
- if (delimiter == '\0' && member (string[i], QUOTE_CHARACTERS))
- {
- delimiter = string[i];
- i++;
- }
- for (; string[i] != '\0'; i++)
- {
- if (string[i] == '\\' && string[i + 1] == '\n')
- {
- i++;
- continue;
- }
- if (string[i] == '\\' && delimiter != '\'' &&
- (delimiter != '"' || member (string[i], slashify_in_quotes)))
- {
- i++;
- continue;
- }
- /* delimiter must be set and set to something other than a quote if
- nestdelim is set, so these tests are safe. */
- if (nestdelim != 0 && string[i] == delimopen)
- {
- nestdelim++;
- continue;
- }
- if (nestdelim != 0 && string[i] == delimiter)
- {
- nestdelim--;
- if (nestdelim == 0)
- delimiter = '\0';
- continue;
- }
- if (delimiter != '\0' && string[i] == delimiter)
- {
- delimiter = '\0';
- continue;
- }
- /* Command and process substitution; shell extended globbing patterns */
- if (nestdelim == 0 && delimiter == '\0' && member (string[i], "<>$!@?+*")
- && string[i + 1] == '(')
- {
- /* ) */
- i += 2;
- delimopen = '(';
- delimiter = ')';
- nestdelim = 1;
- continue;
- }
- if (delimiter == '\0' && member (string[i], WORD_DELIMITERS))
- break;
- if (delimiter == '\0' && member (string[i], QUOTE_CHARACTERS))
- delimiter = string[i];
- }
- return i;
- }
- /* --------------------------------------------------------------------------------------------- */
- /*** public functions ****************************************************************************/
- /* --------------------------------------------------------------------------------------------- */
- /* Parse string into tokens.
- *
- * Based on history_tokenize_internal() from GNU readline-8.2
- */
- GPtrArray *
- str_tokenize (const char *string)
- {
- GPtrArray *result = NULL;
- int i = 0;
- /* Get a token, and stuff it into RESULT. The tokens are split
- exactly where the shell would split them. */
- while (string[i] != '\0')
- {
- int start;
- /* Skip leading whitespace */
- for (; string[i] != '\0' && whiteness (string[i]); i++)
- ;
- if (string[i] == '\0')
- return result;
- start = i;
- i = str_tokenize_word (string, start);
- /* If we have a non-whitespace delimiter character (which would not be
- skipped by the loop above), use it and any adjacent delimiters to
- make a separate field. Any adjacent white space will be skipped the
- next time through the loop. */
- if (i == start)
- for (i++; string[i] != '\0' && member (string[i], WORD_DELIMITERS); i++)
- ;
- if (result == NULL)
- result = g_ptr_array_new ();
- g_ptr_array_add (result, g_strndup (string + start, i - start));
- }
- return result;
- }
- /* --------------------------------------------------------------------------------------------- */
|