str_split.h 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. //
  2. // Copyright 2017 The Abseil Authors.
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // https://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. //
  16. // -----------------------------------------------------------------------------
  17. // File: str_split.h
  18. // -----------------------------------------------------------------------------
  19. //
  20. // This file contains functions for splitting strings. It defines the main
  21. // `StrSplit()` function, several delimiters for determining the boundaries on
  22. // which to split the string, and predicates for filtering delimited results.
  23. // `StrSplit()` adapts the returned collection to the type specified by the
  24. // caller.
  25. //
  26. // Example:
  27. //
  28. // // Splits the given string on commas. Returns the results in a
  29. // // vector of strings.
  30. // std::vector<std::string> v = absl::StrSplit("a,b,c", ',');
  31. // // Can also use ","
  32. // // v[0] == "a", v[1] == "b", v[2] == "c"
  33. //
  34. // See StrSplit() below for more information.
  35. #ifndef ABSL_STRINGS_STR_SPLIT_H_
  36. #define ABSL_STRINGS_STR_SPLIT_H_
  37. #include <algorithm>
  38. #include <cstddef>
  39. #include <map>
  40. #include <set>
  41. #include <string>
  42. #include <utility>
  43. #include <vector>
  44. #include "absl/base/internal/raw_logging.h"
  45. #include "absl/base/macros.h"
  46. #include "absl/strings/internal/str_split_internal.h"
  47. #include "absl/strings/string_view.h"
  48. #include "absl/strings/strip.h"
  49. namespace absl {
  50. ABSL_NAMESPACE_BEGIN
  51. //------------------------------------------------------------------------------
  52. // Delimiters
  53. //------------------------------------------------------------------------------
  54. //
  55. // `StrSplit()` uses delimiters to define the boundaries between elements in the
  56. // provided input. Several `Delimiter` types are defined below. If a string
  57. // (`const char*`, `std::string`, or `absl::string_view`) is passed in place of
  58. // an explicit `Delimiter` object, `StrSplit()` treats it the same way as if it
  59. // were passed a `ByString` delimiter.
  60. //
  61. // A `Delimiter` is an object with a `Find()` function that knows how to find
  62. // the first occurrence of itself in a given `absl::string_view`.
  63. //
  64. // The following `Delimiter` types are available for use within `StrSplit()`:
  65. //
  66. // - `ByString` (default for string arguments)
  67. // - `ByChar` (default for a char argument)
  68. // - `ByAnyChar`
  69. // - `ByLength`
  70. // - `MaxSplits`
  71. //
  72. // A Delimiter's `Find()` member function will be passed an input `text` that is
  73. // to be split and a position (`pos`) to begin searching for the next delimiter
  74. // in `text`. The returned absl::string_view should refer to the next occurrence
  75. // (after `pos`) of the represented delimiter; this returned absl::string_view
  76. // represents the next location where the input `text` should be broken.
  77. //
  78. // The returned absl::string_view may be zero-length if the Delimiter does not
  79. // represent a part of the string (e.g., a fixed-length delimiter). If no
  80. // delimiter is found in the input `text`, a zero-length absl::string_view
  81. // referring to `text.end()` should be returned (e.g.,
  82. // `text.substr(text.size())`). It is important that the returned
  83. // absl::string_view always be within the bounds of the input `text` given as an
  84. // argument--it must not refer to a string that is physically located outside of
  85. // the given string.
  86. //
  87. // The following example is a simple Delimiter object that is created with a
  88. // single char and will look for that char in the text passed to the `Find()`
  89. // function:
  90. //
  91. // struct SimpleDelimiter {
  92. // const char c_;
  93. // explicit SimpleDelimiter(char c) : c_(c) {}
  94. // absl::string_view Find(absl::string_view text, size_t pos) {
  95. // auto found = text.find(c_, pos);
  96. // if (found == absl::string_view::npos)
  97. // return text.substr(text.size());
  98. //
  99. // return text.substr(found, 1);
  100. // }
  101. // };
  102. // ByString
  103. //
  104. // A sub-string delimiter. If `StrSplit()` is passed a string in place of a
  105. // `Delimiter` object, the string will be implicitly converted into a
  106. // `ByString` delimiter.
  107. //
  108. // Example:
  109. //
  110. // // Because a string literal is converted to an `absl::ByString`,
  111. // // the following two splits are equivalent.
  112. //
  113. // std::vector<std::string> v1 = absl::StrSplit("a, b, c", ", ");
  114. //
  115. // using absl::ByString;
  116. // std::vector<std::string> v2 = absl::StrSplit("a, b, c",
  117. // ByString(", "));
  118. // // v[0] == "a", v[1] == "b", v[2] == "c"
  119. class ByString {
  120. public:
  121. explicit ByString(absl::string_view sp);
  122. absl::string_view Find(absl::string_view text, size_t pos) const;
  123. private:
  124. const std::string delimiter_;
  125. };
  126. // ByAsciiWhitespace
  127. //
  128. // A sub-string delimiter that splits by ASCII whitespace
  129. // (space, tab, vertical tab, formfeed, linefeed, or carriage return).
  130. // Note: you probably want to use absl::SkipEmpty() as well!
  131. //
  132. // This class is equivalent to ByAnyChar with ASCII whitespace chars.
  133. //
  134. // Example:
  135. //
  136. // std::vector<std::string> v = absl::StrSplit(
  137. // "a b\tc\n d \n", absl::ByAsciiWhitespace(), absl::SkipEmpty());
  138. // // v[0] == "a", v[1] == "b", v[2] == "c", v[3] == "d"
  139. class ByAsciiWhitespace {
  140. public:
  141. absl::string_view Find(absl::string_view text, size_t pos) const;
  142. };
  143. // ByChar
  144. //
  145. // A single character delimiter. `ByChar` is functionally equivalent to a
  146. // 1-char string within a `ByString` delimiter, but slightly more efficient.
  147. //
  148. // Example:
  149. //
  150. // // Because a char literal is converted to a absl::ByChar,
  151. // // the following two splits are equivalent.
  152. // std::vector<std::string> v1 = absl::StrSplit("a,b,c", ',');
  153. // using absl::ByChar;
  154. // std::vector<std::string> v2 = absl::StrSplit("a,b,c", ByChar(','));
  155. // // v[0] == "a", v[1] == "b", v[2] == "c"
  156. //
  157. // `ByChar` is also the default delimiter if a single character is given
  158. // as the delimiter to `StrSplit()`. For example, the following calls are
  159. // equivalent:
  160. //
  161. // std::vector<std::string> v = absl::StrSplit("a-b", '-');
  162. //
  163. // using absl::ByChar;
  164. // std::vector<std::string> v = absl::StrSplit("a-b", ByChar('-'));
  165. //
  166. class ByChar {
  167. public:
  168. explicit ByChar(char c) : c_(c) {}
  169. absl::string_view Find(absl::string_view text, size_t pos) const;
  170. private:
  171. char c_;
  172. };
  173. // ByAnyChar
  174. //
  175. // A delimiter that will match any of the given byte-sized characters within
  176. // its provided string.
  177. //
  178. // Note: this delimiter works with single-byte string data, but does not work
  179. // with variable-width encodings, such as UTF-8.
  180. //
  181. // Example:
  182. //
  183. // using absl::ByAnyChar;
  184. // std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",="));
  185. // // v[0] == "a", v[1] == "b", v[2] == "c"
  186. //
  187. // If `ByAnyChar` is given the empty string, it behaves exactly like
  188. // `ByString` and matches each individual character in the input string.
  189. //
  190. class ByAnyChar {
  191. public:
  192. explicit ByAnyChar(absl::string_view sp);
  193. absl::string_view Find(absl::string_view text, size_t pos) const;
  194. private:
  195. const std::string delimiters_;
  196. };
  197. // ByLength
  198. //
  199. // A delimiter for splitting into equal-length strings. The length argument to
  200. // the constructor must be greater than 0.
  201. //
  202. // Note: this delimiter works with single-byte string data, but does not work
  203. // with variable-width encodings, such as UTF-8.
  204. //
  205. // Example:
  206. //
  207. // using absl::ByLength;
  208. // std::vector<std::string> v = absl::StrSplit("123456789", ByLength(3));
  209. // // v[0] == "123", v[1] == "456", v[2] == "789"
  210. //
  211. // Note that the string does not have to be a multiple of the fixed split
  212. // length. In such a case, the last substring will be shorter.
  213. //
  214. // using absl::ByLength;
  215. // std::vector<std::string> v = absl::StrSplit("12345", ByLength(2));
  216. //
  217. // // v[0] == "12", v[1] == "34", v[2] == "5"
  218. class ByLength {
  219. public:
  220. explicit ByLength(ptrdiff_t length);
  221. absl::string_view Find(absl::string_view text, size_t pos) const;
  222. private:
  223. const ptrdiff_t length_;
  224. };
  225. namespace strings_internal {
  226. // A traits-like metafunction for selecting the default Delimiter object type
  227. // for a particular Delimiter type. The base case simply exposes type Delimiter
  228. // itself as the delimiter's Type. However, there are specializations for
  229. // string-like objects that map them to the ByString delimiter object.
  230. // This allows functions like absl::StrSplit() and absl::MaxSplits() to accept
  231. // string-like objects (e.g., ',') as delimiter arguments but they will be
  232. // treated as if a ByString delimiter was given.
  233. template <typename Delimiter>
  234. struct SelectDelimiter {
  235. using type = Delimiter;
  236. };
  237. template <>
  238. struct SelectDelimiter<char> {
  239. using type = ByChar;
  240. };
  241. template <>
  242. struct SelectDelimiter<char*> {
  243. using type = ByString;
  244. };
  245. template <>
  246. struct SelectDelimiter<const char*> {
  247. using type = ByString;
  248. };
  249. template <>
  250. struct SelectDelimiter<absl::string_view> {
  251. using type = ByString;
  252. };
  253. template <>
  254. struct SelectDelimiter<std::string> {
  255. using type = ByString;
  256. };
  257. // Wraps another delimiter and sets a max number of matches for that delimiter.
  258. template <typename Delimiter>
  259. class MaxSplitsImpl {
  260. public:
  261. MaxSplitsImpl(Delimiter delimiter, int limit)
  262. : delimiter_(delimiter), limit_(limit), count_(0) {}
  263. absl::string_view Find(absl::string_view text, size_t pos) {
  264. if (count_++ == limit_) {
  265. return absl::string_view(text.data() + text.size(),
  266. 0); // No more matches.
  267. }
  268. return delimiter_.Find(text, pos);
  269. }
  270. private:
  271. Delimiter delimiter_;
  272. const int limit_;
  273. int count_;
  274. };
  275. } // namespace strings_internal
  276. // MaxSplits()
  277. //
  278. // A delimiter that limits the number of matches which can occur to the passed
  279. // `limit`. The last element in the returned collection will contain all
  280. // remaining unsplit pieces, which may contain instances of the delimiter.
  281. // The collection will contain at most `limit` + 1 elements.
  282. // Example:
  283. //
  284. // using absl::MaxSplits;
  285. // std::vector<std::string> v = absl::StrSplit("a,b,c", MaxSplits(',', 1));
  286. //
  287. // // v[0] == "a", v[1] == "b,c"
  288. template <typename Delimiter>
  289. inline strings_internal::MaxSplitsImpl<
  290. typename strings_internal::SelectDelimiter<Delimiter>::type>
  291. MaxSplits(Delimiter delimiter, int limit) {
  292. typedef
  293. typename strings_internal::SelectDelimiter<Delimiter>::type DelimiterType;
  294. return strings_internal::MaxSplitsImpl<DelimiterType>(
  295. DelimiterType(delimiter), limit);
  296. }
  297. //------------------------------------------------------------------------------
  298. // Predicates
  299. //------------------------------------------------------------------------------
  300. //
  301. // Predicates filter the results of a `StrSplit()` by determining whether or not
  302. // a resultant element is included in the result set. A predicate may be passed
  303. // as an optional third argument to the `StrSplit()` function.
  304. //
  305. // Predicates are unary functions (or functors) that take a single
  306. // `absl::string_view` argument and return a bool indicating whether the
  307. // argument should be included (`true`) or excluded (`false`).
  308. //
  309. // Predicates are useful when filtering out empty substrings. By default, empty
  310. // substrings may be returned by `StrSplit()`, which is similar to the way split
  311. // functions work in other programming languages.
  312. // AllowEmpty()
  313. //
  314. // Always returns `true`, indicating that all strings--including empty
  315. // strings--should be included in the split output. This predicate is not
  316. // strictly needed because this is the default behavior of `StrSplit()`;
  317. // however, it might be useful at some call sites to make the intent explicit.
  318. //
  319. // Example:
  320. //
  321. // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', AllowEmpty());
  322. //
  323. // // v[0] == " a ", v[1] == " ", v[2] == "", v[3] = "b", v[4] == ""
  324. struct AllowEmpty {
  325. bool operator()(absl::string_view) const { return true; }
  326. };
  327. // SkipEmpty()
  328. //
  329. // Returns `false` if the given `absl::string_view` is empty, indicating that
  330. // `StrSplit()` should omit the empty string.
  331. //
  332. // Example:
  333. //
  334. // std::vector<std::string> v = absl::StrSplit(",a,,b,", ',', SkipEmpty());
  335. //
  336. // // v[0] == "a", v[1] == "b"
  337. //
  338. // Note: `SkipEmpty()` does not consider a string containing only whitespace
  339. // to be empty. To skip such whitespace as well, use the `SkipWhitespace()`
  340. // predicate.
  341. struct SkipEmpty {
  342. bool operator()(absl::string_view sp) const { return !sp.empty(); }
  343. };
  344. // SkipWhitespace()
  345. //
  346. // Returns `false` if the given `absl::string_view` is empty *or* contains only
  347. // whitespace, indicating that `StrSplit()` should omit the string.
  348. //
  349. // Example:
  350. //
  351. // std::vector<std::string> v = absl::StrSplit(" a , ,,b,",
  352. // ',', SkipWhitespace());
  353. // // v[0] == " a ", v[1] == "b"
  354. //
  355. // // SkipEmpty() would return whitespace elements
  356. // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', SkipEmpty());
  357. // // v[0] == " a ", v[1] == " ", v[2] == "b"
  358. struct SkipWhitespace {
  359. bool operator()(absl::string_view sp) const {
  360. sp = absl::StripAsciiWhitespace(sp);
  361. return !sp.empty();
  362. }
  363. };
  364. template <typename T>
  365. using EnableSplitIfString =
  366. typename std::enable_if<std::is_same<T, std::string>::value ||
  367. std::is_same<T, const std::string>::value,
  368. int>::type;
  369. //------------------------------------------------------------------------------
  370. // StrSplit()
  371. //------------------------------------------------------------------------------
  372. // StrSplit()
  373. //
  374. // Splits a given string based on the provided `Delimiter` object, returning the
  375. // elements within the type specified by the caller. Optionally, you may pass a
  376. // `Predicate` to `StrSplit()` indicating whether to include or exclude the
  377. // resulting element within the final result set. (See the overviews for
  378. // Delimiters and Predicates above.)
  379. //
  380. // Example:
  381. //
  382. // std::vector<std::string> v = absl::StrSplit("a,b,c,d", ',');
  383. // // v[0] == "a", v[1] == "b", v[2] == "c", v[3] == "d"
  384. //
  385. // You can also provide an explicit `Delimiter` object:
  386. //
  387. // Example:
  388. //
  389. // using absl::ByAnyChar;
  390. // std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",="));
  391. // // v[0] == "a", v[1] == "b", v[2] == "c"
  392. //
  393. // See above for more information on delimiters.
  394. //
  395. // By default, empty strings are included in the result set. You can optionally
  396. // include a third `Predicate` argument to apply a test for whether the
  397. // resultant element should be included in the result set:
  398. //
  399. // Example:
  400. //
  401. // std::vector<std::string> v = absl::StrSplit(" a , ,,b,",
  402. // ',', SkipWhitespace());
  403. // // v[0] == " a ", v[1] == "b"
  404. //
  405. // See above for more information on predicates.
  406. //
  407. //------------------------------------------------------------------------------
  408. // StrSplit() Return Types
  409. //------------------------------------------------------------------------------
  410. //
  411. // The `StrSplit()` function adapts the returned collection to the collection
  412. // specified by the caller (e.g. `std::vector` above). The returned collections
  413. // may contain `std::string`, `absl::string_view` (in which case the original
  414. // string being split must ensure that it outlives the collection), or any
  415. // object that can be explicitly created from an `absl::string_view`. This
  416. // behavior works for:
  417. //
  418. // 1) All standard STL containers including `std::vector`, `std::list`,
  419. // `std::deque`, `std::set`,`std::multiset`, 'std::map`, and `std::multimap`
  420. // 2) `std::pair` (which is not actually a container). See below.
  421. //
  422. // Example:
  423. //
  424. // // The results are returned as `absl::string_view` objects. Note that we
  425. // // have to ensure that the input string outlives any results.
  426. // std::vector<absl::string_view> v = absl::StrSplit("a,b,c", ',');
  427. //
  428. // // Stores results in a std::set<std::string>, which also performs
  429. // // de-duplication and orders the elements in ascending order.
  430. // std::set<std::string> a = absl::StrSplit("b,a,c,a,b", ',');
  431. // // v[0] == "a", v[1] == "b", v[2] = "c"
  432. //
  433. // // `StrSplit()` can be used within a range-based for loop, in which case
  434. // // each element will be of type `absl::string_view`.
  435. // std::vector<std::string> v;
  436. // for (const auto sv : absl::StrSplit("a,b,c", ',')) {
  437. // if (sv != "b") v.emplace_back(sv);
  438. // }
  439. // // v[0] == "a", v[1] == "c"
  440. //
  441. // // Stores results in a map. The map implementation assumes that the input
  442. // // is provided as a series of key/value pairs. For example, the 0th element
  443. // // resulting from the split will be stored as a key to the 1st element. If
  444. // // an odd number of elements are resolved, the last element is paired with
  445. // // a default-constructed value (e.g., empty string).
  446. // std::map<std::string, std::string> m = absl::StrSplit("a,b,c", ',');
  447. // // m["a"] == "b", m["c"] == "" // last component value equals ""
  448. //
  449. // Splitting to `std::pair` is an interesting case because it can hold only two
  450. // elements and is not a collection type. When splitting to a `std::pair` the
  451. // first two split strings become the `std::pair` `.first` and `.second`
  452. // members, respectively. The remaining split substrings are discarded. If there
  453. // are less than two split substrings, the empty string is used for the
  454. // corresponding `std::pair` member.
  455. //
  456. // Example:
  457. //
  458. // // Stores first two split strings as the members in a std::pair.
  459. // std::pair<std::string, std::string> p = absl::StrSplit("a,b,c", ',');
  460. // // p.first == "a", p.second == "b" // "c" is omitted.
  461. //
  462. // The `StrSplit()` function can be used multiple times to perform more
  463. // complicated splitting logic, such as intelligently parsing key-value pairs.
  464. //
  465. // Example:
  466. //
  467. // // The input string "a=b=c,d=e,f=,g" becomes
  468. // // { "a" => "b=c", "d" => "e", "f" => "", "g" => "" }
  469. // std::map<std::string, std::string> m;
  470. // for (absl::string_view sp : absl::StrSplit("a=b=c,d=e,f=,g", ',')) {
  471. // m.insert(absl::StrSplit(sp, absl::MaxSplits('=', 1)));
  472. // }
  473. // EXPECT_EQ("b=c", m.find("a")->second);
  474. // EXPECT_EQ("e", m.find("d")->second);
  475. // EXPECT_EQ("", m.find("f")->second);
  476. // EXPECT_EQ("", m.find("g")->second);
  477. //
  478. // WARNING: Due to a legacy bug that is maintained for backward compatibility,
  479. // splitting the following empty string_views produces different results:
  480. //
  481. // absl::StrSplit(absl::string_view(""), '-'); // {""}
  482. // absl::StrSplit(absl::string_view(), '-'); // {}, but should be {""}
  483. //
  484. // Try not to depend on this distinction because the bug may one day be fixed.
  485. template <typename Delimiter>
  486. strings_internal::Splitter<
  487. typename strings_internal::SelectDelimiter<Delimiter>::type, AllowEmpty,
  488. absl::string_view>
  489. StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d) {
  490. using DelimiterType =
  491. typename strings_internal::SelectDelimiter<Delimiter>::type;
  492. return strings_internal::Splitter<DelimiterType, AllowEmpty,
  493. absl::string_view>(
  494. text.value(), DelimiterType(d), AllowEmpty());
  495. }
  496. template <typename Delimiter, typename StringType,
  497. EnableSplitIfString<StringType> = 0>
  498. strings_internal::Splitter<
  499. typename strings_internal::SelectDelimiter<Delimiter>::type, AllowEmpty,
  500. std::string>
  501. StrSplit(StringType&& text, Delimiter d) {
  502. using DelimiterType =
  503. typename strings_internal::SelectDelimiter<Delimiter>::type;
  504. return strings_internal::Splitter<DelimiterType, AllowEmpty, std::string>(
  505. std::move(text), DelimiterType(d), AllowEmpty());
  506. }
  507. template <typename Delimiter, typename Predicate>
  508. strings_internal::Splitter<
  509. typename strings_internal::SelectDelimiter<Delimiter>::type, Predicate,
  510. absl::string_view>
  511. StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d,
  512. Predicate p) {
  513. using DelimiterType =
  514. typename strings_internal::SelectDelimiter<Delimiter>::type;
  515. return strings_internal::Splitter<DelimiterType, Predicate,
  516. absl::string_view>(
  517. text.value(), DelimiterType(d), std::move(p));
  518. }
  519. template <typename Delimiter, typename Predicate, typename StringType,
  520. EnableSplitIfString<StringType> = 0>
  521. strings_internal::Splitter<
  522. typename strings_internal::SelectDelimiter<Delimiter>::type, Predicate,
  523. std::string>
  524. StrSplit(StringType&& text, Delimiter d, Predicate p) {
  525. using DelimiterType =
  526. typename strings_internal::SelectDelimiter<Delimiter>::type;
  527. return strings_internal::Splitter<DelimiterType, Predicate, std::string>(
  528. std::move(text), DelimiterType(d), std::move(p));
  529. }
  530. ABSL_NAMESPACE_END
  531. } // namespace absl
  532. #endif // ABSL_STRINGS_STR_SPLIT_H_