charset.h 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. // Copyright 2022 The Abseil Authors.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // https://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // -----------------------------------------------------------------------------
  16. // File: charset.h
  17. // -----------------------------------------------------------------------------
  18. //
  19. // This file contains absl::CharSet, a fast, bit-vector set of 8-bit unsigned
  20. // characters.
  21. //
  22. // Instances can be initialized as constexpr constants. For example:
  23. //
  24. // constexpr absl::CharSet kJustX = absl::CharSet::Char('x');
  25. // constexpr absl::CharSet kMySymbols = absl::CharSet("$@!");
  26. // constexpr absl::CharSet kLetters = absl::CharSet::Range('a', 'z');
  27. //
  28. // Multiple instances can be combined that still forms a constexpr expression.
  29. // For example:
  30. //
  31. // constexpr absl::CharSet kLettersAndNumbers =
  32. // absl::CharSet::Range('a', 'z') | absl::CharSet::Range('0', '9');
  33. //
  34. // Several pre-defined character classes are available that mirror the methods
  35. // from <cctype>. For example:
  36. //
  37. // constexpr absl::CharSet kLettersAndWhitespace =
  38. // absl::CharSet::AsciiAlphabet() | absl::CharSet::AsciiWhitespace();
  39. //
  40. // To check membership, use the .contains method, e.g.
  41. //
  42. // absl::CharSet hex_letters("abcdef");
  43. // hex_letters.contains('a'); // true
  44. // hex_letters.contains('g'); // false
  45. #ifndef ABSL_STRINGS_CHARSET_H_
  46. #define ABSL_STRINGS_CHARSET_H_
  47. #include <cstddef>
  48. #include <cstdint>
  49. #include <cstring>
  50. #include "absl/base/macros.h"
  51. #include "absl/base/port.h"
  52. #include "absl/strings/string_view.h"
  53. namespace absl {
  54. class CharSet {
  55. public:
  56. constexpr CharSet() : m_() {}
  57. // Initializes with a given string_view.
  58. constexpr explicit CharSet(absl::string_view str) : m_() {
  59. for (char c : str) {
  60. SetChar(static_cast<unsigned char>(c));
  61. }
  62. }
  63. constexpr bool contains(char c) const {
  64. return ((m_[static_cast<unsigned char>(c) / 64] >>
  65. (static_cast<unsigned char>(c) % 64)) &
  66. 0x1) == 0x1;
  67. }
  68. constexpr bool empty() const {
  69. for (uint64_t c : m_) {
  70. if (c != 0) return false;
  71. }
  72. return true;
  73. }
  74. // Containing only a single specified char.
  75. static constexpr CharSet Char(char x) {
  76. return CharSet(CharMaskForWord(x, 0), CharMaskForWord(x, 1),
  77. CharMaskForWord(x, 2), CharMaskForWord(x, 3));
  78. }
  79. // Containing all the chars in the closed interval [lo,hi].
  80. static constexpr CharSet Range(char lo, char hi) {
  81. return CharSet(RangeForWord(lo, hi, 0), RangeForWord(lo, hi, 1),
  82. RangeForWord(lo, hi, 2), RangeForWord(lo, hi, 3));
  83. }
  84. friend constexpr CharSet operator&(const CharSet& a, const CharSet& b) {
  85. return CharSet(a.m_[0] & b.m_[0], a.m_[1] & b.m_[1], a.m_[2] & b.m_[2],
  86. a.m_[3] & b.m_[3]);
  87. }
  88. friend constexpr CharSet operator|(const CharSet& a, const CharSet& b) {
  89. return CharSet(a.m_[0] | b.m_[0], a.m_[1] | b.m_[1], a.m_[2] | b.m_[2],
  90. a.m_[3] | b.m_[3]);
  91. }
  92. friend constexpr CharSet operator~(const CharSet& a) {
  93. return CharSet(~a.m_[0], ~a.m_[1], ~a.m_[2], ~a.m_[3]);
  94. }
  95. // Mirrors the char-classifying predicates in <cctype>.
  96. static constexpr CharSet AsciiUppercase() { return CharSet::Range('A', 'Z'); }
  97. static constexpr CharSet AsciiLowercase() { return CharSet::Range('a', 'z'); }
  98. static constexpr CharSet AsciiDigits() { return CharSet::Range('0', '9'); }
  99. static constexpr CharSet AsciiAlphabet() {
  100. return AsciiLowercase() | AsciiUppercase();
  101. }
  102. static constexpr CharSet AsciiAlphanumerics() {
  103. return AsciiDigits() | AsciiAlphabet();
  104. }
  105. static constexpr CharSet AsciiHexDigits() {
  106. return AsciiDigits() | CharSet::Range('A', 'F') | CharSet::Range('a', 'f');
  107. }
  108. static constexpr CharSet AsciiPrintable() {
  109. return CharSet::Range(0x20, 0x7e);
  110. }
  111. static constexpr CharSet AsciiWhitespace() { return CharSet("\t\n\v\f\r "); }
  112. static constexpr CharSet AsciiPunctuation() {
  113. return AsciiPrintable() & ~AsciiWhitespace() & ~AsciiAlphanumerics();
  114. }
  115. private:
  116. constexpr CharSet(uint64_t b0, uint64_t b1, uint64_t b2, uint64_t b3)
  117. : m_{b0, b1, b2, b3} {}
  118. static constexpr uint64_t RangeForWord(char lo, char hi, uint64_t word) {
  119. return OpenRangeFromZeroForWord(static_cast<unsigned char>(hi) + 1, word) &
  120. ~OpenRangeFromZeroForWord(static_cast<unsigned char>(lo), word);
  121. }
  122. // All the chars in the specified word of the range [0, upper).
  123. static constexpr uint64_t OpenRangeFromZeroForWord(uint64_t upper,
  124. uint64_t word) {
  125. return (upper <= 64 * word) ? 0
  126. : (upper >= 64 * (word + 1))
  127. ? ~static_cast<uint64_t>(0)
  128. : (~static_cast<uint64_t>(0) >> (64 - upper % 64));
  129. }
  130. static constexpr uint64_t CharMaskForWord(char x, uint64_t word) {
  131. return (static_cast<unsigned char>(x) / 64 == word)
  132. ? (static_cast<uint64_t>(1)
  133. << (static_cast<unsigned char>(x) % 64))
  134. : 0;
  135. }
  136. constexpr void SetChar(unsigned char c) {
  137. m_[c / 64] |= static_cast<uint64_t>(1) << (c % 64);
  138. }
  139. uint64_t m_[4];
  140. };
  141. } // namespace absl
  142. #endif // ABSL_STRINGS_CHARSET_H_