charset.h 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. // Copyright 2022 The Abseil Authors.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // https://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // -----------------------------------------------------------------------------
  16. // File: charset.h
  17. // -----------------------------------------------------------------------------
  18. //
  19. // This file contains absl::CharSet, a fast, bit-vector set of 8-bit unsigned
  20. // characters.
  21. //
  22. // Instances can be initialized as constexpr constants. For example:
  23. //
  24. // constexpr absl::CharSet kJustX = absl::CharSet::Char('x');
  25. // constexpr absl::CharSet kMySymbols = absl::CharSet("$@!");
  26. // constexpr absl::CharSet kLetters = absl::CharSet::Range('a', 'z');
  27. //
  28. // Multiple instances can be combined that still forms a constexpr expression.
  29. // For example:
  30. //
  31. // constexpr absl::CharSet kLettersAndNumbers =
  32. // absl::CharSet::Range('a', 'z') | absl::CharSet::Range('0', '9');
  33. //
  34. // Several pre-defined character classes are available that mirror the methods
  35. // from <cctype>. For example:
  36. //
  37. // constexpr absl::CharSet kLettersAndWhitespace =
  38. // absl::CharSet::AsciiAlphabet() | absl::CharSet::AsciiWhitespace();
  39. //
  40. // To check membership, use the .contains method, e.g.
  41. //
  42. // absl::CharSet hex_letters("abcdef");
  43. // hex_letters.contains('a'); // true
  44. // hex_letters.contains('g'); // false
  45. #ifndef ABSL_STRINGS_CHARSET_H_
  46. #define ABSL_STRINGS_CHARSET_H_
  47. #include <cstdint>
  48. #include "absl/base/config.h"
  49. #include "absl/strings/string_view.h"
  50. namespace absl {
  51. ABSL_NAMESPACE_BEGIN
  52. class CharSet {
  53. public:
  54. constexpr CharSet() : m_() {}
  55. // Initializes with a given string_view.
  56. constexpr explicit CharSet(absl::string_view str) : m_() {
  57. for (char c : str) {
  58. SetChar(static_cast<unsigned char>(c));
  59. }
  60. }
  61. constexpr bool contains(char c) const {
  62. return ((m_[static_cast<unsigned char>(c) / 64] >>
  63. (static_cast<unsigned char>(c) % 64)) &
  64. 0x1) == 0x1;
  65. }
  66. constexpr bool empty() const {
  67. for (uint64_t c : m_) {
  68. if (c != 0) return false;
  69. }
  70. return true;
  71. }
  72. // Containing only a single specified char.
  73. static constexpr CharSet Char(char x) {
  74. return CharSet(CharMaskForWord(x, 0), CharMaskForWord(x, 1),
  75. CharMaskForWord(x, 2), CharMaskForWord(x, 3));
  76. }
  77. // Containing all the chars in the closed interval [lo,hi].
  78. static constexpr CharSet Range(char lo, char hi) {
  79. return CharSet(RangeForWord(lo, hi, 0), RangeForWord(lo, hi, 1),
  80. RangeForWord(lo, hi, 2), RangeForWord(lo, hi, 3));
  81. }
  82. friend constexpr CharSet operator&(const CharSet& a, const CharSet& b) {
  83. return CharSet(a.m_[0] & b.m_[0], a.m_[1] & b.m_[1], a.m_[2] & b.m_[2],
  84. a.m_[3] & b.m_[3]);
  85. }
  86. friend constexpr CharSet operator|(const CharSet& a, const CharSet& b) {
  87. return CharSet(a.m_[0] | b.m_[0], a.m_[1] | b.m_[1], a.m_[2] | b.m_[2],
  88. a.m_[3] | b.m_[3]);
  89. }
  90. friend constexpr CharSet operator~(const CharSet& a) {
  91. return CharSet(~a.m_[0], ~a.m_[1], ~a.m_[2], ~a.m_[3]);
  92. }
  93. // Mirrors the char-classifying predicates in <cctype>.
  94. static constexpr CharSet AsciiUppercase() { return CharSet::Range('A', 'Z'); }
  95. static constexpr CharSet AsciiLowercase() { return CharSet::Range('a', 'z'); }
  96. static constexpr CharSet AsciiDigits() { return CharSet::Range('0', '9'); }
  97. static constexpr CharSet AsciiAlphabet() {
  98. return AsciiLowercase() | AsciiUppercase();
  99. }
  100. static constexpr CharSet AsciiAlphanumerics() {
  101. return AsciiDigits() | AsciiAlphabet();
  102. }
  103. static constexpr CharSet AsciiHexDigits() {
  104. return AsciiDigits() | CharSet::Range('A', 'F') | CharSet::Range('a', 'f');
  105. }
  106. static constexpr CharSet AsciiPrintable() {
  107. return CharSet::Range(0x20, 0x7e);
  108. }
  109. static constexpr CharSet AsciiWhitespace() { return CharSet("\t\n\v\f\r "); }
  110. static constexpr CharSet AsciiPunctuation() {
  111. return AsciiPrintable() & ~AsciiWhitespace() & ~AsciiAlphanumerics();
  112. }
  113. private:
  114. constexpr CharSet(uint64_t b0, uint64_t b1, uint64_t b2, uint64_t b3)
  115. : m_{b0, b1, b2, b3} {}
  116. static constexpr uint64_t RangeForWord(char lo, char hi, uint64_t word) {
  117. return OpenRangeFromZeroForWord(static_cast<unsigned char>(hi) + 1, word) &
  118. ~OpenRangeFromZeroForWord(static_cast<unsigned char>(lo), word);
  119. }
  120. // All the chars in the specified word of the range [0, upper).
  121. static constexpr uint64_t OpenRangeFromZeroForWord(uint64_t upper,
  122. uint64_t word) {
  123. return (upper <= 64 * word) ? 0
  124. : (upper >= 64 * (word + 1))
  125. ? ~static_cast<uint64_t>(0)
  126. : (~static_cast<uint64_t>(0) >> (64 - upper % 64));
  127. }
  128. static constexpr uint64_t CharMaskForWord(char x, uint64_t word) {
  129. return (static_cast<unsigned char>(x) / 64 == word)
  130. ? (static_cast<uint64_t>(1)
  131. << (static_cast<unsigned char>(x) % 64))
  132. : 0;
  133. }
  134. constexpr void SetChar(unsigned char c) {
  135. m_[c / 64] |= static_cast<uint64_t>(1) << (c % 64);
  136. }
  137. uint64_t m_[4];
  138. };
  139. ABSL_NAMESPACE_END
  140. } // namespace absl
  141. #endif // ABSL_STRINGS_CHARSET_H_