StringRef.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614
  1. //===-- StringRef.cpp - Lightweight String References ---------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "llvm/ADT/StringRef.h"
  9. #include "llvm/ADT/APFloat.h"
  10. #include "llvm/ADT/APInt.h"
  11. #include "llvm/ADT/Hashing.h"
  12. #include "llvm/ADT/StringExtras.h"
  13. #include "llvm/ADT/edit_distance.h"
  14. #include "llvm/Support/Error.h"
  15. #include <bitset>
  16. using namespace llvm;
  17. // MSVC emits references to this into the translation units which reference it.
  18. #ifndef _MSC_VER
  19. constexpr size_t StringRef::npos;
  20. #endif
  21. // strncasecmp() is not available on non-POSIX systems, so define an
  22. // alternative function here.
  23. static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
  24. for (size_t I = 0; I < Length; ++I) {
  25. unsigned char LHC = toLower(LHS[I]);
  26. unsigned char RHC = toLower(RHS[I]);
  27. if (LHC != RHC)
  28. return LHC < RHC ? -1 : 1;
  29. }
  30. return 0;
  31. }
  32. int StringRef::compare_insensitive(StringRef RHS) const {
  33. if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length)))
  34. return Res;
  35. if (Length == RHS.Length)
  36. return 0;
  37. return Length < RHS.Length ? -1 : 1;
  38. }
  39. bool StringRef::starts_with_insensitive(StringRef Prefix) const {
  40. return Length >= Prefix.Length &&
  41. ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0;
  42. }
  43. bool StringRef::ends_with_insensitive(StringRef Suffix) const {
  44. return Length >= Suffix.Length &&
  45. ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
  46. }
  47. size_t StringRef::find_insensitive(char C, size_t From) const {
  48. char L = toLower(C);
  49. return find_if([L](char D) { return toLower(D) == L; }, From);
  50. }
  51. /// compare_numeric - Compare strings, handle embedded numbers.
  52. int StringRef::compare_numeric(StringRef RHS) const {
  53. for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) {
  54. // Check for sequences of digits.
  55. if (isDigit(Data[I]) && isDigit(RHS.Data[I])) {
  56. // The longer sequence of numbers is considered larger.
  57. // This doesn't really handle prefixed zeros well.
  58. size_t J;
  59. for (J = I + 1; J != E + 1; ++J) {
  60. bool ld = J < Length && isDigit(Data[J]);
  61. bool rd = J < RHS.Length && isDigit(RHS.Data[J]);
  62. if (ld != rd)
  63. return rd ? -1 : 1;
  64. if (!rd)
  65. break;
  66. }
  67. // The two number sequences have the same length (J-I), just memcmp them.
  68. if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
  69. return Res < 0 ? -1 : 1;
  70. // Identical number sequences, continue search after the numbers.
  71. I = J - 1;
  72. continue;
  73. }
  74. if (Data[I] != RHS.Data[I])
  75. return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
  76. }
  77. if (Length == RHS.Length)
  78. return 0;
  79. return Length < RHS.Length ? -1 : 1;
  80. }
  81. // Compute the edit distance between the two given strings.
  82. unsigned StringRef::edit_distance(llvm::StringRef Other,
  83. bool AllowReplacements,
  84. unsigned MaxEditDistance) const {
  85. return llvm::ComputeEditDistance(ArrayRef(data(), size()),
  86. ArrayRef(Other.data(), Other.size()),
  87. AllowReplacements, MaxEditDistance);
  88. }
  89. unsigned llvm::StringRef::edit_distance_insensitive(
  90. StringRef Other, bool AllowReplacements, unsigned MaxEditDistance) const {
  91. return llvm::ComputeMappedEditDistance(
  92. ArrayRef(data(), size()), ArrayRef(Other.data(), Other.size()),
  93. llvm::toLower, AllowReplacements, MaxEditDistance);
  94. }
  95. //===----------------------------------------------------------------------===//
  96. // String Operations
  97. //===----------------------------------------------------------------------===//
  98. std::string StringRef::lower() const {
  99. return std::string(map_iterator(begin(), toLower),
  100. map_iterator(end(), toLower));
  101. }
  102. std::string StringRef::upper() const {
  103. return std::string(map_iterator(begin(), toUpper),
  104. map_iterator(end(), toUpper));
  105. }
  106. //===----------------------------------------------------------------------===//
  107. // String Searching
  108. //===----------------------------------------------------------------------===//
  109. /// find - Search for the first string \arg Str in the string.
  110. ///
  111. /// \return - The index of the first occurrence of \arg Str, or npos if not
  112. /// found.
  113. size_t StringRef::find(StringRef Str, size_t From) const {
  114. if (From > Length)
  115. return npos;
  116. const char *Start = Data + From;
  117. size_t Size = Length - From;
  118. const char *Needle = Str.data();
  119. size_t N = Str.size();
  120. if (N == 0)
  121. return From;
  122. if (Size < N)
  123. return npos;
  124. if (N == 1) {
  125. const char *Ptr = (const char *)::memchr(Start, Needle[0], Size);
  126. return Ptr == nullptr ? npos : Ptr - Data;
  127. }
  128. const char *Stop = Start + (Size - N + 1);
  129. if (N == 2) {
  130. // Provide a fast path for newline finding (CRLF case) in InclusionRewriter.
  131. // Not the most optimized strategy, but getting memcmp inlined should be
  132. // good enough.
  133. do {
  134. if (std::memcmp(Start, Needle, 2) == 0)
  135. return Start - Data;
  136. ++Start;
  137. } while (Start < Stop);
  138. return npos;
  139. }
  140. // For short haystacks or unsupported needles fall back to the naive algorithm
  141. if (Size < 16 || N > 255) {
  142. do {
  143. if (std::memcmp(Start, Needle, N) == 0)
  144. return Start - Data;
  145. ++Start;
  146. } while (Start < Stop);
  147. return npos;
  148. }
  149. // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
  150. uint8_t BadCharSkip[256];
  151. std::memset(BadCharSkip, N, 256);
  152. for (unsigned i = 0; i != N-1; ++i)
  153. BadCharSkip[(uint8_t)Str[i]] = N-1-i;
  154. do {
  155. uint8_t Last = Start[N - 1];
  156. if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - 1]))
  157. if (std::memcmp(Start, Needle, N - 1) == 0)
  158. return Start - Data;
  159. // Otherwise skip the appropriate number of bytes.
  160. Start += BadCharSkip[Last];
  161. } while (Start < Stop);
  162. return npos;
  163. }
  164. size_t StringRef::find_insensitive(StringRef Str, size_t From) const {
  165. StringRef This = substr(From);
  166. while (This.size() >= Str.size()) {
  167. if (This.startswith_insensitive(Str))
  168. return From;
  169. This = This.drop_front();
  170. ++From;
  171. }
  172. return npos;
  173. }
  174. size_t StringRef::rfind_insensitive(char C, size_t From) const {
  175. From = std::min(From, Length);
  176. size_t i = From;
  177. while (i != 0) {
  178. --i;
  179. if (toLower(Data[i]) == toLower(C))
  180. return i;
  181. }
  182. return npos;
  183. }
  184. /// rfind - Search for the last string \arg Str in the string.
  185. ///
  186. /// \return - The index of the last occurrence of \arg Str, or npos if not
  187. /// found.
  188. size_t StringRef::rfind(StringRef Str) const {
  189. return std::string_view(*this).rfind(Str);
  190. }
  191. size_t StringRef::rfind_insensitive(StringRef Str) const {
  192. size_t N = Str.size();
  193. if (N > Length)
  194. return npos;
  195. for (size_t i = Length - N + 1, e = 0; i != e;) {
  196. --i;
  197. if (substr(i, N).equals_insensitive(Str))
  198. return i;
  199. }
  200. return npos;
  201. }
  202. /// find_first_of - Find the first character in the string that is in \arg
  203. /// Chars, or npos if not found.
  204. ///
  205. /// Note: O(size() + Chars.size())
  206. StringRef::size_type StringRef::find_first_of(StringRef Chars,
  207. size_t From) const {
  208. std::bitset<1 << CHAR_BIT> CharBits;
  209. for (char C : Chars)
  210. CharBits.set((unsigned char)C);
  211. for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
  212. if (CharBits.test((unsigned char)Data[i]))
  213. return i;
  214. return npos;
  215. }
  216. /// find_first_not_of - Find the first character in the string that is not
  217. /// \arg C or npos if not found.
  218. StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
  219. return std::string_view(*this).find_first_not_of(C, From);
  220. }
  221. /// find_first_not_of - Find the first character in the string that is not
  222. /// in the string \arg Chars, or npos if not found.
  223. ///
  224. /// Note: O(size() + Chars.size())
  225. StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
  226. size_t From) const {
  227. std::bitset<1 << CHAR_BIT> CharBits;
  228. for (char C : Chars)
  229. CharBits.set((unsigned char)C);
  230. for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
  231. if (!CharBits.test((unsigned char)Data[i]))
  232. return i;
  233. return npos;
  234. }
  235. /// find_last_of - Find the last character in the string that is in \arg C,
  236. /// or npos if not found.
  237. ///
  238. /// Note: O(size() + Chars.size())
  239. StringRef::size_type StringRef::find_last_of(StringRef Chars,
  240. size_t From) const {
  241. std::bitset<1 << CHAR_BIT> CharBits;
  242. for (char C : Chars)
  243. CharBits.set((unsigned char)C);
  244. for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
  245. if (CharBits.test((unsigned char)Data[i]))
  246. return i;
  247. return npos;
  248. }
  249. /// find_last_not_of - Find the last character in the string that is not
  250. /// \arg C, or npos if not found.
  251. StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
  252. for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
  253. if (Data[i] != C)
  254. return i;
  255. return npos;
  256. }
  257. /// find_last_not_of - Find the last character in the string that is not in
  258. /// \arg Chars, or npos if not found.
  259. ///
  260. /// Note: O(size() + Chars.size())
  261. StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
  262. size_t From) const {
  263. std::bitset<1 << CHAR_BIT> CharBits;
  264. for (char C : Chars)
  265. CharBits.set((unsigned char)C);
  266. for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
  267. if (!CharBits.test((unsigned char)Data[i]))
  268. return i;
  269. return npos;
  270. }
  271. void StringRef::split(SmallVectorImpl<StringRef> &A,
  272. StringRef Separator, int MaxSplit,
  273. bool KeepEmpty) const {
  274. StringRef S = *this;
  275. // Count down from MaxSplit. When MaxSplit is -1, this will just split
  276. // "forever". This doesn't support splitting more than 2^31 times
  277. // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
  278. // but that seems unlikely to be useful.
  279. while (MaxSplit-- != 0) {
  280. size_t Idx = S.find(Separator);
  281. if (Idx == npos)
  282. break;
  283. // Push this split.
  284. if (KeepEmpty || Idx > 0)
  285. A.push_back(S.slice(0, Idx));
  286. // Jump forward.
  287. S = S.slice(Idx + Separator.size(), npos);
  288. }
  289. // Push the tail.
  290. if (KeepEmpty || !S.empty())
  291. A.push_back(S);
  292. }
  293. void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator,
  294. int MaxSplit, bool KeepEmpty) const {
  295. StringRef S = *this;
  296. // Count down from MaxSplit. When MaxSplit is -1, this will just split
  297. // "forever". This doesn't support splitting more than 2^31 times
  298. // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
  299. // but that seems unlikely to be useful.
  300. while (MaxSplit-- != 0) {
  301. size_t Idx = S.find(Separator);
  302. if (Idx == npos)
  303. break;
  304. // Push this split.
  305. if (KeepEmpty || Idx > 0)
  306. A.push_back(S.slice(0, Idx));
  307. // Jump forward.
  308. S = S.slice(Idx + 1, npos);
  309. }
  310. // Push the tail.
  311. if (KeepEmpty || !S.empty())
  312. A.push_back(S);
  313. }
  314. //===----------------------------------------------------------------------===//
  315. // Helpful Algorithms
  316. //===----------------------------------------------------------------------===//
  317. /// count - Return the number of non-overlapped occurrences of \arg Str in
  318. /// the string.
  319. size_t StringRef::count(StringRef Str) const {
  320. size_t Count = 0;
  321. size_t Pos = 0;
  322. size_t N = Str.size();
  323. // TODO: For an empty `Str` we return 0 for legacy reasons. Consider changing
  324. // this to `Length + 1` which is more in-line with the function
  325. // description.
  326. if (!N)
  327. return 0;
  328. while ((Pos = find(Str, Pos)) != npos) {
  329. ++Count;
  330. Pos += N;
  331. }
  332. return Count;
  333. }
  334. static unsigned GetAutoSenseRadix(StringRef &Str) {
  335. if (Str.empty())
  336. return 10;
  337. if (Str.startswith("0x") || Str.startswith("0X")) {
  338. Str = Str.substr(2);
  339. return 16;
  340. }
  341. if (Str.startswith("0b") || Str.startswith("0B")) {
  342. Str = Str.substr(2);
  343. return 2;
  344. }
  345. if (Str.startswith("0o")) {
  346. Str = Str.substr(2);
  347. return 8;
  348. }
  349. if (Str[0] == '0' && Str.size() > 1 && isDigit(Str[1])) {
  350. Str = Str.substr(1);
  351. return 8;
  352. }
  353. return 10;
  354. }
  355. bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix,
  356. unsigned long long &Result) {
  357. // Autosense radix if not specified.
  358. if (Radix == 0)
  359. Radix = GetAutoSenseRadix(Str);
  360. // Empty strings (after the radix autosense) are invalid.
  361. if (Str.empty()) return true;
  362. // Parse all the bytes of the string given this radix. Watch for overflow.
  363. StringRef Str2 = Str;
  364. Result = 0;
  365. while (!Str2.empty()) {
  366. unsigned CharVal;
  367. if (Str2[0] >= '0' && Str2[0] <= '9')
  368. CharVal = Str2[0] - '0';
  369. else if (Str2[0] >= 'a' && Str2[0] <= 'z')
  370. CharVal = Str2[0] - 'a' + 10;
  371. else if (Str2[0] >= 'A' && Str2[0] <= 'Z')
  372. CharVal = Str2[0] - 'A' + 10;
  373. else
  374. break;
  375. // If the parsed value is larger than the integer radix, we cannot
  376. // consume any more characters.
  377. if (CharVal >= Radix)
  378. break;
  379. // Add in this character.
  380. unsigned long long PrevResult = Result;
  381. Result = Result * Radix + CharVal;
  382. // Check for overflow by shifting back and seeing if bits were lost.
  383. if (Result / Radix < PrevResult)
  384. return true;
  385. Str2 = Str2.substr(1);
  386. }
  387. // We consider the operation a failure if no characters were consumed
  388. // successfully.
  389. if (Str.size() == Str2.size())
  390. return true;
  391. Str = Str2;
  392. return false;
  393. }
  394. bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix,
  395. long long &Result) {
  396. unsigned long long ULLVal;
  397. // Handle positive strings first.
  398. if (Str.empty() || Str.front() != '-') {
  399. if (consumeUnsignedInteger(Str, Radix, ULLVal) ||
  400. // Check for value so large it overflows a signed value.
  401. (long long)ULLVal < 0)
  402. return true;
  403. Result = ULLVal;
  404. return false;
  405. }
  406. // Get the positive part of the value.
  407. StringRef Str2 = Str.drop_front(1);
  408. if (consumeUnsignedInteger(Str2, Radix, ULLVal) ||
  409. // Reject values so large they'd overflow as negative signed, but allow
  410. // "-0". This negates the unsigned so that the negative isn't undefined
  411. // on signed overflow.
  412. (long long)-ULLVal > 0)
  413. return true;
  414. Str = Str2;
  415. Result = -ULLVal;
  416. return false;
  417. }
  418. /// GetAsUnsignedInteger - Workhorse method that converts a integer character
  419. /// sequence of radix up to 36 to an unsigned long long value.
  420. bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
  421. unsigned long long &Result) {
  422. if (consumeUnsignedInteger(Str, Radix, Result))
  423. return true;
  424. // For getAsUnsignedInteger, we require the whole string to be consumed or
  425. // else we consider it a failure.
  426. return !Str.empty();
  427. }
  428. bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
  429. long long &Result) {
  430. if (consumeSignedInteger(Str, Radix, Result))
  431. return true;
  432. // For getAsSignedInteger, we require the whole string to be consumed or else
  433. // we consider it a failure.
  434. return !Str.empty();
  435. }
  436. bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
  437. StringRef Str = *this;
  438. // Autosense radix if not specified.
  439. if (Radix == 0)
  440. Radix = GetAutoSenseRadix(Str);
  441. assert(Radix > 1 && Radix <= 36);
  442. // Empty strings (after the radix autosense) are invalid.
  443. if (Str.empty()) return true;
  444. // Skip leading zeroes. This can be a significant improvement if
  445. // it means we don't need > 64 bits.
  446. while (!Str.empty() && Str.front() == '0')
  447. Str = Str.substr(1);
  448. // If it was nothing but zeroes....
  449. if (Str.empty()) {
  450. Result = APInt(64, 0);
  451. return false;
  452. }
  453. // (Over-)estimate the required number of bits.
  454. unsigned Log2Radix = 0;
  455. while ((1U << Log2Radix) < Radix) Log2Radix++;
  456. bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
  457. unsigned BitWidth = Log2Radix * Str.size();
  458. if (BitWidth < Result.getBitWidth())
  459. BitWidth = Result.getBitWidth(); // don't shrink the result
  460. else if (BitWidth > Result.getBitWidth())
  461. Result = Result.zext(BitWidth);
  462. APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
  463. if (!IsPowerOf2Radix) {
  464. // These must have the same bit-width as Result.
  465. RadixAP = APInt(BitWidth, Radix);
  466. CharAP = APInt(BitWidth, 0);
  467. }
  468. // Parse all the bytes of the string given this radix.
  469. Result = 0;
  470. while (!Str.empty()) {
  471. unsigned CharVal;
  472. if (Str[0] >= '0' && Str[0] <= '9')
  473. CharVal = Str[0]-'0';
  474. else if (Str[0] >= 'a' && Str[0] <= 'z')
  475. CharVal = Str[0]-'a'+10;
  476. else if (Str[0] >= 'A' && Str[0] <= 'Z')
  477. CharVal = Str[0]-'A'+10;
  478. else
  479. return true;
  480. // If the parsed value is larger than the integer radix, the string is
  481. // invalid.
  482. if (CharVal >= Radix)
  483. return true;
  484. // Add in this character.
  485. if (IsPowerOf2Radix) {
  486. Result <<= Log2Radix;
  487. Result |= CharVal;
  488. } else {
  489. Result *= RadixAP;
  490. CharAP = CharVal;
  491. Result += CharAP;
  492. }
  493. Str = Str.substr(1);
  494. }
  495. return false;
  496. }
  497. bool StringRef::getAsDouble(double &Result, bool AllowInexact) const {
  498. APFloat F(0.0);
  499. auto StatusOrErr = F.convertFromString(*this, APFloat::rmNearestTiesToEven);
  500. if (errorToBool(StatusOrErr.takeError()))
  501. return true;
  502. APFloat::opStatus Status = *StatusOrErr;
  503. if (Status != APFloat::opOK) {
  504. if (!AllowInexact || !(Status & APFloat::opInexact))
  505. return true;
  506. }
  507. Result = F.convertToDouble();
  508. return false;
  509. }
  510. // Implementation of StringRef hashing.
  511. hash_code llvm::hash_value(StringRef S) {
  512. return hash_combine_range(S.begin(), S.end());
  513. }
  514. unsigned DenseMapInfo<StringRef, void>::getHashValue(StringRef Val) {
  515. assert(Val.data() != getEmptyKey().data() &&
  516. "Cannot hash the empty key!");
  517. assert(Val.data() != getTombstoneKey().data() &&
  518. "Cannot hash the tombstone key!");
  519. return (unsigned)(hash_value(Val));
  520. }