DJB.cpp 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. //===-- Support/DJB.cpp ---DJB Hash -----------------------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file contains support for the DJ Bernstein hash function.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. #include "llvm/Support/DJB.h"
  13. #include "llvm/ADT/ArrayRef.h"
  14. #include "llvm/Support/Compiler.h"
  15. #include "llvm/Support/ConvertUTF.h"
  16. #include "llvm/Support/Unicode.h"
  17. using namespace llvm;
  18. static UTF32 chopOneUTF32(StringRef &Buffer) {
  19. UTF32 C;
  20. const UTF8 *const Begin8Const =
  21. reinterpret_cast<const UTF8 *>(Buffer.begin());
  22. const UTF8 *Begin8 = Begin8Const;
  23. UTF32 *Begin32 = &C;
  24. // In lenient mode we will always end up with a "reasonable" value in C for
  25. // non-empty input.
  26. assert(!Buffer.empty());
  27. ConvertUTF8toUTF32(&Begin8, reinterpret_cast<const UTF8 *>(Buffer.end()),
  28. &Begin32, &C + 1, lenientConversion);
  29. Buffer = Buffer.drop_front(Begin8 - Begin8Const);
  30. return C;
  31. }
  32. static StringRef toUTF8(UTF32 C, MutableArrayRef<UTF8> Storage) {
  33. const UTF32 *Begin32 = &C;
  34. UTF8 *Begin8 = Storage.begin();
  35. // The case-folded output should always be a valid unicode character, so use
  36. // strict mode here.
  37. ConversionResult CR = ConvertUTF32toUTF8(&Begin32, &C + 1, &Begin8,
  38. Storage.end(), strictConversion);
  39. assert(CR == conversionOK && "Case folding produced invalid char?");
  40. (void)CR;
  41. return StringRef(reinterpret_cast<char *>(Storage.begin()),
  42. Begin8 - Storage.begin());
  43. }
  44. static UTF32 foldCharDwarf(UTF32 C) {
  45. // DWARF v5 addition to the unicode folding rules.
  46. // Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot
  47. // Above" into "i".
  48. if (C == 0x130 || C == 0x131)
  49. return 'i';
  50. return sys::unicode::foldCharSimple(C);
  51. }
  52. static Optional<uint32_t> fastCaseFoldingDjbHash(StringRef Buffer, uint32_t H) {
  53. bool AllASCII = true;
  54. for (unsigned char C : Buffer) {
  55. H = H * 33 + ('A' <= C && C <= 'Z' ? C - 'A' + 'a' : C);
  56. AllASCII &= C <= 0x7f;
  57. }
  58. if (AllASCII)
  59. return H;
  60. return None;
  61. }
  62. uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) {
  63. if (Optional<uint32_t> Result = fastCaseFoldingDjbHash(Buffer, H))
  64. return *Result;
  65. std::array<UTF8, UNI_MAX_UTF8_BYTES_PER_CODE_POINT> Storage;
  66. while (!Buffer.empty()) {
  67. UTF32 C = foldCharDwarf(chopOneUTF32(Buffer));
  68. StringRef Folded = toUTF8(C, Storage);
  69. H = djbHash(Folded, H);
  70. }
  71. return H;
  72. }