uniquecharstr.h 3.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. // © 2020 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. // uniquecharstr.h
  4. // created: 2020sep01 Frank Yung-Fong Tang
  5. #ifndef __UNIQUECHARSTR_H__
  6. #define __UNIQUECHARSTR_H__
  7. #include "charstr.h"
  8. #include "uassert.h"
  9. #include "uhash.h"
  10. U_NAMESPACE_BEGIN
  11. /**
  12. * Stores NUL-terminated strings with duplicate elimination.
  13. * Checks for unique UTF-16 string pointers and converts to invariant characters.
  14. *
  15. * Intended to be stack-allocated. Add strings, get a unique number for each,
  16. * freeze the object, get a char * pointer for each string,
  17. * call orphanCharStrings() to capture the string storage, and let this object go out of scope.
  18. */
  19. class UniqueCharStrings {
  20. public:
  21. UniqueCharStrings(UErrorCode &errorCode) : strings(nullptr) {
  22. // Note: We hash on string contents but store stable char16_t * pointers.
  23. // If the strings are stored in resource bundles which should be built with
  24. // duplicate elimination, then we should be able to hash on just the pointer values.
  25. uhash_init(&map, uhash_hashUChars, uhash_compareUChars, uhash_compareLong, &errorCode);
  26. if (U_FAILURE(errorCode)) { return; }
  27. strings = new CharString();
  28. if (strings == nullptr) {
  29. errorCode = U_MEMORY_ALLOCATION_ERROR;
  30. }
  31. }
  32. ~UniqueCharStrings() {
  33. uhash_close(&map);
  34. delete strings;
  35. }
  36. /** Returns/orphans the CharString that contains all strings. */
  37. CharString *orphanCharStrings() {
  38. CharString *result = strings;
  39. strings = nullptr;
  40. return result;
  41. }
  42. /**
  43. * Adds a string and returns a unique number for it.
  44. * The string's buffer contents must not change, nor move around in memory,
  45. * while this UniqueCharStrings is in use.
  46. * The string contents must be NUL-terminated exactly at s.length().
  47. *
  48. * Best used with read-only-alias UnicodeString objects that point to
  49. * stable storage, such as strings returned by resource bundle functions.
  50. */
  51. int32_t add(const UnicodeString &s, UErrorCode &errorCode) {
  52. if (U_FAILURE(errorCode)) { return 0; }
  53. if (isFrozen) {
  54. errorCode = U_NO_WRITE_PERMISSION;
  55. return 0;
  56. }
  57. // The string points into the resource bundle.
  58. const char16_t *p = s.getBuffer();
  59. int32_t oldIndex = uhash_geti(&map, p);
  60. if (oldIndex != 0) { // found duplicate
  61. return oldIndex;
  62. }
  63. // Explicit NUL terminator for the previous string.
  64. // The strings object is also terminated with one implicit NUL.
  65. strings->append(0, errorCode);
  66. int32_t newIndex = strings->length();
  67. strings->appendInvariantChars(s, errorCode);
  68. uhash_puti(&map, const_cast<char16_t *>(p), newIndex, &errorCode);
  69. return newIndex;
  70. }
  71. void freeze() { isFrozen = true; }
  72. /**
  73. * Returns a string pointer for its unique number, if this object is frozen.
  74. * Otherwise nullptr.
  75. */
  76. const char *get(int32_t i) const {
  77. U_ASSERT(isFrozen);
  78. return isFrozen && i > 0 ? strings->data() + i : nullptr;
  79. }
  80. private:
  81. UHashtable map;
  82. CharString *strings;
  83. bool isFrozen = false;
  84. };
  85. U_NAMESPACE_END
  86. #endif // __UNIQUECHARSTR_H__