comptrie_impl.h 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. #pragma once
  2. #include <util/stream/output.h>
  3. #ifndef COMPTRIE_DATA_CHECK
  4. #define COMPTRIE_DATA_CHECK 1
  5. #endif
  6. // NCompactTrie
  7. namespace NCompactTrie {
  8. const char MT_FINAL = '\x80';
  9. const char MT_NEXT = '\x40';
  10. const char MT_SIZEMASK = '\x07';
  11. const size_t MT_LEFTSHIFT = 3;
  12. const size_t MT_RIGHTSHIFT = 0;
  13. Y_FORCE_INLINE size_t UnpackOffset(const char* p, size_t len);
  14. size_t MeasureOffset(size_t offset);
  15. size_t PackOffset(char* buffer, size_t offset);
  16. static inline ui64 ArcSaveOffset(size_t offset, IOutputStream& os);
  17. Y_FORCE_INLINE char LeapByte(const char*& datapos, const char* dataend, char label);
  18. template <class T>
  19. inline static size_t ExtraBits() {
  20. return (sizeof(T) - 1) * 8;
  21. }
  22. static inline bool IsEpsilonLink(const char flags) {
  23. return !(flags & (MT_FINAL | MT_NEXT));
  24. }
  25. static inline void TraverseEpsilon(const char*& datapos) {
  26. const char flags = *datapos;
  27. if (!IsEpsilonLink(flags)) {
  28. return;
  29. }
  30. const size_t offsetlength = flags & MT_SIZEMASK;
  31. const size_t offset = UnpackOffset(datapos + 1, offsetlength);
  32. Y_ASSERT(offset);
  33. datapos += offset;
  34. }
  35. static inline size_t LeftOffsetLen(const char flags) {
  36. return (flags >> MT_LEFTSHIFT) & MT_SIZEMASK;
  37. }
  38. static inline size_t RightOffsetLen(const char flags) {
  39. return flags & MT_SIZEMASK;
  40. }
  41. void ShowProgress(size_t n); // just print dots
  42. }
  43. namespace NCompTriePrivate {
  44. template <typename TChar>
  45. struct TStringForChar {
  46. };
  47. template <>
  48. struct TStringForChar<char> {
  49. typedef TString TResult;
  50. };
  51. template <>
  52. struct TStringForChar<wchar16> {
  53. typedef TUtf16String TResult;
  54. };
  55. template <>
  56. struct TStringForChar<wchar32> {
  57. typedef TUtf32String TResult;
  58. };
  59. }
  60. namespace NCompTriePrivate {
  61. struct TCmp {
  62. template <class T>
  63. inline bool operator()(const T& l, const T& r) {
  64. return (unsigned char)(l.Label[0]) < (unsigned char)(r.Label[0]);
  65. }
  66. template <class T>
  67. inline bool operator()(const T& l, char r) {
  68. return (unsigned char)(l.Label[0]) < (unsigned char)r;
  69. }
  70. };
  71. }
  72. namespace NCompactTrie {
  73. static inline ui64 ArcSaveOffset(size_t offset, IOutputStream& os) {
  74. using namespace NCompactTrie;
  75. if (!offset)
  76. return 0;
  77. char buf[16];
  78. size_t len = PackOffset(buf, offset);
  79. os.Write(buf, len);
  80. return len;
  81. }
  82. // Unpack the offset to the next node. The encoding scheme can store offsets
  83. // up to 7 bytes; whether they fit into size_t is another issue.
  84. Y_FORCE_INLINE size_t UnpackOffset(const char* p, size_t len) {
  85. size_t result = 0;
  86. while (len--)
  87. result = ((result << 8) | (*(p++) & 0xFF));
  88. return result;
  89. }
  90. // Auxiliary function: consumes one character from the input. Advances the data pointer
  91. // to the position immediately preceding the value for the link just traversed (if any);
  92. // returns flags associated with the link. If no arc with the required label is present,
  93. // zeroes the data pointer.
  94. Y_FORCE_INLINE char LeapByte(const char*& datapos, const char* dataend, char label) {
  95. while (datapos < dataend) {
  96. size_t offsetlength, offset;
  97. const char* startpos = datapos;
  98. char flags = *(datapos++);
  99. if (IsEpsilonLink(flags)) {
  100. // Epsilon link - jump to the specified offset without further checks.
  101. // These links are created during minimization: original uncompressed
  102. // tree does not need them. (If we find a way to package 3 offset lengths
  103. // into 1 byte, we could get rid of them; but it looks like they do no harm.
  104. Y_ASSERT(datapos < dataend);
  105. offsetlength = flags & MT_SIZEMASK;
  106. offset = UnpackOffset(datapos, offsetlength);
  107. if (!offset)
  108. break;
  109. datapos = startpos + offset;
  110. continue;
  111. }
  112. char ch = *(datapos++);
  113. // Left branch
  114. offsetlength = LeftOffsetLen(flags);
  115. if ((unsigned char)label < (unsigned char)ch) {
  116. offset = UnpackOffset(datapos, offsetlength);
  117. if (!offset)
  118. break;
  119. datapos = startpos + offset;
  120. continue;
  121. }
  122. datapos += offsetlength;
  123. // Right branch
  124. offsetlength = RightOffsetLen(flags);
  125. if ((unsigned char)label > (unsigned char)ch) {
  126. offset = UnpackOffset(datapos, offsetlength);
  127. if (!offset)
  128. break;
  129. datapos = startpos + offset;
  130. continue;
  131. }
  132. // Got a match; return position right before the contents for the label
  133. datapos += offsetlength;
  134. return flags;
  135. }
  136. // if we got here, we're past the dataend - bail out ASAP
  137. datapos = nullptr;
  138. return 0;
  139. }
  140. // Auxiliary function: consumes one (multibyte) symbol from the input.
  141. // Advances the data pointer to the root of the subtrie beginning after the symbol,
  142. // zeroes it if this subtrie is empty.
  143. // If there is a value associated with the symbol, makes the value pointer point to it,
  144. // otherwise sets it to nullptr.
  145. // Returns true if the symbol was succesfully found in the trie, false otherwise.
  146. template <typename TSymbol, class TPacker>
  147. Y_FORCE_INLINE bool Advance(const char*& datapos, const char* const dataend, const char*& value,
  148. TSymbol label, TPacker packer) {
  149. Y_ASSERT(datapos < dataend);
  150. char flags = MT_NEXT;
  151. for (int i = (int)ExtraBits<TSymbol>(); i >= 0; i -= 8) {
  152. flags = LeapByte(datapos, dataend, (char)(label >> i));
  153. if (!datapos) {
  154. return false; // no such arc
  155. }
  156. value = nullptr;
  157. Y_ASSERT(datapos <= dataend);
  158. if ((flags & MT_FINAL)) {
  159. value = datapos;
  160. datapos += packer.SkipLeaf(datapos);
  161. }
  162. if (!(flags & MT_NEXT)) {
  163. if (i == 0) {
  164. datapos = nullptr;
  165. return true;
  166. }
  167. return false; // no further way
  168. }
  169. TraverseEpsilon(datapos);
  170. if (i == 0) { // last byte, and got a match
  171. return true;
  172. }
  173. }
  174. return false;
  175. }
  176. }