sanitizer_lzw.h 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. //===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // Lempel–Ziv–Welch encoding/decoding
  10. //
  11. //===----------------------------------------------------------------------===//
  12. #ifndef SANITIZER_LZW_H
  13. #define SANITIZER_LZW_H
  14. #include "sanitizer_dense_map.h"
  15. namespace __sanitizer {
  16. using LzwCodeType = u32;
  17. template <class T, class ItIn, class ItOut>
  18. ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
  19. using Substring =
  20. detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
  21. // Sentinel value for substrings of len 1.
  22. static constexpr LzwCodeType kNoPrefix =
  23. Min(DenseMapInfo<Substring>::getEmptyKey().first,
  24. DenseMapInfo<Substring>::getTombstoneKey().first) -
  25. 1;
  26. DenseMap<Substring, LzwCodeType> prefix_to_code;
  27. {
  28. // Add all substring of len 1 as initial dictionary.
  29. InternalMmapVector<T> dict_len1;
  30. for (auto it = begin; it != end; ++it)
  31. if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
  32. dict_len1.push_back(*it);
  33. // Slightly helps with later delta encoding.
  34. Sort(dict_len1.data(), dict_len1.size());
  35. // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
  36. // just generate them.
  37. *out = dict_len1.size();
  38. ++out;
  39. for (uptr i = 0; i != dict_len1.size(); ++i) {
  40. // Remap after the Sort.
  41. prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
  42. *out = dict_len1[i];
  43. ++out;
  44. }
  45. CHECK_EQ(prefix_to_code.size(), dict_len1.size());
  46. }
  47. if (begin == end)
  48. return out;
  49. // Main LZW encoding loop.
  50. LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
  51. ++begin;
  52. for (auto it = begin; it != end; ++it) {
  53. // Extend match with the new item.
  54. auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
  55. if (ins.second) {
  56. // This is a new substring, but emit the code for the current match
  57. // (before extend). This allows LZW decoder to recover the dictionary.
  58. *out = match;
  59. ++out;
  60. // Reset the match to a single item, which must be already in the map.
  61. match = prefix_to_code.find({kNoPrefix, *it})->second;
  62. } else {
  63. // Already known, use as the current match.
  64. match = ins.first->second;
  65. }
  66. }
  67. *out = match;
  68. ++out;
  69. return out;
  70. }
  71. template <class T, class ItIn, class ItOut>
  72. ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
  73. if (begin == end)
  74. return out;
  75. // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
  76. InternalMmapVector<T> dict_len1(*begin);
  77. ++begin;
  78. if (begin == end)
  79. return out;
  80. for (auto& v : dict_len1) {
  81. v = *begin;
  82. ++begin;
  83. }
  84. // Substrings of len 2 and up. Indexes are shifted because [0,
  85. // dict_len1.size()) stored in dict_len1. Substings get here after being
  86. // emitted to the output, so we can use output position.
  87. InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
  88. code_to_substr;
  89. // Copies already emitted substrings into the output again.
  90. auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
  91. if (code < dict_len1.size()) {
  92. *out = dict_len1[code];
  93. ++out;
  94. return out;
  95. }
  96. const auto& s = code_to_substr[code - dict_len1.size()];
  97. for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
  98. return out;
  99. };
  100. // Returns lens of the substring with the given code.
  101. auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
  102. if (code < dict_len1.size())
  103. return 1;
  104. const auto& s = code_to_substr[code - dict_len1.size()];
  105. return s.second - s.first;
  106. };
  107. // Main LZW decoding loop.
  108. LzwCodeType prev_code = *begin;
  109. ++begin;
  110. out = copy(prev_code, out);
  111. for (auto it = begin; it != end; ++it) {
  112. LzwCodeType code = *it;
  113. auto start = out;
  114. if (code == dict_len1.size() + code_to_substr.size()) {
  115. // Special LZW case. The code is not in the dictionary yet. This is
  116. // possible only when the new substring is the same as previous one plus
  117. // the first item of the previous substring. We can emit that in two
  118. // steps.
  119. out = copy(prev_code, out);
  120. *out = *start;
  121. ++out;
  122. } else {
  123. out = copy(code, out);
  124. }
  125. // Every time encoded emits the code, it also creates substing of len + 1
  126. // including the first item of the just emmited substring. Do the same here.
  127. uptr len = code_to_len(prev_code);
  128. code_to_substr.push_back({start - len, start + 1});
  129. prev_code = code;
  130. }
  131. return out;
  132. }
  133. } // namespace __sanitizer
  134. #endif