123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159 |
- //===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- //
- // Lempel–Ziv–Welch encoding/decoding
- //
- //===----------------------------------------------------------------------===//
- #ifndef SANITIZER_LZW_H
- #define SANITIZER_LZW_H
- #include "sanitizer_dense_map.h"
- namespace __sanitizer {
- using LzwCodeType = u32;
- template <class T, class ItIn, class ItOut>
- ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
- using Substring =
- detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
- // Sentinel value for substrings of len 1.
- static constexpr LzwCodeType kNoPrefix =
- Min(DenseMapInfo<Substring>::getEmptyKey().first,
- DenseMapInfo<Substring>::getTombstoneKey().first) -
- 1;
- DenseMap<Substring, LzwCodeType> prefix_to_code;
- {
- // Add all substring of len 1 as initial dictionary.
- InternalMmapVector<T> dict_len1;
- for (auto it = begin; it != end; ++it)
- if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
- dict_len1.push_back(*it);
- // Slightly helps with later delta encoding.
- Sort(dict_len1.data(), dict_len1.size());
- // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
- // just generate them.
- *out = dict_len1.size();
- ++out;
- for (uptr i = 0; i != dict_len1.size(); ++i) {
- // Remap after the Sort.
- prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
- *out = dict_len1[i];
- ++out;
- }
- CHECK_EQ(prefix_to_code.size(), dict_len1.size());
- }
- if (begin == end)
- return out;
- // Main LZW encoding loop.
- LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
- ++begin;
- for (auto it = begin; it != end; ++it) {
- // Extend match with the new item.
- auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
- if (ins.second) {
- // This is a new substring, but emit the code for the current match
- // (before extend). This allows LZW decoder to recover the dictionary.
- *out = match;
- ++out;
- // Reset the match to a single item, which must be already in the map.
- match = prefix_to_code.find({kNoPrefix, *it})->second;
- } else {
- // Already known, use as the current match.
- match = ins.first->second;
- }
- }
- *out = match;
- ++out;
- return out;
- }
- template <class T, class ItIn, class ItOut>
- ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
- if (begin == end)
- return out;
- // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
- InternalMmapVector<T> dict_len1(*begin);
- ++begin;
- if (begin == end)
- return out;
- for (auto& v : dict_len1) {
- v = *begin;
- ++begin;
- }
- // Substrings of len 2 and up. Indexes are shifted because [0,
- // dict_len1.size()) stored in dict_len1. Substings get here after being
- // emitted to the output, so we can use output position.
- InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
- code_to_substr;
- // Copies already emitted substrings into the output again.
- auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
- if (code < dict_len1.size()) {
- *out = dict_len1[code];
- ++out;
- return out;
- }
- const auto& s = code_to_substr[code - dict_len1.size()];
- for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
- return out;
- };
- // Returns lens of the substring with the given code.
- auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
- if (code < dict_len1.size())
- return 1;
- const auto& s = code_to_substr[code - dict_len1.size()];
- return s.second - s.first;
- };
- // Main LZW decoding loop.
- LzwCodeType prev_code = *begin;
- ++begin;
- out = copy(prev_code, out);
- for (auto it = begin; it != end; ++it) {
- LzwCodeType code = *it;
- auto start = out;
- if (code == dict_len1.size() + code_to_substr.size()) {
- // Special LZW case. The code is not in the dictionary yet. This is
- // possible only when the new substring is the same as previous one plus
- // the first item of the previous substring. We can emit that in two
- // steps.
- out = copy(prev_code, out);
- *out = *start;
- ++out;
- } else {
- out = copy(code, out);
- }
- // Every time encoded emits the code, it also creates substing of len + 1
- // including the first item of the just emmited substring. Do the same here.
- uptr len = code_to_len(prev_code);
- code_to_substr.push_back({start - len, start + 1});
- prev_code = code;
- }
- return out;
- }
- } // namespace __sanitizer
- #endif
|