FuzzedDataProvider.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. //===- FuzzedDataProvider.h - Utility header for fuzz targets ---*- C++ -* ===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. // A single header library providing an utility class to break up an array of
  9. // bytes. Whenever run on the same input, provides the same output, as long as
  10. // its methods are called in the same order, with the same arguments.
  11. //===----------------------------------------------------------------------===//
  12. #ifndef LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
  13. #define LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_
  14. #include <algorithm>
  15. #include <array>
  16. #include <climits>
  17. #include <cstddef>
  18. #include <cstdint>
  19. #include <cstring>
  20. #include <initializer_list>
  21. #include <limits>
  22. #include <string>
  23. #include <type_traits>
  24. #include <utility>
  25. #include <vector>
  26. // In addition to the comments below, the API is also briefly documented at
  27. // https://github.com/google/fuzzing/blob/master/docs/split-inputs.md#fuzzed-data-provider
  28. class FuzzedDataProvider {
  29. public:
  30. // |data| is an array of length |size| that the FuzzedDataProvider wraps to
  31. // provide more granular access. |data| must outlive the FuzzedDataProvider.
  32. FuzzedDataProvider(const uint8_t *data, size_t size)
  33. : data_ptr_(data), remaining_bytes_(size) {}
  34. ~FuzzedDataProvider() = default;
  35. // See the implementation below (after the class definition) for more verbose
  36. // comments for each of the methods.
  37. // Methods returning std::vector of bytes. These are the most popular choice
  38. // when splitting fuzzing input into pieces, as every piece is put into a
  39. // separate buffer (i.e. ASan would catch any under-/overflow) and the memory
  40. // will be released automatically.
  41. template <typename T> std::vector<T> ConsumeBytes(size_t num_bytes);
  42. template <typename T>
  43. std::vector<T> ConsumeBytesWithTerminator(size_t num_bytes, T terminator = 0);
  44. template <typename T> std::vector<T> ConsumeRemainingBytes();
  45. // Methods returning strings. Use only when you need a std::string or a null
  46. // terminated C-string. Otherwise, prefer the methods returning std::vector.
  47. std::string ConsumeBytesAsString(size_t num_bytes);
  48. std::string ConsumeRandomLengthString(size_t max_length);
  49. std::string ConsumeRandomLengthString();
  50. std::string ConsumeRemainingBytesAsString();
  51. // Methods returning integer values.
  52. template <typename T> T ConsumeIntegral();
  53. template <typename T> T ConsumeIntegralInRange(T min, T max);
  54. // Methods returning floating point values.
  55. template <typename T> T ConsumeFloatingPoint();
  56. template <typename T> T ConsumeFloatingPointInRange(T min, T max);
  57. // 0 <= return value <= 1.
  58. template <typename T> T ConsumeProbability();
  59. bool ConsumeBool();
  60. // Returns a value chosen from the given enum.
  61. template <typename T> T ConsumeEnum();
  62. // Returns a value from the given array.
  63. template <typename T, size_t size> T PickValueInArray(const T (&array)[size]);
  64. template <typename T, size_t size>
  65. T PickValueInArray(const std::array<T, size> &array);
  66. template <typename T> T PickValueInArray(std::initializer_list<const T> list);
  67. // Writes data to the given destination and returns number of bytes written.
  68. size_t ConsumeData(void *destination, size_t num_bytes);
  69. // Reports the remaining bytes available for fuzzed input.
  70. size_t remaining_bytes() { return remaining_bytes_; }
  71. private:
  72. FuzzedDataProvider(const FuzzedDataProvider &) = delete;
  73. FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete;
  74. void CopyAndAdvance(void *destination, size_t num_bytes);
  75. void Advance(size_t num_bytes);
  76. template <typename T>
  77. std::vector<T> ConsumeBytes(size_t size, size_t num_bytes);
  78. template <typename TS, typename TU> TS ConvertUnsignedToSigned(TU value);
  79. const uint8_t *data_ptr_;
  80. size_t remaining_bytes_;
  81. };
  82. // Returns a std::vector containing |num_bytes| of input data. If fewer than
  83. // |num_bytes| of data remain, returns a shorter std::vector containing all
  84. // of the data that's left. Can be used with any byte sized type, such as
  85. // char, unsigned char, uint8_t, etc.
  86. template <typename T>
  87. std::vector<T> FuzzedDataProvider::ConsumeBytes(size_t num_bytes) {
  88. num_bytes = std::min(num_bytes, remaining_bytes_);
  89. return ConsumeBytes<T>(num_bytes, num_bytes);
  90. }
  91. // Similar to |ConsumeBytes|, but also appends the terminator value at the end
  92. // of the resulting vector. Useful, when a mutable null-terminated C-string is
  93. // needed, for example. But that is a rare case. Better avoid it, if possible,
  94. // and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods.
  95. template <typename T>
  96. std::vector<T> FuzzedDataProvider::ConsumeBytesWithTerminator(size_t num_bytes,
  97. T terminator) {
  98. num_bytes = std::min(num_bytes, remaining_bytes_);
  99. std::vector<T> result = ConsumeBytes<T>(num_bytes + 1, num_bytes);
  100. result.back() = terminator;
  101. return result;
  102. }
  103. // Returns a std::vector containing all remaining bytes of the input data.
  104. template <typename T>
  105. std::vector<T> FuzzedDataProvider::ConsumeRemainingBytes() {
  106. return ConsumeBytes<T>(remaining_bytes_);
  107. }
  108. // Returns a std::string containing |num_bytes| of input data. Using this and
  109. // |.c_str()| on the resulting string is the best way to get an immutable
  110. // null-terminated C string. If fewer than |num_bytes| of data remain, returns
  111. // a shorter std::string containing all of the data that's left.
  112. inline std::string FuzzedDataProvider::ConsumeBytesAsString(size_t num_bytes) {
  113. static_assert(sizeof(std::string::value_type) == sizeof(uint8_t),
  114. "ConsumeBytesAsString cannot convert the data to a string.");
  115. num_bytes = std::min(num_bytes, remaining_bytes_);
  116. std::string result(
  117. reinterpret_cast<const std::string::value_type *>(data_ptr_), num_bytes);
  118. Advance(num_bytes);
  119. return result;
  120. }
  121. // Returns a std::string of length from 0 to |max_length|. When it runs out of
  122. // input data, returns what remains of the input. Designed to be more stable
  123. // with respect to a fuzzer inserting characters than just picking a random
  124. // length and then consuming that many bytes with |ConsumeBytes|.
  125. inline std::string
  126. FuzzedDataProvider::ConsumeRandomLengthString(size_t max_length) {
  127. // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\"
  128. // followed by anything else to the end of the string. As a result of this
  129. // logic, a fuzzer can insert characters into the string, and the string
  130. // will be lengthened to include those new characters, resulting in a more
  131. // stable fuzzer than picking the length of a string independently from
  132. // picking its contents.
  133. std::string result;
  134. // Reserve the anticipated capacity to prevent several reallocations.
  135. result.reserve(std::min(max_length, remaining_bytes_));
  136. for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) {
  137. char next = ConvertUnsignedToSigned<char>(data_ptr_[0]);
  138. Advance(1);
  139. if (next == '\\' && remaining_bytes_ != 0) {
  140. next = ConvertUnsignedToSigned<char>(data_ptr_[0]);
  141. Advance(1);
  142. if (next != '\\')
  143. break;
  144. }
  145. result += next;
  146. }
  147. result.shrink_to_fit();
  148. return result;
  149. }
  150. // Returns a std::string of length from 0 to |remaining_bytes_|.
  151. inline std::string FuzzedDataProvider::ConsumeRandomLengthString() {
  152. return ConsumeRandomLengthString(remaining_bytes_);
  153. }
  154. // Returns a std::string containing all remaining bytes of the input data.
  155. // Prefer using |ConsumeRemainingBytes| unless you actually need a std::string
  156. // object.
  157. inline std::string FuzzedDataProvider::ConsumeRemainingBytesAsString() {
  158. return ConsumeBytesAsString(remaining_bytes_);
  159. }
  160. // Returns a number in the range [Type's min, Type's max]. The value might
  161. // not be uniformly distributed in the given range. If there's no input data
  162. // left, always returns |min|.
  163. template <typename T> T FuzzedDataProvider::ConsumeIntegral() {
  164. return ConsumeIntegralInRange(std::numeric_limits<T>::min(),
  165. std::numeric_limits<T>::max());
  166. }
  167. // Returns a number in the range [min, max] by consuming bytes from the
  168. // input data. The value might not be uniformly distributed in the given
  169. // range. If there's no input data left, always returns |min|. |min| must
  170. // be less than or equal to |max|.
  171. template <typename T>
  172. T FuzzedDataProvider::ConsumeIntegralInRange(T min, T max) {
  173. static_assert(std::is_integral<T>::value, "An integral type is required.");
  174. static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type.");
  175. if (min > max)
  176. abort();
  177. // Use the biggest type possible to hold the range and the result.
  178. uint64_t range = static_cast<uint64_t>(max) - static_cast<uint64_t>(min);
  179. uint64_t result = 0;
  180. size_t offset = 0;
  181. while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 &&
  182. remaining_bytes_ != 0) {
  183. // Pull bytes off the end of the seed data. Experimentally, this seems to
  184. // allow the fuzzer to more easily explore the input space. This makes
  185. // sense, since it works by modifying inputs that caused new code to run,
  186. // and this data is often used to encode length of data read by
  187. // |ConsumeBytes|. Separating out read lengths makes it easier modify the
  188. // contents of the data that is actually read.
  189. --remaining_bytes_;
  190. result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_];
  191. offset += CHAR_BIT;
  192. }
  193. // Avoid division by 0, in case |range + 1| results in overflow.
  194. if (range != std::numeric_limits<decltype(range)>::max())
  195. result = result % (range + 1);
  196. return static_cast<T>(static_cast<uint64_t>(min) + result);
  197. }
  198. // Returns a floating point value in the range [Type's lowest, Type's max] by
  199. // consuming bytes from the input data. If there's no input data left, always
  200. // returns approximately 0.
  201. template <typename T> T FuzzedDataProvider::ConsumeFloatingPoint() {
  202. return ConsumeFloatingPointInRange<T>(std::numeric_limits<T>::lowest(),
  203. std::numeric_limits<T>::max());
  204. }
  205. // Returns a floating point value in the given range by consuming bytes from
  206. // the input data. If there's no input data left, returns |min|. Note that
  207. // |min| must be less than or equal to |max|.
  208. template <typename T>
  209. T FuzzedDataProvider::ConsumeFloatingPointInRange(T min, T max) {
  210. if (min > max)
  211. abort();
  212. T range = .0;
  213. T result = min;
  214. constexpr T zero(.0);
  215. if (max > zero && min < zero && max > min + std::numeric_limits<T>::max()) {
  216. // The diff |max - min| would overflow the given floating point type. Use
  217. // the half of the diff as the range and consume a bool to decide whether
  218. // the result is in the first of the second part of the diff.
  219. range = (max / 2.0) - (min / 2.0);
  220. if (ConsumeBool()) {
  221. result += range;
  222. }
  223. } else {
  224. range = max - min;
  225. }
  226. return result + range * ConsumeProbability<T>();
  227. }
  228. // Returns a floating point number in the range [0.0, 1.0]. If there's no
  229. // input data left, always returns 0.
  230. template <typename T> T FuzzedDataProvider::ConsumeProbability() {
  231. static_assert(std::is_floating_point<T>::value,
  232. "A floating point type is required.");
  233. // Use different integral types for different floating point types in order
  234. // to provide better density of the resulting values.
  235. using IntegralType =
  236. typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t,
  237. uint64_t>::type;
  238. T result = static_cast<T>(ConsumeIntegral<IntegralType>());
  239. result /= static_cast<T>(std::numeric_limits<IntegralType>::max());
  240. return result;
  241. }
  242. // Reads one byte and returns a bool, or false when no data remains.
  243. inline bool FuzzedDataProvider::ConsumeBool() {
  244. return 1 & ConsumeIntegral<uint8_t>();
  245. }
  246. // Returns an enum value. The enum must start at 0 and be contiguous. It must
  247. // also contain |kMaxValue| aliased to its largest (inclusive) value. Such as:
  248. // enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue };
  249. template <typename T> T FuzzedDataProvider::ConsumeEnum() {
  250. static_assert(std::is_enum<T>::value, "|T| must be an enum type.");
  251. return static_cast<T>(
  252. ConsumeIntegralInRange<uint32_t>(0, static_cast<uint32_t>(T::kMaxValue)));
  253. }
  254. // Returns a copy of the value selected from the given fixed-size |array|.
  255. template <typename T, size_t size>
  256. T FuzzedDataProvider::PickValueInArray(const T (&array)[size]) {
  257. static_assert(size > 0, "The array must be non empty.");
  258. return array[ConsumeIntegralInRange<size_t>(0, size - 1)];
  259. }
  260. template <typename T, size_t size>
  261. T FuzzedDataProvider::PickValueInArray(const std::array<T, size> &array) {
  262. static_assert(size > 0, "The array must be non empty.");
  263. return array[ConsumeIntegralInRange<size_t>(0, size - 1)];
  264. }
  265. template <typename T>
  266. T FuzzedDataProvider::PickValueInArray(std::initializer_list<const T> list) {
  267. // TODO(Dor1s): switch to static_assert once C++14 is allowed.
  268. if (!list.size())
  269. abort();
  270. return *(list.begin() + ConsumeIntegralInRange<size_t>(0, list.size() - 1));
  271. }
  272. // Writes |num_bytes| of input data to the given destination pointer. If there
  273. // is not enough data left, writes all remaining bytes. Return value is the
  274. // number of bytes written.
  275. // In general, it's better to avoid using this function, but it may be useful
  276. // in cases when it's necessary to fill a certain buffer or object with
  277. // fuzzing data.
  278. inline size_t FuzzedDataProvider::ConsumeData(void *destination,
  279. size_t num_bytes) {
  280. num_bytes = std::min(num_bytes, remaining_bytes_);
  281. CopyAndAdvance(destination, num_bytes);
  282. return num_bytes;
  283. }
  284. // Private methods.
  285. inline void FuzzedDataProvider::CopyAndAdvance(void *destination,
  286. size_t num_bytes) {
  287. std::memcpy(destination, data_ptr_, num_bytes);
  288. Advance(num_bytes);
  289. }
  290. inline void FuzzedDataProvider::Advance(size_t num_bytes) {
  291. if (num_bytes > remaining_bytes_)
  292. abort();
  293. data_ptr_ += num_bytes;
  294. remaining_bytes_ -= num_bytes;
  295. }
  296. template <typename T>
  297. std::vector<T> FuzzedDataProvider::ConsumeBytes(size_t size, size_t num_bytes) {
  298. static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type.");
  299. // The point of using the size-based constructor below is to increase the
  300. // odds of having a vector object with capacity being equal to the length.
  301. // That part is always implementation specific, but at least both libc++ and
  302. // libstdc++ allocate the requested number of bytes in that constructor,
  303. // which seems to be a natural choice for other implementations as well.
  304. // To increase the odds even more, we also call |shrink_to_fit| below.
  305. std::vector<T> result(size);
  306. if (size == 0) {
  307. if (num_bytes != 0)
  308. abort();
  309. return result;
  310. }
  311. CopyAndAdvance(result.data(), num_bytes);
  312. // Even though |shrink_to_fit| is also implementation specific, we expect it
  313. // to provide an additional assurance in case vector's constructor allocated
  314. // a buffer which is larger than the actual amount of data we put inside it.
  315. result.shrink_to_fit();
  316. return result;
  317. }
  318. template <typename TS, typename TU>
  319. TS FuzzedDataProvider::ConvertUnsignedToSigned(TU value) {
  320. static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types.");
  321. static_assert(!std::numeric_limits<TU>::is_signed,
  322. "Source type must be unsigned.");
  323. // TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream.
  324. if (std::numeric_limits<TS>::is_modulo)
  325. return static_cast<TS>(value);
  326. // Avoid using implementation-defined unsigned to signed conversions.
  327. // To learn more, see https://stackoverflow.com/questions/13150449.
  328. if (value <= std::numeric_limits<TS>::max()) {
  329. return static_cast<TS>(value);
  330. } else {
  331. constexpr auto TS_min = std::numeric_limits<TS>::min();
  332. return TS_min + static_cast<TS>(value - TS_min);
  333. }
  334. }
  335. #endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_