tuklib_integer.h 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948
  1. // SPDX-License-Identifier: 0BSD
  2. ///////////////////////////////////////////////////////////////////////////////
  3. //
  4. /// \file tuklib_integer.h
  5. /// \brief Various integer and bit operations
  6. ///
  7. /// This file provides macros or functions to do some basic integer and bit
  8. /// operations.
  9. ///
  10. /// Native endian inline functions (XX = 16, 32, or 64):
  11. /// - Unaligned native endian reads: readXXne(ptr)
  12. /// - Unaligned native endian writes: writeXXne(ptr, num)
  13. /// - Aligned native endian reads: aligned_readXXne(ptr)
  14. /// - Aligned native endian writes: aligned_writeXXne(ptr, num)
  15. ///
  16. /// Endianness-converting integer operations (these can be macros!)
  17. /// (XX = 16, 32, or 64; Y = b or l):
  18. /// - Byte swapping: byteswapXX(num)
  19. /// - Byte order conversions to/from native (byteswaps if Y isn't
  20. /// the native endianness): convXXYe(num)
  21. /// - Unaligned reads: readXXYe(ptr)
  22. /// - Unaligned writes: writeXXYe(ptr, num)
  23. /// - Aligned reads: aligned_readXXYe(ptr)
  24. /// - Aligned writes: aligned_writeXXYe(ptr, num)
  25. ///
  26. /// Since the above can macros, the arguments should have no side effects
  27. /// because they may be evaluated more than once.
  28. ///
  29. /// Bit scan operations for non-zero 32-bit integers (inline functions):
  30. /// - Bit scan reverse (find highest non-zero bit): bsr32(num)
  31. /// - Count leading zeros: clz32(num)
  32. /// - Count trailing zeros: ctz32(num)
  33. /// - Bit scan forward (simply an alias for ctz32()): bsf32(num)
  34. ///
  35. /// The above bit scan operations return 0-31. If num is zero,
  36. /// the result is undefined.
  37. //
  38. // Authors: Lasse Collin
  39. // Joachim Henke
  40. //
  41. ///////////////////////////////////////////////////////////////////////////////
  42. #ifndef TUKLIB_INTEGER_H
  43. #define TUKLIB_INTEGER_H
  44. #include "tuklib_common.h"
  45. #include <string.h>
  46. // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
  47. // and such functions.
  48. #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
  49. # include <immintrin.h>
  50. // Only include <intrin.h> when it is needed. GCC and Clang can both
  51. // use __builtin's, so we only need Windows instrincs when using MSVC.
  52. // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
  53. // cases explicitly.
  54. #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
  55. # include <intrin.h>
  56. #endif
  57. ///////////////////
  58. // Byte swapping //
  59. ///////////////////
  60. #if defined(HAVE___BUILTIN_BSWAPXX)
  61. // GCC >= 4.8 and Clang
  62. # define byteswap16(num) __builtin_bswap16(num)
  63. # define byteswap32(num) __builtin_bswap32(num)
  64. # define byteswap64(num) __builtin_bswap64(num)
  65. #elif defined(HAVE_BYTESWAP_H)
  66. // glibc, uClibc, dietlibc
  67. # include <byteswap.h>
  68. # ifdef HAVE_BSWAP_16
  69. # define byteswap16(num) bswap_16(num)
  70. # endif
  71. # ifdef HAVE_BSWAP_32
  72. # define byteswap32(num) bswap_32(num)
  73. # endif
  74. # ifdef HAVE_BSWAP_64
  75. # define byteswap64(num) bswap_64(num)
  76. # endif
  77. #elif defined(HAVE_SYS_ENDIAN_H)
  78. // *BSDs and Darwin
  79. # include <sys/endian.h>
  80. # define byteswap16(num) bswap16(num)
  81. # define byteswap32(num) bswap32(num)
  82. # define byteswap64(num) bswap64(num)
  83. #elif defined(HAVE_SYS_BYTEORDER_H)
  84. // Solaris
  85. # error #include <sys/byteorder.h>
  86. # ifdef BSWAP_16
  87. # define byteswap16(num) BSWAP_16(num)
  88. # endif
  89. # ifdef BSWAP_32
  90. # define byteswap32(num) BSWAP_32(num)
  91. # endif
  92. # ifdef BSWAP_64
  93. # define byteswap64(num) BSWAP_64(num)
  94. # endif
  95. # ifdef BE_16
  96. # define conv16be(num) BE_16(num)
  97. # endif
  98. # ifdef BE_32
  99. # define conv32be(num) BE_32(num)
  100. # endif
  101. # ifdef BE_64
  102. # define conv64be(num) BE_64(num)
  103. # endif
  104. # ifdef LE_16
  105. # define conv16le(num) LE_16(num)
  106. # endif
  107. # ifdef LE_32
  108. # define conv32le(num) LE_32(num)
  109. # endif
  110. # ifdef LE_64
  111. # define conv64le(num) LE_64(num)
  112. # endif
  113. #endif
  114. #ifndef byteswap16
  115. # define byteswap16(n) (uint16_t)( \
  116. (((n) & 0x00FFU) << 8) \
  117. | (((n) & 0xFF00U) >> 8) \
  118. )
  119. #endif
  120. #ifndef byteswap32
  121. # define byteswap32(n) (uint32_t)( \
  122. (((n) & UINT32_C(0x000000FF)) << 24) \
  123. | (((n) & UINT32_C(0x0000FF00)) << 8) \
  124. | (((n) & UINT32_C(0x00FF0000)) >> 8) \
  125. | (((n) & UINT32_C(0xFF000000)) >> 24) \
  126. )
  127. #endif
  128. #ifndef byteswap64
  129. # define byteswap64(n) (uint64_t)( \
  130. (((n) & UINT64_C(0x00000000000000FF)) << 56) \
  131. | (((n) & UINT64_C(0x000000000000FF00)) << 40) \
  132. | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
  133. | (((n) & UINT64_C(0x00000000FF000000)) << 8) \
  134. | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
  135. | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
  136. | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
  137. | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
  138. )
  139. #endif
  140. // Define conversion macros using the basic byte swapping macros.
  141. #ifdef WORDS_BIGENDIAN
  142. # ifndef conv16be
  143. # define conv16be(num) ((uint16_t)(num))
  144. # endif
  145. # ifndef conv32be
  146. # define conv32be(num) ((uint32_t)(num))
  147. # endif
  148. # ifndef conv64be
  149. # define conv64be(num) ((uint64_t)(num))
  150. # endif
  151. # ifndef conv16le
  152. # define conv16le(num) byteswap16(num)
  153. # endif
  154. # ifndef conv32le
  155. # define conv32le(num) byteswap32(num)
  156. # endif
  157. # ifndef conv64le
  158. # define conv64le(num) byteswap64(num)
  159. # endif
  160. #else
  161. # ifndef conv16be
  162. # define conv16be(num) byteswap16(num)
  163. # endif
  164. # ifndef conv32be
  165. # define conv32be(num) byteswap32(num)
  166. # endif
  167. # ifndef conv64be
  168. # define conv64be(num) byteswap64(num)
  169. # endif
  170. # ifndef conv16le
  171. # define conv16le(num) ((uint16_t)(num))
  172. # endif
  173. # ifndef conv32le
  174. # define conv32le(num) ((uint32_t)(num))
  175. # endif
  176. # ifndef conv64le
  177. # define conv64le(num) ((uint64_t)(num))
  178. # endif
  179. #endif
  180. ////////////////////////////////
  181. // Unaligned reads and writes //
  182. ////////////////////////////////
  183. // No-strict-align archs like x86-64
  184. // ---------------------------------
  185. //
  186. // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
  187. // is bad even if the uint8_pointer is properly aligned because this kind
  188. // of casts break strict aliasing rules and result in undefined behavior.
  189. // With unaligned pointers it's even worse: compilers may emit vector
  190. // instructions that require aligned pointers even if non-vector
  191. // instructions work with unaligned pointers.
  192. //
  193. // Using memcpy() is the standard compliant way to do unaligned access.
  194. // Many modern compilers inline it so there is no function call overhead.
  195. // For those compilers that don't handle the memcpy() method well, the
  196. // old casting method (that violates strict aliasing) can be requested at
  197. // build time. A third method, casting to a packed struct, would also be
  198. // an option but isn't provided to keep things simpler (it's already a mess).
  199. // Hopefully this is flexible enough in practice.
  200. //
  201. // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
  202. //
  203. // buf[0] | (buf[1] << 8)
  204. //
  205. // reads a 16-bit value and can emit a single 16-bit load and produce
  206. // identical code than with the memcpy() method. In other cases Clang and GCC
  207. // produce either the same or better code with memcpy(). For example, Clang 9
  208. // on x86-64 can detect 32-bit load but not 16-bit load.
  209. //
  210. // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
  211. // code for "buf[0] | (buf[1] << 8)".
  212. //
  213. // Conclusion: The memcpy() method is the best choice when unaligned access
  214. // is supported.
  215. //
  216. // Strict-align archs like SPARC
  217. // -----------------------------
  218. //
  219. // GCC versions from around 4.x to to at least 13.2.0 produce worse code
  220. // from the memcpy() method than from simple byte-by-byte shift-or code
  221. // when reading a 32-bit integer:
  222. //
  223. // (1) It may be constructed on stack using using four 8-bit loads,
  224. // four 8-bit stores to stack, and finally one 32-bit load from stack.
  225. //
  226. // (2) Especially with -Os, an actual memcpy() call may be emitted.
  227. //
  228. // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
  229. // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
  230. // some processors but not all so this is relevant only in the case when
  231. // GCC assumes that unaligned is not supported or -mstrict-align or
  232. // -mno-unaligned-access is used.
  233. //
  234. // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
  235. // was one the very few with a minor difference: the memcpy() version
  236. // was one instruction longer.
  237. //
  238. // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
  239. // the best choice for strict-align archs to do unaligned access.
  240. //
  241. // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
  242. //
  243. // Thanks to <https://godbolt.org/> it was easy to test different compilers.
  244. // The following is for little endian targets:
  245. /*
  246. #include <stdint.h>
  247. #include <string.h>
  248. uint32_t bytes16(const uint8_t *b)
  249. {
  250. return (uint32_t)b[0]
  251. | ((uint32_t)b[1] << 8);
  252. }
  253. uint32_t copy16(const uint8_t *b)
  254. {
  255. uint16_t v;
  256. memcpy(&v, b, sizeof(v));
  257. return v;
  258. }
  259. uint32_t bytes32(const uint8_t *b)
  260. {
  261. return (uint32_t)b[0]
  262. | ((uint32_t)b[1] << 8)
  263. | ((uint32_t)b[2] << 16)
  264. | ((uint32_t)b[3] << 24);
  265. }
  266. uint32_t copy32(const uint8_t *b)
  267. {
  268. uint32_t v;
  269. memcpy(&v, b, sizeof(v));
  270. return v;
  271. }
  272. void wbytes16(uint8_t *b, uint16_t v)
  273. {
  274. b[0] = (uint8_t)v;
  275. b[1] = (uint8_t)(v >> 8);
  276. }
  277. void wcopy16(uint8_t *b, uint16_t v)
  278. {
  279. memcpy(b, &v, sizeof(v));
  280. }
  281. void wbytes32(uint8_t *b, uint32_t v)
  282. {
  283. b[0] = (uint8_t)v;
  284. b[1] = (uint8_t)(v >> 8);
  285. b[2] = (uint8_t)(v >> 16);
  286. b[3] = (uint8_t)(v >> 24);
  287. }
  288. void wcopy32(uint8_t *b, uint32_t v)
  289. {
  290. memcpy(b, &v, sizeof(v));
  291. }
  292. */
  293. #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
  294. static inline uint16_t
  295. read16ne(const uint8_t *buf)
  296. {
  297. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  298. return *(const uint16_t *)buf;
  299. #else
  300. uint16_t num;
  301. memcpy(&num, buf, sizeof(num));
  302. return num;
  303. #endif
  304. }
  305. static inline uint32_t
  306. read32ne(const uint8_t *buf)
  307. {
  308. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  309. return *(const uint32_t *)buf;
  310. #else
  311. uint32_t num;
  312. memcpy(&num, buf, sizeof(num));
  313. return num;
  314. #endif
  315. }
  316. static inline uint64_t
  317. read64ne(const uint8_t *buf)
  318. {
  319. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  320. return *(const uint64_t *)buf;
  321. #else
  322. uint64_t num;
  323. memcpy(&num, buf, sizeof(num));
  324. return num;
  325. #endif
  326. }
  327. static inline void
  328. write16ne(uint8_t *buf, uint16_t num)
  329. {
  330. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  331. *(uint16_t *)buf = num;
  332. #else
  333. memcpy(buf, &num, sizeof(num));
  334. #endif
  335. return;
  336. }
  337. static inline void
  338. write32ne(uint8_t *buf, uint32_t num)
  339. {
  340. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  341. *(uint32_t *)buf = num;
  342. #else
  343. memcpy(buf, &num, sizeof(num));
  344. #endif
  345. return;
  346. }
  347. static inline void
  348. write64ne(uint8_t *buf, uint64_t num)
  349. {
  350. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  351. *(uint64_t *)buf = num;
  352. #else
  353. memcpy(buf, &num, sizeof(num));
  354. #endif
  355. return;
  356. }
  357. static inline uint16_t
  358. read16be(const uint8_t *buf)
  359. {
  360. uint16_t num = read16ne(buf);
  361. return conv16be(num);
  362. }
  363. static inline uint16_t
  364. read16le(const uint8_t *buf)
  365. {
  366. uint16_t num = read16ne(buf);
  367. return conv16le(num);
  368. }
  369. static inline uint32_t
  370. read32be(const uint8_t *buf)
  371. {
  372. uint32_t num = read32ne(buf);
  373. return conv32be(num);
  374. }
  375. static inline uint32_t
  376. read32le(const uint8_t *buf)
  377. {
  378. uint32_t num = read32ne(buf);
  379. return conv32le(num);
  380. }
  381. static inline uint64_t
  382. read64be(const uint8_t *buf)
  383. {
  384. uint64_t num = read64ne(buf);
  385. return conv64be(num);
  386. }
  387. static inline uint64_t
  388. read64le(const uint8_t *buf)
  389. {
  390. uint64_t num = read64ne(buf);
  391. return conv64le(num);
  392. }
  393. // NOTE: Possible byte swapping must be done in a macro to allow the compiler
  394. // to optimize byte swapping of constants when using glibc's or *BSD's
  395. // byte swapping macros. The actual write is done in an inline function
  396. // to make type checking of the buf pointer possible.
  397. #define write16be(buf, num) write16ne(buf, conv16be(num))
  398. #define write32be(buf, num) write32ne(buf, conv32be(num))
  399. #define write64be(buf, num) write64ne(buf, conv64be(num))
  400. #define write16le(buf, num) write16ne(buf, conv16le(num))
  401. #define write32le(buf, num) write32ne(buf, conv32le(num))
  402. #define write64le(buf, num) write64ne(buf, conv64le(num))
  403. #else
  404. #ifdef WORDS_BIGENDIAN
  405. # define read16ne read16be
  406. # define read32ne read32be
  407. # define read64ne read64be
  408. # define write16ne write16be
  409. # define write32ne write32be
  410. # define write64ne write64be
  411. #else
  412. # define read16ne read16le
  413. # define read32ne read32le
  414. # define read64ne read64le
  415. # define write16ne write16le
  416. # define write32ne write32le
  417. # define write64ne write64le
  418. #endif
  419. static inline uint16_t
  420. read16be(const uint8_t *buf)
  421. {
  422. uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
  423. return num;
  424. }
  425. static inline uint16_t
  426. read16le(const uint8_t *buf)
  427. {
  428. uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
  429. return num;
  430. }
  431. static inline uint32_t
  432. read32be(const uint8_t *buf)
  433. {
  434. uint32_t num = (uint32_t)buf[0] << 24;
  435. num |= (uint32_t)buf[1] << 16;
  436. num |= (uint32_t)buf[2] << 8;
  437. num |= (uint32_t)buf[3];
  438. return num;
  439. }
  440. static inline uint32_t
  441. read32le(const uint8_t *buf)
  442. {
  443. uint32_t num = (uint32_t)buf[0];
  444. num |= (uint32_t)buf[1] << 8;
  445. num |= (uint32_t)buf[2] << 16;
  446. num |= (uint32_t)buf[3] << 24;
  447. return num;
  448. }
  449. static inline uint64_t
  450. read64be(const uint8_t *buf)
  451. {
  452. uint64_t num = (uint64_t)buf[0] << 56;
  453. num |= (uint64_t)buf[1] << 48;
  454. num |= (uint64_t)buf[2] << 40;
  455. num |= (uint64_t)buf[3] << 32;
  456. num |= (uint64_t)buf[4] << 24;
  457. num |= (uint64_t)buf[5] << 16;
  458. num |= (uint64_t)buf[6] << 8;
  459. num |= (uint64_t)buf[7];
  460. return num;
  461. }
  462. static inline uint64_t
  463. read64le(const uint8_t *buf)
  464. {
  465. uint64_t num = (uint64_t)buf[0];
  466. num |= (uint64_t)buf[1] << 8;
  467. num |= (uint64_t)buf[2] << 16;
  468. num |= (uint64_t)buf[3] << 24;
  469. num |= (uint64_t)buf[4] << 32;
  470. num |= (uint64_t)buf[5] << 40;
  471. num |= (uint64_t)buf[6] << 48;
  472. num |= (uint64_t)buf[7] << 56;
  473. return num;
  474. }
  475. static inline void
  476. write16be(uint8_t *buf, uint16_t num)
  477. {
  478. buf[0] = (uint8_t)(num >> 8);
  479. buf[1] = (uint8_t)num;
  480. return;
  481. }
  482. static inline void
  483. write16le(uint8_t *buf, uint16_t num)
  484. {
  485. buf[0] = (uint8_t)num;
  486. buf[1] = (uint8_t)(num >> 8);
  487. return;
  488. }
  489. static inline void
  490. write32be(uint8_t *buf, uint32_t num)
  491. {
  492. buf[0] = (uint8_t)(num >> 24);
  493. buf[1] = (uint8_t)(num >> 16);
  494. buf[2] = (uint8_t)(num >> 8);
  495. buf[3] = (uint8_t)num;
  496. return;
  497. }
  498. static inline void
  499. write32le(uint8_t *buf, uint32_t num)
  500. {
  501. buf[0] = (uint8_t)num;
  502. buf[1] = (uint8_t)(num >> 8);
  503. buf[2] = (uint8_t)(num >> 16);
  504. buf[3] = (uint8_t)(num >> 24);
  505. return;
  506. }
  507. static inline void
  508. write64be(uint8_t *buf, uint64_t num)
  509. {
  510. buf[0] = (uint8_t)(num >> 56);
  511. buf[1] = (uint8_t)(num >> 48);
  512. buf[2] = (uint8_t)(num >> 40);
  513. buf[3] = (uint8_t)(num >> 32);
  514. buf[4] = (uint8_t)(num >> 24);
  515. buf[5] = (uint8_t)(num >> 16);
  516. buf[6] = (uint8_t)(num >> 8);
  517. buf[7] = (uint8_t)num;
  518. return;
  519. }
  520. static inline void
  521. write64le(uint8_t *buf, uint64_t num)
  522. {
  523. buf[0] = (uint8_t)num;
  524. buf[1] = (uint8_t)(num >> 8);
  525. buf[2] = (uint8_t)(num >> 16);
  526. buf[3] = (uint8_t)(num >> 24);
  527. buf[4] = (uint8_t)(num >> 32);
  528. buf[5] = (uint8_t)(num >> 40);
  529. buf[6] = (uint8_t)(num >> 48);
  530. buf[7] = (uint8_t)(num >> 56);
  531. return;
  532. }
  533. #endif
  534. //////////////////////////////
  535. // Aligned reads and writes //
  536. //////////////////////////////
  537. // Separate functions for aligned reads and writes are provided since on
  538. // strict-align archs aligned access is much faster than unaligned access.
  539. //
  540. // Just like in the unaligned case, memcpy() is needed to avoid
  541. // strict aliasing violations. However, on archs that don't support
  542. // unaligned access the compiler cannot know that the pointers given
  543. // to memcpy() are aligned which results in slow code. As of C11 there is
  544. // no standard way to tell the compiler that we know that the address is
  545. // aligned but some compilers have language extensions to do that. With
  546. // such language extensions the memcpy() method gives excellent results.
  547. //
  548. // What to do on a strict-align system when no known language extensions
  549. // are available? Falling back to byte-by-byte access would be safe but ruin
  550. // optimizations that have been made specifically with aligned access in mind.
  551. // As a compromise, aligned reads will fall back to non-compliant type punning
  552. // but aligned writes will be byte-by-byte, that is, fast reads are preferred
  553. // over fast writes. This obviously isn't great but hopefully it's a working
  554. // compromise for now.
  555. //
  556. // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
  557. #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
  558. # define tuklib_memcpy_aligned(dest, src, size) \
  559. memcpy(dest, __builtin_assume_aligned(src, size), size)
  560. #else
  561. # define tuklib_memcpy_aligned(dest, src, size) \
  562. memcpy(dest, src, size)
  563. # ifndef TUKLIB_FAST_UNALIGNED_ACCESS
  564. # define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
  565. # endif
  566. #endif
  567. static inline uint16_t
  568. aligned_read16ne(const uint8_t *buf)
  569. {
  570. #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
  571. || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
  572. return *(const uint16_t *)buf;
  573. #else
  574. uint16_t num;
  575. tuklib_memcpy_aligned(&num, buf, sizeof(num));
  576. return num;
  577. #endif
  578. }
  579. static inline uint32_t
  580. aligned_read32ne(const uint8_t *buf)
  581. {
  582. #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
  583. || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
  584. return *(const uint32_t *)buf;
  585. #else
  586. uint32_t num;
  587. tuklib_memcpy_aligned(&num, buf, sizeof(num));
  588. return num;
  589. #endif
  590. }
  591. static inline uint64_t
  592. aligned_read64ne(const uint8_t *buf)
  593. {
  594. #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
  595. || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
  596. return *(const uint64_t *)buf;
  597. #else
  598. uint64_t num;
  599. tuklib_memcpy_aligned(&num, buf, sizeof(num));
  600. return num;
  601. #endif
  602. }
  603. static inline void
  604. aligned_write16ne(uint8_t *buf, uint16_t num)
  605. {
  606. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  607. *(uint16_t *)buf = num;
  608. #else
  609. tuklib_memcpy_aligned(buf, &num, sizeof(num));
  610. #endif
  611. return;
  612. }
  613. static inline void
  614. aligned_write32ne(uint8_t *buf, uint32_t num)
  615. {
  616. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  617. *(uint32_t *)buf = num;
  618. #else
  619. tuklib_memcpy_aligned(buf, &num, sizeof(num));
  620. #endif
  621. return;
  622. }
  623. static inline void
  624. aligned_write64ne(uint8_t *buf, uint64_t num)
  625. {
  626. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  627. *(uint64_t *)buf = num;
  628. #else
  629. tuklib_memcpy_aligned(buf, &num, sizeof(num));
  630. #endif
  631. return;
  632. }
  633. static inline uint16_t
  634. aligned_read16be(const uint8_t *buf)
  635. {
  636. uint16_t num = aligned_read16ne(buf);
  637. return conv16be(num);
  638. }
  639. static inline uint16_t
  640. aligned_read16le(const uint8_t *buf)
  641. {
  642. uint16_t num = aligned_read16ne(buf);
  643. return conv16le(num);
  644. }
  645. static inline uint32_t
  646. aligned_read32be(const uint8_t *buf)
  647. {
  648. uint32_t num = aligned_read32ne(buf);
  649. return conv32be(num);
  650. }
  651. static inline uint32_t
  652. aligned_read32le(const uint8_t *buf)
  653. {
  654. uint32_t num = aligned_read32ne(buf);
  655. return conv32le(num);
  656. }
  657. static inline uint64_t
  658. aligned_read64be(const uint8_t *buf)
  659. {
  660. uint64_t num = aligned_read64ne(buf);
  661. return conv64be(num);
  662. }
  663. static inline uint64_t
  664. aligned_read64le(const uint8_t *buf)
  665. {
  666. uint64_t num = aligned_read64ne(buf);
  667. return conv64le(num);
  668. }
  669. // These need to be macros like in the unaligned case.
  670. #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
  671. #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
  672. #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
  673. #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
  674. #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
  675. #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
  676. ////////////////////
  677. // Bit operations //
  678. ////////////////////
  679. static inline uint32_t
  680. bsr32(uint32_t n)
  681. {
  682. // Check for ICC first, since it tends to define __GNUC__ too.
  683. #if defined(__INTEL_COMPILER)
  684. return _bit_scan_reverse(n);
  685. #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
  686. // GCC >= 3.4 has __builtin_clz(), which gives good results on
  687. // multiple architectures. On x86, __builtin_clz() ^ 31U becomes
  688. // either plain BSR (so the XOR gets optimized away) or LZCNT and
  689. // XOR (if -march indicates that SSE4a instructions are supported).
  690. return (uint32_t)__builtin_clz(n) ^ 31U;
  691. #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
  692. uint32_t i;
  693. __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
  694. return i;
  695. #elif defined(_MSC_VER)
  696. unsigned long i;
  697. _BitScanReverse(&i, n);
  698. return i;
  699. #else
  700. uint32_t i = 31;
  701. if ((n & 0xFFFF0000) == 0) {
  702. n <<= 16;
  703. i = 15;
  704. }
  705. if ((n & 0xFF000000) == 0) {
  706. n <<= 8;
  707. i -= 8;
  708. }
  709. if ((n & 0xF0000000) == 0) {
  710. n <<= 4;
  711. i -= 4;
  712. }
  713. if ((n & 0xC0000000) == 0) {
  714. n <<= 2;
  715. i -= 2;
  716. }
  717. if ((n & 0x80000000) == 0)
  718. --i;
  719. return i;
  720. #endif
  721. }
  722. static inline uint32_t
  723. clz32(uint32_t n)
  724. {
  725. #if defined(__INTEL_COMPILER)
  726. return _bit_scan_reverse(n) ^ 31U;
  727. #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
  728. return (uint32_t)__builtin_clz(n);
  729. #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
  730. uint32_t i;
  731. __asm__("bsrl %1, %0\n\t"
  732. "xorl $31, %0"
  733. : "=r" (i) : "rm" (n));
  734. return i;
  735. #elif defined(_MSC_VER)
  736. unsigned long i;
  737. _BitScanReverse(&i, n);
  738. return i ^ 31U;
  739. #else
  740. uint32_t i = 0;
  741. if ((n & 0xFFFF0000) == 0) {
  742. n <<= 16;
  743. i = 16;
  744. }
  745. if ((n & 0xFF000000) == 0) {
  746. n <<= 8;
  747. i += 8;
  748. }
  749. if ((n & 0xF0000000) == 0) {
  750. n <<= 4;
  751. i += 4;
  752. }
  753. if ((n & 0xC0000000) == 0) {
  754. n <<= 2;
  755. i += 2;
  756. }
  757. if ((n & 0x80000000) == 0)
  758. ++i;
  759. return i;
  760. #endif
  761. }
  762. static inline uint32_t
  763. ctz32(uint32_t n)
  764. {
  765. #if defined(__INTEL_COMPILER)
  766. return _bit_scan_forward(n);
  767. #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
  768. return (uint32_t)__builtin_ctz(n);
  769. #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
  770. uint32_t i;
  771. __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
  772. return i;
  773. #elif defined(_MSC_VER)
  774. unsigned long i;
  775. _BitScanForward(&i, n);
  776. return i;
  777. #else
  778. uint32_t i = 0;
  779. if ((n & 0x0000FFFF) == 0) {
  780. n >>= 16;
  781. i = 16;
  782. }
  783. if ((n & 0x000000FF) == 0) {
  784. n >>= 8;
  785. i += 8;
  786. }
  787. if ((n & 0x0000000F) == 0) {
  788. n >>= 4;
  789. i += 4;
  790. }
  791. if ((n & 0x00000003) == 0) {
  792. n >>= 2;
  793. i += 2;
  794. }
  795. if ((n & 0x00000001) == 0)
  796. ++i;
  797. return i;
  798. #endif
  799. }
  800. #define bsf32 ctz32
  801. #endif