tuklib_integer.h 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954
  1. // SPDX-License-Identifier: 0BSD
  2. ///////////////////////////////////////////////////////////////////////////////
  3. //
  4. /// \file tuklib_integer.h
  5. /// \brief Various integer and bit operations
  6. ///
  7. /// This file provides macros or functions to do some basic integer and bit
  8. /// operations.
  9. ///
  10. /// Native endian inline functions (XX = 16, 32, or 64):
  11. /// - Unaligned native endian reads: readXXne(ptr)
  12. /// - Unaligned native endian writes: writeXXne(ptr, num)
  13. /// - Aligned native endian reads: aligned_readXXne(ptr)
  14. /// - Aligned native endian writes: aligned_writeXXne(ptr, num)
  15. ///
  16. /// Endianness-converting integer operations (these can be macros!)
  17. /// (XX = 16, 32, or 64; Y = b or l):
  18. /// - Byte swapping: byteswapXX(num)
  19. /// - Byte order conversions to/from native (byteswaps if Y isn't
  20. /// the native endianness): convXXYe(num)
  21. /// - Unaligned reads: readXXYe(ptr)
  22. /// - Unaligned writes: writeXXYe(ptr, num)
  23. /// - Aligned reads: aligned_readXXYe(ptr)
  24. /// - Aligned writes: aligned_writeXXYe(ptr, num)
  25. ///
  26. /// Since the above can macros, the arguments should have no side effects
  27. /// because they may be evaluated more than once.
  28. ///
  29. /// Bit scan operations for non-zero 32-bit integers (inline functions):
  30. /// - Bit scan reverse (find highest non-zero bit): bsr32(num)
  31. /// - Count leading zeros: clz32(num)
  32. /// - Count trailing zeros: ctz32(num)
  33. /// - Bit scan forward (simply an alias for ctz32()): bsf32(num)
  34. ///
  35. /// The above bit scan operations return 0-31. If num is zero,
  36. /// the result is undefined.
  37. //
  38. // Authors: Lasse Collin
  39. // Joachim Henke
  40. //
  41. ///////////////////////////////////////////////////////////////////////////////
  42. #ifndef TUKLIB_INTEGER_H
  43. #define TUKLIB_INTEGER_H
  44. #include "tuklib_common.h"
  45. #include <string.h>
  46. // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
  47. // and such functions.
  48. #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
  49. # include <immintrin.h>
  50. // Only include <intrin.h> when it is needed. GCC and Clang can both
  51. // use __builtin's, so we only need Windows instrincs when using MSVC.
  52. // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
  53. // cases explicitly.
  54. #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
  55. # include <intrin.h>
  56. #endif
  57. ///////////////////
  58. // Byte swapping //
  59. ///////////////////
  60. #if defined(HAVE___BUILTIN_BSWAPXX)
  61. // GCC >= 4.8 and Clang
  62. # define byteswap16(num) __builtin_bswap16(num)
  63. # define byteswap32(num) __builtin_bswap32(num)
  64. # define byteswap64(num) __builtin_bswap64(num)
  65. #elif defined(HAVE_BYTESWAP_H)
  66. // glibc, uClibc, dietlibc
  67. # include <byteswap.h>
  68. # ifdef HAVE_BSWAP_16
  69. # define byteswap16(num) bswap_16(num)
  70. # endif
  71. # ifdef HAVE_BSWAP_32
  72. # define byteswap32(num) bswap_32(num)
  73. # endif
  74. # ifdef HAVE_BSWAP_64
  75. # define byteswap64(num) bswap_64(num)
  76. # endif
  77. #elif defined(HAVE_SYS_ENDIAN_H)
  78. // *BSDs and Darwin
  79. # include <sys/endian.h>
  80. # ifdef __OpenBSD__
  81. # define byteswap16(num) swap16(num)
  82. # define byteswap32(num) swap32(num)
  83. # define byteswap64(num) swap64(num)
  84. # else
  85. # define byteswap16(num) bswap16(num)
  86. # define byteswap32(num) bswap32(num)
  87. # define byteswap64(num) bswap64(num)
  88. # endif
  89. #elif defined(HAVE_SYS_BYTEORDER_H)
  90. // Solaris
  91. # error #include <sys/byteorder.h>
  92. # ifdef BSWAP_16
  93. # define byteswap16(num) BSWAP_16(num)
  94. # endif
  95. # ifdef BSWAP_32
  96. # define byteswap32(num) BSWAP_32(num)
  97. # endif
  98. # ifdef BSWAP_64
  99. # define byteswap64(num) BSWAP_64(num)
  100. # endif
  101. # ifdef BE_16
  102. # define conv16be(num) BE_16(num)
  103. # endif
  104. # ifdef BE_32
  105. # define conv32be(num) BE_32(num)
  106. # endif
  107. # ifdef BE_64
  108. # define conv64be(num) BE_64(num)
  109. # endif
  110. # ifdef LE_16
  111. # define conv16le(num) LE_16(num)
  112. # endif
  113. # ifdef LE_32
  114. # define conv32le(num) LE_32(num)
  115. # endif
  116. # ifdef LE_64
  117. # define conv64le(num) LE_64(num)
  118. # endif
  119. #endif
  120. #ifndef byteswap16
  121. # define byteswap16(n) (uint16_t)( \
  122. (((n) & 0x00FFU) << 8) \
  123. | (((n) & 0xFF00U) >> 8) \
  124. )
  125. #endif
  126. #ifndef byteswap32
  127. # define byteswap32(n) (uint32_t)( \
  128. (((n) & UINT32_C(0x000000FF)) << 24) \
  129. | (((n) & UINT32_C(0x0000FF00)) << 8) \
  130. | (((n) & UINT32_C(0x00FF0000)) >> 8) \
  131. | (((n) & UINT32_C(0xFF000000)) >> 24) \
  132. )
  133. #endif
  134. #ifndef byteswap64
  135. # define byteswap64(n) (uint64_t)( \
  136. (((n) & UINT64_C(0x00000000000000FF)) << 56) \
  137. | (((n) & UINT64_C(0x000000000000FF00)) << 40) \
  138. | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
  139. | (((n) & UINT64_C(0x00000000FF000000)) << 8) \
  140. | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
  141. | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
  142. | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
  143. | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
  144. )
  145. #endif
  146. // Define conversion macros using the basic byte swapping macros.
  147. #ifdef WORDS_BIGENDIAN
  148. # ifndef conv16be
  149. # define conv16be(num) ((uint16_t)(num))
  150. # endif
  151. # ifndef conv32be
  152. # define conv32be(num) ((uint32_t)(num))
  153. # endif
  154. # ifndef conv64be
  155. # define conv64be(num) ((uint64_t)(num))
  156. # endif
  157. # ifndef conv16le
  158. # define conv16le(num) byteswap16(num)
  159. # endif
  160. # ifndef conv32le
  161. # define conv32le(num) byteswap32(num)
  162. # endif
  163. # ifndef conv64le
  164. # define conv64le(num) byteswap64(num)
  165. # endif
  166. #else
  167. # ifndef conv16be
  168. # define conv16be(num) byteswap16(num)
  169. # endif
  170. # ifndef conv32be
  171. # define conv32be(num) byteswap32(num)
  172. # endif
  173. # ifndef conv64be
  174. # define conv64be(num) byteswap64(num)
  175. # endif
  176. # ifndef conv16le
  177. # define conv16le(num) ((uint16_t)(num))
  178. # endif
  179. # ifndef conv32le
  180. # define conv32le(num) ((uint32_t)(num))
  181. # endif
  182. # ifndef conv64le
  183. # define conv64le(num) ((uint64_t)(num))
  184. # endif
  185. #endif
  186. ////////////////////////////////
  187. // Unaligned reads and writes //
  188. ////////////////////////////////
  189. // No-strict-align archs like x86-64
  190. // ---------------------------------
  191. //
  192. // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
  193. // is bad even if the uint8_pointer is properly aligned because this kind
  194. // of casts break strict aliasing rules and result in undefined behavior.
  195. // With unaligned pointers it's even worse: compilers may emit vector
  196. // instructions that require aligned pointers even if non-vector
  197. // instructions work with unaligned pointers.
  198. //
  199. // Using memcpy() is the standard compliant way to do unaligned access.
  200. // Many modern compilers inline it so there is no function call overhead.
  201. // For those compilers that don't handle the memcpy() method well, the
  202. // old casting method (that violates strict aliasing) can be requested at
  203. // build time. A third method, casting to a packed struct, would also be
  204. // an option but isn't provided to keep things simpler (it's already a mess).
  205. // Hopefully this is flexible enough in practice.
  206. //
  207. // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
  208. //
  209. // buf[0] | (buf[1] << 8)
  210. //
  211. // reads a 16-bit value and can emit a single 16-bit load and produce
  212. // identical code than with the memcpy() method. In other cases Clang and GCC
  213. // produce either the same or better code with memcpy(). For example, Clang 9
  214. // on x86-64 can detect 32-bit load but not 16-bit load.
  215. //
  216. // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
  217. // code for "buf[0] | (buf[1] << 8)".
  218. //
  219. // Conclusion: The memcpy() method is the best choice when unaligned access
  220. // is supported.
  221. //
  222. // Strict-align archs like SPARC
  223. // -----------------------------
  224. //
  225. // GCC versions from around 4.x to to at least 13.2.0 produce worse code
  226. // from the memcpy() method than from simple byte-by-byte shift-or code
  227. // when reading a 32-bit integer:
  228. //
  229. // (1) It may be constructed on stack using four 8-bit loads,
  230. // four 8-bit stores to stack, and finally one 32-bit load from stack.
  231. //
  232. // (2) Especially with -Os, an actual memcpy() call may be emitted.
  233. //
  234. // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
  235. // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
  236. // some processors but not all so this is relevant only in the case when
  237. // GCC assumes that unaligned is not supported or -mstrict-align or
  238. // -mno-unaligned-access is used.
  239. //
  240. // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
  241. // was one the very few with a minor difference: the memcpy() version
  242. // was one instruction longer.
  243. //
  244. // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
  245. // the best choice for strict-align archs to do unaligned access.
  246. //
  247. // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
  248. //
  249. // Thanks to <https://godbolt.org/> it was easy to test different compilers.
  250. // The following is for little endian targets:
  251. /*
  252. #include <stdint.h>
  253. #include <string.h>
  254. uint32_t bytes16(const uint8_t *b)
  255. {
  256. return (uint32_t)b[0]
  257. | ((uint32_t)b[1] << 8);
  258. }
  259. uint32_t copy16(const uint8_t *b)
  260. {
  261. uint16_t v;
  262. memcpy(&v, b, sizeof(v));
  263. return v;
  264. }
  265. uint32_t bytes32(const uint8_t *b)
  266. {
  267. return (uint32_t)b[0]
  268. | ((uint32_t)b[1] << 8)
  269. | ((uint32_t)b[2] << 16)
  270. | ((uint32_t)b[3] << 24);
  271. }
  272. uint32_t copy32(const uint8_t *b)
  273. {
  274. uint32_t v;
  275. memcpy(&v, b, sizeof(v));
  276. return v;
  277. }
  278. void wbytes16(uint8_t *b, uint16_t v)
  279. {
  280. b[0] = (uint8_t)v;
  281. b[1] = (uint8_t)(v >> 8);
  282. }
  283. void wcopy16(uint8_t *b, uint16_t v)
  284. {
  285. memcpy(b, &v, sizeof(v));
  286. }
  287. void wbytes32(uint8_t *b, uint32_t v)
  288. {
  289. b[0] = (uint8_t)v;
  290. b[1] = (uint8_t)(v >> 8);
  291. b[2] = (uint8_t)(v >> 16);
  292. b[3] = (uint8_t)(v >> 24);
  293. }
  294. void wcopy32(uint8_t *b, uint32_t v)
  295. {
  296. memcpy(b, &v, sizeof(v));
  297. }
  298. */
  299. #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
  300. static inline uint16_t
  301. read16ne(const uint8_t *buf)
  302. {
  303. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  304. return *(const uint16_t *)buf;
  305. #else
  306. uint16_t num;
  307. memcpy(&num, buf, sizeof(num));
  308. return num;
  309. #endif
  310. }
  311. static inline uint32_t
  312. read32ne(const uint8_t *buf)
  313. {
  314. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  315. return *(const uint32_t *)buf;
  316. #else
  317. uint32_t num;
  318. memcpy(&num, buf, sizeof(num));
  319. return num;
  320. #endif
  321. }
  322. static inline uint64_t
  323. read64ne(const uint8_t *buf)
  324. {
  325. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  326. return *(const uint64_t *)buf;
  327. #else
  328. uint64_t num;
  329. memcpy(&num, buf, sizeof(num));
  330. return num;
  331. #endif
  332. }
  333. static inline void
  334. write16ne(uint8_t *buf, uint16_t num)
  335. {
  336. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  337. *(uint16_t *)buf = num;
  338. #else
  339. memcpy(buf, &num, sizeof(num));
  340. #endif
  341. return;
  342. }
  343. static inline void
  344. write32ne(uint8_t *buf, uint32_t num)
  345. {
  346. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  347. *(uint32_t *)buf = num;
  348. #else
  349. memcpy(buf, &num, sizeof(num));
  350. #endif
  351. return;
  352. }
  353. static inline void
  354. write64ne(uint8_t *buf, uint64_t num)
  355. {
  356. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  357. *(uint64_t *)buf = num;
  358. #else
  359. memcpy(buf, &num, sizeof(num));
  360. #endif
  361. return;
  362. }
  363. static inline uint16_t
  364. read16be(const uint8_t *buf)
  365. {
  366. uint16_t num = read16ne(buf);
  367. return conv16be(num);
  368. }
  369. static inline uint16_t
  370. read16le(const uint8_t *buf)
  371. {
  372. uint16_t num = read16ne(buf);
  373. return conv16le(num);
  374. }
  375. static inline uint32_t
  376. read32be(const uint8_t *buf)
  377. {
  378. uint32_t num = read32ne(buf);
  379. return conv32be(num);
  380. }
  381. static inline uint32_t
  382. read32le(const uint8_t *buf)
  383. {
  384. uint32_t num = read32ne(buf);
  385. return conv32le(num);
  386. }
  387. static inline uint64_t
  388. read64be(const uint8_t *buf)
  389. {
  390. uint64_t num = read64ne(buf);
  391. return conv64be(num);
  392. }
  393. static inline uint64_t
  394. read64le(const uint8_t *buf)
  395. {
  396. uint64_t num = read64ne(buf);
  397. return conv64le(num);
  398. }
  399. // NOTE: Possible byte swapping must be done in a macro to allow the compiler
  400. // to optimize byte swapping of constants when using glibc's or *BSD's
  401. // byte swapping macros. The actual write is done in an inline function
  402. // to make type checking of the buf pointer possible.
  403. #define write16be(buf, num) write16ne(buf, conv16be(num))
  404. #define write32be(buf, num) write32ne(buf, conv32be(num))
  405. #define write64be(buf, num) write64ne(buf, conv64be(num))
  406. #define write16le(buf, num) write16ne(buf, conv16le(num))
  407. #define write32le(buf, num) write32ne(buf, conv32le(num))
  408. #define write64le(buf, num) write64ne(buf, conv64le(num))
  409. #else
  410. #ifdef WORDS_BIGENDIAN
  411. # define read16ne read16be
  412. # define read32ne read32be
  413. # define read64ne read64be
  414. # define write16ne write16be
  415. # define write32ne write32be
  416. # define write64ne write64be
  417. #else
  418. # define read16ne read16le
  419. # define read32ne read32le
  420. # define read64ne read64le
  421. # define write16ne write16le
  422. # define write32ne write32le
  423. # define write64ne write64le
  424. #endif
  425. static inline uint16_t
  426. read16be(const uint8_t *buf)
  427. {
  428. uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
  429. return num;
  430. }
  431. static inline uint16_t
  432. read16le(const uint8_t *buf)
  433. {
  434. uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
  435. return num;
  436. }
  437. static inline uint32_t
  438. read32be(const uint8_t *buf)
  439. {
  440. uint32_t num = (uint32_t)buf[0] << 24;
  441. num |= (uint32_t)buf[1] << 16;
  442. num |= (uint32_t)buf[2] << 8;
  443. num |= (uint32_t)buf[3];
  444. return num;
  445. }
  446. static inline uint32_t
  447. read32le(const uint8_t *buf)
  448. {
  449. uint32_t num = (uint32_t)buf[0];
  450. num |= (uint32_t)buf[1] << 8;
  451. num |= (uint32_t)buf[2] << 16;
  452. num |= (uint32_t)buf[3] << 24;
  453. return num;
  454. }
  455. static inline uint64_t
  456. read64be(const uint8_t *buf)
  457. {
  458. uint64_t num = (uint64_t)buf[0] << 56;
  459. num |= (uint64_t)buf[1] << 48;
  460. num |= (uint64_t)buf[2] << 40;
  461. num |= (uint64_t)buf[3] << 32;
  462. num |= (uint64_t)buf[4] << 24;
  463. num |= (uint64_t)buf[5] << 16;
  464. num |= (uint64_t)buf[6] << 8;
  465. num |= (uint64_t)buf[7];
  466. return num;
  467. }
  468. static inline uint64_t
  469. read64le(const uint8_t *buf)
  470. {
  471. uint64_t num = (uint64_t)buf[0];
  472. num |= (uint64_t)buf[1] << 8;
  473. num |= (uint64_t)buf[2] << 16;
  474. num |= (uint64_t)buf[3] << 24;
  475. num |= (uint64_t)buf[4] << 32;
  476. num |= (uint64_t)buf[5] << 40;
  477. num |= (uint64_t)buf[6] << 48;
  478. num |= (uint64_t)buf[7] << 56;
  479. return num;
  480. }
  481. static inline void
  482. write16be(uint8_t *buf, uint16_t num)
  483. {
  484. buf[0] = (uint8_t)(num >> 8);
  485. buf[1] = (uint8_t)num;
  486. return;
  487. }
  488. static inline void
  489. write16le(uint8_t *buf, uint16_t num)
  490. {
  491. buf[0] = (uint8_t)num;
  492. buf[1] = (uint8_t)(num >> 8);
  493. return;
  494. }
  495. static inline void
  496. write32be(uint8_t *buf, uint32_t num)
  497. {
  498. buf[0] = (uint8_t)(num >> 24);
  499. buf[1] = (uint8_t)(num >> 16);
  500. buf[2] = (uint8_t)(num >> 8);
  501. buf[3] = (uint8_t)num;
  502. return;
  503. }
  504. static inline void
  505. write32le(uint8_t *buf, uint32_t num)
  506. {
  507. buf[0] = (uint8_t)num;
  508. buf[1] = (uint8_t)(num >> 8);
  509. buf[2] = (uint8_t)(num >> 16);
  510. buf[3] = (uint8_t)(num >> 24);
  511. return;
  512. }
  513. static inline void
  514. write64be(uint8_t *buf, uint64_t num)
  515. {
  516. buf[0] = (uint8_t)(num >> 56);
  517. buf[1] = (uint8_t)(num >> 48);
  518. buf[2] = (uint8_t)(num >> 40);
  519. buf[3] = (uint8_t)(num >> 32);
  520. buf[4] = (uint8_t)(num >> 24);
  521. buf[5] = (uint8_t)(num >> 16);
  522. buf[6] = (uint8_t)(num >> 8);
  523. buf[7] = (uint8_t)num;
  524. return;
  525. }
  526. static inline void
  527. write64le(uint8_t *buf, uint64_t num)
  528. {
  529. buf[0] = (uint8_t)num;
  530. buf[1] = (uint8_t)(num >> 8);
  531. buf[2] = (uint8_t)(num >> 16);
  532. buf[3] = (uint8_t)(num >> 24);
  533. buf[4] = (uint8_t)(num >> 32);
  534. buf[5] = (uint8_t)(num >> 40);
  535. buf[6] = (uint8_t)(num >> 48);
  536. buf[7] = (uint8_t)(num >> 56);
  537. return;
  538. }
  539. #endif
  540. //////////////////////////////
  541. // Aligned reads and writes //
  542. //////////////////////////////
  543. // Separate functions for aligned reads and writes are provided since on
  544. // strict-align archs aligned access is much faster than unaligned access.
  545. //
  546. // Just like in the unaligned case, memcpy() is needed to avoid
  547. // strict aliasing violations. However, on archs that don't support
  548. // unaligned access the compiler cannot know that the pointers given
  549. // to memcpy() are aligned which results in slow code. As of C11 there is
  550. // no standard way to tell the compiler that we know that the address is
  551. // aligned but some compilers have language extensions to do that. With
  552. // such language extensions the memcpy() method gives excellent results.
  553. //
  554. // What to do on a strict-align system when no known language extensions
  555. // are available? Falling back to byte-by-byte access would be safe but ruin
  556. // optimizations that have been made specifically with aligned access in mind.
  557. // As a compromise, aligned reads will fall back to non-compliant type punning
  558. // but aligned writes will be byte-by-byte, that is, fast reads are preferred
  559. // over fast writes. This obviously isn't great but hopefully it's a working
  560. // compromise for now.
  561. //
  562. // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
  563. #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
  564. # define tuklib_memcpy_aligned(dest, src, size) \
  565. memcpy(dest, __builtin_assume_aligned(src, size), size)
  566. #else
  567. # define tuklib_memcpy_aligned(dest, src, size) \
  568. memcpy(dest, src, size)
  569. # ifndef TUKLIB_FAST_UNALIGNED_ACCESS
  570. # define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
  571. # endif
  572. #endif
  573. static inline uint16_t
  574. aligned_read16ne(const uint8_t *buf)
  575. {
  576. #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
  577. || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
  578. return *(const uint16_t *)buf;
  579. #else
  580. uint16_t num;
  581. tuklib_memcpy_aligned(&num, buf, sizeof(num));
  582. return num;
  583. #endif
  584. }
  585. static inline uint32_t
  586. aligned_read32ne(const uint8_t *buf)
  587. {
  588. #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
  589. || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
  590. return *(const uint32_t *)buf;
  591. #else
  592. uint32_t num;
  593. tuklib_memcpy_aligned(&num, buf, sizeof(num));
  594. return num;
  595. #endif
  596. }
  597. static inline uint64_t
  598. aligned_read64ne(const uint8_t *buf)
  599. {
  600. #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
  601. || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
  602. return *(const uint64_t *)buf;
  603. #else
  604. uint64_t num;
  605. tuklib_memcpy_aligned(&num, buf, sizeof(num));
  606. return num;
  607. #endif
  608. }
  609. static inline void
  610. aligned_write16ne(uint8_t *buf, uint16_t num)
  611. {
  612. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  613. *(uint16_t *)buf = num;
  614. #else
  615. tuklib_memcpy_aligned(buf, &num, sizeof(num));
  616. #endif
  617. return;
  618. }
  619. static inline void
  620. aligned_write32ne(uint8_t *buf, uint32_t num)
  621. {
  622. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  623. *(uint32_t *)buf = num;
  624. #else
  625. tuklib_memcpy_aligned(buf, &num, sizeof(num));
  626. #endif
  627. return;
  628. }
  629. static inline void
  630. aligned_write64ne(uint8_t *buf, uint64_t num)
  631. {
  632. #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
  633. *(uint64_t *)buf = num;
  634. #else
  635. tuklib_memcpy_aligned(buf, &num, sizeof(num));
  636. #endif
  637. return;
  638. }
  639. static inline uint16_t
  640. aligned_read16be(const uint8_t *buf)
  641. {
  642. uint16_t num = aligned_read16ne(buf);
  643. return conv16be(num);
  644. }
  645. static inline uint16_t
  646. aligned_read16le(const uint8_t *buf)
  647. {
  648. uint16_t num = aligned_read16ne(buf);
  649. return conv16le(num);
  650. }
  651. static inline uint32_t
  652. aligned_read32be(const uint8_t *buf)
  653. {
  654. uint32_t num = aligned_read32ne(buf);
  655. return conv32be(num);
  656. }
  657. static inline uint32_t
  658. aligned_read32le(const uint8_t *buf)
  659. {
  660. uint32_t num = aligned_read32ne(buf);
  661. return conv32le(num);
  662. }
  663. static inline uint64_t
  664. aligned_read64be(const uint8_t *buf)
  665. {
  666. uint64_t num = aligned_read64ne(buf);
  667. return conv64be(num);
  668. }
  669. static inline uint64_t
  670. aligned_read64le(const uint8_t *buf)
  671. {
  672. uint64_t num = aligned_read64ne(buf);
  673. return conv64le(num);
  674. }
  675. // These need to be macros like in the unaligned case.
  676. #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
  677. #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
  678. #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
  679. #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
  680. #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
  681. #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
  682. ////////////////////
  683. // Bit operations //
  684. ////////////////////
  685. static inline uint32_t
  686. bsr32(uint32_t n)
  687. {
  688. // Check for ICC first, since it tends to define __GNUC__ too.
  689. #if defined(__INTEL_COMPILER)
  690. return _bit_scan_reverse(n);
  691. #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
  692. // GCC >= 3.4 has __builtin_clz(), which gives good results on
  693. // multiple architectures. On x86, __builtin_clz() ^ 31U becomes
  694. // either plain BSR (so the XOR gets optimized away) or LZCNT and
  695. // XOR (if -march indicates that SSE4a instructions are supported).
  696. return (uint32_t)__builtin_clz(n) ^ 31U;
  697. #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
  698. uint32_t i;
  699. __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
  700. return i;
  701. #elif defined(_MSC_VER)
  702. unsigned long i;
  703. _BitScanReverse(&i, n);
  704. return i;
  705. #else
  706. uint32_t i = 31;
  707. if ((n & 0xFFFF0000) == 0) {
  708. n <<= 16;
  709. i = 15;
  710. }
  711. if ((n & 0xFF000000) == 0) {
  712. n <<= 8;
  713. i -= 8;
  714. }
  715. if ((n & 0xF0000000) == 0) {
  716. n <<= 4;
  717. i -= 4;
  718. }
  719. if ((n & 0xC0000000) == 0) {
  720. n <<= 2;
  721. i -= 2;
  722. }
  723. if ((n & 0x80000000) == 0)
  724. --i;
  725. return i;
  726. #endif
  727. }
  728. static inline uint32_t
  729. clz32(uint32_t n)
  730. {
  731. #if defined(__INTEL_COMPILER)
  732. return _bit_scan_reverse(n) ^ 31U;
  733. #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
  734. return (uint32_t)__builtin_clz(n);
  735. #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
  736. uint32_t i;
  737. __asm__("bsrl %1, %0\n\t"
  738. "xorl $31, %0"
  739. : "=r" (i) : "rm" (n));
  740. return i;
  741. #elif defined(_MSC_VER)
  742. unsigned long i;
  743. _BitScanReverse(&i, n);
  744. return i ^ 31U;
  745. #else
  746. uint32_t i = 0;
  747. if ((n & 0xFFFF0000) == 0) {
  748. n <<= 16;
  749. i = 16;
  750. }
  751. if ((n & 0xFF000000) == 0) {
  752. n <<= 8;
  753. i += 8;
  754. }
  755. if ((n & 0xF0000000) == 0) {
  756. n <<= 4;
  757. i += 4;
  758. }
  759. if ((n & 0xC0000000) == 0) {
  760. n <<= 2;
  761. i += 2;
  762. }
  763. if ((n & 0x80000000) == 0)
  764. ++i;
  765. return i;
  766. #endif
  767. }
  768. static inline uint32_t
  769. ctz32(uint32_t n)
  770. {
  771. #if defined(__INTEL_COMPILER)
  772. return _bit_scan_forward(n);
  773. #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
  774. return (uint32_t)__builtin_ctz(n);
  775. #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
  776. uint32_t i;
  777. __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
  778. return i;
  779. #elif defined(_MSC_VER)
  780. unsigned long i;
  781. _BitScanForward(&i, n);
  782. return i;
  783. #else
  784. uint32_t i = 0;
  785. if ((n & 0x0000FFFF) == 0) {
  786. n >>= 16;
  787. i = 16;
  788. }
  789. if ((n & 0x000000FF) == 0) {
  790. n >>= 8;
  791. i += 8;
  792. }
  793. if ((n & 0x0000000F) == 0) {
  794. n >>= 4;
  795. i += 4;
  796. }
  797. if ((n & 0x00000003) == 0) {
  798. n >>= 2;
  799. i += 2;
  800. }
  801. if ((n & 0x00000001) == 0)
  802. ++i;
  803. return i;
  804. #endif
  805. }
  806. #define bsf32 ctz32
  807. #endif