turbob64_.h 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. size_t _tb64xdec( const unsigned char *in, size_t inlen, unsigned char *out);
  2. size_t tb64memcpy(const unsigned char *in, size_t inlen, unsigned char *out); // testing only
  3. #define PREFETCH(_ip_,_i_,_rw_) __builtin_prefetch(_ip_+(_i_),_rw_)
  4. #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  5. #define BSWAP32(a) a
  6. #define BSWAP64(a) a
  7. #else
  8. #define BSWAP32(a) bswap32(a)
  9. #define BSWAP64(a) bswap64(a)
  10. #endif
  11. #ifdef NB64CHECK
  12. #define CHECK0(a)
  13. #define CHECK1(a)
  14. #else
  15. #define CHECK0(a) a
  16. #ifdef B64CHECK
  17. #define CHECK1(a) a
  18. #else
  19. #define CHECK1(a)
  20. #endif
  21. #endif
  22. //--------------------- Encoding ----------------------------------------------------------
  23. extern unsigned char tb64lutse[];
  24. #define SU32(_u_) (tb64lutse[(_u_>> 8) & 0x3f] << 24 |\
  25. tb64lutse[(_u_>>14) & 0x3f] << 16 |\
  26. tb64lutse[(_u_>>20) & 0x3f] << 8 |\
  27. tb64lutse[(_u_>>26) & 0x3f])
  28. #define ETAIL()\
  29. unsigned _l = (in+inlen) - ip;\
  30. if(_l == 3) { unsigned _u = ip[0]<<24 | ip[1]<<16 | ip[2]<<8; stou32(op, SU32(_u)); op+=4; ip+=3; }\
  31. else if(_l) { *op++ = tb64lutse[(ip[0]>>2)&0x3f];\
  32. if(_l == 2) *op++ = tb64lutse[(ip[0] & 0x3) << 4 | (ip[1] & 0xf0) >> 4],\
  33. *op++ = tb64lutse[(ip[1] & 0xf) << 2];\
  34. else *op++ = tb64lutse[(ip[0] & 0x3) << 4], *op++ = '=';\
  35. *op++ = '=';\
  36. }
  37. extern const unsigned short tb64lutxe[];
  38. #define XU32(_u_) (tb64lutxe[(_u_ >> 8) & 0xfff] << 16 |\
  39. tb64lutxe[ _u_ >> 20])
  40. #define EXTAIL() for(; op < (out+outlen)-4; op += 4, ip += 3) { unsigned _u = BSWAP32(ctou32(ip)); stou32(op, XU32(_u)); } ETAIL()
  41. //--------------------- Decoding ----------------------------------------------------------
  42. extern const unsigned tb64lutxd0[];
  43. extern const unsigned tb64lutxd1[];
  44. extern const unsigned tb64lutxd2[];
  45. extern const unsigned tb64lutxd3[];
  46. #define DU32(_u_) (tb64lutxd0[(unsigned char)(_u_ )] |\
  47. tb64lutxd1[(unsigned char)(_u_>> 8)] |\
  48. tb64lutxd2[(unsigned char)(_u_>> 16)] |\
  49. tb64lutxd3[ _u_>> 24 ] )
  50. #if 0
  51. static ALWAYS_INLINE size_t _tb64xd(const unsigned char *in, size_t inlen, unsigned char *out) {
  52. const unsigned char *ip = in;
  53. unsigned char *op = out;
  54. for(; ip < (in+inlen)-4; ip += 4, op += 3) { unsigned u = ctou32(ip); u = DU32(u); stou32(op, u); }
  55. unsigned u = 0, l = (in+inlen) - ip;
  56. if(l == 4) // last 4 bytes
  57. if( ip[3]=='=') { l = 3;
  58. if( ip[2]=='=') { l = 2;
  59. if(ip[1]=='=') l = 1;
  60. }
  61. }
  62. unsigned char *up = (unsigned char *)&u;
  63. switch(l) {
  64. case 4: u = ctou32(ip); u = DU32(u); *op++ = up[0]; *op++ = up[1]; *op++ = up[2]; break; // 4->3 bytes
  65. case 3: u = tb64lutxd0[ip[0]] | tb64lutxd1[ip[1]] | tb64lutxd2[ip[2]]; *op++ = up[0]; *op++ = up[1]; break; // 3->2 bytes
  66. case 2: u = tb64lutxd0[ip[0]] | tb64lutxd1[ip[1]]; *op++ = up[0]; break; // 2->1 byte
  67. case 1: u = tb64lutxd0[ip[0]]; *op++ = up[0]; break; // 1->1 byte
  68. }
  69. return op-out;
  70. }
  71. #else
  72. static ALWAYS_INLINE size_t _tb64xd(const unsigned char *in, size_t inlen, unsigned char *out) {
  73. const unsigned char *ip = in;
  74. unsigned char *op = out;
  75. unsigned cu = 0;
  76. for(; ip < (in+inlen)-4; ip += 4, op += 3) { unsigned u = ctou32(ip); u = DU32(u); stou32(op, u); cu |= u; }
  77. unsigned u = 0, l = (in+inlen) - ip;
  78. if(l == 4) // last 4 bytes
  79. if( ip[3]=='=') { l = 3;
  80. if( ip[2]=='=') { l = 2;
  81. if(ip[1]=='=') l = 1;
  82. }
  83. }
  84. unsigned char *up = (unsigned char *)&u;
  85. switch(l) {
  86. case 4: u = ctou32(ip); u = DU32(u); *op++ = up[0]; *op++ = up[1]; *op++ = up[2]; cu |= u; break; // 4->3 bytes
  87. case 3: u = tb64lutxd0[ip[0]] | tb64lutxd1[ip[1]] | tb64lutxd2[ip[2]]; *op++ = up[0]; *op++ = up[1]; cu |= u; break; // 3->2 bytes
  88. case 2: u = tb64lutxd0[ip[0]] | tb64lutxd1[ip[1]]; *op++ = up[0]; cu |= u; break; // 2->1 byte
  89. case 1: u = tb64lutxd0[ip[0]]; *op++ = up[0]; cu |= u; break; // 1->1 byte
  90. }
  91. return (cu == -1)?0:(op-out);
  92. }
  93. #endif
  94. //--------------------------- sse -----------------------------------------------------------------
  95. #if defined(__SSSE3__)
  96. #include <tmmintrin.h>
  97. #define MM_PACK8TO6(v, cpv) {\
  98. const __m128i merge_ab_and_bc = _mm_maddubs_epi16(v, _mm_set1_epi32(0x01400140)); /*/dec_reshuffle: https://arxiv.org/abs/1704.00605 P.17*/\
  99. v = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000));\
  100. v = _mm_shuffle_epi8(v, cpv);\
  101. }
  102. #define MM_MAP8TO6(iv, shifted, delta_asso, delta_values, ov) { /*map 8-bits ascii to 6-bits bin*/\
  103. shifted = _mm_srli_epi32(iv, 3);\
  104. const __m128i delta_hash = _mm_avg_epu8(_mm_shuffle_epi8(delta_asso, iv), shifted);\
  105. ov = _mm_add_epi8(_mm_shuffle_epi8(delta_values, delta_hash), iv);\
  106. }
  107. #define MM_B64CHK(iv, shifted, check_asso, check_values, vx) {\
  108. const __m128i check_hash = _mm_avg_epu8( _mm_shuffle_epi8(check_asso, iv), shifted);\
  109. const __m128i chk = _mm_adds_epi8(_mm_shuffle_epi8(check_values, check_hash), iv);\
  110. vx = _mm_or_si128(vx, chk);\
  111. }
  112. static ALWAYS_INLINE __m128i mm_map6to8(const __m128i v) {
  113. const __m128i offsets = _mm_set_epi8( 0, 0,-16,-19, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 71, 65);
  114. __m128i vidx = _mm_subs_epu8(v, _mm_set1_epi8(51));
  115. vidx = _mm_sub_epi8(vidx, _mm_cmpgt_epi8(v, _mm_set1_epi8(25)));
  116. return _mm_add_epi8(v, _mm_shuffle_epi8(offsets, vidx));
  117. }
  118. static ALWAYS_INLINE __m128i mm_unpack6to8(__m128i v) {
  119. __m128i va = _mm_mulhi_epu16(_mm_and_si128(v, _mm_set1_epi32(0x0fc0fc00)), _mm_set1_epi32(0x04000040));
  120. __m128i vb = _mm_mullo_epi16(_mm_and_si128(v, _mm_set1_epi32(0x003f03f0)), _mm_set1_epi32(0x01000010));
  121. return _mm_or_si128(va, vb);
  122. }
  123. #endif