codec_ssse3.c 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. #include <stdint.h>
  2. #include <stddef.h>
  3. #include <stdlib.h>
  4. #include "libbase64.h"
  5. #include "codecs.h"
  6. #ifdef __SSSE3__
  7. #include <tmmintrin.h>
  8. #define CMPGT(s,n) _mm_cmpgt_epi8((s), _mm_set1_epi8(n))
  9. #define CMPEQ(s,n) _mm_cmpeq_epi8((s), _mm_set1_epi8(n))
  10. #define REPLACE(s,n) _mm_and_si128((s), _mm_set1_epi8(n))
  11. #define RANGE(s,a,b) _mm_andnot_si128(CMPGT((s), (b)), CMPGT((s), (a) - 1))
  12. static inline __m128i
  13. _mm_bswap_epi32 (const __m128i in)
  14. {
  15. return _mm_shuffle_epi8(in, _mm_setr_epi8(
  16. 3, 2, 1, 0,
  17. 7, 6, 5, 4,
  18. 11, 10, 9, 8,
  19. 15, 14, 13, 12));
  20. }
  21. static inline __m128i
  22. enc_reshuffle (__m128i in)
  23. {
  24. // Slice into 32-bit chunks and operate on all chunks in parallel.
  25. // All processing is done within the 32-bit chunk. First, shuffle:
  26. // before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb]
  27. // after: [00000000|aaaaaabb|bbbbcccc|ccdddddd]
  28. in = _mm_shuffle_epi8(in, _mm_set_epi8(
  29. -1, 9, 10, 11,
  30. -1, 6, 7, 8,
  31. -1, 3, 4, 5,
  32. -1, 0, 1, 2));
  33. // cd = [00000000|00000000|0000cccc|ccdddddd]
  34. const __m128i cd = _mm_and_si128(in, _mm_set1_epi32(0x00000FFF));
  35. // ab = [0000aaaa|aabbbbbb|00000000|00000000]
  36. const __m128i ab = _mm_and_si128(_mm_slli_epi32(in, 4), _mm_set1_epi32(0x0FFF0000));
  37. // merged = [0000aaaa|aabbbbbb|0000cccc|ccdddddd]
  38. const __m128i merged = _mm_or_si128(ab, cd);
  39. // bd = [00000000|00bbbbbb|00000000|00dddddd]
  40. const __m128i bd = _mm_and_si128(merged, _mm_set1_epi32(0x003F003F));
  41. // ac = [00aaaaaa|00000000|00cccccc|00000000]
  42. const __m128i ac = _mm_and_si128(_mm_slli_epi32(merged, 2), _mm_set1_epi32(0x3F003F00));
  43. // indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
  44. const __m128i indices = _mm_or_si128(ac, bd);
  45. // return = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
  46. return _mm_bswap_epi32(indices);
  47. }
  48. static inline __m128i
  49. enc_translate (const __m128i in)
  50. {
  51. // Translate values 0..63 to the Base64 alphabet. There are five sets:
  52. // # From To Abs Delta Characters
  53. // 0 [0..25] [65..90] +65 +65 ABCDEFGHIJKLMNOPQRSTUVWXYZ
  54. // 1 [26..51] [97..122] +71 +6 abcdefghijklmnopqrstuvwxyz
  55. // 2 [52..61] [48..57] -4 -75 0123456789
  56. // 3 [62] [43] -19 -15 +
  57. // 4 [63] [47] -16 +3 /
  58. // Create cumulative masks for characters in sets [1,2,3,4], [2,3,4],
  59. // [3,4], and [4]:
  60. const __m128i mask1 = CMPGT(in, 25);
  61. const __m128i mask2 = CMPGT(in, 51);
  62. const __m128i mask3 = CMPGT(in, 61);
  63. const __m128i mask4 = CMPEQ(in, 63);
  64. // All characters are at least in cumulative set 0, so add 'A':
  65. __m128i out = _mm_add_epi8(in, _mm_set1_epi8(65));
  66. // For inputs which are also in any of the other cumulative sets,
  67. // add delta values against the previous set(s) to correct the shift:
  68. out = _mm_add_epi8(out, REPLACE(mask1, 6));
  69. out = _mm_sub_epi8(out, REPLACE(mask2, 75));
  70. out = _mm_sub_epi8(out, REPLACE(mask3, 15));
  71. out = _mm_add_epi8(out, REPLACE(mask4, 3));
  72. return out;
  73. }
  74. static inline __m128i
  75. dec_reshuffle (__m128i in)
  76. {
  77. // Shuffle bytes to 32-bit bigendian:
  78. in = _mm_bswap_epi32(in);
  79. // Mask in a single byte per shift:
  80. __m128i mask = _mm_set1_epi32(0x3F000000);
  81. // Pack bytes together:
  82. __m128i out = _mm_slli_epi32(_mm_and_si128(in, mask), 2);
  83. mask = _mm_srli_epi32(mask, 8);
  84. out = _mm_or_si128(out, _mm_slli_epi32(_mm_and_si128(in, mask), 4));
  85. mask = _mm_srli_epi32(mask, 8);
  86. out = _mm_or_si128(out, _mm_slli_epi32(_mm_and_si128(in, mask), 6));
  87. mask = _mm_srli_epi32(mask, 8);
  88. out = _mm_or_si128(out, _mm_slli_epi32(_mm_and_si128(in, mask), 8));
  89. // Reshuffle and repack into 12-byte output format:
  90. return _mm_shuffle_epi8(out, _mm_setr_epi8(
  91. 3, 2, 1,
  92. 7, 6, 5,
  93. 11, 10, 9,
  94. 15, 14, 13,
  95. -1, -1, -1, -1));
  96. }
  97. #endif // __SSSE3__
  98. void
  99. ssse3_base64_stream_encode
  100. ( struct ssse3_base64_state *state
  101. , const char *src
  102. , size_t srclen
  103. , char *out
  104. , size_t *outlen
  105. )
  106. {
  107. #ifdef __SSSE3__
  108. #include "enc_head.c"
  109. #include "enc_ssse3.c"
  110. #include "enc_tail.c"
  111. #else
  112. (void)state;
  113. (void)src;
  114. (void)srclen;
  115. (void)out;
  116. (void)outlen;
  117. abort();
  118. #endif
  119. }
  120. int
  121. ssse3_base64_stream_decode
  122. ( struct ssse3_base64_state *state
  123. , const char *src
  124. , size_t srclen
  125. , char *out
  126. , size_t *outlen
  127. )
  128. {
  129. #ifdef __SSSE3__
  130. #include "dec_head.c"
  131. #include "dec_ssse3.c"
  132. #include "dec_tail.c"
  133. #else
  134. (void)state;
  135. (void)src;
  136. (void)srclen;
  137. (void)out;
  138. (void)outlen;
  139. abort();
  140. #endif
  141. }