codec_neon32.c 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. #if (defined(__ARM_NEON) && !defined(__ARM_NEON__))
  2. #define __ARM_NEON__
  3. #endif
  4. #include <stdint.h>
  5. #include <stddef.h>
  6. #include <stdlib.h>
  7. #ifdef __ARM_NEON__
  8. #include <arm_neon.h>
  9. #endif
  10. #include "libbase64.h"
  11. #include "codecs.h"
  12. #if (defined(__arm__) && defined(__ARM_NEON__))
  13. #define CMPGT(s,n) vcgtq_u8((s), vdupq_n_u8(n))
  14. #define CMPEQ(s,n) vceqq_u8((s), vdupq_n_u8(n))
  15. #define REPLACE(s,n) vandq_u8((s), vdupq_n_u8(n))
  16. #define RANGE(s,a,b) vandq_u8(vcgeq_u8((s), vdupq_n_u8(a)), vcleq_u8((s), vdupq_n_u8(b)))
  17. static inline uint8x16x4_t
  18. enc_reshuffle (uint8x16x3_t in)
  19. {
  20. uint8x16x4_t out;
  21. // Divide bits of three input bytes over four output bytes:
  22. out.val[0] = vshrq_n_u8(in.val[0], 2);
  23. out.val[1] = vorrq_u8(vshrq_n_u8(in.val[1], 4), vshlq_n_u8(in.val[0], 4));
  24. out.val[2] = vorrq_u8(vshrq_n_u8(in.val[2], 6), vshlq_n_u8(in.val[1], 2));
  25. out.val[3] = in.val[2];
  26. // Clear top two bits:
  27. out.val[0] = vandq_u8(out.val[0], vdupq_n_u8(0x3F));
  28. out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
  29. out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
  30. out.val[3] = vandq_u8(out.val[3], vdupq_n_u8(0x3F));
  31. return out;
  32. }
  33. static inline uint8x16x4_t
  34. enc_translate (uint8x16x4_t in)
  35. {
  36. uint8x16x4_t mask1, mask2, mask3, mask4, out;
  37. // Translate values 0..63 to the Base64 alphabet. There are five sets:
  38. // # From To Abs Delta Characters
  39. // 0 [0..25] [65..90] +65 +65 ABCDEFGHIJKLMNOPQRSTUVWXYZ
  40. // 1 [26..51] [97..122] +71 +6 abcdefghijklmnopqrstuvwxyz
  41. // 2 [52..61] [48..57] -4 -75 0123456789
  42. // 3 [62] [43] -19 -15 +
  43. // 4 [63] [47] -16 +3 /
  44. // Create cumulative masks for characters in sets [1,2,3,4], [2,3,4],
  45. // [3,4], and [4]:
  46. mask1.val[0] = CMPGT(in.val[0], 25);
  47. mask1.val[1] = CMPGT(in.val[1], 25);
  48. mask1.val[2] = CMPGT(in.val[2], 25);
  49. mask1.val[3] = CMPGT(in.val[3], 25);
  50. mask2.val[0] = CMPGT(in.val[0], 51);
  51. mask2.val[1] = CMPGT(in.val[1], 51);
  52. mask2.val[2] = CMPGT(in.val[2], 51);
  53. mask2.val[3] = CMPGT(in.val[3], 51);
  54. mask3.val[0] = CMPGT(in.val[0], 61);
  55. mask3.val[1] = CMPGT(in.val[1], 61);
  56. mask3.val[2] = CMPGT(in.val[2], 61);
  57. mask3.val[3] = CMPGT(in.val[3], 61);
  58. mask4.val[0] = CMPEQ(in.val[0], 63);
  59. mask4.val[1] = CMPEQ(in.val[1], 63);
  60. mask4.val[2] = CMPEQ(in.val[2], 63);
  61. mask4.val[3] = CMPEQ(in.val[3], 63);
  62. // All characters are at least in cumulative set 0, so add 'A':
  63. out.val[0] = vaddq_u8(in.val[0], vdupq_n_u8(65));
  64. out.val[1] = vaddq_u8(in.val[1], vdupq_n_u8(65));
  65. out.val[2] = vaddq_u8(in.val[2], vdupq_n_u8(65));
  66. out.val[3] = vaddq_u8(in.val[3], vdupq_n_u8(65));
  67. // For inputs which are also in any of the other cumulative sets,
  68. // add delta values against the previous set(s) to correct the shift:
  69. out.val[0] = vaddq_u8(out.val[0], REPLACE(mask1.val[0], 6));
  70. out.val[1] = vaddq_u8(out.val[1], REPLACE(mask1.val[1], 6));
  71. out.val[2] = vaddq_u8(out.val[2], REPLACE(mask1.val[2], 6));
  72. out.val[3] = vaddq_u8(out.val[3], REPLACE(mask1.val[3], 6));
  73. out.val[0] = vsubq_u8(out.val[0], REPLACE(mask2.val[0], 75));
  74. out.val[1] = vsubq_u8(out.val[1], REPLACE(mask2.val[1], 75));
  75. out.val[2] = vsubq_u8(out.val[2], REPLACE(mask2.val[2], 75));
  76. out.val[3] = vsubq_u8(out.val[3], REPLACE(mask2.val[3], 75));
  77. out.val[0] = vsubq_u8(out.val[0], REPLACE(mask3.val[0], 15));
  78. out.val[1] = vsubq_u8(out.val[1], REPLACE(mask3.val[1], 15));
  79. out.val[2] = vsubq_u8(out.val[2], REPLACE(mask3.val[2], 15));
  80. out.val[3] = vsubq_u8(out.val[3], REPLACE(mask3.val[3], 15));
  81. out.val[0] = vaddq_u8(out.val[0], REPLACE(mask4.val[0], 3));
  82. out.val[1] = vaddq_u8(out.val[1], REPLACE(mask4.val[1], 3));
  83. out.val[2] = vaddq_u8(out.val[2], REPLACE(mask4.val[2], 3));
  84. out.val[3] = vaddq_u8(out.val[3], REPLACE(mask4.val[3], 3));
  85. return out;
  86. }
  87. #endif
  88. // Stride size is so large on these NEON 32-bit functions
  89. // (48 bytes encode, 32 bytes decode) that we inline the
  90. // uint32 codec to stay performant on smaller inputs.
  91. void
  92. neon32_base64_stream_encode
  93. ( struct neon32_base64_state *state
  94. , const char *src
  95. , size_t srclen
  96. , char *out
  97. , size_t *outlen
  98. )
  99. {
  100. #if (defined(__arm__) && defined(__ARM_NEON__))
  101. #include "enc_head.c"
  102. #include "enc_neon.c"
  103. #include "enc_uint32.c"
  104. #include "enc_tail.c"
  105. #else
  106. (void)state;
  107. (void)src;
  108. (void)srclen;
  109. (void)out;
  110. (void)outlen;
  111. abort();
  112. #endif
  113. }
  114. int
  115. neon32_base64_stream_decode
  116. ( struct neon32_base64_state *state
  117. , const char *src
  118. , size_t srclen
  119. , char *out
  120. , size_t *outlen
  121. )
  122. {
  123. #if (defined(__arm__) && defined(__ARM_NEON__))
  124. #include "dec_head.c"
  125. #include "dec_neon.c"
  126. #include "dec_uint32.c"
  127. #include "dec_tail.c"
  128. #else
  129. (void)state;
  130. (void)src;
  131. (void)srclen;
  132. (void)out;
  133. (void)outlen;
  134. abort();
  135. #endif
  136. }