dec_neon.c 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. // If we have NEON support, pick off 64 bytes at a time for as long as we can.
  2. // Unlike the SSE codecs, we don't write trailing zero bytes to output, so we
  3. // don't need to check if we have enough remaining input to cover them:
  4. while (srclen >= 64)
  5. {
  6. uint8x16x4_t set1, set2, set3, set4, set5, set6, set7, delta;
  7. uint8x16x3_t dec;
  8. // Load 64 bytes and deinterleave:
  9. uint8x16x4_t str = vld4q_u8((uint8_t *)c);
  10. // The input consists of six character sets in the Base64 alphabet,
  11. // which we need to map back to the 6-bit values they represent.
  12. // There are three ranges, two singles, and then there's the rest.
  13. //
  14. // # From To Add Characters
  15. // 1 [43] [62] +19 +
  16. // 2 [47] [63] +16 /
  17. // 3 [48..57] [52..61] +4 0..9
  18. // 4 [65..90] [0..25] -65 A..Z
  19. // 5 [97..122] [26..51] -71 a..z
  20. // (6) Everything else => invalid input
  21. // Benchmarking on the Raspberry Pi 2B and Clang shows that looping
  22. // generates slightly faster code than explicit unrolling:
  23. for (int i = 0; i < 4; i++) {
  24. set1.val[i] = CMPEQ(str.val[i], '+');
  25. set2.val[i] = CMPEQ(str.val[i], '/');
  26. set3.val[i] = RANGE(str.val[i], '0', '9');
  27. set4.val[i] = RANGE(str.val[i], 'A', 'Z');
  28. set5.val[i] = RANGE(str.val[i], 'a', 'z');
  29. set6.val[i] = CMPEQ(str.val[i], '-');
  30. set7.val[i] = CMPEQ(str.val[i], '_');
  31. delta.val[i] = REPLACE(set1.val[i], 19);
  32. delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set2.val[i], 16));
  33. delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set3.val[i], 4));
  34. delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set4.val[i], -65));
  35. delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set5.val[i], -71));
  36. delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set6.val[i], 17));
  37. delta.val[i] = vorrq_u8(delta.val[i], REPLACE(set7.val[i], -32));
  38. }
  39. // Check for invalid input: if any of the delta values are zero,
  40. // fall back on bytewise code to do error checking and reporting:
  41. uint8x16_t classified = CMPEQ(delta.val[0], 0);
  42. classified = vorrq_u8(classified, CMPEQ(delta.val[1], 0));
  43. classified = vorrq_u8(classified, CMPEQ(delta.val[2], 0));
  44. classified = vorrq_u8(classified, CMPEQ(delta.val[3], 0));
  45. // Extract both 32-bit halves; check that all bits are zero:
  46. if (vgetq_lane_u32((uint32x4_t)classified, 0) != 0
  47. || vgetq_lane_u32((uint32x4_t)classified, 1) != 0
  48. || vgetq_lane_u32((uint32x4_t)classified, 2) != 0
  49. || vgetq_lane_u32((uint32x4_t)classified, 3) != 0) {
  50. break;
  51. }
  52. // Now simply add the delta values to the input:
  53. str.val[0] = vaddq_u8(str.val[0], delta.val[0]);
  54. str.val[1] = vaddq_u8(str.val[1], delta.val[1]);
  55. str.val[2] = vaddq_u8(str.val[2], delta.val[2]);
  56. str.val[3] = vaddq_u8(str.val[3], delta.val[3]);
  57. // Compress four bytes into three:
  58. dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4);
  59. dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2);
  60. dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3];
  61. // Interleave and store decoded result:
  62. vst3q_u8((uint8_t *)o, dec);
  63. c += 64;
  64. o += 48;
  65. outl += 48;
  66. srclen -= 64;
  67. }