enc_neon.c 1.1 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. // If we have ARM NEON support, pick off 48 bytes at a time:
  2. while (srclen >= 48)
  3. {
  4. uint8x16x3_t str;
  5. uint8x16x4_t res;
  6. // Load 48 bytes and deinterleave:
  7. str = vld3q_u8((uint8_t *)c);
  8. // Divide bits of three input bytes over four output bytes:
  9. res.val[0] = vshrq_n_u8(str.val[0], 2);
  10. res.val[1] = vshrq_n_u8(str.val[1], 4) | vshlq_n_u8(str.val[0], 4);
  11. res.val[2] = vshrq_n_u8(str.val[2], 6) | vshlq_n_u8(str.val[1], 2);
  12. res.val[3] = str.val[2];
  13. // Clear top two bits:
  14. res.val[0] &= vdupq_n_u8(0x3F);
  15. res.val[1] &= vdupq_n_u8(0x3F);
  16. res.val[2] &= vdupq_n_u8(0x3F);
  17. res.val[3] &= vdupq_n_u8(0x3F);
  18. // The bits have now been shifted to the right locations;
  19. // translate their values 0..63 to the Base64 alphabet.
  20. // Use a 64-byte table lookup:
  21. res.val[0] = vqtbl4q_u8(tbl_enc, res.val[0]);
  22. res.val[1] = vqtbl4q_u8(tbl_enc, res.val[1]);
  23. res.val[2] = vqtbl4q_u8(tbl_enc, res.val[2]);
  24. res.val[3] = vqtbl4q_u8(tbl_enc, res.val[3]);
  25. // Interleave and store result:
  26. vst4q_u8((uint8_t *)o, res);
  27. c += 48; // 3 * 16 bytes of input
  28. o += 64; // 4 * 16 bytes of output
  29. outl += 64;
  30. srclen -= 48;
  31. }