dec_avx2.c 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. // If we have AVX2 support, pick off 32 bytes at a time for as long as we can,
  2. // but make sure that we quit before seeing any == markers at the end of the
  3. // string. Also, because we write 8 zeroes at the end of the output, ensure
  4. // that there are at least 11 valid bytes of input data remaining to close the
  5. // gap. 32 + 2 + 11 = 45 bytes:
  6. while (srclen >= 45)
  7. {
  8. // Load string:
  9. __m256i str = _mm256_loadu_si256((__m256i *)c);
  10. // The input consists of six character sets in the Base64 alphabet,
  11. // which we need to map back to the 6-bit values they represent.
  12. // There are three ranges, two singles, and then there's the rest.
  13. //
  14. // # From To Add Characters
  15. // 1 [43] [62] +19 +
  16. // 2 [47] [63] +16 /
  17. // 3 [48..57] [52..61] +4 0..9
  18. // 4 [65..90] [0..25] -65 A..Z
  19. // 5 [97..122] [26..51] -71 a..z
  20. // (6) Everything else => invalid input
  21. const __m256i set1 = CMPEQ(str, '+');
  22. const __m256i set2 = CMPEQ(str, '/');
  23. const __m256i set3 = RANGE(str, '0', '9');
  24. const __m256i set4 = RANGE(str, 'A', 'Z');
  25. const __m256i set5 = RANGE(str, 'a', 'z');
  26. const __m256i set6 = CMPEQ(str, '-');
  27. const __m256i set7 = CMPEQ(str, '_');
  28. __m256i delta = REPLACE(set1, 19);
  29. delta = _mm256_or_si256(delta, REPLACE(set2, 16));
  30. delta = _mm256_or_si256(delta, REPLACE(set3, 4));
  31. delta = _mm256_or_si256(delta, REPLACE(set4, -65));
  32. delta = _mm256_or_si256(delta, REPLACE(set5, -71));
  33. delta = _mm256_or_si256(delta, REPLACE(set6, 17));
  34. delta = _mm256_or_si256(delta, REPLACE(set7, -32));
  35. // Check for invalid input: if any of the delta values are zero,
  36. // fall back on bytewise code to do error checking and reporting:
  37. #ifdef _MSC_VER
  38. // Hack for MSVC miscompilation - it inserts vzeroupper for the break
  39. // (we need to clear YMM registers before exiting the function)
  40. // while delta and str are still in the registers.
  41. // Save delta/str in memory manually.
  42. _mm256_zeroupper();
  43. #endif
  44. if (_mm256_movemask_epi8(CMPEQ(delta, 0))) {
  45. break;
  46. }
  47. // Now simply add the delta values to the input:
  48. str = _mm256_add_epi8(str, delta);
  49. // Reshuffle the input to packed 12-byte output format:
  50. str = dec_reshuffle(str);
  51. // Store back:
  52. _mm256_storeu_si256((__m256i *)o, str);
  53. c += 32;
  54. o += 24;
  55. outl += 24;
  56. srclen -= 32;
  57. }