hh_buffer.h 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. // Copyright 2017 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #ifndef HIGHWAYHASH_HH_BUFFER_H_
  15. #define HIGHWAYHASH_HH_BUFFER_H_
  16. // Helper functions used by hh_avx2 and hh_sse41.
  17. // WARNING: this is a "restricted" header because it is included from
  18. // translation units compiled with different flags. This header and its
  19. // dependencies must not define any function unless it is static inline and/or
  20. // within namespace HH_TARGET_NAME. See arch_specific.h for details.
  21. #include "highwayhash/vector128.h"
  22. // For auto-dependency generation, we need to include all headers but not their
  23. // contents (otherwise compilation fails because -msse4.1 is not specified).
  24. #ifndef HH_DISABLE_TARGET_SPECIFIC
  25. namespace highwayhash {
  26. // To prevent ODR violations when including this from multiple translation
  27. // units (TU) that are compiled with different flags, the contents must reside
  28. // in a namespace whose name is unique to the TU. NOTE: this behavior is
  29. // incompatible with precompiled modules and requires textual inclusion instead.
  30. namespace HH_TARGET_NAME {
  31. template <uint32_t kSizeOffset>
  32. struct IntMask {}; // primary template
  33. template <>
  34. struct IntMask<0> {
  35. // Returns 32-bit lanes : ~0U if that lane can be loaded given "size" bytes.
  36. // Typical case: size = 0..16, nothing deducted.
  37. HH_INLINE V4x32U operator()(const V4x32U& size) const {
  38. // Lane n is valid if size >= (n + 1) * 4; subtract one because we only have
  39. // greater-than comparisons and don't want a negated mask.
  40. return V4x32U(_mm_cmpgt_epi32(size, V4x32U(15, 11, 7, 3)));
  41. }
  42. };
  43. template <>
  44. struct IntMask<16> {
  45. // "size" is 16..31; this is for loading the upper half of a packet, so
  46. // effectively deduct 16 from size by changing the comparands.
  47. HH_INLINE V4x32U operator()(const V4x32U& size) const {
  48. return V4x32U(_mm_cmpgt_epi32(size, V4x32U(31, 27, 23, 19)));
  49. }
  50. };
  51. // Inserts "bytes4" into "prev" at the lowest i such that mask[i] = 0.
  52. // Assumes prev[j] == 0 if mask[j] = 0.
  53. HH_INLINE V4x32U Insert4AboveMask(const uint32_t bytes4, const V4x32U& mask,
  54. const V4x32U& prev) {
  55. // There is no 128-bit shift by a variable count. Using shuffle_epi8 with a
  56. // control mask requires a table lookup. We know the shift count is a
  57. // multiple of 4 bytes, so we can broadcastd_epi32 and clear all lanes except
  58. // those where mask != 0. This works because any upper output lanes need not
  59. // be zero.
  60. return prev | AndNot(mask, V4x32U(bytes4));
  61. }
  62. // Shifts "suffix" left by "prefix_len" = 0..15 bytes, clears upper bytes of
  63. // "prefix", and returns the merged/concatenated bytes.
  64. HH_INLINE V4x32U Concatenate(const V4x32U& prefix, const size_t prefix_len,
  65. const V4x32U& suffix) {
  66. static const uint64_t table[V16x8U::N][V2x64U::N] = {
  67. {0x0706050403020100ull, 0x0F0E0D0C0B0A0908ull},
  68. {0x06050403020100FFull, 0x0E0D0C0B0A090807ull},
  69. {0x050403020100FFFFull, 0x0D0C0B0A09080706ull},
  70. {0x0403020100FFFFFFull, 0x0C0B0A0908070605ull},
  71. {0x03020100FFFFFFFFull, 0x0B0A090807060504ull},
  72. {0x020100FFFFFFFFFFull, 0x0A09080706050403ull},
  73. {0x0100FFFFFFFFFFFFull, 0x0908070605040302ull},
  74. {0x00FFFFFFFFFFFFFFull, 0x0807060504030201ull},
  75. {0xFFFFFFFFFFFFFFFFull, 0x0706050403020100ull},
  76. {0xFFFFFFFFFFFFFFFFull, 0x06050403020100FFull},
  77. {0xFFFFFFFFFFFFFFFFull, 0x050403020100FFFFull},
  78. {0xFFFFFFFFFFFFFFFFull, 0x0403020100FFFFFFull},
  79. {0xFFFFFFFFFFFFFFFFull, 0x03020100FFFFFFFFull},
  80. {0xFFFFFFFFFFFFFFFFull, 0x020100FFFFFFFFFFull},
  81. {0xFFFFFFFFFFFFFFFFull, 0x0100FFFFFFFFFFFFull},
  82. {0xFFFFFFFFFFFFFFFFull, 0x00FFFFFFFFFFFFFFull}};
  83. const V2x64U control = Load<V2x64U>(&table[prefix_len][0]);
  84. const V2x64U shifted_suffix(_mm_shuffle_epi8(suffix, control));
  85. return V4x32U(_mm_blendv_epi8(shifted_suffix, prefix, control));
  86. }
  87. } // namespace HH_TARGET_NAME
  88. } // namespace highwayhash
  89. #endif // HH_DISABLE_TARGET_SPECIFIC
  90. #endif // HIGHWAYHASH_HH_BUFFER_H_