l1_distance.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. #pragma once
  2. #include <library/cpp/sse/sse.h>
  3. #include <util/system/types.h>
  4. #include <util/generic/ymath.h>
  5. #include <util/system/align.h>
  6. #include <util/system/platform.h>
  7. namespace NL1Distance {
  8. namespace NPrivate {
  9. template <typename T>
  10. inline T AbsDelta(T a, T b) {
  11. if (a < b)
  12. return b - a;
  13. return a - b;
  14. }
  15. template <typename Result, typename Number>
  16. inline Result L1DistanceImpl(const Number* lhs, const Number* rhs, int length) {
  17. Result sum = 0;
  18. for (int i = 0; i < length; i++)
  19. sum += AbsDelta(lhs[i], rhs[i]);
  20. return sum;
  21. }
  22. template <typename Result, typename Number>
  23. inline Result L1DistanceImpl2(const Number* lhs, const Number* rhs, int length) {
  24. Result s0 = 0;
  25. Result s1 = 0;
  26. while (length >= 2) {
  27. s0 += AbsDelta(lhs[0], rhs[0]);
  28. s1 += AbsDelta(lhs[1], rhs[1]);
  29. lhs += 2;
  30. rhs += 2;
  31. length -= 2;
  32. }
  33. while (length--)
  34. s0 += AbsDelta(*lhs++, *rhs++);
  35. return s0 + s1;
  36. }
  37. template <typename Result, typename Number>
  38. inline Result L1DistanceImpl4(const Number* lhs, const Number* rhs, int length) {
  39. Result s0 = 0;
  40. Result s1 = 0;
  41. Result s2 = 0;
  42. Result s3 = 0;
  43. while (length >= 4) {
  44. s0 += AbsDelta(lhs[0], rhs[0]);
  45. s1 += AbsDelta(lhs[1], rhs[1]);
  46. s2 += AbsDelta(lhs[2], rhs[2]);
  47. s3 += AbsDelta(lhs[3], rhs[3]);
  48. lhs += 4;
  49. rhs += 4;
  50. length -= 4;
  51. }
  52. while (length--)
  53. s0 += AbsDelta(*lhs++, *rhs++);
  54. return s0 + s1 + s2 + s3;
  55. }
  56. template <typename Result>
  57. inline Result L1DistanceImplUI4(const ui8* lhs, const ui8* rhs, int lengtInBytes) {
  58. Result sum = 0;
  59. for (int i = 0; i < lengtInBytes; ++i) {
  60. sum += AbsDelta(lhs[i] & 0x0f, rhs[i] & 0x0f);
  61. sum += AbsDelta(lhs[i] & 0xf0, rhs[i] & 0xf0) >> 4;
  62. }
  63. return sum;
  64. }
  65. #ifdef ARCADIA_SSE
  66. static const __m128i MASK_UI4_1 = _mm_set_epi8(0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f);
  67. static const __m128i MASK_UI4_2 = _mm_set_epi8(0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0);
  68. Y_FORCE_INLINE ui32 L1Distance96Ui8(const ui8* lhs, const ui8* rhs) {
  69. __m128i x1 = _mm_loadu_si128((const __m128i*)&lhs[0]);
  70. __m128i y1 = _mm_loadu_si128((const __m128i*)&rhs[0]);
  71. __m128i sum = _mm_sad_epu8(x1, y1);
  72. __m128i x2 = _mm_loadu_si128((const __m128i*)&lhs[16]);
  73. __m128i y2 = _mm_loadu_si128((const __m128i*)&rhs[16]);
  74. sum = _mm_add_epi64(sum, _mm_sad_epu8(x2, y2));
  75. __m128i x3 = _mm_loadu_si128((const __m128i*)&lhs[32]);
  76. __m128i y3 = _mm_loadu_si128((const __m128i*)&rhs[32]);
  77. sum = _mm_add_epi64(sum, _mm_sad_epu8(x3, y3));
  78. __m128i x4 = _mm_loadu_si128((const __m128i*)&lhs[48]);
  79. __m128i y4 = _mm_loadu_si128((const __m128i*)&rhs[48]);
  80. sum = _mm_add_epi64(sum, _mm_sad_epu8(x4, y4));
  81. __m128i x5 = _mm_loadu_si128((const __m128i*)&lhs[64]);
  82. __m128i y5 = _mm_loadu_si128((const __m128i*)&rhs[64]);
  83. sum = _mm_add_epi64(sum, _mm_sad_epu8(x5, y5));
  84. __m128i x6 = _mm_loadu_si128((const __m128i*)&lhs[80]);
  85. __m128i y6 = _mm_loadu_si128((const __m128i*)&rhs[80]);
  86. sum = _mm_add_epi64(sum, _mm_sad_epu8(x6, y6));
  87. return _mm_cvtsi128_si32(sum) + _mm_cvtsi128_si32(_mm_shuffle_epi32(sum, _MM_SHUFFLE(2, 2, 2, 2)));
  88. }
  89. Y_FORCE_INLINE ui32 L1Distance96Ui4(const ui8* lhs, const ui8* rhs) {
  90. __m128i x1 = _mm_loadu_si128((const __m128i*)&lhs[0]);
  91. __m128i y1 = _mm_loadu_si128((const __m128i*)&rhs[0]);
  92. __m128i sum1 = _mm_sad_epu8(_mm_and_si128(x1, MASK_UI4_1), _mm_and_si128(y1, MASK_UI4_1));
  93. __m128i sum2 = _mm_sad_epu8(_mm_and_si128(x1, MASK_UI4_2), _mm_and_si128(y1, MASK_UI4_2));
  94. __m128i x2 = _mm_loadu_si128((const __m128i*)&lhs[16]);
  95. __m128i y2 = _mm_loadu_si128((const __m128i*)&rhs[16]);
  96. sum1 = _mm_add_epi64(sum1, _mm_sad_epu8(_mm_and_si128(x2, MASK_UI4_1), _mm_and_si128(y2, MASK_UI4_1)));
  97. sum2 = _mm_add_epi64(sum2, _mm_sad_epu8(_mm_and_si128(x2, MASK_UI4_2), _mm_and_si128(y2, MASK_UI4_2)));
  98. __m128i x3 = _mm_loadu_si128((const __m128i*)&lhs[32]);
  99. __m128i y3 = _mm_loadu_si128((const __m128i*)&rhs[32]);
  100. sum1 = _mm_add_epi64(sum1, _mm_sad_epu8(_mm_and_si128(x3, MASK_UI4_1), _mm_and_si128(y3, MASK_UI4_1)));
  101. sum2 = _mm_add_epi64(sum2, _mm_sad_epu8(_mm_and_si128(x3, MASK_UI4_2), _mm_and_si128(y3, MASK_UI4_2)));
  102. __m128i x4 = _mm_loadu_si128((const __m128i*)&lhs[48]);
  103. __m128i y4 = _mm_loadu_si128((const __m128i*)&rhs[48]);
  104. sum1 = _mm_add_epi64(sum1, _mm_sad_epu8(_mm_and_si128(x4, MASK_UI4_1), _mm_and_si128(y4, MASK_UI4_1)));
  105. sum2 = _mm_add_epi64(sum2, _mm_sad_epu8(_mm_and_si128(x4, MASK_UI4_2), _mm_and_si128(y4, MASK_UI4_2)));
  106. __m128i x5 = _mm_loadu_si128((const __m128i*)&lhs[64]);
  107. __m128i y5 = _mm_loadu_si128((const __m128i*)&rhs[64]);
  108. sum1 = _mm_add_epi64(sum1, _mm_sad_epu8(_mm_and_si128(x5, MASK_UI4_1), _mm_and_si128(y5, MASK_UI4_1)));
  109. sum2 = _mm_add_epi64(sum2, _mm_sad_epu8(_mm_and_si128(x5, MASK_UI4_2), _mm_and_si128(y5, MASK_UI4_2)));
  110. __m128i x6 = _mm_loadu_si128((const __m128i*)&lhs[80]);
  111. __m128i y6 = _mm_loadu_si128((const __m128i*)&rhs[80]);
  112. sum1 = _mm_add_epi64(sum1, _mm_sad_epu8(_mm_and_si128(x6, MASK_UI4_1), _mm_and_si128(y6, MASK_UI4_1)));
  113. sum2 = _mm_add_epi64(sum2, _mm_sad_epu8(_mm_and_si128(x6, MASK_UI4_2), _mm_and_si128(y6, MASK_UI4_2)));
  114. return _mm_cvtsi128_si32(sum1) + _mm_cvtsi128_si32(_mm_shuffle_epi32(sum1, _MM_SHUFFLE(2, 2, 2, 2))) +
  115. ((_mm_cvtsi128_si32(sum2) + _mm_cvtsi128_si32(_mm_shuffle_epi32(sum2, _MM_SHUFFLE(2, 2, 2, 2)))) >> 4);
  116. }
  117. #endif // ARCADIA_SSE
  118. } // namespace NPrivate
  119. }
  120. /**
  121. * L1Distance (sum(abs(l[i] - r[i]))) implementation using SSE when possible.
  122. */
  123. #ifdef ARCADIA_SSE
  124. Y_FORCE_INLINE ui32 L1Distance(const i8* lhs, const i8* rhs, int length) {
  125. static const __m128i unsignedToSignedDiff = _mm_set_epi8(
  126. -128, -128, -128, -128, -128, -128, -128, -128,
  127. -128, -128, -128, -128, -128, -128, -128, -128);
  128. __m128i resVec = _mm_setzero_si128();
  129. while (length >= 16) {
  130. __m128i lVec = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)lhs), unsignedToSignedDiff);
  131. __m128i rVec = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)rhs), unsignedToSignedDiff);
  132. resVec = _mm_add_epi64(_mm_sad_epu8(lVec, rVec), resVec);
  133. lhs += 16;
  134. rhs += 16;
  135. length -= 16;
  136. }
  137. alignas(16) i64 res[2];
  138. _mm_store_si128((__m128i*)res, resVec);
  139. ui32 sum = res[0] + res[1];
  140. for (int i = 0; i < length; ++i) {
  141. const i32 diff = static_cast<i32>(lhs[i]) - static_cast<i32>(rhs[i]);
  142. sum += (diff >= 0) ? diff : -diff;
  143. }
  144. return sum;
  145. }
  146. Y_FORCE_INLINE ui32 L1Distance(const ui8* lhs, const ui8* rhs, int length) {
  147. if (length == 96)
  148. return NL1Distance::NPrivate::L1Distance96Ui8(lhs, rhs);
  149. int l16 = length & (~15);
  150. __m128i sum = _mm_setzero_si128();
  151. if ((reinterpret_cast<uintptr_t>(lhs) & 0x0f) || (reinterpret_cast<uintptr_t>(rhs) & 0x0f)) {
  152. for (int i = 0; i < l16; i += 16) {
  153. __m128i a = _mm_loadu_si128((const __m128i*)(&lhs[i]));
  154. __m128i b = _mm_loadu_si128((const __m128i*)(&rhs[i]));
  155. sum = _mm_add_epi64(sum, _mm_sad_epu8(a, b));
  156. }
  157. } else {
  158. for (int i = 0; i < l16; i += 16) {
  159. __m128i sum_ab = _mm_sad_epu8(*(const __m128i*)(&lhs[i]), *(const __m128i*)(&rhs[i]));
  160. sum = _mm_add_epi64(sum, sum_ab);
  161. }
  162. }
  163. if (l16 == length)
  164. return _mm_cvtsi128_si32(sum) + _mm_cvtsi128_si32(_mm_shuffle_epi32(sum, _MM_SHUFFLE(2, 2, 2, 2)));
  165. int l4 = length & (~3);
  166. for (int i = l16; i < l4; i += 4) {
  167. __m128i a = _mm_set_epi32(*((const ui32*)&lhs[i]), 0, 0, 0);
  168. __m128i b = _mm_set_epi32(*((const ui32*)&rhs[i]), 0, 0, 0);
  169. sum = _mm_add_epi64(sum, _mm_sad_epu8(a, b));
  170. }
  171. ui32 res = _mm_cvtsi128_si32(sum) + _mm_cvtsi128_si32(_mm_shuffle_epi32(sum, _MM_SHUFFLE(2, 2, 2, 2)));
  172. for (int i = l4; i < length; i++)
  173. res += lhs[i] < rhs[i] ? rhs[i] - lhs[i] : lhs[i] - rhs[i];
  174. return res;
  175. }
  176. Y_FORCE_INLINE ui32 L1DistanceUI4(const ui8* lhs, const ui8* rhs, int lengtInBytes) {
  177. if (lengtInBytes == 96)
  178. return NL1Distance::NPrivate::L1Distance96Ui4(lhs, rhs);
  179. int l16 = lengtInBytes & (~15);
  180. __m128i sum1 = _mm_setzero_si128();
  181. __m128i sum2 = _mm_setzero_si128();
  182. for (int i = 0; i < l16; i += 16) {
  183. __m128i a = _mm_loadu_si128((const __m128i*)(&lhs[i]));
  184. __m128i b = _mm_loadu_si128((const __m128i*)(&rhs[i]));
  185. sum1 = _mm_add_epi64(sum1, _mm_sad_epu8(_mm_and_si128(a, NL1Distance::NPrivate::MASK_UI4_1), _mm_and_si128(b, NL1Distance::NPrivate::MASK_UI4_1)));
  186. sum2 = _mm_add_epi64(sum2, _mm_sad_epu8(_mm_and_si128(a, NL1Distance::NPrivate::MASK_UI4_2), _mm_and_si128(b, NL1Distance::NPrivate::MASK_UI4_2)));
  187. }
  188. if (l16 == lengtInBytes)
  189. return _mm_cvtsi128_si32(sum1) + _mm_cvtsi128_si32(_mm_shuffle_epi32(sum1, _MM_SHUFFLE(2, 2, 2, 2))) +
  190. ((_mm_cvtsi128_si32(sum2) + _mm_cvtsi128_si32(_mm_shuffle_epi32(sum2, _MM_SHUFFLE(2, 2, 2, 2)))) >> 4);
  191. int l4 = lengtInBytes & (~3);
  192. for (int i = l16; i < l4; i += 4) {
  193. __m128i a = _mm_set_epi32(*((const ui32*)&lhs[i]), 0, 0, 0);
  194. __m128i b = _mm_set_epi32(*((const ui32*)&rhs[i]), 0, 0, 0);
  195. sum1 = _mm_add_epi64(sum1, _mm_sad_epu8(_mm_and_si128(a, NL1Distance::NPrivate::MASK_UI4_1), _mm_and_si128(b, NL1Distance::NPrivate::MASK_UI4_1)));
  196. sum2 = _mm_add_epi64(sum2, _mm_sad_epu8(_mm_and_si128(a, NL1Distance::NPrivate::MASK_UI4_2), _mm_and_si128(b, NL1Distance::NPrivate::MASK_UI4_2)));
  197. }
  198. ui32 res = _mm_cvtsi128_si32(sum1) + _mm_cvtsi128_si32(_mm_shuffle_epi32(sum1, _MM_SHUFFLE(2, 2, 2, 2))) +
  199. ((_mm_cvtsi128_si32(sum2) + _mm_cvtsi128_si32(_mm_shuffle_epi32(sum2, _MM_SHUFFLE(2, 2, 2, 2)))) >> 4);
  200. for (int i = l4; i < lengtInBytes; ++i) {
  201. ui8 a1 = lhs[i] & 0x0f;
  202. ui8 a2 = (lhs[i] & 0xf0) >> 4;
  203. ui8 b1 = rhs[i] & 0x0f;
  204. ui8 b2 = (rhs[i] & 0xf0) >> 4;
  205. res += a1 < b1 ? b1 - a1 : a1 - b1;
  206. res += a2 < b2 ? b2 - a2 : a2 - b2;
  207. }
  208. return res;
  209. }
  210. Y_FORCE_INLINE ui64 L1Distance(const i32* lhs, const i32* rhs, int length) {
  211. __m128i zero = _mm_setzero_si128();
  212. __m128i res = zero;
  213. while (length >= 4) {
  214. __m128i a = _mm_loadu_si128((const __m128i*)lhs);
  215. __m128i b = _mm_loadu_si128((const __m128i*)rhs);
  216. __m128i mask = _mm_cmpgt_epi32(a, b);
  217. __m128i a2 = _mm_and_si128(mask, _mm_sub_epi32(a, b));
  218. b = _mm_andnot_si128(mask, _mm_sub_epi32(b, a));
  219. a = _mm_or_si128(a2, b);
  220. res = _mm_add_epi64(_mm_unpackhi_epi32(a, zero), res);
  221. res = _mm_add_epi64(_mm_unpacklo_epi32(a, zero), res);
  222. rhs += 4;
  223. lhs += 4;
  224. length -= 4;
  225. }
  226. alignas(16) ui64 r[2];
  227. _mm_store_si128((__m128i*)r, res);
  228. ui64 sum = r[0] + r[1];
  229. while (length) {
  230. sum += lhs[0] < rhs[0] ? rhs[0] - lhs[0] : lhs[0] - rhs[0];
  231. ++lhs;
  232. ++rhs;
  233. --length;
  234. }
  235. return sum;
  236. }
  237. Y_FORCE_INLINE ui64 L1Distance(const ui32* lhs, const ui32* rhs, int length) {
  238. __m128i zero = _mm_setzero_si128();
  239. __m128i shift = _mm_set1_epi32(0x80000000);
  240. __m128i res = zero;
  241. while (length >= 4) {
  242. __m128i a = _mm_add_epi32(_mm_loadu_si128((const __m128i*)lhs), shift);
  243. __m128i b = _mm_add_epi32(_mm_loadu_si128((const __m128i*)rhs), shift);
  244. __m128i mask = _mm_cmpgt_epi32(a, b);
  245. __m128i a2 = _mm_and_si128(mask, _mm_sub_epi32(a, b));
  246. b = _mm_andnot_si128(mask, _mm_sub_epi32(b, a));
  247. a = _mm_or_si128(a2, b);
  248. res = _mm_add_epi64(_mm_unpackhi_epi32(a, zero), res);
  249. res = _mm_add_epi64(_mm_unpacklo_epi32(a, zero), res);
  250. rhs += 4;
  251. lhs += 4;
  252. length -= 4;
  253. }
  254. alignas(16) ui64 r[2];
  255. _mm_store_si128((__m128i*)r, res);
  256. ui64 sum = r[0] + r[1];
  257. while (length) {
  258. sum += lhs[0] < rhs[0] ? rhs[0] - lhs[0] : lhs[0] - rhs[0];
  259. ++lhs;
  260. ++rhs;
  261. --length;
  262. }
  263. return sum;
  264. }
  265. Y_FORCE_INLINE float L1Distance(const float* lhs, const float* rhs, int length) {
  266. __m128 res = _mm_setzero_ps();
  267. __m128 absMask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
  268. while (length >= 4) {
  269. __m128 a = _mm_loadu_ps(lhs);
  270. __m128 b = _mm_loadu_ps(rhs);
  271. __m128 d = _mm_sub_ps(a, b);
  272. res = _mm_add_ps(_mm_and_ps(d, absMask), res);
  273. rhs += 4;
  274. lhs += 4;
  275. length -= 4;
  276. }
  277. alignas(16) float r[4];
  278. _mm_store_ps(r, res);
  279. float sum = r[0] + r[1] + r[2] + r[3];
  280. while (length) {
  281. sum += std::abs(*lhs - *rhs);
  282. ++lhs;
  283. ++rhs;
  284. --length;
  285. }
  286. return sum;
  287. }
  288. Y_FORCE_INLINE double L1Distance(const double* lhs, const double* rhs, int length) {
  289. __m128d res = _mm_setzero_pd();
  290. __m128d absMask = _mm_castsi128_pd(_mm_set_epi32(0x7fffffff, 0xffffffff, 0x7fffffff, 0xffffffff));
  291. while (length >= 2) {
  292. __m128d a = _mm_loadu_pd(lhs);
  293. __m128d b = _mm_loadu_pd(rhs);
  294. __m128d d = _mm_sub_pd(a, b);
  295. res = _mm_add_pd(_mm_and_pd(d, absMask), res);
  296. rhs += 2;
  297. lhs += 2;
  298. length -= 2;
  299. }
  300. alignas(16) double r[2];
  301. _mm_store_pd(r, res);
  302. double sum = r[0] + r[1];
  303. while (length) {
  304. sum += std::abs(*lhs - *rhs);
  305. ++lhs;
  306. ++rhs;
  307. --length;
  308. }
  309. return sum;
  310. }
  311. #else // ARCADIA_SSE
  312. inline ui32 L1Distance(const i8* lhs, const i8* rhs, int length) {
  313. return NL1Distance::NPrivate::L1DistanceImpl<ui32, i8>(lhs, rhs, length);
  314. }
  315. inline ui32 L1Distance(const ui8* lhs, const ui8* rhs, int length) {
  316. return NL1Distance::NPrivate::L1DistanceImpl<ui32, ui8>(lhs, rhs, length);
  317. }
  318. inline ui32 L1DistanceUI4(const ui8* lhs, const ui8* rhs, int lengtInBytes) {
  319. return NL1Distance::NPrivate::L1DistanceImplUI4<ui32>(lhs, rhs, lengtInBytes);
  320. }
  321. inline ui64 L1Distance(const ui32* lhs, const ui32* rhs, int length) {
  322. return NL1Distance::NPrivate::L1DistanceImpl2<ui64, ui32>(lhs, rhs, length);
  323. }
  324. inline ui64 L1Distance(const i32* lhs, const i32* rhs, int length) {
  325. return NL1Distance::NPrivate::L1DistanceImpl2<ui64, i32>(lhs, rhs, length);
  326. }
  327. inline float L1Distance(const float* lhs, const float* rhs, int length) {
  328. return NL1Distance::NPrivate::L1DistanceImpl4<float, float>(lhs, rhs, length);
  329. }
  330. inline double L1Distance(const double* lhs, const double* rhs, int length) {
  331. return NL1Distance::NPrivate::L1DistanceImpl4<double, double>(lhs, rhs, length);
  332. }
  333. #endif // _sse_
  334. /**
  335. * L1Distance (sum(abs(l[i] - r[i]))) implementation without SSE.
  336. */
  337. inline ui32 L1DistanceSlow(const i8* lhs, const i8* rhs, int length) {
  338. return NL1Distance::NPrivate::L1DistanceImpl<ui32, i8>(lhs, rhs, length);
  339. }
  340. inline ui32 L1DistanceSlow(const ui8* lhs, const ui8* rhs, int length) {
  341. return NL1Distance::NPrivate::L1DistanceImpl<ui32, ui8>(lhs, rhs, length);
  342. }
  343. inline ui32 L1DistanceUI4Slow(const ui8* lhs, const ui8* rhs, int lengtInBytes) {
  344. return NL1Distance::NPrivate::L1DistanceImplUI4<ui32>(lhs, rhs, lengtInBytes);
  345. }
  346. inline ui64 L1DistanceSlow(const ui32* lhs, const ui32* rhs, int length) {
  347. return NL1Distance::NPrivate::L1DistanceImpl2<ui64, ui32>(lhs, rhs, length);
  348. }
  349. inline ui64 L1DistanceSlow(const i32* lhs, const i32* rhs, int length) {
  350. return NL1Distance::NPrivate::L1DistanceImpl2<ui64, i32>(lhs, rhs, length);
  351. }
  352. inline float L1DistanceSlow(const float* lhs, const float* rhs, int length) {
  353. return NL1Distance::NPrivate::L1DistanceImpl4<float, float>(lhs, rhs, length);
  354. }
  355. inline double L1DistanceSlow(const double* lhs, const double* rhs, int length) {
  356. return NL1Distance::NPrivate::L1DistanceImpl4<double, double>(lhs, rhs, length);
  357. }
  358. namespace NL1Distance {
  359. // Simpler wrapper allowing to use this functions as template argument.
  360. template <typename T>
  361. struct TL1Distance {
  362. using TResult = decltype(L1Distance(static_cast<const T*>(nullptr), static_cast<const T*>(nullptr), 0));
  363. inline TResult operator()(const T* a, const T* b, int length) const {
  364. return L1Distance(a, b, length);
  365. }
  366. };
  367. struct TL1DistanceUI4 {
  368. using TResult = ui32;
  369. inline TResult operator()(const ui8* a, const ui8* b, int lengtInBytes) const {
  370. return L1DistanceUI4(a, b, lengtInBytes);
  371. }
  372. };
  373. }