test.cpp 69 KB


  1. /*
  2. Unittests for all SSE instrinsics translated to NEON instrinsics or
  3. software implementation.
  4. Should be tested both on Intel and ARM64.
  5. */
  6. /* Author: Vitaliy Manushkin <agri@yandex-team.ru */
  7. #include <library/cpp/testing/unittest/registar.h>
  8. #include <util/generic/typetraits.h>
  9. #include <util/string/hex.h>
  10. #include <util/random/fast.h>
  11. #include <util/stream/output.h>
  12. #include <algorithm>
  13. #include <array>
  14. #include <limits>
  15. #include <memory>
  16. #include <type_traits>
  17. #include <utility>
  18. template <typename TResult, typename TFunc, TFunc* func>
  19. struct T_mm_CallWrapper {
  20. TResult Value;
  21. template <typename... TParams>
  22. T_mm_CallWrapper(TParams&&... params) {
  23. Value = func(std::forward<TParams>(params)...);
  24. }
  25. operator TResult&() {
  26. return Value;
  27. }
  28. operator const TResult&() const {
  29. return Value;
  30. }
  31. };
  32. #if defined(_arm64_)
  33. #include "library/cpp/sse/sse2neon.h"
  34. #elif defined(_i386_) || defined(_x86_64_)
  35. #include <xmmintrin.h>
  36. #include <emmintrin.h>
  37. #include <smmintrin.h>
  38. #elif defined(_ppc64_)
  39. #include "library/cpp/sse/powerpc.h"
  40. #else
  41. #error "Unsupported platform"
  42. #endif
  43. #if defined(_arm64_)
  44. #define Wrap(T_mm_func) T_mm_func
  45. #define WrapF(T_mm_func) T_mm_func
  46. #define WrapD(T_mm_func) T_mm_func
  47. #elif defined(_ppc64_) || defined(_i386_) || defined(_x86_64_)
  48. #define Wrap(_mm_func) \
  49. T_mm_CallWrapper<__m128i, decltype(_mm_func), _mm_func>
  50. #define WrapF(_mm_func) \
  51. T_mm_CallWrapper<__m128, decltype(_mm_func), _mm_func>
  52. #define WrapD(_mm_func) \
  53. T_mm_CallWrapper<__m128d, decltype(_mm_func), _mm_func>
  54. using int8x16_t = std::array<i8, 16>;
  55. using int16x8_t = std::array<i16, 8>;
  56. using int32x4_t = std::array<i32, 4>;
  57. using int64x2_t = std::array<i64, 2>;
  58. using uint8x16_t = std::array<ui8, 16>;
  59. using uint16x8_t = std::array<ui16, 8>;
  60. using uint32x4_t = std::array<ui32, 4>;
  61. using uint64x2_t = std::array<ui64, 2>;
  62. using float32x4_t = std::array<float, 4>;
  63. using float64x2_t = std::array<double, 2>;
  64. template <typename TVectorType>
  65. struct TQType {
  66. static TVectorType As(__m128i param) {
  67. TVectorType value;
  68. _mm_storeu_si128((__m128i*)&value, param);
  69. return value;
  70. }
  71. static TVectorType As(__m128 param) {
  72. TVectorType value;
  73. _mm_storeu_ps((float*)&value, param);
  74. return value;
  75. }
  76. static TVectorType As(__m128d param) {
  77. TVectorType value;
  78. _mm_storeu_pd((double*)&value, param);
  79. return value;
  80. }
  81. };
  82. #endif
  83. template <typename TVectorType>
  84. struct TFuncLoad;
  85. template <typename TVectorType>
  86. struct TFuncStore;
  87. template <>
  88. struct TFuncLoad<__m128i> {
  89. __m128i Value;
  90. template <typename TPointer>
  91. TFuncLoad(TPointer* ptr) {
  92. Value = _mm_loadu_si128((__m128i*)ptr);
  93. }
  94. operator __m128i&() {
  95. return Value;
  96. }
  97. operator const __m128i&() const {
  98. return Value;
  99. }
  100. };
  101. template <>
  102. struct TFuncLoad<__m128> {
  103. __m128 Value;
  104. template <typename TPointer>
  105. TFuncLoad(TPointer* ptr) {
  106. Value = _mm_loadu_ps((float*)ptr);
  107. }
  108. operator __m128&() {
  109. return Value;
  110. }
  111. operator const __m128&() const {
  112. return Value;
  113. }
  114. };
  115. template <>
  116. struct TFuncLoad<__m128d> {
  117. __m128d Value;
  118. template <typename TPointer>
  119. TFuncLoad(TPointer* ptr) {
  120. Value = _mm_loadu_pd((double*)ptr);
  121. }
  122. operator __m128d&() {
  123. return Value;
  124. }
  125. operator const __m128d&() const {
  126. return Value;
  127. }
  128. };
  129. template <>
  130. struct TFuncStore<__m128i> {
  131. template <typename TPointer>
  132. TFuncStore(TPointer* ptr, __m128i Value) {
  133. _mm_storeu_si128((__m128i*)ptr, Value);
  134. }
  135. };
  136. template <>
  137. struct TFuncStore<__m128> {
  138. template <typename TPointer>
  139. TFuncStore(TPointer* ptr, __m128 Value) {
  140. _mm_storeu_ps((float*)ptr, Value);
  141. }
  142. };
  143. class TSSEEmulTest: public TTestBase {
  144. private:
  145. UNIT_TEST_SUITE(TSSEEmulTest);
  146. UNIT_TEST(Test_mm_load_si128);
  147. UNIT_TEST(Test_mm_loadu_si128);
  148. UNIT_TEST(Test_mm_storeu_si128);
  149. UNIT_TEST(Test_mm_loadu_si128_2);
  150. UNIT_TEST(Test_mm_loadu_ps);
  151. UNIT_TEST(Test_mm_storeu_ps);
  152. UNIT_TEST(Test_mm_slli_epi16);
  153. UNIT_TEST(Test_mm_slli_epi32);
  154. UNIT_TEST(Test_mm_slli_epi64);
  155. UNIT_TEST(Test_mm_slli_si128);
  156. UNIT_TEST(Test_mm_srli_epi16);
  157. UNIT_TEST(Test_mm_srli_epi32);
  158. UNIT_TEST(Test_mm_srli_epi64);
  159. UNIT_TEST(Test_mm_srli_si128);
  160. UNIT_TEST(Test_mm_srai_epi16);
  161. UNIT_TEST(Test_mm_srai_epi32);
  162. UNIT_TEST(Test_mm_sll_epi16);
  163. UNIT_TEST(Test_mm_sll_epi32);
  164. UNIT_TEST(Test_mm_sll_epi64);
  165. UNIT_TEST(Test_mm_srl_epi16);
  166. UNIT_TEST(Test_mm_srl_epi32);
  167. UNIT_TEST(Test_mm_srl_epi64);
  168. UNIT_TEST(Test_mm_add_epi16);
  169. UNIT_TEST(Test_mm_add_epi32);
  170. UNIT_TEST(Test_mm_add_epi64);
  171. UNIT_TEST(Test_mm_add_ps);
  172. UNIT_TEST(Test_mm_add_pd);
  173. UNIT_TEST(Test_mm_madd_epi16);
  174. UNIT_TEST(Test_mm_sub_epi16);
  175. UNIT_TEST(Test_mm_sub_epi32);
  176. UNIT_TEST(Test_mm_sub_epi64);
  177. UNIT_TEST(Test_mm_sub_ps);
  178. UNIT_TEST(Test_mm_sub_pd);
  179. UNIT_TEST(Test_mm_mul_ps);
  180. UNIT_TEST(Test_mm_mul_pd);
  181. UNIT_TEST(Test_mm_div_ps);
  182. UNIT_TEST(Test_mm_div_pd);
  183. UNIT_TEST(Test_mm_max_ps);
  184. UNIT_TEST(Test_mm_min_ps);
  185. UNIT_TEST(Test_mm_and_ps);
  186. UNIT_TEST(Test_mm_unpacklo_epi8);
  187. UNIT_TEST(Test_mm_unpackhi_epi8);
  188. UNIT_TEST(Test_mm_unpacklo_epi16);
  189. UNIT_TEST(Test_mm_unpackhi_epi16);
  190. UNIT_TEST(Test_mm_unpacklo_epi32);
  191. UNIT_TEST(Test_mm_unpackhi_epi32);
  192. UNIT_TEST(Test_mm_unpacklo_epi64);
  193. UNIT_TEST(Test_mm_unpackhi_epi64);
  194. UNIT_TEST(Test_mm_or_si128);
  195. UNIT_TEST(Test_mm_and_si128);
  196. UNIT_TEST(Test_mm_andnot_si128);
  197. UNIT_TEST(Test_mm_cmpeq_epi8);
  198. UNIT_TEST(Test_mm_cmpeq_epi16);
  199. UNIT_TEST(Test_mm_cmpeq_epi32);
  200. UNIT_TEST(Test_mm_cmpeq_ps);
  201. UNIT_TEST(Test_mm_cmpgt_epi8);
  202. UNIT_TEST(Test_mm_cmpgt_epi16);
  203. UNIT_TEST(Test_mm_cmpgt_epi32);
  204. UNIT_TEST(Test_mm_cmpgt_ps);
  205. UNIT_TEST(Test_mm_cmplt_epi8);
  206. UNIT_TEST(Test_mm_cmplt_epi16);
  207. UNIT_TEST(Test_mm_cmplt_epi32);
  208. UNIT_TEST(Test_mm_set1_epi8);
  209. UNIT_TEST(Test_mm_set1_epi16);
  210. UNIT_TEST(Test_mm_set1_epi32);
  211. UNIT_TEST(Test_mm_set1_ps);
  212. UNIT_TEST(Test_mm_set_ps1);
  213. UNIT_TEST(Test_mm_setzero_si128);
  214. UNIT_TEST(Test_mm_setzero_ps);
  215. UNIT_TEST(Test_mm_setzero_pd);
  216. UNIT_TEST(Test_mm_storel_epi64);
  217. UNIT_TEST(Test_mm_loadl_epi64);
  218. UNIT_TEST(Test_mm_loadl_pd);
  219. UNIT_TEST(Test_mm_loadh_pd);
  220. UNIT_TEST(Test_mm_cvtsd_f64);
  221. UNIT_TEST(Test_mm_shuffle_epi32);
  222. UNIT_TEST(Test_mm_movemask_epi8);
  223. UNIT_TEST(Test_mm_cvtsi128_si32);
  224. UNIT_TEST(Test_mm_cvtsi128_si64);
  225. UNIT_TEST(Test_mm_set_epi16);
  226. UNIT_TEST(Test_mm_set_epi32);
  227. UNIT_TEST(Test_mm_set_ps);
  228. UNIT_TEST(Test_mm_set_pd);
  229. UNIT_TEST(Test_mm_cvtsi32_si128);
  230. UNIT_TEST(Test_mm_cvtsi64_si128);
  231. UNIT_TEST(Test_mm_packs_epi16);
  232. UNIT_TEST(Test_mm_packs_epi32);
  233. UNIT_TEST(Test_mm_packus_epi16);
  234. UNIT_TEST(Test_mm_extract_epi16);
  235. UNIT_TEST(Test_mm_extract_epi8);
  236. UNIT_TEST(Test_mm_extract_epi32);
  237. UNIT_TEST(Test_mm_extract_epi64);
  238. UNIT_TEST(Test_MM_TRANSPOSE4_PS);
  239. UNIT_TEST(Test_mm_movemask_ps);
  240. UNIT_TEST(Test_mm_movemask_ps_2);
  241. UNIT_TEST(Test_mm_cvtepi32_ps);
  242. UNIT_TEST(Test_mm_cvtps_epi32);
  243. UNIT_TEST(Test_mm_cvttps_epi32);
  244. UNIT_TEST(Test_mm_castsi128_ps);
  245. UNIT_TEST(Test_mm_castps_si128);
  246. UNIT_TEST(Test_mm_mul_epu32);
  247. UNIT_TEST(Test_mm_cmpunord_ps);
  248. UNIT_TEST(Test_mm_andnot_ps);
  249. UNIT_TEST(Test_mm_shuffle_ps);
  250. UNIT_TEST(Test_mm_shuffle_pd);
  251. UNIT_TEST(Test_mm_or_ps);
  252. UNIT_TEST(Test_mm_store_ss);
  253. UNIT_TEST(Test_mm_store_ps);
  254. UNIT_TEST(Test_mm_storeu_pd);
  255. UNIT_TEST(Test_mm_loadu_pd);
  256. UNIT_TEST(Test_mm_rsqrt_ps);
  257. UNIT_TEST(Test_matrixnet_powerpc);
  258. UNIT_TEST_SUITE_END();
  259. public:
  260. void Test_mm_load_si128();
  261. void Test_mm_loadu_si128();
  262. void Test_mm_storeu_si128();
  263. void Test_mm_loadu_si128_2();
  264. void Test_mm_loadu_ps();
  265. void Test_mm_storeu_ps();
  266. template <typename TElem, int bits, int elemCount,
  267. typename TFunc, typename TShifter, typename TOp, typename TElemFunc>
  268. void Test_mm_shifter_epiXX();
  269. enum class EDirection {
  270. Left,
  271. Right
  272. };
  273. struct TShiftRes {
  274. __m128i Value[17];
  275. };
  276. void Test_mm_byte_shifter(EDirection direction, std::function<TShiftRes (__m128i)> foo);
  277. void Test_mm_slli_epi16();
  278. void Test_mm_slli_epi32();
  279. void Test_mm_slli_epi64();
  280. void Test_mm_slli_si128();
  281. void Test_mm_srli_epi16();
  282. void Test_mm_srli_epi32();
  283. void Test_mm_srli_epi64();
  284. void Test_mm_srli_si128();
  285. void Test_mm_srai_epi16();
  286. void Test_mm_srai_epi32();
  287. void Test_mm_sll_epi16();
  288. void Test_mm_sll_epi32();
  289. void Test_mm_sll_epi64();
  290. void Test_mm_srl_epi16();
  291. void Test_mm_srl_epi32();
  292. void Test_mm_srl_epi64();
  293. void Test_mm_add_epi8();
  294. void Test_mm_add_epi16();
  295. void Test_mm_add_epi32();
  296. void Test_mm_add_epi64();
  297. void Test_mm_add_ps();
  298. void Test_mm_add_pd();
  299. void Test_mm_madd_epi16();
  300. void Test_mm_sub_epi8();
  301. void Test_mm_sub_epi16();
  302. void Test_mm_sub_epi32();
  303. void Test_mm_sub_epi64();
  304. void Test_mm_sub_ps();
  305. void Test_mm_sub_pd();
  306. void Test_mm_mul_ps();
  307. void Test_mm_mul_pd();
  308. void Test_mm_div_ps();
  309. void Test_mm_div_pd();
  310. void Test_mm_max_ps();
  311. void Test_mm_min_ps();
  312. void Test_mm_and_ps();
  313. template <typename TElem, int bits, int elemCount, int shift,
  314. typename TFunc, typename TOp>
  315. void Test_mm_unpack_epiXX();
  316. void Test_mm_unpacklo_epi8();
  317. void Test_mm_unpackhi_epi8();
  318. void Test_mm_unpacklo_epi16();
  319. void Test_mm_unpackhi_epi16();
  320. void Test_mm_unpacklo_epi32();
  321. void Test_mm_unpackhi_epi32();
  322. void Test_mm_unpacklo_epi64();
  323. void Test_mm_unpackhi_epi64();
  324. template <typename TElem, unsigned elemCount,
  325. typename TFunc, typename TElemFunc,
  326. typename TOp, typename TVectorType = __m128i>
  327. void Test_mm_dualop();
  328. template <typename TElem, unsigned elemCount,
  329. typename TFunc, typename TElemFunc,
  330. typename TOp, typename TVectorType = __m128i>
  331. void Test_mm_dualcmp();
  332. void Test_mm_or_si128();
  333. void Test_mm_and_si128();
  334. void Test_mm_andnot_si128();
  335. void Test_mm_cmpeq_epi8();
  336. void Test_mm_cmpeq_epi16();
  337. void Test_mm_cmpeq_epi32();
  338. void Test_mm_cmpeq_ps();
  339. void Test_mm_cmpgt_epi8();
  340. void Test_mm_cmpgt_epi16();
  341. void Test_mm_cmpgt_epi32();
  342. void Test_mm_cmpgt_ps();
  343. void Test_mm_cmplt_epi8();
  344. void Test_mm_cmplt_epi16();
  345. void Test_mm_cmplt_epi32();
  346. template <typename TElem, int elemCount,
  347. typename TFunc, typename TOp, typename TVectorType>
  348. void Test_mm_setter_epiXX();
  349. void Test_mm_set1_epi8();
  350. void Test_mm_set1_epi16();
  351. void Test_mm_set1_epi32();
  352. void Test_mm_set1_ps();
  353. void Test_mm_set_ps1();
  354. void Test_mm_setzero_si128();
  355. void Test_mm_setzero_ps();
  356. void Test_mm_setzero_pd();
  357. void Test_mm_loadl_epi64();
  358. void Test_mm_storel_epi64();
  359. void Test_mm_loadl_pd();
  360. void Test_mm_loadh_pd();
  361. void Test_mm_cvtsd_f64();
  362. void Test_mm_shuffle_epi32();
  363. void Test_mm_movemask_epi8();
  364. void Test_mm_cvtsi128_si32();
  365. void Test_mm_cvtsi128_si64();
  366. void Test_mm_set_epi16();
  367. void Test_mm_set_epi32();
  368. void Test_mm_set_ps();
  369. void Test_mm_set_pd();
  370. void Test_mm_cvtsi32_si128();
  371. void Test_mm_cvtsi64_si128();
  372. template <typename TElem, typename TNarrow, unsigned elemCount,
  373. typename TFunc>
  374. void Test_mm_packs_epiXX();
  375. void Test_mm_packs_epi16();
  376. void Test_mm_packs_epi32();
  377. void Test_mm_packus_epi16();
  378. void Test_mm_extract_epi16();
  379. void Test_mm_extract_epi8();
  380. void Test_mm_extract_epi32();
  381. void Test_mm_extract_epi64();
  382. void Test_MM_TRANSPOSE4_PS();
  383. void Test_mm_movemask_ps();
  384. void Test_mm_movemask_ps_2();
  385. template <typename TFrom, typename TTo, unsigned elemCount,
  386. typename TLoadVector, typename TResultVector,
  387. typename TElemFunc, typename TFunc, typename TOp>
  388. void Test_mm_convertop();
  389. void Test_mm_cvtepi32_ps();
  390. void Test_mm_cvtps_epi32();
  391. void Test_mm_cvttps_epi32();
  392. template <typename TLoadVector, typename TCastVector,
  393. typename TFunc, TFunc* func>
  394. void Test_mm_castXX();
  395. void Test_mm_castsi128_ps();
  396. void Test_mm_castps_si128();
  397. void Test_mm_mul_epu32();
  398. void Test_mm_cmpunord_ps();
  399. void Test_mm_store_ss();
  400. void Test_mm_store_ps();
  401. void Test_mm_storeu_pd();
  402. void Test_mm_andnot_ps();
  403. void Test_mm_shuffle_ps();
  404. void Test_mm_shuffle_pd();
  405. void Test_mm_or_ps();
  406. void Test_mm_loadu_pd();
  407. void Test_mm_rsqrt_ps();
  408. void Test_mm_rsqrt_ss();
  409. void Test_matrixnet_powerpc();
  410. };
  411. UNIT_TEST_SUITE_REGISTRATION(TSSEEmulTest);
  412. void TSSEEmulTest::Test_mm_load_si128() {
  413. alignas(16) char data[16] = {
  414. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  415. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  416. __m128i value = _mm_load_si128((__m128i*)&data);
  417. UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[0], 0xAABB2211CCFF00AAUL);
  418. UNIT_ASSERT_EQUAL(TQType<uint64x2_t>::As(value)[1], 0x1C66775588449933UL);
  419. }
  420. void TSSEEmulTest::Test_mm_loadu_si128() {
  421. alignas(16) char data[17] = {
  422. '\x66',
  423. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  424. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  425. UNIT_ASSERT((ui64(&data[1]) & 0x1) == 0x1);
  426. __m128i value = _mm_loadu_si128((__m128i*)&data[1]);
  427. UNIT_ASSERT(TQType<uint64x2_t>::As(value)[0] == 0xAABB2211CCFF00AAUL);
  428. UNIT_ASSERT(TQType<uint64x2_t>::As(value)[1] == 0x1C66775588449933UL);
  429. }
  430. void TSSEEmulTest::Test_mm_storeu_si128() {
  431. alignas(16) unsigned char stub[32] = {
  432. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  433. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  434. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  435. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
  436. };
  437. alignas(16) unsigned char value[16] = {
  438. 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
  439. 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf
  440. };
  441. const __m128i val = _mm_loadu_si128((__m128i*)&value[0]);
  442. for (size_t shift = 0; shift != 17; ++shift) {
  443. alignas(16) unsigned char res[sizeof(stub)];
  444. memcpy(res, stub, sizeof(res));
  445. _mm_storeu_si128((__m128i*)&res[shift], val);
  446. alignas(16) unsigned char etalon[sizeof(stub)];
  447. memcpy(etalon, stub, sizeof(etalon));
  448. for (size_t i = 0; i != sizeof(value); ++i) {
  449. etalon[shift + i] = value[i];
  450. }
  451. for (size_t i = 0; i != sizeof(etalon) / sizeof(etalon[0]); ++i) {
  452. UNIT_ASSERT_EQUAL_C(res[i], etalon[i], "res: " << HexEncode(res, 32) << " vs etalon: " << HexEncode(etalon, 32));
  453. }
  454. }
  455. }
  456. void TSSEEmulTest::Test_mm_loadu_si128_2() {
  457. alignas(16) unsigned char stub[32] = {
  458. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  459. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  460. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  461. 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
  462. };
  463. for (size_t shift = 0; shift != 17; ++shift) {
  464. const __m128i val = _mm_loadu_si128((const __m128i*)&stub[shift]);
  465. alignas(16) unsigned char res[16];
  466. _mm_store_si128((__m128i*)res, val);
  467. for (size_t i = 0; i != 16; ++i) {
  468. UNIT_ASSERT_EQUAL_C(res[i], stub[i + shift], "res: " << HexEncode(res, 16) << " vs etalon: " << HexEncode(&stub[shift], 16));
  469. }
  470. }
  471. }
  472. void TSSEEmulTest::Test_mm_loadu_ps() {
  473. alignas(16) float stub[8] = {
  474. 0.f, 1.f, 2.f, 3.f,
  475. 4.f, 5.f, 6.f, 7.f
  476. };
  477. for (size_t shift = 0; shift != 5; ++shift) {
  478. const __m128 val = _mm_loadu_ps(&stub[shift]);
  479. alignas(16) float res[4];
  480. _mm_store_ps(res, val);
  481. for (size_t i = 0; i != 4; ++i) {
  482. UNIT_ASSERT_EQUAL_C(res[i], stub[shift + i], "res: " << HexEncode(res, 16) << " vs etalon: " << HexEncode(&stub[shift], 16));
  483. }
  484. }
  485. }
  486. void TSSEEmulTest::Test_mm_storeu_ps() {
  487. alignas(16) float stub[8] = {
  488. 0.f, 1.f, 2.f, 3.f,
  489. 4.f, 5.f, 6.f, 7.f
  490. };
  491. alignas(16) float value[4] = {
  492. 100.f, 101.f, 102.f, 103.f
  493. };
  494. const __m128 val = _mm_load_ps(value);
  495. for (size_t shift = 0; shift != 5; ++shift) {
  496. alignas(16) float res[sizeof(stub) / sizeof(stub[0])];
  497. memcpy(res, stub, sizeof(stub));
  498. _mm_storeu_ps(&res[shift], val);
  499. float etalon[sizeof(stub) / sizeof(stub[0])];
  500. memcpy(etalon, stub, sizeof(stub));
  501. for (size_t i = 0; i != 4; ++i) {
  502. etalon[i + shift] = value[i];
  503. }
  504. for (size_t i = 0; i != sizeof(stub) / sizeof(stub[0]); ++i) {
  505. UNIT_ASSERT_EQUAL_C(res[i], etalon[i], "res: " << HexEncode(res, sizeof(res)) << " vs etalon: " << HexEncode(etalon, sizeof(etalon)));
  506. }
  507. }
  508. }
  509. template<typename C>
  510. C MakeNumber(unsigned number);
  511. template<>
  512. __m128i MakeNumber<__m128i>(unsigned number) {
  513. char data[16] = {0};
  514. memcpy(data, &number, sizeof(number));
  515. return _mm_loadu_si128((__m128i*)data);
  516. }
  517. template<>
  518. unsigned MakeNumber<unsigned>(unsigned number) {
  519. return number;
  520. }
  521. template <typename TElem, int bits, int elemCount,
  522. typename TFunc, typename TShifter, typename TOp, typename TElemFunc>
  523. void TSSEEmulTest::Test_mm_shifter_epiXX() {
  524. char data[16] = {
  525. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  526. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  527. TElem* dataw = reinterpret_cast<TElem*>(&data);
  528. __m128i value = _mm_loadu_si128((__m128i*)&data);
  529. for (unsigned shifter = 0; shifter <= bits; ++shifter) {
  530. TElem shiftedData[elemCount];
  531. for (unsigned i = 0; i < elemCount; ++i) {
  532. shiftedData[i] = TElemFunc::Call(dataw[i], shifter);
  533. }
  534. const TShifter adhoc_shifter = MakeNumber<TShifter>(shifter);
  535. __m128i result = TFunc(value, adhoc_shifter);
  536. for (unsigned i = 0; i < elemCount; ++i) {
  537. UNIT_ASSERT_EQUAL(shiftedData[i], TQType<TOp>::As(result)[i]);
  538. }
  539. }
  540. }
  541. void TSSEEmulTest::Test_mm_byte_shifter(EDirection direction, std::function<TShiftRes (__m128i)> foo) {
  542. const char data[48] = {
  543. '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00',
  544. '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00',
  545. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  546. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C',
  547. '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00',
  548. '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'
  549. };
  550. const __m128i a = _mm_loadu_si128((__m128i*)(data + 16));
  551. const TShiftRes res = foo(a);
  552. for (int shift = 0; shift <= 16; ++shift) {
  553. const int etalon_offset = 16 + (direction == EDirection::Left ? -shift : shift); //!< specific to little endian byte order.
  554. const char* etalon = data + etalon_offset;
  555. const char* res_bytes = (const char*)&res.Value[shift];
  556. for (size_t byte = 0; byte != 16; ++byte) {
  557. UNIT_ASSERT_EQUAL(etalon[byte], res_bytes[byte]);
  558. }
  559. }
  560. }
  561. template <typename TElem>
  562. struct THelperASHR {
  563. static TElem Call(const TElem op, const int shift) {
  564. constexpr int nBitsInOp = sizeof(op) * CHAR_BIT;
  565. if (op < 0) {
  566. // Arithmetic shift propagates sign bit to the right
  567. // while operator>> is implementation defined for negative values,
  568. // so we can't use it as a reference implementation
  569. // and we need to write some standard consistent code.
  570. typedef TFixedWidthUnsignedInt<TElem> TUnsignedElem;
  571. TUnsignedElem uOp(op);
  572. const TUnsignedElem signBit = TUnsignedElem(1) << (nBitsInOp - 1);
  573. Y_ENSURE(shift >= 0);
  574. for (int i = 0; i != shift; ++i) {
  575. uOp = signBit | (uOp >> 1);
  576. }
  577. // unsigned -> signed conversion is also implementation defined, so we need to use some other method.
  578. return reinterpret_cast<TElem&>(uOp);
  579. }
  580. return shift < nBitsInOp ? op >> shift : 0;
  581. }
  582. };
  583. template <typename TElem>
  584. struct THelperSHR {
  585. static TElem Call(const TElem op, const int shift) {
  586. constexpr int nBitsInOp = sizeof(op) * CHAR_BIT;
  587. return shift < nBitsInOp ? op >> shift : 0;
  588. }
  589. };
  590. void TSSEEmulTest::Test_mm_srli_epi16() {
  591. Test_mm_shifter_epiXX<ui16, 16, 8, Wrap(_mm_srli_epi16), unsigned, uint16x8_t,
  592. THelperSHR<ui16>>();
  593. }
  594. void TSSEEmulTest::Test_mm_srli_epi32() {
  595. Test_mm_shifter_epiXX<ui32, 32, 4, Wrap(_mm_srli_epi32), unsigned, uint32x4_t,
  596. THelperSHR<ui32>>();
  597. }
  598. void TSSEEmulTest::Test_mm_srli_epi64() {
  599. Test_mm_shifter_epiXX<ui64, 64, 2, Wrap(_mm_srli_epi64), unsigned, uint64x2_t,
  600. THelperSHR<ui64>>();
  601. }
  602. template <typename TElem>
  603. struct THelperSHL {
  604. static TElem Call(const TElem op, const int shift) {
  605. constexpr int nBitsInOp = sizeof(op) * CHAR_BIT;
  606. return shift < nBitsInOp ? op << shift : 0;
  607. }
  608. };
  609. void TSSEEmulTest::Test_mm_slli_epi16() {
  610. Test_mm_shifter_epiXX<ui16, 16, 8, Wrap(_mm_slli_epi16), unsigned, uint16x8_t,
  611. THelperSHL<ui16>>();
  612. }
  613. void TSSEEmulTest::Test_mm_slli_epi32() {
  614. Test_mm_shifter_epiXX<ui32, 32, 4, Wrap(_mm_slli_epi32), unsigned, uint32x4_t,
  615. THelperSHL<ui32>>();
  616. }
  617. void TSSEEmulTest::Test_mm_slli_epi64() {
  618. Test_mm_shifter_epiXX<ui64, 64, 2, Wrap(_mm_slli_epi64), unsigned, uint64x2_t,
  619. THelperSHL<ui64>>();
  620. }
  621. void TSSEEmulTest::Test_mm_slli_si128() {
  622. Test_mm_byte_shifter(EDirection::Left, [] (__m128i a) -> TShiftRes {
  623. TShiftRes res;
  624. res.Value[0] = _mm_slli_si128(a, 0);
  625. res.Value[1] = _mm_slli_si128(a, 1);
  626. res.Value[2] = _mm_slli_si128(a, 2);
  627. res.Value[3] = _mm_slli_si128(a, 3);
  628. res.Value[4] = _mm_slli_si128(a, 4);
  629. res.Value[5] = _mm_slli_si128(a, 5);
  630. res.Value[6] = _mm_slli_si128(a, 6);
  631. res.Value[7] = _mm_slli_si128(a, 7);
  632. res.Value[8] = _mm_slli_si128(a, 8);
  633. res.Value[9] = _mm_slli_si128(a, 9);
  634. res.Value[10] = _mm_slli_si128(a, 10);
  635. res.Value[11] = _mm_slli_si128(a, 11);
  636. res.Value[12] = _mm_slli_si128(a, 12);
  637. res.Value[13] = _mm_slli_si128(a, 13);
  638. res.Value[14] = _mm_slli_si128(a, 14);
  639. res.Value[15] = _mm_slli_si128(a, 15);
  640. res.Value[16] = _mm_slli_si128(a, 16);
  641. return res;
  642. });
  643. }
  644. void TSSEEmulTest::Test_mm_srl_epi16() {
  645. Test_mm_shifter_epiXX<ui16, 16, 8, T_mm_CallWrapper<__m128i, decltype(_mm_srl_epi16), _mm_srl_epi16>, __m128i, uint16x8_t,
  646. THelperSHR<ui16>>();
  647. }
  648. void TSSEEmulTest::Test_mm_srl_epi32() {
  649. Test_mm_shifter_epiXX<ui32, 32, 4, T_mm_CallWrapper<__m128i, decltype(_mm_srl_epi32), _mm_srl_epi32>, __m128i, uint32x4_t,
  650. THelperSHR<ui32>>();
  651. }
  652. void TSSEEmulTest::Test_mm_srl_epi64() {
  653. Test_mm_shifter_epiXX<ui64, 64, 2, T_mm_CallWrapper<__m128i, decltype(_mm_srl_epi64), _mm_srl_epi64>, __m128i, uint64x2_t,
  654. THelperSHR<ui64>>();
  655. }
  656. void TSSEEmulTest::Test_mm_srai_epi16() {
  657. Test_mm_shifter_epiXX<i16, 16, 8, T_mm_CallWrapper<__m128i, decltype(_mm_srai_epi16), _mm_srai_epi16>, unsigned, int16x8_t,
  658. THelperASHR<i16>>();
  659. }
  660. void TSSEEmulTest::Test_mm_srai_epi32() {
  661. Test_mm_shifter_epiXX<i32, 32, 4, T_mm_CallWrapper<__m128i, decltype(_mm_srai_epi32), _mm_srai_epi32>, unsigned, int32x4_t,
  662. THelperASHR<i32>>();
  663. }
  664. void TSSEEmulTest::Test_mm_srli_si128() {
  665. Test_mm_byte_shifter(EDirection::Right, [](__m128i a) -> TShiftRes {
  666. TShiftRes res;
  667. res.Value[0] = _mm_srli_si128(a, 0);
  668. res.Value[1] = _mm_srli_si128(a, 1);
  669. res.Value[2] = _mm_srli_si128(a, 2);
  670. res.Value[3] = _mm_srli_si128(a, 3);
  671. res.Value[4] = _mm_srli_si128(a, 4);
  672. res.Value[5] = _mm_srli_si128(a, 5);
  673. res.Value[6] = _mm_srli_si128(a, 6);
  674. res.Value[7] = _mm_srli_si128(a, 7);
  675. res.Value[8] = _mm_srli_si128(a, 8);
  676. res.Value[9] = _mm_srli_si128(a, 9);
  677. res.Value[10] = _mm_srli_si128(a, 10);
  678. res.Value[11] = _mm_srli_si128(a, 11);
  679. res.Value[12] = _mm_srli_si128(a, 12);
  680. res.Value[13] = _mm_srli_si128(a, 13);
  681. res.Value[14] = _mm_srli_si128(a, 14);
  682. res.Value[15] = _mm_srli_si128(a, 15);
  683. res.Value[16] = _mm_srli_si128(a, 16);
  684. return res;
  685. });
  686. }
  687. void TSSEEmulTest::Test_mm_sll_epi16() {
  688. Test_mm_shifter_epiXX<ui16, 16, 8, T_mm_CallWrapper<__m128i, decltype(_mm_sll_epi16), _mm_sll_epi16>, __m128i, uint16x8_t,
  689. THelperSHL<ui16>>();
  690. }
  691. void TSSEEmulTest::Test_mm_sll_epi32() {
  692. Test_mm_shifter_epiXX<ui32, 32, 4, T_mm_CallWrapper<__m128i, decltype(_mm_sll_epi32), _mm_sll_epi32>, __m128i, uint32x4_t,
  693. THelperSHL<ui32>>();
  694. }
  695. void TSSEEmulTest::Test_mm_sll_epi64() {
  696. Test_mm_shifter_epiXX<ui64, 64, 2, T_mm_CallWrapper<__m128i, decltype(_mm_sll_epi64), _mm_sll_epi64>, __m128i, uint64x2_t,
  697. THelperSHL<ui64>>();
  698. }
  699. template <typename TElem>
  700. struct THelperAdd {
  701. static TElem Call(const TElem op1, const TElem op2) {
  702. return op1 + op2;
  703. }
  704. };
  705. void TSSEEmulTest::Test_mm_add_epi16() {
  706. Test_mm_dualop<ui16, 8, Wrap(_mm_add_epi16), THelperAdd<ui16>, uint16x8_t>();
  707. }
  708. void TSSEEmulTest::Test_mm_add_epi32() {
  709. Test_mm_dualop<ui32, 4, Wrap(_mm_add_epi32), THelperAdd<ui32>, uint32x4_t>();
  710. }
  711. void TSSEEmulTest::Test_mm_add_epi64() {
  712. Test_mm_dualop<ui64, 2, Wrap(_mm_add_epi64), THelperAdd<ui64>, uint64x2_t>();
  713. }
  714. void TSSEEmulTest::Test_mm_add_ps() {
  715. Test_mm_dualop<float, 2, WrapF(_mm_add_ps),
  716. THelperAdd<float>, float32x4_t, __m128>();
  717. }
  718. void TSSEEmulTest::Test_mm_add_pd() {
  719. Test_mm_dualop<double, 2, WrapD(_mm_add_pd),
  720. THelperAdd<double>, float64x2_t, __m128d>();
  721. }
  722. void TSSEEmulTest::Test_mm_madd_epi16() {
  723. alignas(16) const char data1[16] = {
  724. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  725. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'
  726. };
  727. alignas(16) const char data2[16] = {
  728. '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
  729. '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'
  730. };
  731. const __m128i value1 = TFuncLoad<__m128i>(&data1);
  732. const __m128i value2 = TFuncLoad<__m128i>(&data2);
  733. const __m128i res = _mm_madd_epi16(value1, value2);
  734. const i16* dataw1 = reinterpret_cast<const i16*>(&data1);
  735. const i16* dataw2 = reinterpret_cast<const i16*>(&data2);
  736. for (size_t i = 0; i != 4; ++i) {
  737. const size_t dataIdx = i * 2;
  738. const i32 etalonResult = (i32) dataw1[dataIdx] * (i32) dataw2[dataIdx] + (i32) dataw1[dataIdx + 1] * (i32) dataw2[dataIdx + 1];
  739. const i32 value = TQType<int32x4_t>::As(res)[i];
  740. UNIT_ASSERT_EQUAL(value, etalonResult);
  741. }
  742. }
  743. template <typename TElem>
  744. struct THelperSub {
  745. static TElem Call(const TElem op1, const TElem op2) {
  746. return op1 - op2;
  747. }
  748. };
  749. void TSSEEmulTest::Test_mm_sub_epi16() {
  750. Test_mm_dualop<ui16, 8, Wrap(_mm_sub_epi16), THelperSub<ui16>, uint16x8_t>();
  751. }
  752. void TSSEEmulTest::Test_mm_sub_epi32() {
  753. Test_mm_dualop<ui32, 4, Wrap(_mm_sub_epi32), THelperSub<ui32>, uint32x4_t>();
  754. }
  755. void TSSEEmulTest::Test_mm_sub_epi64() {
  756. Test_mm_dualop<ui64, 2, Wrap(_mm_sub_epi64), THelperSub<ui64>, uint64x2_t>();
  757. }
  758. void TSSEEmulTest::Test_mm_sub_ps() {
  759. Test_mm_dualop<float, 4, WrapF(_mm_sub_ps), THelperSub<float>,
  760. float32x4_t, __m128>();
  761. }
  762. void TSSEEmulTest::Test_mm_sub_pd() {
  763. Test_mm_dualop<double, 2, WrapD(_mm_sub_pd), THelperSub<double>,
  764. float64x2_t, __m128d>();
  765. }
  766. void TSSEEmulTest::Test_mm_mul_ps() {
  767. struct THelper {
  768. static float Call(const float op1, const float op2) {
  769. return op1 * op2;
  770. }
  771. };
  772. Test_mm_dualop<float, 4, WrapF(_mm_mul_ps), THelper, float32x4_t, __m128>();
  773. }
  774. void TSSEEmulTest::Test_mm_mul_pd() {
  775. struct THelper {
  776. static double Call(const double op1, const double op2) {
  777. return op1 * op2;
  778. }
  779. };
  780. Test_mm_dualop<double, 2, WrapD(_mm_mul_pd), THelper, float64x2_t, __m128d>();
  781. }
  782. void TSSEEmulTest::Test_mm_div_ps() {
  783. struct THelper {
  784. static float Call(const float op1, const float op2) {
  785. return op1 / op2;
  786. }
  787. };
  788. Test_mm_dualop<float, 4, WrapF(_mm_div_ps), THelper, float32x4_t, __m128>();
  789. }
  790. void TSSEEmulTest::Test_mm_div_pd() {
  791. struct THelper {
  792. static double Call(const double op1, const double op2) {
  793. return op1 / op2;
  794. }
  795. };
  796. Test_mm_dualop<double, 2, WrapD(_mm_div_pd), THelper, float64x2_t, __m128d>();
  797. }
  798. void TSSEEmulTest::Test_mm_max_ps() {
  799. struct THelper {
  800. static float Call(const float op1, const float op2) {
  801. return std::max(op1, op2);
  802. }
  803. };
  804. Test_mm_dualop<float, 4, WrapF(_mm_max_ps), THelper, float32x4_t, __m128>();
  805. }
  806. void TSSEEmulTest::Test_mm_min_ps() {
  807. struct THelper {
  808. static float Call(const float op1, const float op2) {
  809. return std::min(op1, op2);
  810. }
  811. };
  812. Test_mm_dualop<float, 4, WrapF(_mm_min_ps), THelper, float32x4_t, __m128>();
  813. }
  814. void TSSEEmulTest::Test_mm_and_ps() {
  815. struct THelper {
  816. static float Call(const float op1, const float op2) {
  817. union Cast {
  818. unsigned int AsUInt;
  819. float AsFloat;
  820. };
  821. Cast v1, v2, result;
  822. v1.AsFloat = op1;
  823. v2.AsFloat = op2;
  824. result.AsUInt = v1.AsUInt & v2.AsUInt;
  825. return result.AsFloat;
  826. }
  827. };
  828. Test_mm_dualcmp<float, 4, WrapF(_mm_and_ps),
  829. THelper, float32x4_t, __m128>();
  830. }
  831. template <typename TElem, int bits, int elemCount, int shift,
  832. typename TFunc, typename TOp>
  833. void TSSEEmulTest::Test_mm_unpack_epiXX() {
  834. char data1[16] = {
  835. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  836. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  837. char data2[16] = {
  838. '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
  839. '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
  840. TElem* dataw1 = reinterpret_cast<TElem*>(&data1);
  841. TElem* dataw2 = reinterpret_cast<TElem*>(&data2);
  842. __m128i value1 = _mm_loadu_si128((__m128i*)&data1);
  843. __m128i value2 = _mm_loadu_si128((__m128i*)&data2);
  844. TElem zippedData[elemCount];
  845. for (unsigned i = 0; i < elemCount / 2; ++i) {
  846. zippedData[i * 2] = dataw1[i + shift];
  847. zippedData[i * 2 + 1] = dataw2[i + shift];
  848. }
  849. __m128i result = TFunc(value1, value2);
  850. for (unsigned i = 0; i < elemCount / 2; ++i) {
  851. UNIT_ASSERT_EQUAL(zippedData[i * 2], TQType<TOp>::As(result)[i * 2]);
  852. UNIT_ASSERT_EQUAL(zippedData[i * 2 + 1],
  853. TQType<TOp>::As(result)[i * 2 + 1]);
  854. }
  855. }
  856. void TSSEEmulTest::Test_mm_unpacklo_epi8() {
  857. Test_mm_unpack_epiXX<ui8, 8, 16, 0, Wrap(_mm_unpacklo_epi8), uint8x16_t>();
  858. }
  859. void TSSEEmulTest::Test_mm_unpackhi_epi8() {
  860. Test_mm_unpack_epiXX<ui8, 8, 16, 8, Wrap(_mm_unpackhi_epi8), uint8x16_t>();
  861. }
  862. void TSSEEmulTest::Test_mm_unpacklo_epi16() {
  863. Test_mm_unpack_epiXX<ui16, 16, 8, 0, Wrap(_mm_unpacklo_epi16), uint16x8_t>();
  864. }
  865. void TSSEEmulTest::Test_mm_unpackhi_epi16() {
  866. Test_mm_unpack_epiXX<ui16, 16, 8, 4, Wrap(_mm_unpackhi_epi16), uint16x8_t>();
  867. }
  868. void TSSEEmulTest::Test_mm_unpacklo_epi32() {
  869. Test_mm_unpack_epiXX<ui32, 32, 4, 0, Wrap(_mm_unpacklo_epi32), uint32x4_t>();
  870. }
  871. void TSSEEmulTest::Test_mm_unpackhi_epi32() {
  872. Test_mm_unpack_epiXX<ui32, 32, 4, 2, Wrap(_mm_unpackhi_epi32), uint32x4_t>();
  873. }
  874. void TSSEEmulTest::Test_mm_unpacklo_epi64() {
  875. Test_mm_unpack_epiXX<ui64, 64, 2, 0, Wrap(_mm_unpacklo_epi64), uint64x2_t>();
  876. }
  877. void TSSEEmulTest::Test_mm_unpackhi_epi64() {
  878. Test_mm_unpack_epiXX<ui64, 64, 2, 1, Wrap(_mm_unpackhi_epi64), uint64x2_t>();
  879. }
  880. template <typename TElem, unsigned elemCount,
  881. typename TFunc, typename TElemFunc,
  882. typename TOp, typename TVectorType>
  883. void TSSEEmulTest::Test_mm_dualop() {
  884. char data1[16] = {
  885. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  886. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  887. char data2[16] = {
  888. '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
  889. '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
  890. TElem* dataw1 = reinterpret_cast<TElem*>(&data1);
  891. TElem* dataw2 = reinterpret_cast<TElem*>(&data2);
  892. TVectorType value1 = TFuncLoad<TVectorType>(&data1);
  893. TVectorType value2 = TFuncLoad<TVectorType>(&data2);
  894. TElem procData[elemCount];
  895. for (unsigned i = 0; i < elemCount; ++i) {
  896. procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]);
  897. }
  898. TVectorType result = TFunc(value1, value2);
  899. for (unsigned i = 0; i < elemCount; ++i) {
  900. UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]);
  901. }
  902. }
  903. /* This is almost the same as Test_mm_dualop,
  904. but different data1 and data2 */
  905. template <typename TElem, unsigned elemCount,
  906. typename TFunc, typename TElemFunc,
  907. typename TOp, typename TVectorType>
  908. void TSSEEmulTest::Test_mm_dualcmp() {
  909. char data1[16] = {
  910. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x66', '\x77', '\xAA',
  911. '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C'};
  912. char data2[16] = {
  913. '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
  914. '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
  915. TElem* dataw1 = reinterpret_cast<TElem*>(&data1);
  916. TElem* dataw2 = reinterpret_cast<TElem*>(&data2);
  917. TVectorType value1 = TFuncLoad<TVectorType>(&data1);
  918. TVectorType value2 = TFuncLoad<TVectorType>(&data2);
  919. TElem procData[elemCount];
  920. for (unsigned i = 0; i < elemCount; ++i) {
  921. procData[i] = TElemFunc::Call(dataw1[i], dataw2[i]);
  922. }
  923. TVectorType result = TFunc(value1, value2);
  924. for (unsigned i = 0; i < elemCount; ++i) {
  925. /* memcmp is for compare to invalid floats in results */
  926. const TElem value = TQType<TOp>::As(result)[i];
  927. UNIT_ASSERT(memcmp(&(procData[i]), &value, sizeof(TElem)) == 0);
  928. }
  929. }
  930. void TSSEEmulTest::Test_mm_or_si128() {
  931. struct THelper {
  932. static ui64 Call(const ui64 op1, const ui64 op2) {
  933. return op1 | op2;
  934. }
  935. };
  936. Test_mm_dualop<ui64, 2, Wrap(_mm_or_si128), THelper, uint64x2_t>();
  937. }
  938. void TSSEEmulTest::Test_mm_and_si128() {
  939. struct THelper {
  940. static ui64 Call(const ui64 op1, const ui64 op2) {
  941. return op1 & op2;
  942. }
  943. };
  944. Test_mm_dualop<ui64, 2, Wrap(_mm_and_si128), THelper, uint64x2_t>();
  945. }
  946. void TSSEEmulTest::Test_mm_andnot_si128() {
  947. struct THelper {
  948. static ui64 Call(const ui64 op1, const ui64 op2) {
  949. return (~op1) & op2;
  950. }
  951. };
  952. Test_mm_dualop<ui64, 2, Wrap(_mm_andnot_si128), THelper, uint64x2_t>();
  953. }
  954. template <typename TElem>
  955. struct THelperCMPEQ {
  956. static TElem Call(const TElem op1, const TElem op2) {
  957. return op1 == op2 ? ~TElem(0) : TElem(0);
  958. }
  959. };
  960. void TSSEEmulTest::Test_mm_cmpeq_epi8() {
  961. Test_mm_dualcmp<ui8, 16, Wrap(_mm_cmpeq_epi8),
  962. THelperCMPEQ<ui8>, uint8x16_t>();
  963. }
  964. void TSSEEmulTest::Test_mm_cmpeq_epi16() {
  965. Test_mm_dualcmp<ui16, 8, Wrap(_mm_cmpeq_epi16),
  966. THelperCMPEQ<ui16>, uint16x8_t>();
  967. }
  968. void TSSEEmulTest::Test_mm_cmpeq_epi32() {
  969. Test_mm_dualcmp<ui32, 4, Wrap(_mm_cmpeq_epi32),
  970. THelperCMPEQ<ui32>, uint32x4_t>();
  971. }
  972. void TSSEEmulTest::Test_mm_cmpeq_ps() {
  973. struct THelperFloat {
  974. static float Call(const float op1, const float op2) {
  975. union Cast {
  976. unsigned int AsUInt;
  977. float AsFloat;
  978. };
  979. Cast value;
  980. value.AsUInt = op1 == op2 ? 0xFFFFFFFF : 0;
  981. return value.AsFloat;
  982. }
  983. };
  984. Test_mm_dualcmp<float, 4, WrapF(_mm_cmpeq_ps),
  985. THelperFloat, float32x4_t, __m128>();
  986. }
  987. template <typename TElem>
  988. struct THelperCMPGT {
  989. static TElem Call(const TElem op1, const TElem op2) {
  990. return op1 > op2 ? ~TElem(0) : TElem(0);
  991. }
  992. };
  993. void TSSEEmulTest::Test_mm_cmpgt_epi8() {
  994. Test_mm_dualcmp<i8, 16, Wrap(_mm_cmpgt_epi8),
  995. THelperCMPGT<i8>, int8x16_t>();
  996. }
  997. void TSSEEmulTest::Test_mm_cmpgt_epi16() {
  998. Test_mm_dualcmp<i16, 8, Wrap(_mm_cmpgt_epi16),
  999. THelperCMPGT<i16>, int16x8_t>();
  1000. }
  1001. void TSSEEmulTest::Test_mm_cmpgt_epi32() {
  1002. Test_mm_dualcmp<i32, 4, Wrap(_mm_cmpgt_epi32),
  1003. THelperCMPGT<i32>, int32x4_t>();
  1004. }
  1005. void TSSEEmulTest::Test_mm_cmpgt_ps() {
  1006. struct THelperFloat {
  1007. static float Call(const float op1, const float op2) {
  1008. union Cast {
  1009. unsigned int AsUInt;
  1010. float AsFloat;
  1011. };
  1012. Cast value;
  1013. value.AsUInt = op1 > op2 ? 0xFFFFFFFF : 0;
  1014. return value.AsFloat;
  1015. }
  1016. };
  1017. Test_mm_dualcmp<float, 4, WrapF(_mm_cmpgt_ps),
  1018. THelperFloat, float32x4_t, __m128>();
  1019. }
  1020. template <typename TElem>
  1021. struct THelperCMPLT {
  1022. static TElem Call(const TElem op1, const TElem op2) {
  1023. return op1 < op2 ? ~TElem(0) : TElem(0);
  1024. }
  1025. };
  1026. void TSSEEmulTest::Test_mm_cmplt_epi8() {
  1027. Test_mm_dualcmp<i8, 16, Wrap(_mm_cmplt_epi8),
  1028. THelperCMPLT<i8>, int8x16_t>();
  1029. }
  1030. void TSSEEmulTest::Test_mm_cmplt_epi16() {
  1031. Test_mm_dualcmp<i16, 8, Wrap(_mm_cmplt_epi16),
  1032. THelperCMPLT<i16>, int16x8_t>();
  1033. }
  1034. void TSSEEmulTest::Test_mm_cmplt_epi32() {
  1035. Test_mm_dualcmp<i32, 4, Wrap(_mm_cmplt_epi32),
  1036. THelperCMPLT<i32>, int32x4_t>();
  1037. }
  1038. template <typename TElem, int elemCount,
  1039. typename TFunc, typename TOp, typename TVectorType>
  1040. void TSSEEmulTest::Test_mm_setter_epiXX() {
  1041. char data[64] = {
  1042. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1043. '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C',
  1044. '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
  1045. '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF',
  1046. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00',
  1047. '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C',
  1048. '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
  1049. '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
  1050. TElem* dataw = reinterpret_cast<TElem*>(&data);
  1051. for (unsigned dataItem = 0; dataItem < elemCount * 4; ++dataItem) {
  1052. TVectorType value = TFunc(dataw[dataItem]);
  1053. for (unsigned i = 0; i < elemCount; ++i)
  1054. UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<TOp>::As(value)[i]);
  1055. }
  1056. }
  1057. void TSSEEmulTest::Test_mm_set1_epi8() {
  1058. Test_mm_setter_epiXX<i8, 16, Wrap(_mm_set1_epi8), int8x16_t, __m128i>();
  1059. }
  1060. void TSSEEmulTest::Test_mm_set1_epi16() {
  1061. Test_mm_setter_epiXX<i16, 8, Wrap(_mm_set1_epi16), int16x8_t, __m128i>();
  1062. }
  1063. void TSSEEmulTest::Test_mm_set1_epi32() {
  1064. Test_mm_setter_epiXX<i32, 4, Wrap(_mm_set1_epi32), int32x4_t, __m128i>();
  1065. }
  1066. void TSSEEmulTest::Test_mm_set1_ps() {
  1067. Test_mm_setter_epiXX<float, 4, WrapF(_mm_set1_ps), float32x4_t, __m128>();
  1068. }
  1069. void TSSEEmulTest::Test_mm_set_ps1() {
  1070. Test_mm_setter_epiXX<float, 4, WrapF(_mm_set_ps1), float32x4_t, __m128>();
  1071. }
  1072. void TSSEEmulTest::Test_mm_setzero_si128() {
  1073. __m128i value = _mm_setzero_si128();
  1074. for (unsigned i = 0; i < 4; ++i)
  1075. UNIT_ASSERT_EQUAL(0, TQType<uint32x4_t>::As(value)[i]);
  1076. }
  1077. void TSSEEmulTest::Test_mm_setzero_ps() {
  1078. __m128 value = _mm_setzero_ps();
  1079. for (unsigned i = 0; i < 4; ++i)
  1080. UNIT_ASSERT_EQUAL(0.0, TQType<float32x4_t>::As(value)[i]);
  1081. }
  1082. void TSSEEmulTest::Test_mm_setzero_pd() {
  1083. __m128d value = _mm_setzero_pd();
  1084. for (unsigned i = 0; i < 2; ++i)
  1085. UNIT_ASSERT_EQUAL(0.0, TQType<float64x2_t>::As(value)[i]);
  1086. }
  1087. void TSSEEmulTest::Test_mm_loadl_epi64() {
  1088. char data[64] = {
  1089. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1090. '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C',
  1091. '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
  1092. '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF',
  1093. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00',
  1094. '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C',
  1095. '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
  1096. '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
  1097. ui64* dataw = reinterpret_cast<ui64*>(&data);
  1098. for (unsigned dataItem = 0; dataItem < 8; ++dataItem) {
  1099. __m128i value = _mm_loadl_epi64((__m128i const*)&dataw[dataItem]);
  1100. UNIT_ASSERT_EQUAL(dataw[dataItem], TQType<uint64x2_t>::As(value)[0]);
  1101. UNIT_ASSERT_EQUAL(0, TQType<uint64x2_t>::As(value)[1]);
  1102. }
  1103. }
  1104. void TSSEEmulTest::Test_mm_storel_epi64() {
  1105. char data[64] = {
  1106. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1107. '\x33', '\x99', '\x44', '\x00', '\x55', '\x77', '\x66', '\x1C',
  1108. '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
  1109. '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF',
  1110. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x00', '\x00', '\x00',
  1111. '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x66', '\x1C',
  1112. '\x99', '\x33', '\xFF', '\xCC', '\x88', '\x66', '\x77', '\x44',
  1113. '\x33', '\x99', '\x44', '\x88', '\xCC', '\xBB', '\x22', '\xFF'};
  1114. ui64* dataw = reinterpret_cast<ui64*>(&data);
  1115. for (unsigned dataItem = 0; dataItem < 4; ++dataItem) {
  1116. __m128i value = _mm_loadu_si128((__m128i*)&dataw[dataItem * 2]);
  1117. ui64 buf[2] = {55, 81};
  1118. _mm_storel_epi64((__m128i*)&buf, value);
  1119. UNIT_ASSERT_EQUAL(dataw[dataItem * 2], buf[0]);
  1120. UNIT_ASSERT_EQUAL(81, buf[1]);
  1121. }
  1122. }
  1123. void TSSEEmulTest::Test_mm_shuffle_epi32() {
  1124. char data[16] = {
  1125. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1126. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1127. ui32* dataw = reinterpret_cast<ui32*>(&data);
  1128. __m128i value = _mm_loadu_si128((__m128i*)&data);
  1129. int coding[4] = {1, 3, 0, 2};
  1130. __m128i result = _mm_shuffle_epi32(value, _MM_SHUFFLE(2, 0, 3, 1));
  1131. for (unsigned i = 0; i < 4; ++i)
  1132. UNIT_ASSERT_EQUAL(dataw[coding[i]],
  1133. TQType<uint32x4_t>::As(result)[i]);
  1134. }
  1135. static int GetHighBitAt(char data, int at) {
  1136. ui8 udata = data & 0x80;
  1137. return int(udata >> 7) << at;
  1138. }
  1139. void TSSEEmulTest::Test_mm_movemask_epi8() {
  1140. char data[16] = {
  1141. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1142. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1143. __m128i value = _mm_loadu_si128((__m128i*)&data);
  1144. int result = _mm_movemask_epi8(value);
  1145. int verify = 0;
  1146. for (unsigned i = 0; i < 16; ++i) {
  1147. verify |= GetHighBitAt(data[i], i);
  1148. }
  1149. UNIT_ASSERT_EQUAL(result, verify);
  1150. }
  1151. void TSSEEmulTest::Test_mm_movemask_ps() {
  1152. char data[16] = {
  1153. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1154. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1155. __m128 value = _mm_loadu_ps((float*)&data);
  1156. int result = _mm_movemask_ps(value);
  1157. int verify = 0;
  1158. for (unsigned i = 0; i < 4; ++i) {
  1159. verify |= GetHighBitAt(data[i * 4 + 3], i);
  1160. }
  1161. UNIT_ASSERT_EQUAL(result, verify);
  1162. }
  1163. void TSSEEmulTest::Test_mm_movemask_ps_2() {
  1164. char data[16] = {
  1165. '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF',
  1166. '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF', '\xFF'};
  1167. __m128 value = _mm_loadu_ps((float*)&data);
  1168. int result = _mm_movemask_ps(value);
  1169. UNIT_ASSERT_EQUAL(result, 0xf);
  1170. }
  1171. void TSSEEmulTest::Test_mm_cvtsi128_si32() {
  1172. char data[16] = {
  1173. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1174. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1175. __m128i value = _mm_loadu_si128((__m128i*)&data);
  1176. int result = _mm_cvtsi128_si32(value);
  1177. i32* datap = reinterpret_cast<i32*>(&data);
  1178. int verify = datap[0];
  1179. UNIT_ASSERT_EQUAL(result, verify);
  1180. }
  1181. void TSSEEmulTest::Test_mm_cvtsi128_si64() {
  1182. char data[16] = {
  1183. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1184. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1185. __m128i value = _mm_loadu_si128((__m128i*)&data);
  1186. i64 result = _mm_cvtsi128_si64(value);
  1187. i64* datap = reinterpret_cast<i64*>(&data);
  1188. i64 verify = datap[0];
  1189. UNIT_ASSERT_EQUAL(result, verify);
  1190. }
  1191. void TSSEEmulTest::Test_mm_set_epi16() {
  1192. char data[16] = {
  1193. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1194. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1195. i16* dataw = reinterpret_cast<i16*>(&data);
  1196. ui64* dataq = reinterpret_cast<ui64*>(&data);
  1197. __m128i result = _mm_set_epi16(dataw[7], dataw[6], dataw[5], dataw[4],
  1198. dataw[3], dataw[2], dataw[1], dataw[0]);
  1199. ui64 buf[2] = {53, 81};
  1200. _mm_storeu_si128((__m128i*)&buf, result);
  1201. UNIT_ASSERT_EQUAL(buf[0], dataq[0]);
  1202. UNIT_ASSERT_EQUAL(buf[1], dataq[1]);
  1203. }
  1204. void TSSEEmulTest::Test_mm_set_epi32() {
  1205. char data[16] = {
  1206. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1207. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1208. i32* dataw = reinterpret_cast<i32*>(&data);
  1209. ui64* dataq = reinterpret_cast<ui64*>(&data);
  1210. __m128i result = _mm_set_epi32(dataw[3], dataw[2], dataw[1], dataw[0]);
  1211. ui64 buf[2] = {53, 81};
  1212. _mm_storeu_si128((__m128i*)&buf, result);
  1213. UNIT_ASSERT_EQUAL(buf[0], dataq[0]);
  1214. UNIT_ASSERT_EQUAL(buf[1], dataq[1]);
  1215. }
  1216. void TSSEEmulTest::Test_mm_set_ps() {
  1217. char data[16] = {
  1218. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1219. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1220. float* dataw = reinterpret_cast<float*>(&data);
  1221. ui64* dataq = reinterpret_cast<ui64*>(&data);
  1222. __m128 result = _mm_set_ps(dataw[3], dataw[2], dataw[1], dataw[0]);
  1223. ui64 buf[2] = {53, 81};
  1224. _mm_storeu_ps((float*)&buf, result);
  1225. UNIT_ASSERT_EQUAL(buf[0], dataq[0]);
  1226. UNIT_ASSERT_EQUAL(buf[1], dataq[1]);
  1227. }
  1228. void TSSEEmulTest::Test_mm_set_pd() {
  1229. char data[16] = {
  1230. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1231. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1232. double* dataw = reinterpret_cast<double*>(&data);
  1233. ui64* dataq = reinterpret_cast<ui64*>(&data);
  1234. __m128d result = _mm_set_pd(dataw[1], dataw[0]);
  1235. ui64 buf[2] = {53, 81};
  1236. _mm_storeu_pd((double*)&buf, result);
  1237. UNIT_ASSERT_EQUAL(buf[0], dataq[0]);
  1238. UNIT_ASSERT_EQUAL(buf[1], dataq[1]);
  1239. }
  1240. void TSSEEmulTest::Test_mm_cvtsi32_si128() {
  1241. char data[16] = {
  1242. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1243. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1244. i32* dataw = reinterpret_cast<i32*>(&data);
  1245. __m128i result = _mm_cvtsi32_si128(dataw[0]);
  1246. i32 buf[4] = {53, 81, -43, 2132};
  1247. _mm_storeu_si128((__m128i*)&buf, result);
  1248. UNIT_ASSERT_EQUAL(buf[0], dataw[0]);
  1249. UNIT_ASSERT_EQUAL(buf[1], 0);
  1250. UNIT_ASSERT_EQUAL(buf[2], 0);
  1251. UNIT_ASSERT_EQUAL(buf[3], 0);
  1252. }
  1253. void TSSEEmulTest::Test_mm_cvtsi64_si128() {
  1254. char data[16] = {
  1255. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1256. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1257. i64* dataw = reinterpret_cast<i64*>(&data);
  1258. __m128i result = _mm_cvtsi64_si128(dataw[0]);
  1259. i64 buf[2] = {7, 8};
  1260. _mm_storeu_si128((__m128i*)&buf, result);
  1261. UNIT_ASSERT_EQUAL(buf[0], dataw[0]);
  1262. UNIT_ASSERT_EQUAL(buf[1], 0);
  1263. }
  1264. template <typename TElem, typename TNarrow, unsigned elemCount, typename TFunc>
  1265. void TSSEEmulTest::Test_mm_packs_epiXX() {
  1266. char data[32] = {
  1267. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1268. '\x33', '\x99', '\x44', '\x88', '\x55', '\x00', '\x66', '\x1C',
  1269. '\x99', '\x33', '\x1C', '\x55', '\x00', '\x00', '\x00', '\x00',
  1270. '\x00', '\xAA', '\x00', '\x00', '\xCC', '\xBB', '\x22', '\xFF'};
  1271. __m128i value0 = _mm_loadu_si128((__m128i*)&data);
  1272. __m128i value1 = _mm_loadu_si128(((__m128i*)&data) + 1);
  1273. TElem* dataw = reinterpret_cast<TElem*>(&data);
  1274. __m128i result = TFunc(value0, value1);
  1275. TNarrow verify[elemCount];
  1276. for (unsigned i = 0; i < elemCount; ++i) {
  1277. TElem sum = dataw[i];
  1278. if (sum > std::numeric_limits<TNarrow>::max())
  1279. sum = std::numeric_limits<TNarrow>::max();
  1280. if (sum < std::numeric_limits<TNarrow>::min())
  1281. sum = std::numeric_limits<TNarrow>::min();
  1282. verify[i] = TNarrow(sum);
  1283. }
  1284. ui64* verifyp = (ui64*)&verify;
  1285. UNIT_ASSERT_EQUAL(verifyp[0], TQType<uint64x2_t>::As(result)[0]);
  1286. UNIT_ASSERT_EQUAL(verifyp[1], TQType<uint64x2_t>::As(result)[1]);
  1287. }
  1288. void TSSEEmulTest::Test_mm_packs_epi16() {
  1289. Test_mm_packs_epiXX<i16, i8, 16, Wrap(_mm_packs_epi16)>();
  1290. }
  1291. void TSSEEmulTest::Test_mm_packs_epi32() {
  1292. Test_mm_packs_epiXX<i32, i16, 8, Wrap(_mm_packs_epi32)>();
  1293. }
  1294. void TSSEEmulTest::Test_mm_packus_epi16() {
  1295. Test_mm_packs_epiXX<i16, ui8, 16, Wrap(_mm_packus_epi16)>();
  1296. }
  1297. void TSSEEmulTest::Test_mm_extract_epi8() {
  1298. alignas(16) char data[16] = {
  1299. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1300. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1301. const ui8* dataw = reinterpret_cast<const ui8*>(&data);
  1302. const __m128i value = _mm_loadu_si128((__m128i*)&data);
  1303. UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 0)), int(dataw[0]));
  1304. UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 1)), int(dataw[1]));
  1305. UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 2)), int(dataw[2]));
  1306. UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 3)), int(dataw[3]));
  1307. UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 4)), int(dataw[4]));
  1308. UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 5)), int(dataw[5]));
  1309. UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 6)), int(dataw[6]));
  1310. UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 7)), int(dataw[7]));
  1311. UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 8)), int(dataw[8]));
  1312. UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 9)), int(dataw[9]));
  1313. UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 10)), int(dataw[10]));
  1314. UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 11)), int(dataw[11]));
  1315. UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 12)), int(dataw[12]));
  1316. UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 13)), int(dataw[13]));
  1317. UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 14)), int(dataw[14]));
  1318. UNIT_ASSERT_EQUAL((_mm_extract_epi8(value, 15)), int(dataw[15]));
  1319. }
  1320. void TSSEEmulTest::Test_mm_extract_epi16() {
  1321. alignas(16) char data[16] = {
  1322. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1323. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1324. const ui16* dataw = reinterpret_cast<const ui16*>(&data);
  1325. const __m128i value = _mm_loadu_si128((__m128i*)&data);
  1326. UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 0)), int(dataw[0]));
  1327. UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 1)), int(dataw[1]));
  1328. UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 2)), int(dataw[2]));
  1329. UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 3)), int(dataw[3]));
  1330. UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 4)), int(dataw[4]));
  1331. UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 5)), int(dataw[5]));
  1332. UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 6)), int(dataw[6]));
  1333. UNIT_ASSERT_EQUAL((_mm_extract_epi16(value, 7)), int(dataw[7]));
  1334. }
  1335. void TSSEEmulTest::Test_mm_extract_epi64() {
  1336. alignas(16) char data[16] = {
  1337. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1338. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1339. const ui64* dataw = reinterpret_cast<const ui64*>(&data);
  1340. const __m128i value = _mm_loadu_si128((__m128i*)&data);
  1341. UNIT_ASSERT_EQUAL((_mm_extract_epi64(value, 0)), (long long)(dataw[0]));
  1342. UNIT_ASSERT_EQUAL((_mm_extract_epi64(value, 1)), (long long)(dataw[1]));
  1343. }
  1344. void TSSEEmulTest::Test_mm_extract_epi32() {
  1345. alignas(16) char data[16] = {
  1346. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1347. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1348. const ui32* dataw = reinterpret_cast<const ui32*>(&data);
  1349. const __m128i value = _mm_loadu_si128((__m128i*)&data);
  1350. UNIT_ASSERT_EQUAL((_mm_extract_epi32(value, 0)), int(dataw[0]));
  1351. UNIT_ASSERT_EQUAL((_mm_extract_epi32(value, 1)), int(dataw[1]));
  1352. UNIT_ASSERT_EQUAL((_mm_extract_epi32(value, 2)), int(dataw[2]));
  1353. UNIT_ASSERT_EQUAL((_mm_extract_epi32(value, 3)), int(dataw[3]));
  1354. }
  1355. void TSSEEmulTest::Test_MM_TRANSPOSE4_PS() {
  1356. char data0[16] = {
  1357. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1358. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1359. char data1[16] = {
  1360. '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
  1361. '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
  1362. char data2[16] = {
  1363. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1364. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1365. char data3[16] = {
  1366. '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
  1367. '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
  1368. __m128 value0 = _mm_loadu_ps((float*)&data0);
  1369. __m128 value1 = _mm_loadu_ps((float*)&data1);
  1370. __m128 value2 = _mm_loadu_ps((float*)&data2);
  1371. __m128 value3 = _mm_loadu_ps((float*)&data3);
  1372. _MM_TRANSPOSE4_PS(value0, value1, value2, value3);
  1373. ui64 tbuf0[2] = {0, 0};
  1374. ui64 tbuf1[2] = {0, 0};
  1375. ui64 tbuf2[2] = {0, 0};
  1376. ui64 tbuf3[2] = {0, 0};
  1377. _mm_storeu_ps((float*)&tbuf0, value0);
  1378. _mm_storeu_ps((float*)&tbuf1, value1);
  1379. _mm_storeu_ps((float*)&tbuf2, value2);
  1380. _mm_storeu_ps((float*)&tbuf3, value3);
  1381. char tdata0[16] = {
  1382. '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55',
  1383. '\xAA', '\x00', '\xFF', '\xCC', '\x99', '\x33', '\x1C', '\x55'};
  1384. char tdata1[16] = {
  1385. '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44',
  1386. '\x11', '\x22', '\xBB', '\xAA', '\x88', '\x66', '\x77', '\x44'};
  1387. char tdata2[16] = {
  1388. '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11',
  1389. '\x33', '\x99', '\x44', '\x88', '\x00', '\xAA', '\xAA', '\x11'};
  1390. char tdata3[16] = {
  1391. '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF',
  1392. '\x55', '\x77', '\x66', '\x1C', '\xCC', '\xBB', '\x22', '\xFF'};
  1393. UNIT_ASSERT(memcmp(tbuf0, tdata0, 16) == 0);
  1394. UNIT_ASSERT(memcmp(tbuf1, tdata1, 16) == 0);
  1395. UNIT_ASSERT(memcmp(tbuf2, tdata2, 16) == 0);
  1396. UNIT_ASSERT(memcmp(tbuf3, tdata3, 16) == 0);
  1397. }
  1398. template <typename TFrom, typename TTo, unsigned elemCount,
  1399. typename TLoadVector, typename TResultVector,
  1400. typename TElemFunc, typename TFunc, typename TOp>
  1401. void TSSEEmulTest::Test_mm_convertop() {
  1402. char data[16] = {
  1403. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1404. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1405. TFrom* datap = reinterpret_cast<TFrom*>(&data);
  1406. TLoadVector value = TFuncLoad<TLoadVector>(&data);
  1407. TTo procData[elemCount];
  1408. for (unsigned i = 0; i < elemCount; ++i) {
  1409. procData[i] = TElemFunc::Call(datap[i]);
  1410. }
  1411. TResultVector result = TFunc(value);
  1412. for (unsigned i = 0; i < elemCount; ++i) {
  1413. UNIT_ASSERT_EQUAL(procData[i], TQType<TOp>::As(result)[i]);
  1414. }
  1415. }
  1416. void TSSEEmulTest::Test_mm_cvtepi32_ps() {
  1417. struct THelper {
  1418. static float Call(const i32 op) {
  1419. return float(op);
  1420. }
  1421. };
  1422. Test_mm_convertop<i32, float, 4, __m128i, __m128,
  1423. THelper, WrapF(_mm_cvtepi32_ps), float32x4_t>();
  1424. }
  1425. void TSSEEmulTest::Test_mm_cvtps_epi32() {
  1426. struct THelper {
  1427. static i32 Call(const float op) {
  1428. return i32(op);
  1429. }
  1430. };
  1431. Test_mm_convertop<float, i32, 4, __m128, __m128i,
  1432. THelper, T_mm_CallWrapper<__m128i, decltype(_mm_cvtps_epi32), _mm_cvtps_epi32>, int32x4_t>();
  1433. }
  1434. void TSSEEmulTest::Test_mm_cvttps_epi32() {
  1435. struct THelper {
  1436. static i32 Call(const float op) {
  1437. return i32(op);
  1438. }
  1439. };
  1440. Test_mm_convertop<float, i32, 4, __m128, __m128i,
  1441. THelper, Wrap(_mm_cvttps_epi32), int32x4_t>();
  1442. }
  1443. template <typename TLoadVector, typename TCastVector,
  1444. typename TFunc, TFunc* func>
  1445. void TSSEEmulTest::Test_mm_castXX() {
  1446. char data[16] = {
  1447. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1448. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1449. TLoadVector value = TFuncLoad<TLoadVector>(&data);
  1450. const TLoadVector constvalue = TFuncLoad<TLoadVector>(&data);
  1451. TCastVector casted = func(value);
  1452. const TCastVector constcasted = func(constvalue);
  1453. char verify[16];
  1454. char constverify[16];
  1455. TFuncStore<TCastVector>(&verify, casted);
  1456. TFuncStore<TCastVector>(&constverify, constcasted);
  1457. UNIT_ASSERT(memcmp(&data, &verify, 16) == 0);
  1458. UNIT_ASSERT(memcmp(&data, &constverify, 16) == 0);
  1459. }
  1460. void TSSEEmulTest::Test_mm_castsi128_ps() {
  1461. Test_mm_castXX<__m128i, __m128,
  1462. decltype(_mm_castsi128_ps), _mm_castsi128_ps>();
  1463. }
  1464. void TSSEEmulTest::Test_mm_castps_si128() {
  1465. Test_mm_castXX<__m128, __m128i,
  1466. decltype(_mm_castps_si128), _mm_castps_si128>();
  1467. }
  1468. void TSSEEmulTest::Test_mm_mul_epu32() {
  1469. char data0[16] = {
  1470. '\xAA', '\x00', '\xFF', '\xCC', '\x11', '\x22', '\xBB', '\xAA',
  1471. '\x33', '\x99', '\x44', '\x88', '\x55', '\x77', '\x66', '\x1C'};
  1472. char data1[16] = {
  1473. '\x99', '\x33', '\x1C', '\x55', '\x88', '\x66', '\x77', '\x44',
  1474. '\x00', '\xAA', '\xAA', '\x11', '\xCC', '\xBB', '\x22', '\xFF'};
  1475. ui32* dataw0 = reinterpret_cast<ui32*>(&data0);
  1476. ui32* dataw1 = reinterpret_cast<ui32*>(&data1);
  1477. __m128i value0 = _mm_loadu_si128((__m128i*)&data0);
  1478. __m128i value1 = _mm_loadu_si128((__m128i*)&data1);
  1479. ui64 mul0 = (ui64) dataw0[0] * (ui64) dataw1[0];
  1480. ui64 mul1 = (ui64) dataw0[2] * (ui64) dataw1[2];
  1481. __m128i result = _mm_mul_epu32(value0, value1);
  1482. UNIT_ASSERT_EQUAL(mul0, TQType<uint64x2_t>::As(result)[0]);
  1483. UNIT_ASSERT_EQUAL(mul1, TQType<uint64x2_t>::As(result)[1]);
  1484. }
  1485. void TSSEEmulTest::Test_mm_cmpunord_ps() {
  1486. alignas(16) float valuesBits[4] = {1.f, 2.f, 3.f, 4.f};
  1487. alignas(16) float values2Bits[4] = {5.f, 6.f, 7.f, 8.f};
  1488. alignas(16) char allfs[16] = {
  1489. '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff',
  1490. '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff'
  1491. };
  1492. alignas(16) char allzeroes[16] = {
  1493. '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00',
  1494. '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'
  1495. };
  1496. const __m128 qnan = _mm_set_ps1(std::numeric_limits<float>::quiet_NaN());
  1497. const __m128 snan = _mm_set_ps1(std::numeric_limits<float>::signaling_NaN());
  1498. const __m128 values = _mm_loadu_ps((const float*) valuesBits);
  1499. const __m128 values2 = _mm_loadu_ps((const float*) values2Bits);
  1500. const __m128 mask1 = _mm_cmpunord_ps(qnan, qnan);
  1501. UNIT_ASSERT_EQUAL(::memcmp(&mask1, &allfs, sizeof(allfs)), 0);
  1502. const __m128 mask2 = _mm_cmpunord_ps(values, values);
  1503. UNIT_ASSERT_EQUAL(::memcmp(&mask2, &allzeroes, sizeof(allzeroes)), 0);
  1504. const __m128 mask3 = _mm_cmpunord_ps(snan, snan);
  1505. UNIT_ASSERT_EQUAL(::memcmp(&mask3, &allfs, sizeof(allfs)), 0);
  1506. const __m128 mask4 = _mm_cmpunord_ps(qnan, values);
  1507. UNIT_ASSERT_EQUAL(::memcmp(&mask4, &allfs, sizeof(allfs)), 0);
  1508. const __m128 mask5 = _mm_cmpunord_ps(snan, values);
  1509. UNIT_ASSERT_EQUAL(::memcmp(&mask5, &allfs, sizeof(allfs)), 0);
  1510. const __m128 mask6 = _mm_cmpunord_ps(qnan, snan);
  1511. UNIT_ASSERT_EQUAL(::memcmp(&mask6, &allfs, sizeof(allfs)), 0);
  1512. const __m128 mask7 = _mm_cmpunord_ps(values, values2);
  1513. UNIT_ASSERT_EQUAL(::memcmp(&mask7, &allzeroes, sizeof(allzeroes)), 0);
  1514. }
  1515. void TSSEEmulTest::Test_mm_store_ss() {
  1516. alignas(16) const float valueBits[4] = {1.f, 2.f, 3.f, 4.f};
  1517. const __m128 value = _mm_loadu_ps(valueBits);
  1518. float res = std::numeric_limits<float>::signaling_NaN();
  1519. _mm_store_ss(&res, value);
  1520. UNIT_ASSERT_EQUAL(res, 1.f);
  1521. }
  1522. void TSSEEmulTest::Test_mm_store_ps() {
  1523. alignas(16) const float valueBits[4] = {1.f, 2.f, 3.f, 4.f};
  1524. const __m128 value = _mm_loadu_ps(valueBits);
  1525. float res[4] = {0.f};
  1526. _mm_storeu_ps(res, value);
  1527. UNIT_ASSERT_EQUAL(res[0], 1.f);
  1528. UNIT_ASSERT_EQUAL(res[1], 2.f);
  1529. UNIT_ASSERT_EQUAL(res[2], 3.f);
  1530. UNIT_ASSERT_EQUAL(res[3], 4.f);
  1531. }
  1532. void TSSEEmulTest::Test_mm_storeu_pd() {
  1533. alignas(16) const double valueBits[4] = {1., 2., 3., 4.};
  1534. for (size_t i = 0; i != 3; ++i) {
  1535. const __m128d value = _mm_loadu_pd(&valueBits[i]);
  1536. alignas(16) double res[4];
  1537. for (size_t shift = 0; shift != 3; ++shift) {
  1538. _mm_storeu_pd(&res[shift], value);
  1539. for (size_t j = 0; j != 2; ++j) {
  1540. UNIT_ASSERT_EQUAL_C(res[j + shift], valueBits[i + j], "res: " << HexEncode(&res[shift], 16) << " vs etalon: " << HexEncode(&valueBits[i], 16));
  1541. }
  1542. }
  1543. }
  1544. }
  1545. void TSSEEmulTest::Test_mm_andnot_ps() {
  1546. alignas(16) const char firstBits[16] = {
  1547. '\x00', '\x00', '\xff', '\xff', '\x00', '\x00', '\xff', '\xff',
  1548. '\x00', '\x00', '\xff', '\xff', '\x00', '\x00', '\xff', '\xff'
  1549. };
  1550. alignas(16) const char secondBits[16] = {
  1551. '\x00', '\xff', '\x00', '\xff', '\x00', '\xff', '\x00', '\xff',
  1552. '\x00', '\xff', '\x00', '\xff', '\x00', '\xff', '\x00', '\xff'
  1553. };
  1554. alignas(16) const char resBits[16] = {
  1555. '\x00', '\xff', '\x00', '\x00', '\x00', '\xff', '\x00', '\x00',
  1556. '\x00', '\xff', '\x00', '\x00', '\x00', '\xff', '\x00', '\x00'
  1557. };
  1558. const __m128 value1 = _mm_loadu_ps((const float*) firstBits);
  1559. const __m128 value2 = _mm_loadu_ps((const float*) secondBits);
  1560. const __m128 res = _mm_andnot_ps(value1, value2);
  1561. UNIT_ASSERT_EQUAL(::memcmp(&res, resBits, sizeof(resBits)), 0);
  1562. }
  1563. void TSSEEmulTest::Test_mm_shuffle_ps() {
  1564. alignas(16) const float first[4] = {1.f, 2.f, 3.f, 4.f};
  1565. alignas(16) const float second[4] = {5.f, 6.f, 7.f, 8.f};
  1566. alignas(16) const float etalon[4] = {3.f, 4.f, 5.f, 6.f};
  1567. const __m128 value1 = _mm_loadu_ps(first);
  1568. const __m128 value2 = _mm_loadu_ps(second);
  1569. const __m128 res = _mm_shuffle_ps(value1, value2, _MM_SHUFFLE(1, 0, 3, 2));
  1570. UNIT_ASSERT_EQUAL(::memcmp(&res, etalon, sizeof(etalon)), 0);
  1571. }
  1572. void TSSEEmulTest::Test_mm_shuffle_pd() {
  1573. const double first[2] = {1.3, 2.3};
  1574. const double second[2] = {5.3, 6.3};
  1575. const double etalon0[2] = {1.3, 5.3};
  1576. const double etalon1[2] = {2.3, 5.3};
  1577. const double etalon2[2] = {1.3, 6.3};
  1578. const double etalon3[2] = {2.3, 6.3};
  1579. const __m128d value1 = _mm_loadu_pd(first);
  1580. const __m128d value2 = _mm_loadu_pd(second);
  1581. __m128d res = _mm_shuffle_pd(value1, value2, 0);
  1582. UNIT_ASSERT_EQUAL(::memcmp(&res, etalon0, sizeof(etalon0)), 0);
  1583. res = _mm_shuffle_pd(value1, value2, 1);
  1584. UNIT_ASSERT_EQUAL(::memcmp(&res, etalon1, sizeof(etalon1)), 0);
  1585. res = _mm_shuffle_pd(value1, value2, 2);
  1586. UNIT_ASSERT_EQUAL(::memcmp(&res, etalon2, sizeof(etalon2)), 0);
  1587. res = _mm_shuffle_pd(value1, value2, 3);
  1588. UNIT_ASSERT_EQUAL(::memcmp(&res, etalon3, sizeof(etalon3)), 0);
  1589. }
  1590. void TSSEEmulTest::Test_mm_cvtsd_f64() {
  1591. const double first[2] = {1.3, 2.3};
  1592. const double second[2] = {5.3, 6.3};
  1593. const __m128d value1 = _mm_loadu_pd(first);
  1594. const __m128d value2 = _mm_loadu_pd(second);
  1595. UNIT_ASSERT_EQUAL(_mm_cvtsd_f64(value1), 1.3);
  1596. UNIT_ASSERT_EQUAL(_mm_cvtsd_f64(value2), 5.3);
  1597. }
  1598. void TSSEEmulTest::Test_mm_loadl_pd() {
  1599. const double first[2] = {1.3, 2.3};
  1600. const double second[2] = {5.3, 6.3};
  1601. const double firstEtalon[2] = {10.13, 2.3};
  1602. const double secondEtalon[2] = {11.13, 6.3};
  1603. double newFirst = 10.13;
  1604. double newSecond = 11.13;
  1605. __m128d value1 = _mm_loadu_pd(first);
  1606. __m128d value2 = _mm_loadu_pd(second);
  1607. value1 = _mm_loadl_pd(value1, &newFirst);
  1608. value2 = _mm_loadl_pd(value2, &newSecond);
  1609. UNIT_ASSERT_EQUAL(::memcmp(&value1, firstEtalon, sizeof(firstEtalon)), 0);
  1610. UNIT_ASSERT_EQUAL(::memcmp(&value2, secondEtalon, sizeof(secondEtalon)), 0);
  1611. }
  1612. void TSSEEmulTest::Test_mm_loadh_pd() {
  1613. const double first[2] = {1.3, 2.3};
  1614. const double second[2] = {5.3, 6.3};
  1615. const double firstEtalon[2] = {1.3, 10.13};
  1616. const double secondEtalon[2] = {5.3, 11.13};
  1617. double newFirst = 10.13;
  1618. double newSecond = 11.13;
  1619. __m128d value1 = _mm_loadu_pd(first);
  1620. __m128d value2 = _mm_loadu_pd(second);
  1621. value1 = _mm_loadh_pd(value1, &newFirst);
  1622. value2 = _mm_loadh_pd(value2, &newSecond);
  1623. UNIT_ASSERT_EQUAL(::memcmp(&value1, firstEtalon, sizeof(firstEtalon)), 0);
  1624. UNIT_ASSERT_EQUAL(::memcmp(&value2, secondEtalon, sizeof(secondEtalon)), 0);
  1625. }
  1626. void TSSEEmulTest::Test_mm_or_ps() {
  1627. alignas(16) const char bytes1[16] = {
  1628. '\x00', '\x00', '\xff', '\xff', '\x00', '\x00', '\xff', '\xff',
  1629. '\x00', '\x00', '\xff', '\xff', '\x00', '\x00', '\xff', '\xff'
  1630. };
  1631. alignas(16) const char bytes2[16] = {
  1632. '\x00', '\xff', '\x00', '\xff', '\x00', '\xff', '\x00', '\xff',
  1633. '\x00', '\xff', '\x00', '\xff', '\x00', '\xff', '\x00', '\xff'
  1634. };
  1635. alignas(16) const char etalon[16] = {
  1636. '\x00', '\xff', '\xff', '\xff', '\x00', '\xff', '\xff', '\xff',
  1637. '\x00', '\xff', '\xff', '\xff', '\x00', '\xff', '\xff', '\xff'
  1638. };
  1639. const __m128 value1 = _mm_loadu_ps((const float*) bytes1);
  1640. const __m128 value2 = _mm_loadu_ps((const float*) bytes2);
  1641. const __m128 res = _mm_or_ps(value1, value2);
  1642. UNIT_ASSERT_EQUAL(::memcmp(&res, etalon, sizeof(etalon)), 0);
  1643. }
  1644. void TSSEEmulTest::Test_mm_loadu_pd() {
  1645. alignas(16) double stub[4] = {
  1646. 0.f, 1.f,
  1647. 2.f, 3.f
  1648. };
  1649. for (size_t shift = 0; shift != 3; ++shift) {
  1650. const __m128d val = _mm_loadu_pd(&stub[shift]);
  1651. alignas(16) double res[2];
  1652. _mm_store_pd(res, val);
  1653. for (size_t i = 0; i != 2; ++i) {
  1654. UNIT_ASSERT_EQUAL_C(res[i], stub[shift + i], "res: " << HexEncode(res, 16) << " vs etalon: " << HexEncode(&stub[shift], 16));
  1655. }
  1656. }
  1657. }
  1658. void TSSEEmulTest::Test_mm_rsqrt_ps() {
  1659. alignas(16) const char bytes[16] = {
  1660. '\x00', '\x00', '\x28', '\x42', // 42.f
  1661. '\x00', '\x98', '\x84', '\x45', // 4243.f
  1662. '\x60', '\x26', '\xcf', '\x48', // 424243.f
  1663. '\xed', '\xd5', '\x21', '\x4c' // 42424243.f
  1664. };
  1665. const __m128 value = _mm_loadu_ps((const float*)bytes);
  1666. const __m128 result = _mm_rsqrt_ps(value);
  1667. alignas(16) float res[4];
  1668. _mm_store_ps(res, result);
  1669. float fResult = 0.f;
  1670. for (size_t i = 0; i < 4; ++i) {
  1671. memcpy(&fResult, &bytes[i * 4], 4);
  1672. fResult = 1.f / std::sqrt(fResult);
  1673. UNIT_ASSERT_DOUBLES_EQUAL_C(res[i], fResult, 1e-3, "res: " << fResult << " vs etalon " << res[i]);
  1674. }
  1675. }
  1676. namespace NHelpers {
  1677. static __m128i Y_FORCE_INLINE GetCmp16(const __m128 &c0, const __m128 &c1, const __m128 &c2, const __m128 &c3, const __m128 test) {
  1678. const __m128i r0 = _mm_castps_si128(_mm_cmpgt_ps(c0, test));
  1679. const __m128i r1 = _mm_castps_si128(_mm_cmpgt_ps(c1, test));
  1680. const __m128i r2 = _mm_castps_si128(_mm_cmpgt_ps(c2, test));
  1681. const __m128i r3 = _mm_castps_si128(_mm_cmpgt_ps(c3, test));
  1682. const __m128i packed = _mm_packs_epi16(_mm_packs_epi32(r0, r1), _mm_packs_epi32(r2, r3));
  1683. return _mm_and_si128(_mm_set1_epi8(0x01), packed);
  1684. }
  1685. static __m128i Y_FORCE_INLINE GetCmp16(const float *factors, const __m128 test) {
  1686. const __m128 *ptr = (__m128 *)factors;
  1687. return GetCmp16(ptr[0], ptr[1], ptr[2], ptr[3], test);
  1688. }
  1689. template<size_t Num>
  1690. void DoLane(size_t length, const float *factors, ui32 *& dst, const float *&values) {
  1691. for (size_t i = 0; i < length; ++i) {
  1692. __m128 value = _mm_set1_ps(values[i]);
  1693. __m128i agg = GetCmp16(factors, value);
  1694. if (Num > 1) {
  1695. agg = _mm_add_epi16(agg, _mm_slli_epi16(GetCmp16(&factors[64], value), 1));
  1696. }
  1697. _mm_store_si128((__m128i *)&dst[4 * i], agg);
  1698. }
  1699. }
  1700. }
  1701. void TSSEEmulTest::Test_matrixnet_powerpc() {
  1702. static constexpr size_t length = 10;
  1703. alignas(16) float factors[1024];
  1704. alignas(16) ui32 valP[4 * length] = { 0 };
  1705. float values[length];
  1706. TReallyFastRng32 rng(42);
  1707. for (size_t i = 0; i < 1024; ++i) {
  1708. factors[i] = rng.GenRandReal2();
  1709. }
  1710. for (size_t i = 0; i < length; ++i) {
  1711. values[i] = rng.GenRandReal2();
  1712. }
  1713. ui32* val = reinterpret_cast<ui32*>(valP);
  1714. const float* vals = reinterpret_cast<const float*>(values);
  1715. NHelpers::DoLane<2>(length, factors, val, vals);
  1716. static const ui32 etalon[4 * length] = {
  1717. 2, 33554432, 258, 33554433, 50529027,
  1718. 50529027, 50529027, 50529027, 50528770,
  1719. 33685763, 33555203, 50462723, 50528770,
  1720. 33685763, 33555203, 50462723, 50529026,
  1721. 33751299, 50529027, 50463491, 2, 33554432,
  1722. 258, 33554433, 50397698, 33685761, 259,
  1723. 50462721, 50332162, 33554689, 259, 50462721,
  1724. 50528770, 33685761, 33555203, 50462723,
  1725. 50529026, 33685763, 50463491, 50463235
  1726. };
  1727. for (size_t i = 0; i < 4 * length; ++i) {
  1728. UNIT_ASSERT_EQUAL(valP[i], etalon[i]);
  1729. }
  1730. }