multibinary.asm 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527
  1. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2. ; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
  3. ;
  4. ; Redistribution and use in source and binary forms, with or without
  5. ; modification, are permitted provided that the following conditions
  6. ; are met:
  7. ; * Redistributions of source code must retain the above copyright
  8. ; notice, this list of conditions and the following disclaimer.
  9. ; * Redistributions in binary form must reproduce the above copyright
  10. ; notice, this list of conditions and the following disclaimer in
  11. ; the documentation and/or other materials provided with the
  12. ; distribution.
  13. ; * Neither the name of Intel Corporation nor the names of its
  14. ; contributors may be used to endorse or promote products derived
  15. ; from this software without specific prior written permission.
  16. ;
  17. ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18. ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19. ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20. ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21. ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22. ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23. ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24. ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25. ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26. ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27. ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29. %ifndef _MULTIBINARY_ASM_
  30. %define _MULTIBINARY_ASM_
  31. %ifidn __OUTPUT_FORMAT__, elf32
  32. %define mbin_def_ptr dd
  33. %define mbin_ptr_sz dword
  34. %define mbin_rdi edi
  35. %define mbin_rsi esi
  36. %define mbin_rax eax
  37. %define mbin_rbx ebx
  38. %define mbin_rcx ecx
  39. %define mbin_rdx edx
  40. %else
  41. %define mbin_def_ptr dq
  42. %define mbin_ptr_sz qword
  43. %define mbin_rdi rdi
  44. %define mbin_rsi rsi
  45. %define mbin_rax rax
  46. %define mbin_rbx rbx
  47. %define mbin_rcx rcx
  48. %define mbin_rdx rdx
  49. %endif
  50. %ifndef AS_FEATURE_LEVEL
  51. %define AS_FEATURE_LEVEL 4
  52. %endif
  53. ;;;;
  54. ; multibinary macro:
  55. ; creates the visable entry point that uses HW optimized call pointer
  56. ; creates the init of the HW optimized call pointer
  57. ;;;;
  58. %macro mbin_interface 1
  59. ;;;;
  60. ; *_dispatched is defaulted to *_mbinit and replaced on first call.
  61. ; Therefore, *_dispatch_init is only executed on first call.
  62. ;;;;
  63. section .data
  64. %1_dispatched:
  65. mbin_def_ptr %1_mbinit
  66. section .text
  67. global %1, function
  68. %1_mbinit:
  69. endbranch
  70. ;;; only called the first time to setup hardware match
  71. call %1_dispatch_init
  72. ;;; falls thru to execute the hw optimized code
  73. %1:
  74. endbranch
  75. jmp mbin_ptr_sz [%1_dispatched]
  76. %endmacro
  77. ;;;;;
  78. ; mbin_dispatch_init parameters
  79. ; Use this function when SSE/00/01 is a minimum requirement
  80. ; 1-> function name
  81. ; 2-> SSE/00/01 optimized function used as base
  82. ; 3-> AVX or AVX/02 opt func
  83. ; 4-> AVX2 or AVX/04 opt func
  84. ;;;;;
  85. %macro mbin_dispatch_init 4
  86. section .text
  87. %1_dispatch_init:
  88. push mbin_rsi
  89. push mbin_rax
  90. push mbin_rbx
  91. push mbin_rcx
  92. push mbin_rdx
  93. lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
  94. mov eax, 1
  95. cpuid
  96. and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
  97. cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
  98. lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
  99. jne _%1_init_done ; AVX is not available so end
  100. mov mbin_rsi, mbin_rbx
  101. ;; Try for AVX2
  102. xor ecx, ecx
  103. mov eax, 7
  104. cpuid
  105. test ebx, FLAG_CPUID7_EBX_AVX2
  106. lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
  107. cmovne mbin_rsi, mbin_rbx
  108. ;; Does it have xmm and ymm support
  109. xor ecx, ecx
  110. xgetbv
  111. and eax, FLAG_XGETBV_EAX_XMM_YMM
  112. cmp eax, FLAG_XGETBV_EAX_XMM_YMM
  113. je _%1_init_done
  114. lea mbin_rsi, [%2 WRT_OPT]
  115. _%1_init_done:
  116. pop mbin_rdx
  117. pop mbin_rcx
  118. pop mbin_rbx
  119. pop mbin_rax
  120. mov [%1_dispatched], mbin_rsi
  121. pop mbin_rsi
  122. ret
  123. %endmacro
  124. ;;;;;
  125. ; mbin_dispatch_init2 parameters
  126. ; Cases where only base functions are available
  127. ; 1-> function name
  128. ; 2-> base function
  129. ;;;;;
  130. %macro mbin_dispatch_init2 2
  131. section .text
  132. %1_dispatch_init:
  133. push mbin_rsi
  134. lea mbin_rsi, [%2 WRT_OPT] ; Default
  135. mov [%1_dispatched], mbin_rsi
  136. pop mbin_rsi
  137. ret
  138. %endmacro
  139. ;;;;;
  140. ; mbin_dispatch_init_clmul 3 parameters
  141. ; Use this case for CRC which needs both SSE4_1 and CLMUL
  142. ; 1-> function name
  143. ; 2-> base function
  144. ; 3-> SSE4_1 and CLMUL optimized function
  145. ; 4-> AVX/02 opt func
  146. ; 5-> AVX512/10 opt func
  147. ;;;;;
  148. %macro mbin_dispatch_init_clmul 5
  149. section .text
  150. %1_dispatch_init:
  151. push mbin_rsi
  152. push mbin_rax
  153. push mbin_rbx
  154. push mbin_rcx
  155. push mbin_rdx
  156. push mbin_rdi
  157. lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
  158. mov eax, 1
  159. cpuid
  160. mov ebx, ecx ; save cpuid1.ecx
  161. test ecx, FLAG_CPUID1_ECX_SSE4_1
  162. jz _%1_init_done
  163. test ecx, FLAG_CPUID1_ECX_CLMUL
  164. jz _%1_init_done
  165. lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
  166. ;; Test for XMM_YMM support/AVX
  167. test ecx, FLAG_CPUID1_ECX_OSXSAVE
  168. je _%1_init_done
  169. xor ecx, ecx
  170. xgetbv ; xcr -> edx:eax
  171. mov edi, eax ; save xgetvb.eax
  172. and eax, FLAG_XGETBV_EAX_XMM_YMM
  173. cmp eax, FLAG_XGETBV_EAX_XMM_YMM
  174. jne _%1_init_done
  175. test ebx, FLAG_CPUID1_ECX_AVX
  176. je _%1_init_done
  177. lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
  178. %if AS_FEATURE_LEVEL >= 10
  179. ;; Test for AVX2
  180. xor ecx, ecx
  181. mov eax, 7
  182. cpuid
  183. test ebx, FLAG_CPUID7_EBX_AVX2
  184. je _%1_init_done ; No AVX2 possible
  185. ;; Test for AVX512
  186. and edi, FLAG_XGETBV_EAX_ZMM_OPM
  187. cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
  188. jne _%1_init_done ; No AVX512 possible
  189. and ebx, FLAGS_CPUID7_EBX_AVX512_G1
  190. cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
  191. jne _%1_init_done
  192. and ecx, FLAGS_CPUID7_ECX_AVX512_G2
  193. cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2
  194. lea mbin_rbx, [%5 WRT_OPT] ; AVX512/10 opt
  195. cmove mbin_rsi, mbin_rbx
  196. %endif
  197. _%1_init_done:
  198. pop mbin_rdi
  199. pop mbin_rdx
  200. pop mbin_rcx
  201. pop mbin_rbx
  202. pop mbin_rax
  203. mov [%1_dispatched], mbin_rsi
  204. pop mbin_rsi
  205. ret
  206. %endmacro
  207. ;;;;;
  208. ; mbin_dispatch_init5 parameters
  209. ; 1-> function name
  210. ; 2-> base function
  211. ; 3-> SSE4_2 or 00/01 optimized function
  212. ; 4-> AVX/02 opt func
  213. ; 5-> AVX2/04 opt func
  214. ;;;;;
  215. %macro mbin_dispatch_init5 5
  216. section .text
  217. %1_dispatch_init:
  218. push mbin_rsi
  219. push mbin_rax
  220. push mbin_rbx
  221. push mbin_rcx
  222. push mbin_rdx
  223. lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
  224. mov eax, 1
  225. cpuid
  226. ; Test for SSE4.2
  227. test ecx, FLAG_CPUID1_ECX_SSE4_2
  228. lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func
  229. cmovne mbin_rsi, mbin_rbx
  230. and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
  231. cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
  232. lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func
  233. jne _%1_init_done ; AVX is not available so end
  234. mov mbin_rsi, mbin_rbx
  235. ;; Try for AVX2
  236. xor ecx, ecx
  237. mov eax, 7
  238. cpuid
  239. test ebx, FLAG_CPUID7_EBX_AVX2
  240. lea mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func
  241. cmovne mbin_rsi, mbin_rbx
  242. ;; Does it have xmm and ymm support
  243. xor ecx, ecx
  244. xgetbv
  245. and eax, FLAG_XGETBV_EAX_XMM_YMM
  246. cmp eax, FLAG_XGETBV_EAX_XMM_YMM
  247. je _%1_init_done
  248. lea mbin_rsi, [%3 WRT_OPT]
  249. _%1_init_done:
  250. pop mbin_rdx
  251. pop mbin_rcx
  252. pop mbin_rbx
  253. pop mbin_rax
  254. mov [%1_dispatched], mbin_rsi
  255. pop mbin_rsi
  256. ret
  257. %endmacro
  258. %if AS_FEATURE_LEVEL >= 6
  259. ;;;;;
  260. ; mbin_dispatch_init6 parameters
  261. ; 1-> function name
  262. ; 2-> base function
  263. ; 3-> SSE4_2 or 00/01 optimized function
  264. ; 4-> AVX/02 opt func
  265. ; 5-> AVX2/04 opt func
  266. ; 6-> AVX512/06 opt func
  267. ;;;;;
  268. %macro mbin_dispatch_init6 6
  269. section .text
  270. %1_dispatch_init:
  271. push mbin_rsi
  272. push mbin_rax
  273. push mbin_rbx
  274. push mbin_rcx
  275. push mbin_rdx
  276. push mbin_rdi
  277. lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
  278. mov eax, 1
  279. cpuid
  280. mov ebx, ecx ; save cpuid1.ecx
  281. test ecx, FLAG_CPUID1_ECX_SSE4_2
  282. je _%1_init_done ; Use base function if no SSE4_2
  283. lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
  284. ;; Test for XMM_YMM support/AVX
  285. test ecx, FLAG_CPUID1_ECX_OSXSAVE
  286. je _%1_init_done
  287. xor ecx, ecx
  288. xgetbv ; xcr -> edx:eax
  289. mov edi, eax ; save xgetvb.eax
  290. and eax, FLAG_XGETBV_EAX_XMM_YMM
  291. cmp eax, FLAG_XGETBV_EAX_XMM_YMM
  292. jne _%1_init_done
  293. test ebx, FLAG_CPUID1_ECX_AVX
  294. je _%1_init_done
  295. lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
  296. ;; Test for AVX2
  297. xor ecx, ecx
  298. mov eax, 7
  299. cpuid
  300. test ebx, FLAG_CPUID7_EBX_AVX2
  301. je _%1_init_done ; No AVX2 possible
  302. lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
  303. ;; Test for AVX512
  304. and edi, FLAG_XGETBV_EAX_ZMM_OPM
  305. cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
  306. jne _%1_init_done ; No AVX512 possible
  307. and ebx, FLAGS_CPUID7_EBX_AVX512_G1
  308. cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
  309. lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
  310. cmove mbin_rsi, mbin_rbx
  311. _%1_init_done:
  312. pop mbin_rdi
  313. pop mbin_rdx
  314. pop mbin_rcx
  315. pop mbin_rbx
  316. pop mbin_rax
  317. mov [%1_dispatched], mbin_rsi
  318. pop mbin_rsi
  319. ret
  320. %endmacro
  321. %else
  322. %macro mbin_dispatch_init6 6
  323. mbin_dispatch_init5 %1, %2, %3, %4, %5
  324. %endmacro
  325. %endif
  326. %if AS_FEATURE_LEVEL >= 10
  327. ;;;;;
  328. ; mbin_dispatch_init7 parameters
  329. ; 1-> function name
  330. ; 2-> base function
  331. ; 3-> SSE4_2 or 00/01 optimized function
  332. ; 4-> AVX/02 opt func
  333. ; 5-> AVX2/04 opt func
  334. ; 6-> AVX512/06 opt func
  335. ; 7-> AVX512 Update/10 opt func
  336. ;;;;;
  337. %macro mbin_dispatch_init7 7
  338. section .text
  339. %1_dispatch_init:
  340. push mbin_rsi
  341. push mbin_rax
  342. push mbin_rbx
  343. push mbin_rcx
  344. push mbin_rdx
  345. push mbin_rdi
  346. lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
  347. mov eax, 1
  348. cpuid
  349. mov ebx, ecx ; save cpuid1.ecx
  350. test ecx, FLAG_CPUID1_ECX_SSE4_2
  351. je _%1_init_done ; Use base function if no SSE4_2
  352. lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
  353. ;; Test for XMM_YMM support/AVX
  354. test ecx, FLAG_CPUID1_ECX_OSXSAVE
  355. je _%1_init_done
  356. xor ecx, ecx
  357. xgetbv ; xcr -> edx:eax
  358. mov edi, eax ; save xgetvb.eax
  359. and eax, FLAG_XGETBV_EAX_XMM_YMM
  360. cmp eax, FLAG_XGETBV_EAX_XMM_YMM
  361. jne _%1_init_done
  362. test ebx, FLAG_CPUID1_ECX_AVX
  363. je _%1_init_done
  364. lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
  365. ;; Test for AVX2
  366. xor ecx, ecx
  367. mov eax, 7
  368. cpuid
  369. test ebx, FLAG_CPUID7_EBX_AVX2
  370. je _%1_init_done ; No AVX2 possible
  371. lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
  372. ;; Test for AVX512
  373. and edi, FLAG_XGETBV_EAX_ZMM_OPM
  374. cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
  375. jne _%1_init_done ; No AVX512 possible
  376. and ebx, FLAGS_CPUID7_EBX_AVX512_G1
  377. cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
  378. lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
  379. cmove mbin_rsi, mbin_rbx
  380. and ecx, FLAGS_CPUID7_ECX_AVX512_G2
  381. cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2
  382. lea mbin_rbx, [%7 WRT_OPT] ; AVX512/06 opt
  383. cmove mbin_rsi, mbin_rbx
  384. _%1_init_done:
  385. pop mbin_rdi
  386. pop mbin_rdx
  387. pop mbin_rcx
  388. pop mbin_rbx
  389. pop mbin_rax
  390. mov [%1_dispatched], mbin_rsi
  391. pop mbin_rsi
  392. ret
  393. %endmacro
  394. ;;;;;
  395. ; mbin_dispatch_init8 parameters
  396. ; 1-> function name
  397. ; 2-> base function
  398. ; 3-> SSE4_2 or 00/01 optimized function
  399. ; 4-> AVX/02 opt func
  400. ; 5-> AVX2/04 opt func
  401. ; 6-> AVX512/06 opt func
  402. ; 7-> AVX2 Update/07 opt func
  403. ; 8-> AVX512 Update/10 opt func
  404. ;;;;;
  405. %macro mbin_dispatch_init8 8
  406. section .text
  407. %1_dispatch_init:
  408. push mbin_rsi
  409. push mbin_rax
  410. push mbin_rbx
  411. push mbin_rcx
  412. push mbin_rdx
  413. push mbin_rdi
  414. lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
  415. mov eax, 1
  416. cpuid
  417. mov ebx, ecx ; save cpuid1.ecx
  418. test ecx, FLAG_CPUID1_ECX_SSE4_2
  419. je _%1_init_done ; Use base function if no SSE4_2
  420. lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
  421. ;; Test for XMM_YMM support/AVX
  422. test ecx, FLAG_CPUID1_ECX_OSXSAVE
  423. je _%1_init_done
  424. xor ecx, ecx
  425. xgetbv ; xcr -> edx:eax
  426. mov edi, eax ; save xgetvb.eax
  427. and eax, FLAG_XGETBV_EAX_XMM_YMM
  428. cmp eax, FLAG_XGETBV_EAX_XMM_YMM
  429. jne _%1_init_done
  430. test ebx, FLAG_CPUID1_ECX_AVX
  431. je _%1_init_done
  432. lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
  433. ;; Test for AVX2
  434. xor ecx, ecx
  435. mov eax, 7
  436. cpuid
  437. test ebx, FLAG_CPUID7_EBX_AVX2
  438. je _%1_init_done ; No AVX2 possible
  439. lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
  440. ;; Test for AVX512
  441. and edi, FLAG_XGETBV_EAX_ZMM_OPM
  442. cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
  443. jne _%1_check_avx2_g2 ; No AVX512 possible
  444. and ebx, FLAGS_CPUID7_EBX_AVX512_G1
  445. cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
  446. lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
  447. cmove mbin_rsi, mbin_rbx
  448. and ecx, FLAGS_CPUID7_ECX_AVX512_G2
  449. cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2
  450. lea mbin_rbx, [%8 WRT_OPT] ; AVX512/10 opt
  451. cmove mbin_rsi, mbin_rbx
  452. jmp _%1_init_done
  453. _%1_check_avx2_g2:
  454. ;; Test for AVX2 Gen 2
  455. and ecx, FLAGS_CPUID7_ECX_AVX2_G2
  456. cmp ecx, FLAGS_CPUID7_ECX_AVX2_G2
  457. lea mbin_rbx, [%7 WRT_OPT] ; AVX2/7 opt
  458. cmove mbin_rsi, mbin_rbx
  459. _%1_init_done:
  460. pop mbin_rdi
  461. pop mbin_rdx
  462. pop mbin_rcx
  463. pop mbin_rbx
  464. pop mbin_rax
  465. mov [%1_dispatched], mbin_rsi
  466. pop mbin_rsi
  467. ret
  468. %endmacro
  469. %else
  470. %macro mbin_dispatch_init7 7
  471. mbin_dispatch_init6 %1, %2, %3, %4, %5, %6
  472. %endmacro
  473. %macro mbin_dispatch_init8 8
  474. mbin_dispatch_init6 %1, %2, %3, %4, %5, %6
  475. %endmacro
  476. %endif
  477. %endif ; ifndef _MULTIBINARY_ASM_