jsimdext.inc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520
  1. ;
  2. ; jsimdext.inc - common declarations
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
  6. ; Copyright (C) 2018, Matthieu Darbois.
  7. ; Copyright (C) 2018, Matthias Räncker.
  8. ;
  9. ; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
  10. ;
  11. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  12. ;
  13. ; This software is provided 'as-is', without any express or implied
  14. ; warranty. In no event will the authors be held liable for any damages
  15. ; arising from the use of this software.
  16. ;
  17. ; Permission is granted to anyone to use this software for any purpose,
  18. ; including commercial applications, and to alter it and redistribute it
  19. ; freely, subject to the following restrictions:
  20. ;
  21. ; 1. The origin of this software must not be misrepresented; you must not
  22. ; claim that you wrote the original software. If you use this software
  23. ; in a product, an acknowledgment in the product documentation would be
  24. ; appreciated but is not required.
  25. ; 2. Altered source versions must be plainly marked as such, and must not be
  26. ; misrepresented as being the original software.
  27. ; 3. This notice may not be removed or altered from any source distribution.
  28. ; ==========================================================================
  29. ; System-dependent configurations
  30. %ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)--------
  31. ; * Microsoft Visual C++
  32. ; * MinGW (Minimalist GNU for Windows)
  33. ; * CygWin
  34. ; * LCC-Win32
  35. ; -- segment definition --
  36. ;
  37. %ifdef __YASM_VER__
  38. %define SEG_TEXT .text align=32
  39. %define SEG_CONST .rdata align=32
  40. %else
  41. %define SEG_TEXT .text align=32 public use32 class=CODE
  42. %define SEG_CONST .rdata align=32 public use32 class=CONST
  43. %endif
  44. %elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
  45. ; * Microsoft Visual C++
  46. ; -- segment definition --
  47. ;
  48. %ifdef __YASM_VER__
  49. %define SEG_TEXT .text align=32
  50. %define SEG_CONST .rdata align=32
  51. %else
  52. %define SEG_TEXT .text align=32 public use64 class=CODE
  53. %define SEG_CONST .rdata align=32 public use64 class=CONST
  54. %endif
  55. %define EXTN(name) name ; foo() -> foo
  56. %elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
  57. ; * Borland C++ (Win32)
  58. ; -- segment definition --
  59. ;
  60. %define SEG_TEXT _text align=32 public use32 class=CODE
  61. %define SEG_CONST _data align=32 public use32 class=DATA
  62. %elifdef UNIX ; ----(nasm -felf[64] -DUNIX ...)------------
  63. ; * Linux
  64. ; * *BSD family Unix using elf format
  65. ; * Unix System V, including Solaris x86, UnixWare and SCO Unix
  66. ; mark stack as non-executable
  67. section .note.GNU-stack noalloc noexec nowrite progbits
  68. ; -- segment definition --
  69. ;
  70. %ifdef _x86_64_
  71. %define SEG_TEXT .text progbits align=32
  72. %define SEG_CONST .rodata progbits align=32
  73. %else
  74. %define SEG_TEXT .text progbits alloc exec nowrite align=32
  75. %define SEG_CONST .rodata progbits alloc noexec nowrite align=32
  76. %endif
  77. ; To make the code position-independent, append -DPIC to the commandline
  78. ;
  79. %define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
  80. %define EXTN(name) name ; foo() -> foo
  81. %elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
  82. ; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
  83. ; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...)
  84. ; -- segment definition --
  85. ;
  86. %define SEG_TEXT .text
  87. %define SEG_CONST .data
  88. ; To make the code position-independent, append -DPIC to the commandline
  89. ;
  90. %define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
  91. %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
  92. ; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
  93. ; -- segment definition --
  94. ;
  95. %define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=32. why?
  96. %define SEG_CONST .rodata align=32
  97. ; The generation of position-independent code (PIC) is the default on Darwin.
  98. ;
  99. %define PIC
  100. %define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing
  101. %else ; ----(Other case)----------------------
  102. ; -- segment definition --
  103. ;
  104. %define SEG_TEXT .text
  105. %define SEG_CONST .data
  106. %endif ; ----------------------------------------------
  107. ; ==========================================================================
  108. ; --------------------------------------------------------------------------
  109. ; Common types
  110. ;
  111. %ifdef _x86_64_
  112. %ifnidn __OUTPUT_FORMAT__, elfx32
  113. %define POINTER qword ; general pointer type
  114. %define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
  115. %define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
  116. %define resp resq
  117. %define dp dq
  118. %define raxp rax
  119. %define rbxp rbx
  120. %define rcxp rcx
  121. %define rdxp rdx
  122. %define rsip rsi
  123. %define rdip rdi
  124. %define rbpp rbp
  125. %define rspp rsp
  126. %define r8p r8
  127. %define r9p r9
  128. %define r10p r10
  129. %define r11p r11
  130. %define r12p r12
  131. %define r13p r13
  132. %define r14p r14
  133. %define r15p r15
  134. %endif
  135. %endif
  136. %ifndef raxp
  137. %define POINTER dword ; general pointer type
  138. %define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
  139. %define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
  140. %define resp resd
  141. %define dp dd
  142. ; x86_64 ILP32 ABI (x32)
  143. %define raxp eax
  144. %define rbxp ebx
  145. %define rcxp ecx
  146. %define rdxp edx
  147. %define rsip esi
  148. %define rdip edi
  149. %define rbpp ebp
  150. %define rspp esp
  151. %define r8p r8d
  152. %define r9p r9d
  153. %define r10p r10d
  154. %define r11p r11d
  155. %define r12p r12d
  156. %define r13p r13d
  157. %define r14p r14d
  158. %define r15p r15d
  159. %endif
  160. %define INT dword ; signed integer type
  161. %define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
  162. %define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
  163. %define FP32 dword ; IEEE754 single
  164. %define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
  165. %define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
  166. %define MMWORD qword ; int64 (MMX register)
  167. %define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
  168. %define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
  169. ; NASM is buggy and doesn't properly handle operand sizes for SSE
  170. ; instructions, so for now we have to define XMMWORD as blank.
  171. %define XMMWORD ; int128 (SSE register)
  172. %define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
  173. %define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
  174. %define YMMWORD ; int256 (AVX register)
  175. %define SIZEOF_YMMWORD SIZEOF_YWORD ; sizeof(YMMWORD)
  176. %define YMMWORD_BIT YWORD_BIT ; sizeof(YMMWORD)*BYTE_BIT
  177. ; Similar hacks for when we load a dword or MMWORD into an xmm# register
  178. %define XMM_DWORD
  179. %define XMM_MMWORD
  180. %define SIZEOF_BYTE 1 ; sizeof(byte)
  181. %define SIZEOF_WORD 2 ; sizeof(word)
  182. %define SIZEOF_DWORD 4 ; sizeof(dword)
  183. %define SIZEOF_QWORD 8 ; sizeof(qword)
  184. %define SIZEOF_OWORD 16 ; sizeof(oword)
  185. %define SIZEOF_YWORD 32 ; sizeof(yword)
  186. %define BYTE_BIT 8 ; CHAR_BIT in C
  187. %define WORD_BIT 16 ; sizeof(word)*BYTE_BIT
  188. %define DWORD_BIT 32 ; sizeof(dword)*BYTE_BIT
  189. %define QWORD_BIT 64 ; sizeof(qword)*BYTE_BIT
  190. %define OWORD_BIT 128 ; sizeof(oword)*BYTE_BIT
  191. %define YWORD_BIT 256 ; sizeof(yword)*BYTE_BIT
  192. ; --------------------------------------------------------------------------
  193. ; External Symbol Name
  194. ;
  195. %ifndef EXTN
  196. %define EXTN(name) _ %+ name ; foo() -> _foo
  197. %endif
  198. ; --------------------------------------------------------------------------
  199. ; Hidden symbols
  200. ;
  201. %ifdef UNIX ; ----(nasm -felf[64] -DUNIX ...)--------
  202. %define GLOBAL_FUNCTION(name) global EXTN(name):function hidden
  203. %define GLOBAL_DATA(name) global EXTN(name):data hidden
  204. %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
  205. %ifdef __YASM_VER__
  206. %define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
  207. %define GLOBAL_DATA(name) global EXTN(name):private_extern
  208. %else
  209. %if __NASM_VERSION_ID__ >= 0x020E0000
  210. %define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
  211. %define GLOBAL_DATA(name) global EXTN(name):private_extern
  212. %endif
  213. %endif
  214. %endif
  215. %ifndef GLOBAL_FUNCTION
  216. %define GLOBAL_FUNCTION(name) global EXTN(name)
  217. %endif
  218. %ifndef GLOBAL_DATA
  219. %define GLOBAL_DATA(name) global EXTN(name)
  220. %endif
  221. ; --------------------------------------------------------------------------
  222. ; Macros for position-independent code (PIC) support
  223. ;
  224. %ifndef GOT_SYMBOL
  225. %undef PIC
  226. %endif
  227. %ifdef PIC ; -------------------------------------------
  228. %ifidn GOT_SYMBOL, _MACHO_PIC_ ; --------------------
  229. ; At present, nasm doesn't seem to support PIC generation for Mach-O.
  230. ; The PIC support code below is a little tricky.
  231. SECTION SEG_CONST
  232. const_base:
  233. %define GOTOFF(got, sym) (got) + (sym) - const_base
  234. %imacro get_GOT 1
  235. ; NOTE: this macro destroys ecx resister.
  236. call %%geteip
  237. add ecx, byte (%%ref - $)
  238. jmp short %%adjust
  239. %%geteip:
  240. mov ecx, POINTER [esp]
  241. ret
  242. %%adjust:
  243. push ebp
  244. xor ebp, ebp ; ebp = 0
  245. %ifidni %1, ebx ; (%1 == ebx)
  246. ; db 0x8D,0x9C + jmp near const_base =
  247. ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
  248. db 0x8D, 0x9C ; 8D,9C
  249. jmp near const_base ; E9,(const_base-%%ref)
  250. %%ref:
  251. %else ; (%1 != ebx)
  252. ; db 0x8D,0x8C + jmp near const_base =
  253. ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
  254. db 0x8D, 0x8C ; 8D,8C
  255. jmp near const_base ; E9,(const_base-%%ref)
  256. %%ref:
  257. mov %1, ecx
  258. %endif ; (%1 == ebx)
  259. pop ebp
  260. %endmacro
  261. %else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
  262. %define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff
  263. %imacro get_GOT 1
  264. extern GOT_SYMBOL
  265. call %%geteip
  266. add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
  267. jmp short %%done
  268. %%geteip:
  269. mov %1, POINTER [esp]
  270. ret
  271. %%done:
  272. %endmacro
  273. %endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
  274. %imacro pushpic 1.nolist
  275. push %1
  276. %endmacro
  277. %imacro poppic 1.nolist
  278. pop %1
  279. %endmacro
  280. %imacro movpic 2.nolist
  281. mov %1, %2
  282. %endmacro
  283. %else ; !PIC -----------------------------------------
  284. %define GOTOFF(got, sym) (sym)
  285. %imacro get_GOT 1.nolist
  286. %endmacro
  287. %imacro pushpic 1.nolist
  288. %endmacro
  289. %imacro poppic 1.nolist
  290. %endmacro
  291. %imacro movpic 2.nolist
  292. %endmacro
  293. %endif ; PIC -----------------------------------------
  294. ; --------------------------------------------------------------------------
  295. ; Align the next instruction on {2,4,8,16,..}-byte boundary.
  296. ; ".balign n,,m" in GNU as
  297. ;
  298. %define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
  299. %define FILLB(b, n) (($$-(b)) & ((n)-1))
  300. %imacro alignx 1-2.nolist 0xFFFF
  301. %%bs: \
  302. times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
  303. db 0x90 ; nop
  304. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \
  305. db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00 ; lea ebx,[ebx+0x00000000]
  306. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \
  307. db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000]
  308. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \
  309. db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000]
  310. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \
  311. db 0x8D, 0x6C, 0x25, 0x00 ; lea ebp,[ebp+0x00]
  312. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \
  313. db 0x8D, 0x6D, 0x00 ; lea ebp,[ebp+0x00]
  314. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \
  315. db 0x8B, 0xED ; mov ebp,ebp
  316. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \
  317. db 0x90 ; nop
  318. %endmacro
  319. ; Align the next data on {2,4,8,16,..}-byte boundary.
  320. ;
  321. %imacro alignz 1.nolist
  322. align %1, db 0 ; filling zeros
  323. %endmacro
  324. %ifdef _x86_64_
  325. %ifdef WIN64
  326. %imacro collect_args 1
  327. sub rsp, SIZEOF_XMMWORD
  328. movaps XMMWORD [rsp], xmm6
  329. sub rsp, SIZEOF_XMMWORD
  330. movaps XMMWORD [rsp], xmm7
  331. mov r10, rcx
  332. %if %1 > 1
  333. mov r11, rdx
  334. %endif
  335. %if %1 > 2
  336. push r12
  337. mov r12, r8
  338. %endif
  339. %if %1 > 3
  340. push r13
  341. mov r13, r9
  342. %endif
  343. %if %1 > 4
  344. push r14
  345. mov r14, [rax+48]
  346. %endif
  347. %if %1 > 5
  348. push r15
  349. mov r15, [rax+56]
  350. %endif
  351. push rsi
  352. push rdi
  353. %endmacro
  354. %imacro uncollect_args 1
  355. pop rdi
  356. pop rsi
  357. %if %1 > 5
  358. pop r15
  359. %endif
  360. %if %1 > 4
  361. pop r14
  362. %endif
  363. %if %1 > 3
  364. pop r13
  365. %endif
  366. %if %1 > 2
  367. pop r12
  368. %endif
  369. movaps xmm7, XMMWORD [rsp]
  370. add rsp, SIZEOF_XMMWORD
  371. movaps xmm6, XMMWORD [rsp]
  372. add rsp, SIZEOF_XMMWORD
  373. %endmacro
  374. %imacro push_xmm 1
  375. sub rsp, %1 * SIZEOF_XMMWORD
  376. movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
  377. %if %1 > 1
  378. movaps XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
  379. %endif
  380. %if %1 > 2
  381. movaps XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
  382. %endif
  383. %if %1 > 3
  384. movaps XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
  385. %endif
  386. %endmacro
  387. %imacro pop_xmm 1
  388. movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
  389. %if %1 > 1
  390. movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
  391. %endif
  392. %if %1 > 2
  393. movaps xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
  394. %endif
  395. %if %1 > 3
  396. movaps xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
  397. %endif
  398. add rsp, %1 * SIZEOF_XMMWORD
  399. %endmacro
  400. %else
  401. %imacro collect_args 1
  402. push r10
  403. mov r10, rdi
  404. %if %1 > 1
  405. push r11
  406. mov r11, rsi
  407. %endif
  408. %if %1 > 2
  409. push r12
  410. mov r12, rdx
  411. %endif
  412. %if %1 > 3
  413. push r13
  414. mov r13, rcx
  415. %endif
  416. %if %1 > 4
  417. push r14
  418. mov r14, r8
  419. %endif
  420. %if %1 > 5
  421. push r15
  422. mov r15, r9
  423. %endif
  424. %endmacro
  425. %imacro uncollect_args 1
  426. %if %1 > 5
  427. pop r15
  428. %endif
  429. %if %1 > 4
  430. pop r14
  431. %endif
  432. %if %1 > 3
  433. pop r13
  434. %endif
  435. %if %1 > 2
  436. pop r12
  437. %endif
  438. %if %1 > 1
  439. pop r11
  440. %endif
  441. pop r10
  442. %endmacro
  443. %imacro push_xmm 1
  444. %endmacro
  445. %imacro pop_xmm 1
  446. %endmacro
  447. %endif
  448. %endif
  449. ; --------------------------------------------------------------------------
  450. ; Defines picked up from the C headers
  451. ;
  452. %include "jsimdcfg.inc"
  453. ; --------------------------------------------------------------------------