x86inc.asm 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540
  1. ;*****************************************************************************
  2. ;* x86inc.asm
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;*****************************************************************************
  22. ; FIXME: All of the 64bit asm functions that take a stride as an argument
  23. ; via register, assume that the high dword of that register is filled with 0.
  24. ; This is true in practice (since we never do any 64bit arithmetic on strides,
  25. ; and x264's strides are all positive), but is not guaranteed by the ABI.
  26. ; Name of the .rodata section.
  27. ; Kludge: Something on OS X fails to align .rodata even given an align attribute,
  28. ; so use a different read-only section.
  29. %macro SECTION_RODATA 0
  30. %ifidn __OUTPUT_FORMAT__,macho64
  31. SECTION .text align=16
  32. %elifidn __OUTPUT_FORMAT__,macho
  33. SECTION .text align=16
  34. fakegot:
  35. %else
  36. SECTION .rodata align=16
  37. %endif
  38. %endmacro
  39. ; PIC support macros. All these macros are totally harmless when PIC is
  40. ; not defined but can ruin everything if misused in PIC mode. On x86_32, shared
  41. ; objects cannot directly access global variables by address, they need to
  42. ; go through the GOT (global offset table). Most OSes do not care about it
  43. ; and let you load non-shared .so objects (Linux, Win32...). However, OS X
  44. ; requires PIC code in its .dylib objects.
  45. ;
  46. ; - GLOBAL should be used as a suffix for global addressing, eg.
  47. ; picgetgot ebx
  48. ; mov eax, [foo GLOBAL]
  49. ; instead of
  50. ; mov eax, [foo]
  51. ;
  52. ; - picgetgot computes the GOT address into the given register in PIC
  53. ; mode, otherwise does nothing. You need to do this before using GLOBAL.
  54. ; Before in both execution order and compiled code order (so GLOBAL knows
  55. ; which register the GOT is in).
  56. %ifndef PIC
  57. %define GLOBAL
  58. %macro picgetgot 1
  59. %endmacro
  60. %elifdef ARCH_X86_64
  61. %define PIC64
  62. %define GLOBAL wrt rip
  63. %macro picgetgot 1
  64. %endmacro
  65. %else
  66. %define PIC32
  67. %ifidn __OUTPUT_FORMAT__,macho
  68. ; There is no real global offset table on OS X, but we still
  69. ; need to reference our variables by offset.
  70. %macro picgetgot 1
  71. call %%getgot
  72. %%getgot:
  73. pop %1
  74. add %1, $$ - %%getgot
  75. %undef GLOBAL
  76. %define GLOBAL + %1 - fakegot
  77. %endmacro
  78. %else ; elf
  79. extern _GLOBAL_OFFSET_TABLE_
  80. %macro picgetgot 1
  81. call %%getgot
  82. %%getgot:
  83. pop %1
  84. add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc
  85. %undef GLOBAL
  86. %define GLOBAL + %1 wrt ..gotoff
  87. %endmacro
  88. %endif
  89. %endif
  90. ; Macros to eliminate most code duplication between x86_32 and x86_64:
  91. ; Currently this works only for leaf functions which load all their arguments
  92. ; into registers at the start, and make no other use of the stack. Luckily that
  93. ; covers most of x264's asm.
  94. ; PROLOGUE:
  95. ; %1 = number of arguments. loads them from stack if needed.
  96. ; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed.
  97. ; %3 = whether global constants are used in this function. inits x86_32 PIC if needed.
  98. ; %4 = list of names to define to registers
  99. ; PROLOGUE can also be invoked by adding the same options to cglobal
  100. ; e.g.
  101. ; cglobal foo, 2,3,0, dst, src, tmp
  102. ; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals
  103. ; TODO Some functions can use some args directly from the stack. If they're the
  104. ; last args then you can just not declare them, but if they're in the middle
  105. ; we need more flexible macro.
  106. ; RET:
  107. ; Pops anything that was pushed by PROLOGUE
  108. ; REP_RET:
  109. ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
  110. ; which are slow when a normal ret follows a branch.
  111. %macro DECLARE_REG 6
  112. %define r%1q %2
  113. %define r%1d %3
  114. %define r%1w %4
  115. %define r%1b %5
  116. %define r%1m %6
  117. %define r%1 %2
  118. %endmacro
  119. %macro DECLARE_REG_SIZE 2
  120. %define r%1q r%1
  121. %define e%1q r%1
  122. %define r%1d e%1
  123. %define e%1d e%1
  124. %define r%1w %1
  125. %define e%1w %1
  126. %define r%1b %2
  127. %define e%1b %2
  128. %ifndef ARCH_X86_64
  129. %define r%1 e%1
  130. %endif
  131. %endmacro
  132. DECLARE_REG_SIZE ax, al
  133. DECLARE_REG_SIZE bx, bl
  134. DECLARE_REG_SIZE cx, cl
  135. DECLARE_REG_SIZE dx, dl
  136. DECLARE_REG_SIZE si, sil
  137. DECLARE_REG_SIZE di, dil
  138. DECLARE_REG_SIZE bp, bpl
  139. %ifdef ARCH_X86_64
  140. %define gprsize 8
  141. %else
  142. %define gprsize 4
  143. %endif
  144. %macro PUSH 1
  145. push %1
  146. %assign stack_offset stack_offset+gprsize
  147. %endmacro
  148. %macro POP 1
  149. pop %1
  150. %assign stack_offset stack_offset-gprsize
  151. %endmacro
  152. %macro SUB 2
  153. sub %1, %2
  154. %ifidn %1, rsp
  155. %assign stack_offset stack_offset+(%2)
  156. %endif
  157. %endmacro
  158. %macro ADD 2
  159. add %1, %2
  160. %ifidn %1, rsp
  161. %assign stack_offset stack_offset-(%2)
  162. %endif
  163. %endmacro
  164. %macro movifnidn 2
  165. %ifnidn %1, %2
  166. mov %1, %2
  167. %endif
  168. %endmacro
  169. %macro movsxdifnidn 2
  170. %ifnidn %1, %2
  171. movsxd %1, %2
  172. %endif
  173. %endmacro
  174. %macro ASSERT 1
  175. %if (%1) == 0
  176. %error assert failed
  177. %endif
  178. %endmacro
  179. %macro DEFINE_ARGS 0-*
  180. %ifdef n_arg_names
  181. %assign %%i 0
  182. %rep n_arg_names
  183. CAT_UNDEF arg_name %+ %%i, q
  184. CAT_UNDEF arg_name %+ %%i, d
  185. CAT_UNDEF arg_name %+ %%i, w
  186. CAT_UNDEF arg_name %+ %%i, b
  187. CAT_UNDEF arg_name, %%i
  188. %assign %%i %%i+1
  189. %endrep
  190. %endif
  191. %assign %%i 0
  192. %rep %0
  193. %xdefine %1q r %+ %%i %+ q
  194. %xdefine %1d r %+ %%i %+ d
  195. %xdefine %1w r %+ %%i %+ w
  196. %xdefine %1b r %+ %%i %+ b
  197. CAT_XDEFINE arg_name, %%i, %1
  198. %assign %%i %%i+1
  199. %rotate 1
  200. %endrep
  201. %assign n_arg_names %%i
  202. %endmacro
  203. %ifdef ARCH_X86_64 ;==========================================================
  204. %ifidn __OUTPUT_FORMAT__,win32
  205. DECLARE_REG 0, rcx, ecx, cx, cl, ecx
  206. DECLARE_REG 1, rdx, edx, dx, dl, edx
  207. DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
  208. DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
  209. DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
  210. DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
  211. DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
  212. %define r7m [rsp + stack_offset + 64]
  213. %define r8m [rsp + stack_offset + 72]
  214. %macro LOAD_IF_USED 2 ; reg_id, number_of_args
  215. %if %1 < %2
  216. mov r%1, [rsp + 8 + %1*8]
  217. %endif
  218. %endmacro
  219. %else ;=======================================================================
  220. DECLARE_REG 0, rdi, edi, di, dil, edi
  221. DECLARE_REG 1, rsi, esi, si, sil, esi
  222. DECLARE_REG 2, rdx, edx, dx, dl, edx
  223. DECLARE_REG 3, rcx, ecx, cx, cl, ecx
  224. DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
  225. DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
  226. DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
  227. %define r7m [rsp + stack_offset + 16]
  228. %define r8m [rsp + stack_offset + 24]
  229. %macro LOAD_IF_USED 2 ; reg_id, number_of_args
  230. %if %1 < %2
  231. mov r%1, [rsp - 40 + %1*8]
  232. %endif
  233. %endmacro
  234. %endif ; !WIN64
  235. %macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
  236. ASSERT %2 >= %1
  237. ASSERT %2 <= 7
  238. %assign stack_offset 0
  239. %ifidn __OUTPUT_FORMAT__,win32
  240. LOAD_IF_USED 4, %1
  241. LOAD_IF_USED 5, %1
  242. %endif
  243. LOAD_IF_USED 6, %1
  244. DEFINE_ARGS %4
  245. %endmacro
  246. %macro RET 0
  247. ret
  248. %endmacro
  249. %macro REP_RET 0
  250. rep ret
  251. %endmacro
  252. %else ; X86_32 ;==============================================================
  253. DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
  254. DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
  255. DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
  256. DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
  257. DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
  258. DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
  259. DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
  260. %define r7m [esp + stack_offset + 32]
  261. %define r8m [esp + stack_offset + 36]
  262. %define rsp esp
  263. %macro PUSH_IF_USED 1 ; reg_id
  264. %if %1 < regs_used
  265. push r%1
  266. %assign stack_offset stack_offset+4
  267. %endif
  268. %endmacro
  269. %macro POP_IF_USED 1 ; reg_id
  270. %if %1 < regs_used
  271. pop r%1
  272. %endif
  273. %endmacro
  274. %macro LOAD_IF_USED 2 ; reg_id, number_of_args
  275. %if %1 < %2
  276. mov r%1, [esp + stack_offset + 4 + %1*4]
  277. %endif
  278. %endmacro
  279. %macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
  280. ASSERT %2 >= %1
  281. %assign stack_offset 0
  282. %assign regs_used %2
  283. %ifdef PIC
  284. %if %3
  285. %assign regs_used regs_used+1
  286. %endif
  287. %endif
  288. ASSERT regs_used <= 7
  289. PUSH_IF_USED 3
  290. PUSH_IF_USED 4
  291. PUSH_IF_USED 5
  292. PUSH_IF_USED 6
  293. LOAD_IF_USED 0, %1
  294. LOAD_IF_USED 1, %1
  295. LOAD_IF_USED 2, %1
  296. LOAD_IF_USED 3, %1
  297. LOAD_IF_USED 4, %1
  298. LOAD_IF_USED 5, %1
  299. LOAD_IF_USED 6, %1
  300. %if %3
  301. picgetgot r%2
  302. %endif
  303. DEFINE_ARGS %4
  304. %endmacro
  305. %macro RET 0
  306. POP_IF_USED 6
  307. POP_IF_USED 5
  308. POP_IF_USED 4
  309. POP_IF_USED 3
  310. ret
  311. %endmacro
  312. %macro REP_RET 0
  313. %if regs_used > 3
  314. RET
  315. %else
  316. rep ret
  317. %endif
  318. %endmacro
  319. %endif ;======================================================================
  320. ;=============================================================================
  321. ; arch-independent part
  322. ;=============================================================================
  323. %assign function_align 16
  324. ; Symbol prefix for C linkage
  325. %macro cglobal 1-2+
  326. %xdefine %1 ff_%1
  327. %ifdef PREFIX
  328. %xdefine %1 _ %+ %1
  329. %endif
  330. %ifidn __OUTPUT_FORMAT__,elf
  331. global %1:function hidden
  332. %else
  333. global %1
  334. %endif
  335. align function_align
  336. %1:
  337. RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
  338. %if %0 > 1
  339. PROLOGUE %2
  340. %endif
  341. %endmacro
  342. %macro cextern 1
  343. %ifdef PREFIX
  344. extern _%1
  345. %define %1 _%1
  346. %else
  347. extern %1
  348. %endif
  349. %endmacro
  350. ; This is needed for ELF, otherwise the GNU linker assumes the stack is
  351. ; executable by default.
  352. %ifidn __OUTPUT_FORMAT__,elf
  353. SECTION .note.GNU-stack noalloc noexec nowrite progbits
  354. %endif
  355. %assign FENC_STRIDE 16
  356. %assign FDEC_STRIDE 32
  357. ; merge mmx and sse*
  358. %macro CAT_XDEFINE 3
  359. %xdefine %1%2 %3
  360. %endmacro
  361. %macro CAT_UNDEF 2
  362. %undef %1%2
  363. %endmacro
  364. %macro INIT_MMX 0
  365. %define RESET_MM_PERMUTATION INIT_MMX
  366. %define mmsize 8
  367. %define num_mmregs 8
  368. %define mova movq
  369. %define movu movq
  370. %define movh movd
  371. %define movnt movntq
  372. %assign %%i 0
  373. %rep 8
  374. CAT_XDEFINE m, %%i, mm %+ %%i
  375. CAT_XDEFINE nmm, %%i, %%i
  376. %assign %%i %%i+1
  377. %endrep
  378. %rep 8
  379. CAT_UNDEF m, %%i
  380. CAT_UNDEF nmm, %%i
  381. %assign %%i %%i+1
  382. %endrep
  383. %endmacro
  384. %macro INIT_XMM 0
  385. %define RESET_MM_PERMUTATION INIT_XMM
  386. %define mmsize 16
  387. %define num_mmregs 8
  388. %ifdef ARCH_X86_64
  389. %define num_mmregs 16
  390. %endif
  391. %define mova movdqa
  392. %define movu movdqu
  393. %define movh movq
  394. %define movnt movntdq
  395. %assign %%i 0
  396. %rep num_mmregs
  397. CAT_XDEFINE m, %%i, xmm %+ %%i
  398. CAT_XDEFINE nxmm, %%i, %%i
  399. %assign %%i %%i+1
  400. %endrep
  401. %endmacro
  402. INIT_MMX
  403. ; I often want to use macros that permute their arguments. e.g. there's no
  404. ; efficient way to implement butterfly or transpose or dct without swapping some
  405. ; arguments.
  406. ;
  407. ; I would like to not have to manually keep track of the permutations:
  408. ; If I insert a permutation in the middle of a function, it should automatically
  409. ; change everything that follows. For more complex macros I may also have multiple
  410. ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
  411. ;
  412. ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
  413. ; permutes its arguments. It's equivalent to exchanging the contents of the
  414. ; registers, except that this way you exchange the register names instead, so it
  415. ; doesn't cost any cycles.
  416. %macro PERMUTE 2-* ; takes a list of pairs to swap
  417. %rep %0/2
  418. %xdefine tmp%2 m%2
  419. %xdefine ntmp%2 nm%2
  420. %rotate 2
  421. %endrep
  422. %rep %0/2
  423. %xdefine m%1 tmp%2
  424. %xdefine nm%1 ntmp%2
  425. %undef tmp%2
  426. %undef ntmp%2
  427. %rotate 2
  428. %endrep
  429. %endmacro
  430. %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
  431. %rep %0-1
  432. %ifdef m%1
  433. %xdefine tmp m%1
  434. %xdefine m%1 m%2
  435. %xdefine m%2 tmp
  436. CAT_XDEFINE n, m%1, %1
  437. CAT_XDEFINE n, m%2, %2
  438. %else
  439. ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
  440. ; Be careful using this mode in nested macros though, as in some cases there may be
  441. ; other copies of m# that have already been dereferenced and don't get updated correctly.
  442. %xdefine %%n1 n %+ %1
  443. %xdefine %%n2 n %+ %2
  444. %xdefine tmp m %+ %%n1
  445. CAT_XDEFINE m, %%n1, m %+ %%n2
  446. CAT_XDEFINE m, %%n2, tmp
  447. CAT_XDEFINE n, m %+ %%n1, %%n1
  448. CAT_XDEFINE n, m %+ %%n2, %%n2
  449. %endif
  450. %undef tmp
  451. %rotate 1
  452. %endrep
  453. %endmacro
  454. %macro SAVE_MM_PERMUTATION 1
  455. %assign %%i 0
  456. %rep num_mmregs
  457. CAT_XDEFINE %1_m, %%i, m %+ %%i
  458. %assign %%i %%i+1
  459. %endrep
  460. %endmacro
  461. %macro LOAD_MM_PERMUTATION 1
  462. %assign %%i 0
  463. %rep num_mmregs
  464. CAT_XDEFINE m, %%i, %1_m %+ %%i
  465. %assign %%i %%i+1
  466. %endrep
  467. %endmacro
  468. %macro call 1
  469. call %1
  470. %ifdef %1_m0
  471. LOAD_MM_PERMUTATION %1
  472. %endif
  473. %endmacro
  474. ; substitutions which are functionally identical but reduce code size
  475. %define movdqa movaps
  476. %define movdqu movups