resample.asm 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. ;******************************************************************************
  2. ;* Copyright (c) 2012 Michael Niedermayer
  3. ;* Copyright (c) 2014 James Almer <jamrial <at> gmail.com>
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. %if ARCH_X86_64
  23. %define pointer resq
  24. %else
  25. %define pointer resd
  26. %endif
  27. struc ResampleContext
  28. .av_class: pointer 1
  29. .filter_bank: pointer 1
  30. .filter_length: resd 1
  31. .filter_alloc: resd 1
  32. .ideal_dst_incr: resd 1
  33. .dst_incr: resd 1
  34. .dst_incr_div: resd 1
  35. .dst_incr_mod: resd 1
  36. .index: resd 1
  37. .frac: resd 1
  38. .src_incr: resd 1
  39. .compensation_distance: resd 1
  40. .phase_shift: resd 1
  41. .phase_mask: resd 1
  42. ; there's a few more here but we only care about the first few
  43. endstruc
  44. SECTION_RODATA
  45. pf_1: dd 1.0
  46. SECTION .text
  47. %macro RESAMPLE_FLOAT_FNS 0
  48. ; int resample_common_float(ResampleContext *ctx, float *dst,
  49. ; const float *src, int size, int update_ctx)
  50. %if ARCH_X86_64 ; unix64 and win64
  51. cglobal resample_common_float, 0, 15, 2, ctx, dst, src, phase_shift, index, frac, \
  52. dst_incr_mod, size, min_filter_count_x4, \
  53. min_filter_len_x4, dst_incr_div, src_incr, \
  54. phase_mask, dst_end, filter_bank
  55. ; use red-zone for variable storage
  56. %define ctx_stackq [rsp-0x8]
  57. %define src_stackq [rsp-0x10]
  58. %if WIN64
  59. %define update_context_stackd r4m
  60. %else ; unix64
  61. %define update_context_stackd [rsp-0x14]
  62. %endif
  63. ; load as many variables in registers as possible; for the rest, store
  64. ; on stack so that we have 'ctx' available as one extra register
  65. mov sized, r3d
  66. mov phase_maskd, [ctxq+ResampleContext.phase_mask]
  67. %if UNIX64
  68. mov update_context_stackd, r4d
  69. %endif
  70. mov indexd, [ctxq+ResampleContext.index]
  71. mov fracd, [ctxq+ResampleContext.frac]
  72. mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
  73. mov filter_bankq, [ctxq+ResampleContext.filter_bank]
  74. mov src_incrd, [ctxq+ResampleContext.src_incr]
  75. mov ctx_stackq, ctxq
  76. mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
  77. mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
  78. shl min_filter_len_x4d, 2
  79. lea dst_endq, [dstq+sizeq*4]
  80. %if UNIX64
  81. mov ecx, [ctxq+ResampleContext.phase_shift]
  82. mov edi, [ctxq+ResampleContext.filter_alloc]
  83. DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \
  84. filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  85. src_incr, phase_mask, dst_end, filter_bank
  86. %elif WIN64
  87. mov R9d, [ctxq+ResampleContext.filter_alloc]
  88. mov ecx, [ctxq+ResampleContext.phase_shift]
  89. DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \
  90. filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  91. src_incr, phase_mask, dst_end, filter_bank
  92. %endif
  93. neg min_filter_len_x4q
  94. sub filter_bankq, min_filter_len_x4q
  95. sub srcq, min_filter_len_x4q
  96. mov src_stackq, srcq
  97. %else ; x86-32
  98. cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
  99. index, min_filter_length_x4, filter_bank
  100. ; push temp variables to stack
  101. %define ctx_stackq r0mp
  102. %define src_stackq r2mp
  103. %define update_context_stackd r4m
  104. mov dstq, r1mp
  105. mov r3, r3mp
  106. lea r3, [dstq+r3*4]
  107. PUSH dword [ctxq+ResampleContext.dst_incr_div]
  108. PUSH dword [ctxq+ResampleContext.dst_incr_mod]
  109. PUSH dword [ctxq+ResampleContext.filter_alloc]
  110. PUSH r3
  111. PUSH dword [ctxq+ResampleContext.phase_mask]
  112. PUSH dword [ctxq+ResampleContext.src_incr]
  113. mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
  114. mov indexd, [ctxq+ResampleContext.index]
  115. shl min_filter_length_x4d, 2
  116. mov fracd, [ctxq+ResampleContext.frac]
  117. neg min_filter_length_x4q
  118. mov filter_bankq, [ctxq+ResampleContext.filter_bank]
  119. sub r2mp, min_filter_length_x4q
  120. sub filter_bankq, min_filter_length_x4q
  121. PUSH min_filter_length_x4q
  122. PUSH filter_bankq
  123. mov phase_shiftd, [ctxq+ResampleContext.phase_shift]
  124. DEFINE_ARGS src, phase_shift, dst, frac, index, min_filter_count_x4, filter
  125. %define filter_bankq dword [rsp+0x0]
  126. %define min_filter_length_x4q dword [rsp+0x4]
  127. %define src_incrd dword [rsp+0x8]
  128. %define phase_maskd dword [rsp+0xc]
  129. %define dst_endq dword [rsp+0x10]
  130. %define filter_allocd dword [rsp+0x14]
  131. %define dst_incr_modd dword [rsp+0x18]
  132. %define dst_incr_divd dword [rsp+0x1c]
  133. mov srcq, r2mp
  134. %endif
  135. .loop:
  136. mov filterd, filter_allocd
  137. imul filterd, indexd
  138. %if ARCH_X86_64
  139. mov min_filter_count_x4q, min_filter_len_x4q
  140. lea filterq, [filter_bankq+filterq*4]
  141. %else ; x86-32
  142. mov min_filter_count_x4q, filter_bankq
  143. lea filterq, [min_filter_count_x4q+filterq*4]
  144. mov min_filter_count_x4q, min_filter_length_x4q
  145. %endif
  146. xorps m0, m0, m0
  147. align 16
  148. .inner_loop:
  149. movups m1, [srcq+min_filter_count_x4q*1]
  150. mulps m1, m1, [filterq+min_filter_count_x4q*1]
  151. addps m0, m0, m1
  152. add min_filter_count_x4q, mmsize
  153. js .inner_loop
  154. %if cpuflag(avx)
  155. vextractf128 xm1, m0, 0x1
  156. addps xm0, xm1
  157. %endif
  158. ; horizontal sum & store
  159. movhlps xm1, xm0
  160. addps xm0, xm1
  161. shufps xm1, xm0, xm0, q0001
  162. add fracd, dst_incr_modd
  163. addps xm0, xm1
  164. add indexd, dst_incr_divd
  165. movss [dstq], xm0
  166. cmp fracd, src_incrd
  167. jl .skip
  168. sub fracd, src_incrd
  169. inc indexd
  170. %if UNIX64
  171. DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \
  172. index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  173. src_incr, phase_mask, dst_end, filter_bank
  174. %elif WIN64
  175. DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \
  176. index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  177. src_incr, phase_mask, dst_end, filter_bank
  178. %else ; x86-32
  179. DEFINE_ARGS src, phase_shift, dst, frac, index, index_incr
  180. %endif
  181. .skip:
  182. mov index_incrd, indexd
  183. add dstq, 4
  184. and indexd, phase_maskd
  185. sar index_incrd, phase_shiftb
  186. lea srcq, [srcq+index_incrq*4]
  187. cmp dstq, dst_endq
  188. jne .loop
  189. %if ARCH_X86_64
  190. DEFINE_ARGS ctx, dst, src, phase_shift, index, frac
  191. %else ; x86-32
  192. DEFINE_ARGS src, ctx, update_context, frac, index
  193. %endif
  194. cmp dword update_context_stackd, 0
  195. jz .skip_store
  196. ; strictly speaking, the function should always return the consumed
  197. ; number of bytes; however, we only use the value if update_context
  198. ; is true, so let's just leave it uninitialized otherwise
  199. mov ctxq, ctx_stackq
  200. movifnidn rax, srcq
  201. mov [ctxq+ResampleContext.frac ], fracd
  202. sub rax, src_stackq
  203. mov [ctxq+ResampleContext.index], indexd
  204. shr rax, 2
  205. .skip_store:
  206. %if ARCH_X86_32
  207. ADD rsp, 0x20
  208. %endif
  209. RET
  210. ; int resample_linear_float(ResampleContext *ctx, float *dst,
  211. ; const float *src, int size, int update_ctx)
  212. %if ARCH_X86_64 ; unix64 and win64
  213. cglobal resample_linear_float, 0, 15, 5, ctx, dst, src, phase_shift, index, frac, \
  214. dst_incr_mod, size, min_filter_count_x4, \
  215. min_filter_len_x4, dst_incr_div, src_incr, \
  216. phase_mask, dst_end, filter_bank
  217. ; use red-zone for variable storage
  218. %define ctx_stackq [rsp-0x8]
  219. %define src_stackq [rsp-0x10]
  220. %define phase_mask_stackd [rsp-0x14]
  221. %if WIN64
  222. %define update_context_stackd r4m
  223. %else ; unix64
  224. %define update_context_stackd [rsp-0x18]
  225. %endif
  226. ; load as many variables in registers as possible; for the rest, store
  227. ; on stack so that we have 'ctx' available as one extra register
  228. mov sized, r3d
  229. mov phase_maskd, [ctxq+ResampleContext.phase_mask]
  230. %if UNIX64
  231. mov update_context_stackd, r4d
  232. %endif
  233. mov indexd, [ctxq+ResampleContext.index]
  234. mov fracd, [ctxq+ResampleContext.frac]
  235. mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
  236. mov filter_bankq, [ctxq+ResampleContext.filter_bank]
  237. mov src_incrd, [ctxq+ResampleContext.src_incr]
  238. mov ctx_stackq, ctxq
  239. mov phase_mask_stackd, phase_maskd
  240. mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
  241. cvtsi2ss xm0, src_incrd
  242. movss xm4, [pf_1]
  243. divss xm4, xm0
  244. mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
  245. shl min_filter_len_x4d, 2
  246. lea dst_endq, [dstq+sizeq*4]
  247. %if UNIX64
  248. mov ecx, [ctxq+ResampleContext.phase_shift]
  249. mov edi, [ctxq+ResampleContext.filter_alloc]
  250. DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \
  251. filter1, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  252. src_incr, filter2, dst_end, filter_bank
  253. %elif WIN64
  254. mov R9d, [ctxq+ResampleContext.filter_alloc]
  255. mov ecx, [ctxq+ResampleContext.phase_shift]
  256. DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \
  257. filter1, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  258. src_incr, filter2, dst_end, filter_bank
  259. %endif
  260. neg min_filter_len_x4q
  261. sub filter_bankq, min_filter_len_x4q
  262. sub srcq, min_filter_len_x4q
  263. mov src_stackq, srcq
  264. %else ; x86-32
  265. cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
  266. index, min_filter_length_x4, filter_bank
  267. ; push temp variables to stack
  268. %define ctx_stackq r0mp
  269. %define src_stackq r2mp
  270. %define update_context_stackd r4m
  271. mov dstq, r1mp
  272. mov r3, r3mp
  273. lea r3, [dstq+r3*4]
  274. PUSH dword [ctxq+ResampleContext.dst_incr_div]
  275. PUSH r3
  276. mov r3, dword [ctxq+ResampleContext.filter_alloc]
  277. PUSH dword [ctxq+ResampleContext.dst_incr_mod]
  278. PUSH r3
  279. shl r3, 2
  280. PUSH r3
  281. mov r3, dword [ctxq+ResampleContext.src_incr]
  282. PUSH dword [ctxq+ResampleContext.phase_mask]
  283. PUSH r3d
  284. cvtsi2ss xm0, r3d
  285. movss xm4, [pf_1]
  286. divss xm4, xm0
  287. mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
  288. mov indexd, [ctxq+ResampleContext.index]
  289. shl min_filter_length_x4d, 2
  290. mov fracd, [ctxq+ResampleContext.frac]
  291. neg min_filter_length_x4q
  292. mov filter_bankq, [ctxq+ResampleContext.filter_bank]
  293. sub r2mp, min_filter_length_x4q
  294. sub filter_bankq, min_filter_length_x4q
  295. PUSH min_filter_length_x4q
  296. PUSH filter_bankq
  297. PUSH dword [ctxq+ResampleContext.phase_shift]
  298. DEFINE_ARGS src, filter1, dst, frac, index, min_filter_count_x4, filter2
  299. %define phase_shift_stackd dword [rsp+0x0]
  300. %define filter_bankq dword [rsp+0x4]
  301. %define min_filter_length_x4q dword [rsp+0x8]
  302. %define src_incrd dword [rsp+0xc]
  303. %define phase_mask_stackd dword [rsp+0x10]
  304. %define filter_alloc_x4q dword [rsp+0x14]
  305. %define filter_allocd dword [rsp+0x18]
  306. %define dst_incr_modd dword [rsp+0x1c]
  307. %define dst_endq dword [rsp+0x20]
  308. %define dst_incr_divd dword [rsp+0x24]
  309. mov srcq, r2mp
  310. %endif
  311. .loop:
  312. mov filter1d, filter_allocd
  313. imul filter1d, indexd
  314. %if ARCH_X86_64
  315. mov min_filter_count_x4q, min_filter_len_x4q
  316. lea filter1q, [filter_bankq+filter1q*4]
  317. lea filter2q, [filter1q+filter_allocq*4]
  318. %else ; x86-32
  319. mov min_filter_count_x4q, filter_bankq
  320. lea filter1q, [min_filter_count_x4q+filter1q*4]
  321. mov min_filter_count_x4q, min_filter_length_x4q
  322. mov filter2q, filter1q
  323. add filter2q, filter_alloc_x4q
  324. %endif
  325. xorps m0, m0, m0
  326. xorps m2, m2, m2
  327. align 16
  328. .inner_loop:
  329. movups m1, [srcq+min_filter_count_x4q*1]
  330. mulps m3, m1, [filter2q+min_filter_count_x4q*1]
  331. mulps m1, m1, [filter1q+min_filter_count_x4q*1]
  332. addps m2, m2, m3
  333. addps m0, m0, m1
  334. add min_filter_count_x4q, mmsize
  335. js .inner_loop
  336. %if cpuflag(avx)
  337. vextractf128 xm1, m0, 0x1
  338. vextractf128 xm3, m2, 0x1
  339. addps xm0, xm1
  340. addps xm2, xm3
  341. %endif
  342. ; val += (v2 - val) * (FELEML) frac / c->src_incr;
  343. cvtsi2ss xm1, fracd
  344. subps xm2, xm0
  345. mulps xm1, xm4
  346. shufps xm1, xm1, q0000
  347. mulps xm2, xm1
  348. addps xm0, xm2
  349. ; horizontal sum & store
  350. movhlps xm1, xm0
  351. addps xm0, xm1
  352. shufps xm1, xm0, xm0, q0001
  353. add fracd, dst_incr_modd
  354. addps xm0, xm1
  355. add indexd, dst_incr_divd
  356. movss [dstq], xm0
  357. cmp fracd, src_incrd
  358. jl .skip
  359. sub fracd, src_incrd
  360. inc indexd
  361. %if UNIX64
  362. DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \
  363. index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  364. src_incr, filter2, dst_end, filter_bank
  365. %elif WIN64
  366. DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \
  367. index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
  368. src_incr, filter2, dst_end, filter_bank
  369. %else ; x86-32
  370. DEFINE_ARGS src, phase_shift, dst, frac, index, index_incr
  371. %endif
  372. .skip:
  373. %if ARCH_X86_32
  374. mov phase_shiftd, phase_shift_stackd
  375. %endif
  376. mov index_incrd, indexd
  377. add dstq, 4
  378. and indexd, phase_mask_stackd
  379. sar index_incrd, phase_shiftb
  380. lea srcq, [srcq+index_incrq*4]
  381. cmp dstq, dst_endq
  382. jne .loop
  383. %if ARCH_X86_64
  384. DEFINE_ARGS ctx, dst, src, phase_shift, index, frac
  385. %else ; x86-32
  386. DEFINE_ARGS src, ctx, update_context, frac, index
  387. %endif
  388. cmp dword update_context_stackd, 0
  389. jz .skip_store
  390. ; strictly speaking, the function should always return the consumed
  391. ; number of bytes; however, we only use the value if update_context
  392. ; is true, so let's just leave it uninitialized otherwise
  393. mov ctxq, ctx_stackq
  394. movifnidn rax, srcq
  395. mov [ctxq+ResampleContext.frac ], fracd
  396. sub rax, src_stackq
  397. mov [ctxq+ResampleContext.index], indexd
  398. shr rax, 2
  399. .skip_store:
  400. %if ARCH_X86_32
  401. ADD rsp, 0x28
  402. %endif
  403. RET
  404. %endmacro
  405. INIT_XMM sse
  406. RESAMPLE_FLOAT_FNS
  407. %if HAVE_AVX_EXTERNAL
  408. INIT_YMM avx
  409. RESAMPLE_FLOAT_FNS
  410. %endif