rgb_2_rgb.asm 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. ;******************************************************************************
  2. ;* Copyright Nick Kurshev
  3. ;* Copyright Michael (michaelni@gmx.at)
  4. ;* Copyright 2018 Jokyo Images
  5. ;* Copyright Ivo van Poorten
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA
  25. pb_mask_shuffle2103_mmx times 8 dw 255
  26. pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
  27. pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
  28. pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
  29. pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
  30. pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  31. SECTION .text
  32. %macro RSHIFT_COPY 3
  33. ; %1 dst ; %2 src ; %3 shift
  34. %if cpuflag(avx)
  35. psrldq %1, %2, %3
  36. %else
  37. mova %1, %2
  38. RSHIFT %1, %3
  39. %endif
  40. %endmacro
  41. ;------------------------------------------------------------------------------
  42. ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
  43. ;------------------------------------------------------------------------------
  44. INIT_MMX mmxext
  45. cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
  46. mova m6, [pb_mask_shuffle2103_mmx]
  47. mova m7, m6
  48. psllq m7, 8
  49. movsxdifnidn wq, wd
  50. mov xq, wq
  51. add srcq, wq
  52. add dstq, wq
  53. neg wq
  54. ;calc scalar loop
  55. and xq, mmsize*2 -4
  56. je .loop_simd
  57. .loop_scalar:
  58. mov tmpb, [srcq + wq + 2]
  59. mov [dstq+wq + 0], tmpb
  60. mov tmpb, [srcq + wq + 1]
  61. mov [dstq+wq + 1], tmpb
  62. mov tmpb, [srcq + wq + 0]
  63. mov [dstq+wq + 2], tmpb
  64. mov tmpb, [srcq + wq + 3]
  65. mov [dstq+wq + 3], tmpb
  66. add wq, 4
  67. sub xq, 4
  68. jg .loop_scalar
  69. ;check if src_size < mmsize * 2
  70. cmp wq, 0
  71. jge .end
  72. .loop_simd:
  73. movu m0, [srcq+wq]
  74. movu m1, [srcq+wq+8]
  75. pshufw m3, m0, 177
  76. pshufw m5, m1, 177
  77. pand m0, m7
  78. pand m3, m6
  79. pand m1, m7
  80. pand m5, m6
  81. por m0, m3
  82. por m1, m5
  83. movu [dstq+wq], m0
  84. movu [dstq+wq + 8], m1
  85. add wq, mmsize*2
  86. jl .loop_simd
  87. .end:
  88. RET
  89. ;------------------------------------------------------------------------------
  90. ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
  91. ;------------------------------------------------------------------------------
  92. ; %1-4 index shuffle
  93. %macro SHUFFLE_BYTES 4
  94. cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
  95. VBROADCASTI128 m0, [pb_shuffle%1%2%3%4]
  96. movsxdifnidn wq, wd
  97. mov xq, wq
  98. add srcq, wq
  99. add dstq, wq
  100. neg wq
  101. ;calc scalar loop
  102. and xq, mmsize-4
  103. je .loop_simd
  104. .loop_scalar:
  105. mov tmpb, [srcq + wq + %1]
  106. mov [dstq+wq + 0], tmpb
  107. mov tmpb, [srcq + wq + %2]
  108. mov [dstq+wq + 1], tmpb
  109. mov tmpb, [srcq + wq + %3]
  110. mov [dstq+wq + 2], tmpb
  111. mov tmpb, [srcq + wq + %4]
  112. mov [dstq+wq + 3], tmpb
  113. add wq, 4
  114. sub xq, 4
  115. jg .loop_scalar
  116. ;check if src_size < mmsize
  117. cmp wq, 0
  118. jge .end
  119. .loop_simd:
  120. movu m1, [srcq+wq]
  121. pshufb m1, m0
  122. movu [dstq+wq], m1
  123. add wq, mmsize
  124. jl .loop_simd
  125. .end:
  126. RET
  127. %endmacro
  128. INIT_XMM ssse3
  129. SHUFFLE_BYTES 2, 1, 0, 3
  130. SHUFFLE_BYTES 0, 3, 2, 1
  131. SHUFFLE_BYTES 1, 2, 3, 0
  132. SHUFFLE_BYTES 3, 0, 1, 2
  133. SHUFFLE_BYTES 3, 2, 1, 0
  134. %if ARCH_X86_64
  135. %if HAVE_AVX2_EXTERNAL
  136. INIT_YMM avx2
  137. SHUFFLE_BYTES 2, 1, 0, 3
  138. SHUFFLE_BYTES 0, 3, 2, 1
  139. SHUFFLE_BYTES 1, 2, 3, 0
  140. SHUFFLE_BYTES 3, 0, 1, 2
  141. SHUFFLE_BYTES 3, 2, 1, 0
  142. %endif
  143. %endif
  144. ;-----------------------------------------------------------------------------------------------
  145. ; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  146. ; const uint8_t *src, int width, int height,
  147. ; int lumStride, int chromStride, int srcStride)
  148. ;-----------------------------------------------------------------------------------------------
  149. %macro UYVY_TO_YUV422 0
  150. cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
  151. pxor m0, m0
  152. pcmpeqw m1, m1
  153. psrlw m1, 8
  154. movsxdifnidn wq, wd
  155. movsxdifnidn lum_strideq, lum_strided
  156. movsxdifnidn chrom_strideq, chrom_strided
  157. movsxdifnidn src_strideq, src_strided
  158. mov back_wq, wq
  159. mov whalfq, wq
  160. shr whalfq, 1 ; whalf = width / 2
  161. lea srcq, [srcq + wq * 2]
  162. add ydstq, wq
  163. add udstq, whalfq
  164. add vdstq, whalfq
  165. .loop_line:
  166. mov xq, wq
  167. mov wtwoq, wq
  168. add wtwoq, wtwoq ; wtwo = width * 2
  169. neg wq
  170. neg wtwoq
  171. neg whalfq
  172. ;calc scalar loop count
  173. and xq, mmsize * 2 - 1
  174. je .loop_simd
  175. .loop_scalar:
  176. mov tmpb, [srcq + wtwoq + 0]
  177. mov [udstq + whalfq], tmpb
  178. mov tmpb, [srcq + wtwoq + 1]
  179. mov [ydstq + wq], tmpb
  180. mov tmpb, [srcq + wtwoq + 2]
  181. mov [vdstq + whalfq], tmpb
  182. mov tmpb, [srcq + wtwoq + 3]
  183. mov [ydstq + wq + 1], tmpb
  184. add wq, 2
  185. add wtwoq, 4
  186. add whalfq, 1
  187. sub xq, 2
  188. jg .loop_scalar
  189. ; check if simd loop is need
  190. cmp wq, 0
  191. jge .end_line
  192. .loop_simd:
  193. movu m2, [srcq + wtwoq ]
  194. movu m3, [srcq + wtwoq + mmsize ]
  195. movu m4, [srcq + wtwoq + mmsize * 2]
  196. movu m5, [srcq + wtwoq + mmsize * 3]
  197. ; extract y part 1
  198. RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
  199. pand m6, m1; YxYx YxYx...
  200. RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY...
  201. pand m7, m1 ; YxYx YxYx...
  202. packuswb m6, m7 ; YYYY YYYY...
  203. movu [ydstq + wq], m6
  204. ; extract y part 2
  205. RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY...
  206. pand m6, m1; YxYx YxYx...
  207. RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY...
  208. pand m7, m1 ; YxYx YxYx...
  209. packuswb m6, m7 ; YYYY YYYY...
  210. movu [ydstq + wq + mmsize], m6
  211. ; extract uv
  212. pand m2, m1 ; UxVx...
  213. pand m3, m1 ; UxVx...
  214. pand m4, m1 ; UxVx...
  215. pand m5, m1 ; UxVx...
  216. packuswb m2, m3 ; UVUV...
  217. packuswb m4, m5 ; UVUV...
  218. ; U
  219. pand m6, m2, m1 ; UxUx...
  220. pand m7, m4, m1 ; UxUx...
  221. packuswb m6, m7 ; UUUU
  222. movu [udstq + whalfq], m6
  223. ; V
  224. psrlw m2, 8 ; VxVx...
  225. psrlw m4, 8 ; VxVx...
  226. packuswb m2, m4 ; VVVV
  227. movu [vdstq + whalfq], m2
  228. add whalfq, mmsize
  229. add wtwoq, mmsize * 4
  230. add wq, mmsize * 2
  231. jl .loop_simd
  232. .end_line:
  233. add srcq, src_strideq
  234. add ydstq, lum_strideq
  235. add udstq, chrom_strideq
  236. add vdstq, chrom_strideq
  237. ;restore initial state of line variable
  238. mov wq, back_wq
  239. mov xq, wq
  240. mov whalfq, wq
  241. shr whalfq, 1 ; whalf = width / 2
  242. sub hd, 1
  243. jg .loop_line
  244. RET
  245. %endmacro
  246. %if ARCH_X86_64
  247. INIT_XMM sse2
  248. UYVY_TO_YUV422
  249. INIT_XMM avx
  250. UYVY_TO_YUV422
  251. %endif