rgb_2_rgb.asm 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. ;******************************************************************************
  2. ;* Copyright Nick Kurshev
  3. ;* Copyright Michael (michaelni@gmx.at)
  4. ;* Copyright 2018 Jokyo Images
  5. ;* Copyright Ivo van Poorten
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA
  25. pb_mask_shuffle2103_mmx times 8 dw 255
  26. pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
  27. pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
  28. pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
  29. pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
  30. pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  31. SECTION .text
  32. %macro RSHIFT_COPY 3
  33. ; %1 dst ; %2 src ; %3 shift
  34. %if cpuflag(avx)
  35. psrldq %1, %2, %3
  36. %else
  37. mova %1, %2
  38. RSHIFT %1, %3
  39. %endif
  40. %endmacro
  41. ;------------------------------------------------------------------------------
  42. ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
  43. ;------------------------------------------------------------------------------
  44. INIT_MMX mmxext
  45. cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
  46. mova m6, [pb_mask_shuffle2103_mmx]
  47. mova m7, m6
  48. psllq m7, 8
  49. movsxdifnidn wq, wd
  50. mov xq, wq
  51. add srcq, wq
  52. add dstq, wq
  53. neg wq
  54. ;calc scalar loop
  55. and xq, mmsize*2 -4
  56. je .loop_simd
  57. .loop_scalar:
  58. mov tmpb, [srcq + wq + 2]
  59. mov [dstq+wq + 0], tmpb
  60. mov tmpb, [srcq + wq + 1]
  61. mov [dstq+wq + 1], tmpb
  62. mov tmpb, [srcq + wq + 0]
  63. mov [dstq+wq + 2], tmpb
  64. mov tmpb, [srcq + wq + 3]
  65. mov [dstq+wq + 3], tmpb
  66. add wq, 4
  67. sub xq, 4
  68. jg .loop_scalar
  69. ;check if src_size < mmsize * 2
  70. cmp wq, 0
  71. jge .end
  72. .loop_simd:
  73. movu m0, [srcq+wq]
  74. movu m1, [srcq+wq+8]
  75. pshufw m3, m0, 177
  76. pshufw m5, m1, 177
  77. pand m0, m7
  78. pand m3, m6
  79. pand m1, m7
  80. pand m5, m6
  81. por m0, m3
  82. por m1, m5
  83. movu [dstq+wq], m0
  84. movu [dstq+wq + 8], m1
  85. add wq, mmsize*2
  86. jl .loop_simd
  87. .end:
  88. emms
  89. RET
  90. ;------------------------------------------------------------------------------
  91. ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
  92. ;------------------------------------------------------------------------------
  93. ; %1-4 index shuffle
  94. %macro SHUFFLE_BYTES 4
  95. cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
  96. VBROADCASTI128 m0, [pb_shuffle%1%2%3%4]
  97. movsxdifnidn wq, wd
  98. mov xq, wq
  99. add srcq, wq
  100. add dstq, wq
  101. neg wq
  102. ;calc scalar loop
  103. and xq, mmsize-4
  104. je .loop_simd
  105. .loop_scalar:
  106. mov tmpb, [srcq + wq + %1]
  107. mov [dstq+wq + 0], tmpb
  108. mov tmpb, [srcq + wq + %2]
  109. mov [dstq+wq + 1], tmpb
  110. mov tmpb, [srcq + wq + %3]
  111. mov [dstq+wq + 2], tmpb
  112. mov tmpb, [srcq + wq + %4]
  113. mov [dstq+wq + 3], tmpb
  114. add wq, 4
  115. sub xq, 4
  116. jg .loop_scalar
  117. ;check if src_size < mmsize
  118. cmp wq, 0
  119. jge .end
  120. .loop_simd:
  121. movu m1, [srcq+wq]
  122. pshufb m1, m0
  123. movu [dstq+wq], m1
  124. add wq, mmsize
  125. jl .loop_simd
  126. .end:
  127. RET
  128. %endmacro
  129. INIT_XMM ssse3
  130. SHUFFLE_BYTES 2, 1, 0, 3
  131. SHUFFLE_BYTES 0, 3, 2, 1
  132. SHUFFLE_BYTES 1, 2, 3, 0
  133. SHUFFLE_BYTES 3, 0, 1, 2
  134. SHUFFLE_BYTES 3, 2, 1, 0
  135. %if ARCH_X86_64
  136. %if HAVE_AVX2_EXTERNAL
  137. INIT_YMM avx2
  138. SHUFFLE_BYTES 2, 1, 0, 3
  139. SHUFFLE_BYTES 0, 3, 2, 1
  140. SHUFFLE_BYTES 1, 2, 3, 0
  141. SHUFFLE_BYTES 3, 0, 1, 2
  142. SHUFFLE_BYTES 3, 2, 1, 0
  143. %endif
  144. %endif
  145. ;-----------------------------------------------------------------------------------------------
  146. ; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  147. ; const uint8_t *src, int width, int height,
  148. ; int lumStride, int chromStride, int srcStride)
  149. ;-----------------------------------------------------------------------------------------------
  150. %macro UYVY_TO_YUV422 0
  151. cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
  152. pxor m0, m0
  153. pcmpeqw m1, m1
  154. psrlw m1, 8
  155. movsxdifnidn wq, wd
  156. movsxdifnidn lum_strideq, lum_strided
  157. movsxdifnidn chrom_strideq, chrom_strided
  158. movsxdifnidn src_strideq, src_strided
  159. mov back_wq, wq
  160. mov whalfq, wq
  161. shr whalfq, 1 ; whalf = width / 2
  162. lea srcq, [srcq + wq * 2]
  163. add ydstq, wq
  164. add udstq, whalfq
  165. add vdstq, whalfq
  166. .loop_line:
  167. mov xq, wq
  168. mov wtwoq, wq
  169. add wtwoq, wtwoq ; wtwo = width * 2
  170. neg wq
  171. neg wtwoq
  172. neg whalfq
  173. ;calc scalar loop count
  174. and xq, mmsize * 2 - 1
  175. je .loop_simd
  176. .loop_scalar:
  177. mov tmpb, [srcq + wtwoq + 0]
  178. mov [udstq + whalfq], tmpb
  179. mov tmpb, [srcq + wtwoq + 1]
  180. mov [ydstq + wq], tmpb
  181. mov tmpb, [srcq + wtwoq + 2]
  182. mov [vdstq + whalfq], tmpb
  183. mov tmpb, [srcq + wtwoq + 3]
  184. mov [ydstq + wq + 1], tmpb
  185. add wq, 2
  186. add wtwoq, 4
  187. add whalfq, 1
  188. sub xq, 2
  189. jg .loop_scalar
  190. ; check if simd loop is need
  191. cmp wq, 0
  192. jge .end_line
  193. .loop_simd:
  194. movu m2, [srcq + wtwoq ]
  195. movu m3, [srcq + wtwoq + mmsize ]
  196. movu m4, [srcq + wtwoq + mmsize * 2]
  197. movu m5, [srcq + wtwoq + mmsize * 3]
  198. ; extract y part 1
  199. RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
  200. pand m6, m1; YxYx YxYx...
  201. RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY...
  202. pand m7, m1 ; YxYx YxYx...
  203. packuswb m6, m7 ; YYYY YYYY...
  204. movu [ydstq + wq], m6
  205. ; extract y part 2
  206. RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY...
  207. pand m6, m1; YxYx YxYx...
  208. RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY...
  209. pand m7, m1 ; YxYx YxYx...
  210. packuswb m6, m7 ; YYYY YYYY...
  211. movu [ydstq + wq + mmsize], m6
  212. ; extract uv
  213. pand m2, m1 ; UxVx...
  214. pand m3, m1 ; UxVx...
  215. pand m4, m1 ; UxVx...
  216. pand m5, m1 ; UxVx...
  217. packuswb m2, m3 ; UVUV...
  218. packuswb m4, m5 ; UVUV...
  219. ; U
  220. pand m6, m2, m1 ; UxUx...
  221. pand m7, m4, m1 ; UxUx...
  222. packuswb m6, m7 ; UUUU
  223. movu [udstq + whalfq], m6
  224. ; V
  225. psrlw m2, 8 ; VxVx...
  226. psrlw m4, 8 ; VxVx...
  227. packuswb m2, m4 ; VVVV
  228. movu [vdstq + whalfq], m2
  229. add whalfq, mmsize
  230. add wtwoq, mmsize * 4
  231. add wq, mmsize * 2
  232. jl .loop_simd
  233. .end_line:
  234. add srcq, src_strideq
  235. add ydstq, lum_strideq
  236. add udstq, chrom_strideq
  237. add vdstq, chrom_strideq
  238. ;restore initial state of line variable
  239. mov wq, back_wq
  240. mov xq, wq
  241. mov whalfq, wq
  242. shr whalfq, 1 ; whalf = width / 2
  243. sub hd, 1
  244. jg .loop_line
  245. RET
  246. %endmacro
  247. %if ARCH_X86_64
  248. INIT_XMM sse2
  249. UYVY_TO_YUV422
  250. INIT_XMM avx
  251. UYVY_TO_YUV422
  252. %endif