rgb_2_rgb.asm 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. ;******************************************************************************
  2. ;* Copyright Nick Kurshev
  3. ;* Copyright Michael (michaelni@gmx.at)
  4. ;* Copyright 2018 Jokyo Images
  5. ;* Copyright Ivo van Poorten
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA
  25. pb_mask_shuffle2103_mmx times 8 dw 255
  26. pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
  27. pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
  28. pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
  29. pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
  30. pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  31. SECTION .text
  32. %macro RSHIFT_COPY 3
  33. ; %1 dst ; %2 src ; %3 shift
  34. %if cpuflag(avx)
  35. psrldq %1, %2, %3
  36. %else
  37. mova %1, %2
  38. RSHIFT %1, %3
  39. %endif
  40. %endmacro
  41. ;------------------------------------------------------------------------------
  42. ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
  43. ;------------------------------------------------------------------------------
  44. INIT_MMX mmxext
  45. cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
  46. mova m6, [pb_mask_shuffle2103_mmx]
  47. mova m7, m6
  48. psllq m7, 8
  49. movsxdifnidn wq, wd
  50. mov xq, wq
  51. add srcq, wq
  52. add dstq, wq
  53. neg wq
  54. ;calc scalar loop
  55. and xq, mmsize*2 -4
  56. je .loop_simd
  57. .loop_scalar:
  58. mov tmpb, [srcq + wq + 2]
  59. mov [dstq+wq + 0], tmpb
  60. mov tmpb, [srcq + wq + 1]
  61. mov [dstq+wq + 1], tmpb
  62. mov tmpb, [srcq + wq + 0]
  63. mov [dstq+wq + 2], tmpb
  64. mov tmpb, [srcq + wq + 3]
  65. mov [dstq+wq + 3], tmpb
  66. add wq, 4
  67. sub xq, 4
  68. jg .loop_scalar
  69. ;check if src_size < mmsize * 2
  70. cmp wq, 0
  71. jge .end
  72. .loop_simd:
  73. movu m0, [srcq+wq]
  74. movu m1, [srcq+wq+8]
  75. pshufw m3, m0, 177
  76. pshufw m5, m1, 177
  77. pand m0, m7
  78. pand m3, m6
  79. pand m1, m7
  80. pand m5, m6
  81. por m0, m3
  82. por m1, m5
  83. movu [dstq+wq], m0
  84. movu [dstq+wq + 8], m1
  85. add wq, mmsize*2
  86. jl .loop_simd
  87. .end:
  88. RET
  89. ;------------------------------------------------------------------------------
  90. ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
  91. ;------------------------------------------------------------------------------
  92. ; %1-4 index shuffle
  93. %macro SHUFFLE_BYTES 4
  94. cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
  95. VBROADCASTI128 m0, [pb_shuffle%1%2%3%4]
  96. movsxdifnidn wq, wd
  97. mov xq, wq
  98. add srcq, wq
  99. add dstq, wq
  100. neg wq
  101. ;calc scalar loop
  102. and xq, mmsize-4
  103. je .loop_simd
  104. .loop_scalar:
  105. mov tmpb, [srcq + wq + %1]
  106. mov [dstq+wq + 0], tmpb
  107. mov tmpb, [srcq + wq + %2]
  108. mov [dstq+wq + 1], tmpb
  109. mov tmpb, [srcq + wq + %3]
  110. mov [dstq+wq + 2], tmpb
  111. mov tmpb, [srcq + wq + %4]
  112. mov [dstq+wq + 3], tmpb
  113. add wq, 4
  114. sub xq, 4
  115. jg .loop_scalar
  116. ;check if src_size < mmsize
  117. cmp wq, 0
  118. jge .end
  119. .loop_simd:
  120. movu m1, [srcq+wq]
  121. pshufb m1, m0
  122. movu [dstq+wq], m1
  123. add wq, mmsize
  124. jl .loop_simd
  125. .end:
  126. RET
  127. %endmacro
  128. INIT_XMM ssse3
  129. SHUFFLE_BYTES 2, 1, 0, 3
  130. SHUFFLE_BYTES 0, 3, 2, 1
  131. SHUFFLE_BYTES 1, 2, 3, 0
  132. SHUFFLE_BYTES 3, 0, 1, 2
  133. SHUFFLE_BYTES 3, 2, 1, 0
  134. ;-----------------------------------------------------------------------------------------------
  135. ; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  136. ; const uint8_t *src, int width, int height,
  137. ; int lumStride, int chromStride, int srcStride)
  138. ;-----------------------------------------------------------------------------------------------
  139. %macro UYVY_TO_YUV422 0
  140. cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
  141. pxor m0, m0
  142. pcmpeqw m1, m1
  143. psrlw m1, 8
  144. movsxdifnidn wq, wd
  145. movsxdifnidn lum_strideq, lum_strided
  146. movsxdifnidn chrom_strideq, chrom_strided
  147. movsxdifnidn src_strideq, src_strided
  148. mov back_wq, wq
  149. mov whalfq, wq
  150. shr whalfq, 1 ; whalf = width / 2
  151. lea srcq, [srcq + wq * 2]
  152. add ydstq, wq
  153. add udstq, whalfq
  154. add vdstq, whalfq
  155. .loop_line:
  156. mov xq, wq
  157. mov wtwoq, wq
  158. add wtwoq, wtwoq ; wtwo = width * 2
  159. neg wq
  160. neg wtwoq
  161. neg whalfq
  162. ;calc scalar loop count
  163. and xq, mmsize * 2 - 1
  164. je .loop_simd
  165. .loop_scalar:
  166. mov tmpb, [srcq + wtwoq + 0]
  167. mov [udstq + whalfq], tmpb
  168. mov tmpb, [srcq + wtwoq + 1]
  169. mov [ydstq + wq], tmpb
  170. mov tmpb, [srcq + wtwoq + 2]
  171. mov [vdstq + whalfq], tmpb
  172. mov tmpb, [srcq + wtwoq + 3]
  173. mov [ydstq + wq + 1], tmpb
  174. add wq, 2
  175. add wtwoq, 4
  176. add whalfq, 1
  177. sub xq, 2
  178. jg .loop_scalar
  179. ; check if simd loop is need
  180. cmp wq, 0
  181. jge .end_line
  182. .loop_simd:
  183. movu m2, [srcq + wtwoq ]
  184. movu m3, [srcq + wtwoq + mmsize ]
  185. movu m4, [srcq + wtwoq + mmsize * 2]
  186. movu m5, [srcq + wtwoq + mmsize * 3]
  187. ; extract y part 1
  188. RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
  189. pand m6, m1; YxYx YxYx...
  190. RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY...
  191. pand m7, m1 ; YxYx YxYx...
  192. packuswb m6, m7 ; YYYY YYYY...
  193. movu [ydstq + wq], m6
  194. ; extract y part 2
  195. RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY...
  196. pand m6, m1; YxYx YxYx...
  197. RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY...
  198. pand m7, m1 ; YxYx YxYx...
  199. packuswb m6, m7 ; YYYY YYYY...
  200. movu [ydstq + wq + mmsize], m6
  201. ; extract uv
  202. pand m2, m1 ; UxVx...
  203. pand m3, m1 ; UxVx...
  204. pand m4, m1 ; UxVx...
  205. pand m5, m1 ; UxVx...
  206. packuswb m2, m3 ; UVUV...
  207. packuswb m4, m5 ; UVUV...
  208. ; U
  209. pand m6, m2, m1 ; UxUx...
  210. pand m7, m4, m1 ; UxUx...
  211. packuswb m6, m7 ; UUUU
  212. movu [udstq + whalfq], m6
  213. ; V
  214. psrlw m2, 8 ; VxVx...
  215. psrlw m4, 8 ; VxVx...
  216. packuswb m2, m4 ; VVVV
  217. movu [vdstq + whalfq], m2
  218. add whalfq, mmsize
  219. add wtwoq, mmsize * 4
  220. add wq, mmsize * 2
  221. jl .loop_simd
  222. .end_line:
  223. add srcq, src_strideq
  224. add ydstq, lum_strideq
  225. add udstq, chrom_strideq
  226. add vdstq, chrom_strideq
  227. ;restore initial state of line variable
  228. mov wq, back_wq
  229. mov xq, wq
  230. mov whalfq, wq
  231. shr whalfq, 1 ; whalf = width / 2
  232. sub hd, 1
  233. jg .loop_line
  234. RET
  235. %endmacro
  236. %if ARCH_X86_64
  237. INIT_XMM sse2
  238. UYVY_TO_YUV422
  239. INIT_XMM avx
  240. UYVY_TO_YUV422
  241. %endif