input_vsx.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. /*
  2. * Copyright (C) 2016 Dan Parrot <dan.parrot@mail.com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include <math.h>
  21. #include <stdint.h>
  22. #include <stdio.h>
  23. #include <string.h>
  24. #include "libavutil/avutil.h"
  25. #include "libavutil/bswap.h"
  26. #include "libavutil/cpu.h"
  27. #include "libavutil/intreadwrite.h"
  28. #include "libavutil/mathematics.h"
  29. #include "libavutil/pixdesc.h"
  30. #include "libavutil/avassert.h"
  31. #include "config.h"
  32. #include "libswscale/rgb2rgb.h"
  33. #include "libswscale/swscale.h"
  34. #include "libswscale/swscale_internal.h"
  35. #if HAVE_VSX
  36. static void abgrToA_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
  37. int width, uint32_t *unused)
  38. {
  39. int16_t *dst = (int16_t *)_dst;
  40. int i, width_adj, frag_len;
  41. uintptr_t src_addr = (uintptr_t)src;
  42. uintptr_t dst_addr = (uintptr_t)dst;
  43. // compute integral number of vector-length items and length of final fragment
  44. width_adj = width >> 3;
  45. width_adj = width_adj << 3;
  46. frag_len = width - width_adj;
  47. for ( i = 0; i < width_adj; i += 8) {
  48. vector int v_rd0 = vec_vsx_ld(0, (int *)src_addr);
  49. vector int v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
  50. v_rd0 = vec_and(v_rd0, vec_splats(0x0ff));
  51. v_rd1 = vec_and(v_rd1, vec_splats(0x0ff));
  52. v_rd0 = vec_sl(v_rd0, vec_splats((unsigned)6));
  53. v_rd1 = vec_sl(v_rd1, vec_splats((unsigned)6));
  54. vector int v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
  55. {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
  56. vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
  57. src_addr += 32;
  58. dst_addr += 16;
  59. }
  60. for (i=width_adj; i< width_adj + frag_len; i++) {
  61. dst[i]= src[4*i]<<6;
  62. }
  63. }
  64. static void rgbaToA_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
  65. int width, uint32_t *unused)
  66. {
  67. int16_t *dst = (int16_t *)_dst;
  68. int i, width_adj, frag_len;
  69. uintptr_t src_addr = (uintptr_t)src;
  70. uintptr_t dst_addr = (uintptr_t)dst;
  71. // compute integral number of vector-length items and length of final fragment
  72. width_adj = width >> 3;
  73. width_adj = width_adj << 3;
  74. frag_len = width - width_adj;
  75. for ( i = 0; i < width_adj; i += 8) {
  76. vector int v_rd0 = vec_vsx_ld(0, (int *)src_addr);
  77. vector int v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
  78. v_rd0 = vec_sld(v_rd0, v_rd0, 13);
  79. v_rd1 = vec_sld(v_rd1, v_rd1, 13);
  80. v_rd0 = vec_and(v_rd0, vec_splats(0x0ff));
  81. v_rd1 = vec_and(v_rd1, vec_splats(0x0ff));
  82. v_rd0 = vec_sl(v_rd0, vec_splats((unsigned)6));
  83. v_rd1 = vec_sl(v_rd1, vec_splats((unsigned)6));
  84. vector int v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
  85. {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
  86. vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
  87. src_addr += 32;
  88. dst_addr += 16;
  89. }
  90. for (i=width_adj; i< width_adj + frag_len; i++) {
  91. dst[i]= src[4*i+3]<<6;
  92. }
  93. }
  94. static void yuy2ToY_c_vsx(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
  95. uint32_t *unused)
  96. {
  97. int i, width_adj, frag_len;
  98. uintptr_t src_addr = (uintptr_t)src;
  99. uintptr_t dst_addr = (uintptr_t)dst;
  100. // compute integral number of vector-length items and length of final fragment
  101. width_adj = width >> 4;
  102. width_adj = width_adj << 4;
  103. frag_len = width - width_adj;
  104. for ( i = 0; i < width_adj; i += 16) {
  105. vector unsigned char v_rd0 = vec_vsx_ld(0, (unsigned char *)src_addr);
  106. vector unsigned char v_rd1 = vec_vsx_ld(0, (unsigned char *)(src_addr + 16));
  107. vector unsigned char v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
  108. {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}));
  109. vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
  110. src_addr += 32;
  111. dst_addr += 16;
  112. }
  113. for (i=width_adj; i< width_adj + frag_len; i++) {
  114. dst[i] = src[2 * i];
  115. }
  116. }
  117. static void yuy2ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
  118. const uint8_t *src2, int width, uint32_t *unused)
  119. {
  120. int i, width_adj, frag_len;
  121. uintptr_t src1_addr = (uintptr_t)src1;
  122. uintptr_t dstu_addr = (uintptr_t)dstU;
  123. uintptr_t dstv_addr = (uintptr_t)dstV;
  124. // compute integral number of vector-length items and length of final fragment
  125. width_adj = width >> 4;
  126. width_adj = width_adj << 4;
  127. frag_len = width - width_adj;
  128. for ( i = 0; i < width_adj; i += 16) {
  129. vector unsigned char v_src1_0 = vec_vsx_ld(0, (unsigned char *)src1_addr);
  130. vector unsigned char v_src1_1 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 16));
  131. vector unsigned char v_src1_2 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 32));
  132. vector unsigned char v_src1_3 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 48));
  133. vector unsigned char v_dstu = vec_perm(v_src1_0, v_src1_1,
  134. ((vector unsigned char)
  135. {1, 5, 9, 13, 17, 21, 25, 29, 1, 5, 9, 13, 17, 21, 25, 29}));
  136. vector unsigned char v_dstv = vec_perm(v_src1_0, v_src1_1,
  137. ((vector unsigned char)
  138. {3, 7, 11, 15, 19, 23, 27, 31, 1, 5, 9, 13, 17, 21, 25, 29}));
  139. v_dstu = vec_perm(v_dstu, v_src1_2,((vector unsigned char)
  140. {0, 1, 2, 3, 4, 5, 6, 7, 17, 21, 25, 29, 17, 21, 25, 29}));
  141. v_dstv = vec_perm(v_dstv, v_src1_2,((vector unsigned char)
  142. {0, 1, 2, 3, 4, 5, 6, 7, 19, 23, 27, 31, 17, 21, 25, 29}));
  143. v_dstu = vec_perm(v_dstu, v_src1_3,((vector unsigned char)
  144. {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 17, 21, 25, 29}));
  145. v_dstv = vec_perm(v_dstv, v_src1_3,((vector unsigned char)
  146. {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 19, 23, 27, 31}));
  147. vec_vsx_st((vector unsigned char)v_dstu, 0, (unsigned char *)dstu_addr);
  148. vec_vsx_st((vector unsigned char)v_dstv, 0, (unsigned char *)dstv_addr);
  149. src1_addr += 64;
  150. dstu_addr += 16;
  151. dstv_addr += 16;
  152. }
  153. for (i=width_adj; i< width_adj + frag_len; i++) {
  154. dstU[i] = src1[4 * i + 1];
  155. dstV[i] = src1[4 * i + 3];
  156. }
  157. av_assert1(src1 == src2);
  158. }
  159. static void yvy2ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
  160. const uint8_t *src2, int width, uint32_t *unused)
  161. {
  162. int i, width_adj, frag_len;
  163. uintptr_t src1_addr = (uintptr_t)src1;
  164. uintptr_t dstu_addr = (uintptr_t)dstU;
  165. uintptr_t dstv_addr = (uintptr_t)dstV;
  166. // compute integral number of vector-length items and length of final fragment
  167. width_adj = width >> 4;
  168. width_adj = width_adj << 4;
  169. frag_len = width - width_adj;
  170. for ( i = 0; i < width_adj; i += 16) {
  171. vector unsigned char v_src1_0 = vec_vsx_ld(0, (unsigned char *)src1_addr);
  172. vector unsigned char v_src1_1 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 16));
  173. vector unsigned char v_src1_2 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 32));
  174. vector unsigned char v_src1_3 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 48));
  175. vector unsigned char v_dstv = vec_perm(v_src1_0, v_src1_1,
  176. ((vector unsigned char)
  177. {1, 5, 9, 13, 17, 21, 25, 29, 1, 5, 9, 13, 17, 21, 25, 29}));
  178. vector unsigned char v_dstu = vec_perm(v_src1_0, v_src1_1,
  179. ((vector unsigned char)
  180. {3, 7, 11, 15, 19, 23, 27, 31, 1, 5, 9, 13, 17, 21, 25, 29}));
  181. v_dstv = vec_perm(v_dstv, v_src1_2,((vector unsigned char)
  182. {0, 1, 2, 3, 4, 5, 6, 7, 17, 21, 25, 29, 17, 21, 25, 29}));
  183. v_dstu = vec_perm(v_dstu, v_src1_2,((vector unsigned char)
  184. {0, 1, 2, 3, 4, 5, 6, 7, 19, 23, 27, 31, 17, 21, 25, 29}));
  185. v_dstv = vec_perm(v_dstv, v_src1_3,((vector unsigned char)
  186. {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 17, 21, 25, 29}));
  187. v_dstu = vec_perm(v_dstu, v_src1_3,((vector unsigned char)
  188. {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 19, 23, 27, 31}));
  189. vec_vsx_st((vector unsigned char)v_dstu, 0, (unsigned char *)dstu_addr);
  190. vec_vsx_st((vector unsigned char)v_dstv, 0, (unsigned char *)dstv_addr);
  191. src1_addr += 64;
  192. dstu_addr += 16;
  193. dstv_addr += 16;
  194. }
  195. for (i=width_adj; i< width_adj + frag_len; i++) {
  196. dstV[i] = src1[4 * i + 1];
  197. dstU[i] = src1[4 * i + 3];
  198. }
  199. av_assert1(src1 == src2);
  200. }
  201. static void uyvyToY_c_vsx(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
  202. uint32_t *unused)
  203. {
  204. int i, width_adj, frag_len;
  205. uintptr_t src_addr = (uintptr_t)src;
  206. uintptr_t dst_addr = (uintptr_t)dst;
  207. // compute integral number of vector-length items and length of final fragment
  208. width_adj = width >> 4;
  209. width_adj = width_adj << 4;
  210. frag_len = width - width_adj;
  211. for ( i = 0; i < width_adj; i += 16) {
  212. vector unsigned char v_rd0 = vec_vsx_ld(0, (unsigned char *)src_addr);
  213. vector unsigned char v_rd1 = vec_vsx_ld(0, (unsigned char *)(src_addr + 16));
  214. vector unsigned char v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
  215. {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}));
  216. vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
  217. src_addr += 32;
  218. dst_addr += 16;
  219. }
  220. for (i=width_adj; i< width_adj + frag_len; i++) {
  221. dst[i] = src[2 * i + 1];
  222. }
  223. }
  224. static void uyvyToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
  225. const uint8_t *src2, int width, uint32_t *unused)
  226. {
  227. int i, width_adj, frag_len;
  228. uintptr_t src1_addr = (uintptr_t)src1;
  229. uintptr_t dstu_addr = (uintptr_t)dstU;
  230. uintptr_t dstv_addr = (uintptr_t)dstV;
  231. // compute integral number of vector-length items and length of final fragment
  232. width_adj = width >> 4;
  233. width_adj = width_adj << 4;
  234. frag_len = width - width_adj;
  235. for ( i = 0; i < width_adj; i += 16) {
  236. vector unsigned char v_src1_0 = vec_vsx_ld(0, (unsigned char *)src1_addr);
  237. vector unsigned char v_src1_1 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 16));
  238. vector unsigned char v_src1_2 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 32));
  239. vector unsigned char v_src1_3 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 48));
  240. vector unsigned char v_dstu = vec_perm(v_src1_0, v_src1_1,
  241. ((vector unsigned char)
  242. {0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29}));
  243. vector unsigned char v_dstv = vec_perm(v_src1_0, v_src1_1,
  244. ((vector unsigned char)
  245. {2, 6, 10, 14, 18, 22, 26, 30, 1, 5, 9, 13, 17, 21, 25, 29}));
  246. v_dstu = vec_perm(v_dstu, v_src1_2,((vector unsigned char)
  247. {0, 1, 2, 3, 4, 5, 6, 7, 16, 20, 24, 28, 17, 21, 25, 29}));
  248. v_dstv = vec_perm(v_dstv, v_src1_2,((vector unsigned char)
  249. {0, 1, 2, 3, 4, 5, 6, 7, 18, 22, 26, 30, 17, 21, 25, 29}));
  250. v_dstu = vec_perm(v_dstu, v_src1_3,((vector unsigned char)
  251. {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 20, 24, 28}));
  252. v_dstv = vec_perm(v_dstv, v_src1_3,((vector unsigned char)
  253. {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 18, 22, 26, 30}));
  254. vec_vsx_st((vector unsigned char)v_dstu, 0, (unsigned char *)dstu_addr);
  255. vec_vsx_st((vector unsigned char)v_dstv, 0, (unsigned char *)dstv_addr);
  256. src1_addr += 64;
  257. dstu_addr += 16;
  258. dstv_addr += 16;
  259. }
  260. for (i=width_adj; i< width_adj + frag_len; i++) {
  261. dstU[i] = src1[4 * i + 0];
  262. dstV[i] = src1[4 * i + 2];
  263. }
  264. av_assert1(src1 == src2);
  265. }
  266. static av_always_inline void nvXXtoUV_c_vsx(uint8_t *dst1, uint8_t *dst2, const uint8_t *src, int width)
  267. {
  268. int i, width_adj, frag_len;
  269. uintptr_t src_addr = (uintptr_t)src;
  270. uintptr_t dst1_addr = (uintptr_t)dst1;
  271. uintptr_t dst2_addr = (uintptr_t)dst2;
  272. // compute integral number of vector-length items and length of final fragment
  273. width_adj = width >> 4;
  274. width_adj = width_adj << 4;
  275. frag_len = width - width_adj;
  276. for ( i = 0; i < width_adj; i += 16) {
  277. vector unsigned char v_rd0 = vec_vsx_ld(0, (unsigned char *)src_addr);
  278. vector unsigned char v_rd1 = vec_vsx_ld(0, (unsigned char *)(src_addr + 16));
  279. vector unsigned char v_dst1 = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
  280. {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}));
  281. vector unsigned char v_dst2 = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
  282. {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}));
  283. vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst1_addr);
  284. vec_vsx_st((vector unsigned char)v_dst2, 0, (unsigned char *)dst2_addr);
  285. src_addr += 32;
  286. dst1_addr += 16;
  287. dst2_addr += 16;
  288. }
  289. for (i=width_adj; i< width_adj + frag_len; i++) {
  290. dst1[i] = src[2 * i + 0];
  291. dst2[i] = src[2 * i + 1];
  292. }
  293. }
  294. static void nv12ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
  295. const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
  296. int width, uint32_t *unused)
  297. {
  298. nvXXtoUV_c_vsx(dstU, dstV, src1, width);
  299. }
  300. static void nv21ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
  301. const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
  302. int width, uint32_t *unused)
  303. {
  304. nvXXtoUV_c_vsx(dstV, dstU, src1, width);
  305. }
  306. #endif /* HAVE_VSX */
  307. av_cold void ff_sws_init_input_funcs_vsx(SwsContext *c)
  308. {
  309. #if HAVE_VSX
  310. enum AVPixelFormat srcFormat = c->srcFormat;
  311. switch (srcFormat) {
  312. case AV_PIX_FMT_YUYV422:
  313. c->chrToYV12 = yuy2ToUV_c_vsx;
  314. break;
  315. case AV_PIX_FMT_YVYU422:
  316. c->chrToYV12 = yvy2ToUV_c_vsx;
  317. break;
  318. case AV_PIX_FMT_UYVY422:
  319. c->chrToYV12 = uyvyToUV_c_vsx;
  320. break;
  321. case AV_PIX_FMT_NV12:
  322. c->chrToYV12 = nv12ToUV_c_vsx;
  323. break;
  324. case AV_PIX_FMT_NV21:
  325. c->chrToYV12 = nv21ToUV_c_vsx;
  326. break;
  327. }
  328. switch (srcFormat) {
  329. case AV_PIX_FMT_YUYV422:
  330. case AV_PIX_FMT_YVYU422:
  331. case AV_PIX_FMT_YA8:
  332. c->lumToYV12 = yuy2ToY_c_vsx;
  333. break;
  334. case AV_PIX_FMT_UYVY422:
  335. c->lumToYV12 = uyvyToY_c_vsx;
  336. break;
  337. }
  338. if (c->needAlpha) {
  339. switch (srcFormat) {
  340. case AV_PIX_FMT_BGRA:
  341. case AV_PIX_FMT_RGBA:
  342. c->alpToYV12 = rgbaToA_c_vsx;
  343. break;
  344. case AV_PIX_FMT_ABGR:
  345. case AV_PIX_FMT_ARGB:
  346. c->alpToYV12 = abgrToA_c_vsx;
  347. break;
  348. case AV_PIX_FMT_YA8:
  349. c->alpToYV12 = uyvyToY_c_vsx;
  350. break;
  351. }
  352. }
  353. #endif /* HAVE_VSX */
  354. }