internal_bfin.S 20 KB


  1. /*
  2. * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
  3. * April 20, 2007
  4. *
  5. * Blackfin video color space converter operations
  6. * convert I420 YV12 to RGB in various formats
  7. *
  8. * This file is part of FFmpeg.
  9. *
  10. * FFmpeg is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * FFmpeg is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with FFmpeg; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. /*
  25. YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
  26. and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
  27. The following calculation is used for the conversion:
  28. r = clipz((y-oy)*cy + crv*(v-128))
  29. g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
  30. b = clipz((y-oy)*cy + cbu*(u-128))
  31. y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
  32. New factorization to eliminate the truncation error which was
  33. occurring due to the byteop3p.
  34. 1) Use the bytop16m to subtract quad bytes we use this in U8 this
  35. then so the offsets need to be renormalized to 8bits.
  36. 2) Scale operands up by a factor of 4 not 8 because Blackfin
  37. multiplies include a shift.
  38. 3) Compute into the accumulators cy*yx0, cy*yx1.
  39. 4) Compute each of the linear equations:
  40. r = clipz((y - oy) * cy + crv * (v - 128))
  41. g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
  42. b = clipz((y - oy) * cy + cbu * (u - 128))
  43. Reuse of the accumulators requires that we actually multiply
  44. twice once with addition and the second time with a subtraction.
  45. Because of this we need to compute the equations in the order R B
  46. then G saving the writes for B in the case of 24/32 bit color
  47. formats.
  48. API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
  49. int dW, uint32_t *coeffs);
  50. A B
  51. --- ---
  52. i2 = cb i3 = cr
  53. i1 = coeff i0 = y
  54. Where coeffs have the following layout in memory.
  55. uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
  56. coeffs is a pointer to oy.
  57. The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
  58. replication is used to simplify the internal algorithms for the dual Mac
  59. architecture of BlackFin.
  60. All routines are exported with _ff_bfin_ as a symbol prefix.
  61. Rough performance gain compared against -O3:
  62. 2779809/1484290 187.28%
  63. which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
  64. c/pel for the optimized implementations. Not sure why there is such a
  65. huge variation on the reference codes on Blackfin I guess it must have
  66. to do with the memory system.
  67. */
  68. #define mL3 .text
  69. #ifdef __FDPIC__
  70. #define mL1 .l1.text
  71. #else
  72. #define mL1 mL3
  73. #endif
  74. #define MEM mL1
  75. #define DEFUN(fname,where,interface) \
  76. .section where; \
  77. .global _ff_bfin_ ## fname; \
  78. .type _ff_bfin_ ## fname, STT_FUNC; \
  79. .align 8; \
  80. _ff_bfin_ ## fname
  81. #define DEFUN_END(fname) \
  82. .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
  83. .text
  84. #define COEFF_LEN 11*4
  85. #define COEFF_REL_CY_OFF 4*4
  86. #define ARG_OUT 20
  87. #define ARG_W 24
  88. #define ARG_COEFF 28
  89. DEFUN(yuv2rgb565_line,MEM,
  90. (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
  91. link 0;
  92. [--sp] = (r7:4);
  93. p1 = [fp+ARG_OUT];
  94. r3 = [fp+ARG_W];
  95. i0 = r0;
  96. i2 = r1;
  97. i3 = r2;
  98. r0 = [fp+ARG_COEFF];
  99. i1 = r0;
  100. b1 = i1;
  101. l1 = COEFF_LEN;
  102. m0 = COEFF_REL_CY_OFF;
  103. p0 = r3;
  104. r0 = [i0++]; // 2Y
  105. r1.l = w[i2++]; // 2u
  106. r1.h = w[i3++]; // 2v
  107. p0 = p0>>2;
  108. lsetup (.L0565, .L1565) lc0 = p0;
  109. /*
  110. uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
  111. r0 -- used to load 4ys
  112. r1 -- used to load 2us,2vs
  113. r4 -- y3,y2
  114. r5 -- y1,y0
  115. r6 -- u1,u0
  116. r7 -- v1,v0
  117. */
  118. r2=[i1++]; // oy
  119. .L0565:
  120. /*
  121. rrrrrrrr gggggggg bbbbbbbb
  122. 5432109876543210
  123. bbbbb >>3
  124. gggggggg <<3
  125. rrrrrrrr <<8
  126. rrrrrggggggbbbbb
  127. */
  128. (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
  129. (r7,r6) = byteop16m (r1:0, r3:2) (r);
  130. r5 = r5 << 2 (v); // y1,y0
  131. r4 = r4 << 2 (v); // y3,y2
  132. r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
  133. r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
  134. /* Y' = y*cy */
  135. a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
  136. /* R = Y+ crv*(Cr-128) */
  137. r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
  138. a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
  139. r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
  140. r2 = r2 >> 3 (v);
  141. r3 = r2 & r5;
  142. /* B = Y+ cbu*(Cb-128) */
  143. r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
  144. a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
  145. r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
  146. r2 = r2 << 8 (v);
  147. r2 = r2 & r5;
  148. r3 = r3 | r2;
  149. /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
  150. a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
  151. r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
  152. r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
  153. r2 = r2 << 3 (v);
  154. r2 = r2 & r5;
  155. r3 = r3 | r2;
  156. [p1++]=r3 || r1=[i1++]; // cy
  157. /* Y' = y*cy */
  158. a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
  159. /* R = Y+ crv*(Cr-128) */
  160. r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
  161. a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
  162. r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
  163. r2 = r2 >> 3 (v);
  164. r3 = r2 & r5;
  165. /* B = Y+ cbu*(Cb-128) */
  166. r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
  167. a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
  168. r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
  169. r2 = r2 << 8 (v);
  170. r2 = r2 & r5;
  171. r3 = r3 | r2;
  172. /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
  173. a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
  174. r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
  175. r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y
  176. r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u
  177. r2 = r2 & r5;
  178. r3 = r3 | r2;
  179. [p1++]=r3 || r1.h = w[i3++]; // 2v
  180. .L1565: r2=[i1++]; // oy
  181. l1 = 0;
  182. (r7:4) = [sp++];
  183. unlink;
  184. rts;
  185. DEFUN_END(yuv2rgb565_line)
  186. DEFUN(yuv2rgb555_line,MEM,
  187. (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
  188. link 0;
  189. [--sp] = (r7:4);
  190. p1 = [fp+ARG_OUT];
  191. r3 = [fp+ARG_W];
  192. i0 = r0;
  193. i2 = r1;
  194. i3 = r2;
  195. r0 = [fp+ARG_COEFF];
  196. i1 = r0;
  197. b1 = i1;
  198. l1 = COEFF_LEN;
  199. m0 = COEFF_REL_CY_OFF;
  200. p0 = r3;
  201. r0 = [i0++]; // 2Y
  202. r1.l = w[i2++]; // 2u
  203. r1.h = w[i3++]; // 2v
  204. p0 = p0>>2;
  205. lsetup (.L0555, .L1555) lc0 = p0;
  206. /*
  207. uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
  208. r0 -- used to load 4ys
  209. r1 -- used to load 2us,2vs
  210. r4 -- y3,y2
  211. r5 -- y1,y0
  212. r6 -- u1,u0
  213. r7 -- v1,v0
  214. */
  215. r2=[i1++]; // oy
  216. .L0555:
  217. /*
  218. rrrrrrrr gggggggg bbbbbbbb
  219. 5432109876543210
  220. bbbbb >>3
  221. gggggggg <<2
  222. rrrrrrrr <<7
  223. xrrrrrgggggbbbbb
  224. */
  225. (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
  226. (r7,r6) = byteop16m (r1:0, r3:2) (r);
  227. r5 = r5 << 2 (v); // y1,y0
  228. r4 = r4 << 2 (v); // y3,y2
  229. r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
  230. r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
  231. /* Y' = y*cy */
  232. a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
  233. /* R = Y+ crv*(Cr-128) */
  234. r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
  235. a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
  236. r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
  237. r2 = r2 >> 3 (v);
  238. r3 = r2 & r5;
  239. /* B = Y+ cbu*(Cb-128) */
  240. r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
  241. a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
  242. r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
  243. r2 = r2 << 7 (v);
  244. r2 = r2 & r5;
  245. r3 = r3 | r2;
  246. /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
  247. a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
  248. r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
  249. r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
  250. r2 = r2 << 2 (v);
  251. r2 = r2 & r5;
  252. r3 = r3 | r2;
  253. [p1++]=r3 || r1=[i1++]; // cy
  254. /* Y' = y*cy */
  255. a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
  256. /* R = Y+ crv*(Cr-128) */
  257. r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
  258. a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
  259. r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
  260. r2 = r2 >> 3 (v);
  261. r3 = r2 & r5;
  262. /* B = Y+ cbu*(Cb-128) */
  263. r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
  264. a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
  265. r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
  266. r2 = r2 << 7 (v);
  267. r2 = r2 & r5;
  268. r3 = r3 | r2;
  269. /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
  270. a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
  271. r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
  272. r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y
  273. r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u
  274. r2 = r2 & r5;
  275. r3 = r3 | r2;
  276. [p1++]=r3 || r1.h=w[i3++]; // 2v
  277. .L1555: r2=[i1++]; // oy
  278. l1 = 0;
  279. (r7:4) = [sp++];
  280. unlink;
  281. rts;
  282. DEFUN_END(yuv2rgb555_line)
  283. DEFUN(yuv2rgb24_line,MEM,
  284. (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
  285. link 0;
  286. [--sp] = (r7:4);
  287. p1 = [fp+ARG_OUT];
  288. r3 = [fp+ARG_W];
  289. p2 = p1;
  290. p2 += 3;
  291. i0 = r0;
  292. i2 = r1;
  293. i3 = r2;
  294. r0 = [fp+ARG_COEFF]; // coeff buffer
  295. i1 = r0;
  296. b1 = i1;
  297. l1 = COEFF_LEN;
  298. m0 = COEFF_REL_CY_OFF;
  299. p0 = r3;
  300. r0 = [i0++]; // 2Y
  301. r1.l = w[i2++]; // 2u
  302. r1.h = w[i3++]; // 2v
  303. p0 = p0>>2;
  304. lsetup (.L0888, .L1888) lc0 = p0;
  305. /*
  306. uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
  307. r0 -- used to load 4ys
  308. r1 -- used to load 2us,2vs
  309. r4 -- y3,y2
  310. r5 -- y1,y0
  311. r6 -- u1,u0
  312. r7 -- v1,v0
  313. */
  314. r2=[i1++]; // oy
  315. .L0888:
  316. (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
  317. (r7,r6) = byteop16m (r1:0, r3:2) (r);
  318. r5 = r5 << 2 (v); // y1,y0
  319. r4 = r4 << 2 (v); // y3,y2
  320. r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
  321. r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
  322. /* Y' = y*cy */
  323. a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
  324. /* R = Y+ crv*(Cr-128) */
  325. r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
  326. a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
  327. r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
  328. r2=r2>>16 || B[p1++]=r2;
  329. B[p2++]=r2;
  330. /* B = Y+ cbu*(Cb-128) */
  331. r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
  332. a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
  333. r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
  334. /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
  335. a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
  336. r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
  337. r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero
  338. r2=r2>>16 || B[p1++]=r2;
  339. B[p2++]=r2;
  340. r3=r3>>16 || B[p1++]=r3;
  341. B[p2++]=r3 || r1=[i1++]; // cy
  342. p1+=3;
  343. p2+=3;
  344. /* Y' = y*cy */
  345. a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
  346. /* R = Y+ crv*(Cr-128) */
  347. r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
  348. a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
  349. r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
  350. r2=r2>>16 || B[p1++]=r2;
  351. B[p2++]=r2;
  352. /* B = Y+ cbu*(Cb-128) */
  353. r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
  354. a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
  355. r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
  356. /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
  357. a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
  358. r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
  359. r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask
  360. r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y
  361. B[p2++]=r2 || r1.l = w[i2++]; // 2u
  362. r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
  363. B[p2++]=r3 || r2=[i1++]; // oy
  364. p1+=3;
  365. .L1888: p2+=3;
  366. l1 = 0;
  367. (r7:4) = [sp++];
  368. unlink;
  369. rts;
  370. DEFUN_END(yuv2rgb24_line)
  371. #define ARG_vdst 20
  372. #define ARG_width 24
  373. #define ARG_height 28
  374. #define ARG_lumStride 32
  375. #define ARG_chromStride 36
  376. #define ARG_srcStride 40
  377. DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  378. long width, long height,
  379. long lumStride, long chromStride, long srcStride)):
  380. link 0;
  381. [--sp] = (r7:4,p5:4);
  382. p0 = r1; // Y top even
  383. i2 = r2; // *u
  384. r2 = [fp + ARG_vdst];
  385. i3 = r2; // *v
  386. r1 = [fp + ARG_srcStride];
  387. r2 = r0 + r1;
  388. r1 += -8; // i0,i1 is pre read need to correct
  389. m0 = r1;
  390. i0 = r0; // uyvy_T even
  391. i1 = r2; // uyvy_B odd
  392. p2 = [fp + ARG_lumStride];
  393. p1 = p0 + p2; // Y bot odd
  394. p5 = [fp + ARG_width];
  395. p4 = [fp + ARG_height];
  396. r0 = p5;
  397. p4 = p4 >> 1;
  398. p5 = p5 >> 2;
  399. r2 = [fp + ARG_chromStride];
  400. r0 = r0 >> 1;
  401. r2 = r2 - r0;
  402. m1 = r2;
  403. /* I0,I1 - src input line pointers
  404. * p0,p1 - luma output line pointers
  405. * I2 - dstU
  406. * I3 - dstV
  407. */
  408. lsetup (0f, 1f) lc1 = p4; // H/2
  409. 0: r0 = [i0++] || r2 = [i1++];
  410. r1 = [i0++] || r3 = [i1++];
  411. r4 = byteop1p(r1:0, r3:2);
  412. r5 = byteop1p(r1:0, r3:2) (r);
  413. lsetup (2f, 3f) lc0 = p5; // W/4
  414. 2: r0 = r0 >> 8(v);
  415. r1 = r1 >> 8(v);
  416. r2 = r2 >> 8(v);
  417. r3 = r3 >> 8(v);
  418. r0 = bytepack(r0, r1);
  419. r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy
  420. r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy
  421. r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
  422. r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
  423. r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu
  424. 3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv
  425. i0 += m0;
  426. i1 += m0;
  427. i2 += m1;
  428. i3 += m1;
  429. p0 = p0 + p2;
  430. 1: p1 = p1 + p2;
  431. (r7:4,p5:4) = [sp++];
  432. unlink;
  433. rts;
  434. DEFUN_END(uyvytoyv12)
  435. DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
  436. long width, long height,
  437. long lumStride, long chromStride, long srcStride)):
  438. link 0;
  439. [--sp] = (r7:4,p5:4);
  440. p0 = r1; // Y top even
  441. i2 = r2; // *u
  442. r2 = [fp + ARG_vdst];
  443. i3 = r2; // *v
  444. r1 = [fp + ARG_srcStride];
  445. r2 = r0 + r1;
  446. r1 += -8; // i0,i1 is pre read need to correct
  447. m0 = r1;
  448. i0 = r0; // uyvy_T even
  449. i1 = r2; // uyvy_B odd
  450. p2 = [fp + ARG_lumStride];
  451. p1 = p0 + p2; // Y bot odd
  452. p5 = [fp + ARG_width];
  453. p4 = [fp + ARG_height];
  454. r0 = p5;
  455. p4 = p4 >> 1;
  456. p5 = p5 >> 2;
  457. r2 = [fp + ARG_chromStride];
  458. r0 = r0 >> 1;
  459. r2 = r2 - r0;
  460. m1 = r2;
  461. /* I0,I1 - src input line pointers
  462. * p0,p1 - luma output line pointers
  463. * I2 - dstU
  464. * I3 - dstV
  465. */
  466. lsetup (0f, 1f) lc1 = p4; // H/2
  467. 0: r0 = [i0++] || r2 = [i1++];
  468. r1 = [i0++] || r3 = [i1++];
  469. r4 = bytepack(r0, r1);
  470. r5 = bytepack(r2, r3);
  471. lsetup (2f, 3f) lc0 = p5; // W/4
  472. 2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even
  473. r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd
  474. r2 = r2 >> 8(v);
  475. r3 = r3 >> 8(v);
  476. r4 = byteop1p(r1:0, r3:2);
  477. r5 = byteop1p(r1:0, r3:2) (r);
  478. r6 = pack(r5.l, r4.l);
  479. r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
  480. r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
  481. r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu
  482. 3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv
  483. i0 += m0;
  484. i1 += m0;
  485. i2 += m1;
  486. i3 += m1;
  487. p0 = p0 + p2;
  488. 1: p1 = p1 + p2;
  489. (r7:4,p5:4) = [sp++];
  490. unlink;
  491. rts;
  492. DEFUN_END(yuyvtoyv12)