simple_idct_vis.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529
  1. /*
  2. * SPARC VIS optimized inverse DCT
  3. * Copyright (c) 2007 Denes Balatoni < dbalatoni XatX interware XdotX hu >
  4. *
  5. * I did consult the following fine web page about dct
  6. * http://www.geocities.com/ssavekar/dct.htm
  7. *
  8. * This file is part of Libav.
  9. *
  10. * Libav is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * Libav is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with Libav; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. #include "libavcodec/dsputil.h"
  25. #include "dsputil_vis.h"
  26. static const DECLARE_ALIGNED(8, int16_t, coeffs)[28] = {
  27. - 1259,- 1259,- 1259,- 1259,
  28. - 4989,- 4989,- 4989,- 4989,
  29. -11045,-11045,-11045,-11045,
  30. -19195,-19195,-19195,-19195,
  31. -29126,-29126,-29126,-29126,
  32. 25080, 25080, 25080, 25080,
  33. 12785, 12785, 12785, 12785
  34. };
  35. static const DECLARE_ALIGNED(8, uint16_t, scale)[4] = {
  36. 65536>>6, 65536>>6, 65536>>6, 65536>>6
  37. };
  38. static const DECLARE_ALIGNED(8, uint16_t, rounder)[4] = {
  39. 1<<5, 1<<5, 1<<5, 1<<5
  40. };
  41. static const DECLARE_ALIGNED(8, uint16_t, expand)[4] = {
  42. 1<<14, 1<<14, 1<<14, 1<<14
  43. };
  44. #define INIT_IDCT \
  45. "ldd [%1], %%f32 \n\t"\
  46. "ldd [%1+8], %%f34 \n\t"\
  47. "ldd [%1+16], %%f36 \n\t"\
  48. "ldd [%1+24], %%f38 \n\t"\
  49. "ldd [%1+32], %%f40 \n\t"\
  50. "ldd [%1+40], %%f42 \n\t"\
  51. "ldd [%1+48], %%f44 \n\t"\
  52. "ldd [%0], %%f46 \n\t"\
  53. "fzero %%f62 \n\t"\
  54. #define LOADSCALE(in) \
  55. "ldd [" in "], %%f0 \n\t"\
  56. "ldd [" in "+16], %%f2 \n\t"\
  57. "ldd [" in "+32], %%f4 \n\t"\
  58. "ldd [" in "+48], %%f6 \n\t"\
  59. "ldd [" in "+64], %%f8 \n\t"\
  60. "ldd [" in "+80], %%f10 \n\t"\
  61. "ldd [" in "+96], %%f12 \n\t"\
  62. "ldd [" in "+112], %%f14 \n\t"\
  63. "fpadd16 %%f0, %%f0, %%f0 \n\t"\
  64. "fpadd16 %%f2, %%f2, %%f2 \n\t"\
  65. "fpadd16 %%f4, %%f4, %%f4 \n\t"\
  66. "fpadd16 %%f6, %%f6, %%f6 \n\t"\
  67. "fpadd16 %%f8, %%f8, %%f8 \n\t"\
  68. "fpadd16 %%f10, %%f10, %%f10 \n\t"\
  69. "fpadd16 %%f12, %%f12, %%f12 \n\t"\
  70. "fpadd16 %%f14, %%f14, %%f14 \n\t"\
  71. \
  72. "fpadd16 %%f0, %%f0, %%f0 \n\t"\
  73. "fpadd16 %%f2, %%f2, %%f2 \n\t"\
  74. "fpadd16 %%f4, %%f4, %%f4 \n\t"\
  75. "fpadd16 %%f6, %%f6, %%f6 \n\t"\
  76. "fpadd16 %%f8, %%f8, %%f8 \n\t"\
  77. "fpadd16 %%f10, %%f10, %%f10 \n\t"\
  78. "fpadd16 %%f12, %%f12, %%f12 \n\t"\
  79. "fpadd16 %%f14, %%f14, %%f14 \n\t"\
  80. \
  81. "fpadd16 %%f0, %%f0, %%f0 \n\t"\
  82. "fpadd16 %%f2, %%f2, %%f2 \n\t"\
  83. "fpadd16 %%f4, %%f4, %%f4 \n\t"\
  84. "fpadd16 %%f6, %%f6, %%f6 \n\t"\
  85. "fpadd16 %%f8, %%f8, %%f8 \n\t"\
  86. "fpadd16 %%f10, %%f10, %%f10 \n\t"\
  87. "fpadd16 %%f12, %%f12, %%f12 \n\t"\
  88. "fpadd16 %%f14, %%f14, %%f14 \n\t"\
  89. \
  90. "fpadd16 %%f0, %%f0, %%f0 \n\t"\
  91. "fpadd16 %%f2, %%f2, %%f2 \n\t"\
  92. "fpadd16 %%f4, %%f4, %%f4 \n\t"\
  93. "fpadd16 %%f6, %%f6, %%f6 \n\t"\
  94. "fpadd16 %%f8, %%f8, %%f8 \n\t"\
  95. "fpadd16 %%f10, %%f10, %%f10 \n\t"\
  96. "fpadd16 %%f12, %%f12, %%f12 \n\t"\
  97. "fpadd16 %%f14, %%f14, %%f14 \n\t"\
  98. #define LOAD(in) \
  99. "ldd [" in "], %%f16 \n\t"\
  100. "ldd [" in "+8], %%f18 \n\t"\
  101. "ldd [" in "+16], %%f20 \n\t"\
  102. "ldd [" in "+24], %%f22 \n\t"\
  103. "ldd [" in "+32], %%f24 \n\t"\
  104. "ldd [" in "+40], %%f26 \n\t"\
  105. "ldd [" in "+48], %%f28 \n\t"\
  106. "ldd [" in "+56], %%f30 \n\t"\
  107. #define TRANSPOSE \
  108. "fpmerge %%f16, %%f24, %%f0 \n\t"\
  109. "fpmerge %%f20, %%f28, %%f2 \n\t"\
  110. "fpmerge %%f17, %%f25, %%f4 \n\t"\
  111. "fpmerge %%f21, %%f29, %%f6 \n\t"\
  112. "fpmerge %%f18, %%f26, %%f8 \n\t"\
  113. "fpmerge %%f22, %%f30, %%f10 \n\t"\
  114. "fpmerge %%f19, %%f27, %%f12 \n\t"\
  115. "fpmerge %%f23, %%f31, %%f14 \n\t"\
  116. \
  117. "fpmerge %%f0, %%f2, %%f16 \n\t"\
  118. "fpmerge %%f1, %%f3, %%f18 \n\t"\
  119. "fpmerge %%f4, %%f6, %%f20 \n\t"\
  120. "fpmerge %%f5, %%f7, %%f22 \n\t"\
  121. "fpmerge %%f8, %%f10, %%f24 \n\t"\
  122. "fpmerge %%f9, %%f11, %%f26 \n\t"\
  123. "fpmerge %%f12, %%f14, %%f28 \n\t"\
  124. "fpmerge %%f13, %%f15, %%f30 \n\t"\
  125. \
  126. "fpmerge %%f16, %%f17, %%f0 \n\t"\
  127. "fpmerge %%f18, %%f19, %%f2 \n\t"\
  128. "fpmerge %%f20, %%f21, %%f4 \n\t"\
  129. "fpmerge %%f22, %%f23, %%f6 \n\t"\
  130. "fpmerge %%f24, %%f25, %%f8 \n\t"\
  131. "fpmerge %%f26, %%f27, %%f10 \n\t"\
  132. "fpmerge %%f28, %%f29, %%f12 \n\t"\
  133. "fpmerge %%f30, %%f31, %%f14 \n\t"\
  134. #define IDCT4ROWS \
  135. /* 1. column */\
  136. "fmul8ulx16 %%f0, %%f38, %%f28 \n\t"\
  137. "for %%f4, %%f6, %%f60 \n\t"\
  138. "fmul8ulx16 %%f2, %%f32, %%f18 \n\t"\
  139. "fmul8ulx16 %%f2, %%f36, %%f22 \n\t"\
  140. "fmul8ulx16 %%f2, %%f40, %%f26 \n\t"\
  141. "fmul8ulx16 %%f2, %%f44, %%f30 \n\t"\
  142. \
  143. ADDROUNDER\
  144. \
  145. "fmul8sux16 %%f0, %%f38, %%f48 \n\t"\
  146. "fcmpd %%fcc0, %%f62, %%f60 \n\t"\
  147. "for %%f8, %%f10, %%f60 \n\t"\
  148. "fmul8sux16 %%f2, %%f32, %%f50 \n\t"\
  149. "fmul8sux16 %%f2, %%f36, %%f52 \n\t"\
  150. "fmul8sux16 %%f2, %%f40, %%f54 \n\t"\
  151. "fmul8sux16 %%f2, %%f44, %%f56 \n\t"\
  152. \
  153. "fpadd16 %%f48, %%f28, %%f28 \n\t"\
  154. "fcmpd %%fcc1, %%f62, %%f60 \n\t"\
  155. "for %%f12, %%f14, %%f60 \n\t"\
  156. "fpadd16 %%f50, %%f18, %%f18 \n\t"\
  157. "fpadd16 %%f52, %%f22, %%f22 \n\t"\
  158. "fpadd16 %%f54, %%f26, %%f26 \n\t"\
  159. "fpadd16 %%f56, %%f30, %%f30 \n\t"\
  160. \
  161. "fpadd16 %%f28, %%f0, %%f16 \n\t"\
  162. "fcmpd %%fcc2, %%f62, %%f60 \n\t"\
  163. "fpadd16 %%f28, %%f0, %%f20 \n\t"\
  164. "fpadd16 %%f28, %%f0, %%f24 \n\t"\
  165. "fpadd16 %%f28, %%f0, %%f28 \n\t"\
  166. "fpadd16 %%f18, %%f2, %%f18 \n\t"\
  167. "fpadd16 %%f22, %%f2, %%f22 \n\t"\
  168. /* 2. column */\
  169. "fbe %%fcc0, 3f \n\t"\
  170. "fpadd16 %%f26, %%f2, %%f26 \n\t"\
  171. "fmul8ulx16 %%f4, %%f34, %%f48 \n\t"\
  172. "fmul8ulx16 %%f4, %%f42, %%f50 \n\t"\
  173. "fmul8ulx16 %%f6, %%f36, %%f52 \n\t"\
  174. "fmul8ulx16 %%f6, %%f44, %%f54 \n\t"\
  175. "fmul8ulx16 %%f6, %%f32, %%f56 \n\t"\
  176. "fmul8ulx16 %%f6, %%f40, %%f58 \n\t"\
  177. \
  178. "fpadd16 %%f16, %%f48, %%f16 \n\t"\
  179. "fpadd16 %%f20, %%f50, %%f20 \n\t"\
  180. "fpsub16 %%f24, %%f50, %%f24 \n\t"\
  181. "fpsub16 %%f28, %%f48, %%f28 \n\t"\
  182. "fpadd16 %%f18, %%f52, %%f18 \n\t"\
  183. "fpsub16 %%f22, %%f54, %%f22 \n\t"\
  184. "fpsub16 %%f26, %%f56, %%f26 \n\t"\
  185. "fpsub16 %%f30, %%f58, %%f30 \n\t"\
  186. \
  187. "fmul8sux16 %%f4, %%f34, %%f48 \n\t"\
  188. "fmul8sux16 %%f4, %%f42, %%f50 \n\t"\
  189. "fmul8sux16 %%f6, %%f36, %%f52 \n\t"\
  190. "fmul8sux16 %%f6, %%f44, %%f54 \n\t"\
  191. "fmul8sux16 %%f6, %%f32, %%f56 \n\t"\
  192. "fmul8sux16 %%f6, %%f40, %%f58 \n\t"\
  193. \
  194. "fpadd16 %%f16, %%f48, %%f16 \n\t"\
  195. "fpadd16 %%f20, %%f50, %%f20 \n\t"\
  196. "fpsub16 %%f24, %%f50, %%f24 \n\t"\
  197. "fpsub16 %%f28, %%f48, %%f28 \n\t"\
  198. "fpadd16 %%f18, %%f52, %%f18 \n\t"\
  199. "fpsub16 %%f22, %%f54, %%f22 \n\t"\
  200. "fpsub16 %%f26, %%f56, %%f26 \n\t"\
  201. "fpsub16 %%f30, %%f58, %%f30 \n\t"\
  202. \
  203. "fpadd16 %%f16, %%f4, %%f16 \n\t"\
  204. "fpsub16 %%f28, %%f4, %%f28 \n\t"\
  205. "fpadd16 %%f18, %%f6, %%f18 \n\t"\
  206. "fpsub16 %%f26, %%f6, %%f26 \n\t"\
  207. /* 3. column */\
  208. "3: \n\t"\
  209. "fbe %%fcc1, 4f \n\t"\
  210. "fpsub16 %%f30, %%f6, %%f30 \n\t"\
  211. "fmul8ulx16 %%f8, %%f38, %%f48 \n\t"\
  212. "fmul8ulx16 %%f10, %%f40, %%f50 \n\t"\
  213. "fmul8ulx16 %%f10, %%f32, %%f52 \n\t"\
  214. "fmul8ulx16 %%f10, %%f44, %%f54 \n\t"\
  215. "fmul8ulx16 %%f10, %%f36, %%f56 \n\t"\
  216. \
  217. "fpadd16 %%f16, %%f48, %%f16 \n\t"\
  218. "fpsub16 %%f20, %%f48, %%f20 \n\t"\
  219. "fpsub16 %%f24, %%f48, %%f24 \n\t"\
  220. "fpadd16 %%f28, %%f48, %%f28 \n\t"\
  221. "fpadd16 %%f18, %%f50, %%f18 \n\t"\
  222. "fpsub16 %%f22, %%f52, %%f22 \n\t"\
  223. "fpadd16 %%f26, %%f54, %%f26 \n\t"\
  224. "fpadd16 %%f30, %%f56, %%f30 \n\t"\
  225. \
  226. "fmul8sux16 %%f8, %%f38, %%f48 \n\t"\
  227. "fmul8sux16 %%f10, %%f40, %%f50 \n\t"\
  228. "fmul8sux16 %%f10, %%f32, %%f52 \n\t"\
  229. "fmul8sux16 %%f10, %%f44, %%f54 \n\t"\
  230. "fmul8sux16 %%f10, %%f36, %%f56 \n\t"\
  231. \
  232. "fpadd16 %%f16, %%f48, %%f16 \n\t"\
  233. "fpsub16 %%f20, %%f48, %%f20 \n\t"\
  234. "fpsub16 %%f24, %%f48, %%f24 \n\t"\
  235. "fpadd16 %%f28, %%f48, %%f28 \n\t"\
  236. "fpadd16 %%f18, %%f50, %%f18 \n\t"\
  237. "fpsub16 %%f22, %%f52, %%f22 \n\t"\
  238. "fpadd16 %%f26, %%f54, %%f26 \n\t"\
  239. "fpadd16 %%f30, %%f56, %%f30 \n\t"\
  240. \
  241. "fpadd16 %%f16, %%f8, %%f16 \n\t"\
  242. "fpsub16 %%f20, %%f8, %%f20 \n\t"\
  243. "fpsub16 %%f24, %%f8, %%f24 \n\t"\
  244. "fpadd16 %%f28, %%f8, %%f28 \n\t"\
  245. "fpadd16 %%f18, %%f10, %%f18 \n\t"\
  246. "fpsub16 %%f22, %%f10, %%f22 \n\t"\
  247. /* 4. column */\
  248. "4: \n\t"\
  249. "fbe %%fcc2, 5f \n\t"\
  250. "fpadd16 %%f30, %%f10, %%f30 \n\t"\
  251. "fmul8ulx16 %%f12, %%f42, %%f48 \n\t"\
  252. "fmul8ulx16 %%f12, %%f34, %%f50 \n\t"\
  253. "fmul8ulx16 %%f14, %%f44, %%f52 \n\t"\
  254. "fmul8ulx16 %%f14, %%f40, %%f54 \n\t"\
  255. "fmul8ulx16 %%f14, %%f36, %%f56 \n\t"\
  256. "fmul8ulx16 %%f14, %%f32, %%f58 \n\t"\
  257. \
  258. "fpadd16 %%f16, %%f48, %%f16 \n\t"\
  259. "fpsub16 %%f20, %%f50, %%f20 \n\t"\
  260. "fpadd16 %%f24, %%f50, %%f24 \n\t"\
  261. "fpsub16 %%f28, %%f48, %%f28 \n\t"\
  262. "fpadd16 %%f18, %%f52, %%f18 \n\t"\
  263. "fpsub16 %%f22, %%f54, %%f22 \n\t"\
  264. "fpadd16 %%f26, %%f56, %%f26 \n\t"\
  265. "fpsub16 %%f30, %%f58, %%f30 \n\t"\
  266. \
  267. "fmul8sux16 %%f12, %%f42, %%f48 \n\t"\
  268. "fmul8sux16 %%f12, %%f34, %%f50 \n\t"\
  269. "fmul8sux16 %%f14, %%f44, %%f52 \n\t"\
  270. "fmul8sux16 %%f14, %%f40, %%f54 \n\t"\
  271. "fmul8sux16 %%f14, %%f36, %%f56 \n\t"\
  272. "fmul8sux16 %%f14, %%f32, %%f58 \n\t"\
  273. \
  274. "fpadd16 %%f16, %%f48, %%f16 \n\t"\
  275. "fpsub16 %%f20, %%f50, %%f20 \n\t"\
  276. "fpadd16 %%f24, %%f50, %%f24 \n\t"\
  277. "fpsub16 %%f28, %%f48, %%f28 \n\t"\
  278. "fpadd16 %%f18, %%f52, %%f18 \n\t"\
  279. "fpsub16 %%f22, %%f54, %%f22 \n\t"\
  280. "fpadd16 %%f26, %%f56, %%f26 \n\t"\
  281. "fpsub16 %%f30, %%f58, %%f30 \n\t"\
  282. \
  283. "fpsub16 %%f20, %%f12, %%f20 \n\t"\
  284. "fpadd16 %%f24, %%f12, %%f24 \n\t"\
  285. "fpsub16 %%f22, %%f14, %%f22 \n\t"\
  286. "fpadd16 %%f26, %%f14, %%f26 \n\t"\
  287. "fpsub16 %%f30, %%f14, %%f30 \n\t"\
  288. /* final butterfly */\
  289. "5: \n\t"\
  290. "fpsub16 %%f16, %%f18, %%f48 \n\t"\
  291. "fpsub16 %%f20, %%f22, %%f50 \n\t"\
  292. "fpsub16 %%f24, %%f26, %%f52 \n\t"\
  293. "fpsub16 %%f28, %%f30, %%f54 \n\t"\
  294. "fpadd16 %%f16, %%f18, %%f16 \n\t"\
  295. "fpadd16 %%f20, %%f22, %%f20 \n\t"\
  296. "fpadd16 %%f24, %%f26, %%f24 \n\t"\
  297. "fpadd16 %%f28, %%f30, %%f28 \n\t"\
  298. #define STOREROWS(out) \
  299. "std %%f48, [" out "+112] \n\t"\
  300. "std %%f50, [" out "+96] \n\t"\
  301. "std %%f52, [" out "+80] \n\t"\
  302. "std %%f54, [" out "+64] \n\t"\
  303. "std %%f16, [" out "] \n\t"\
  304. "std %%f20, [" out "+16] \n\t"\
  305. "std %%f24, [" out "+32] \n\t"\
  306. "std %%f28, [" out "+48] \n\t"\
  307. #define SCALEROWS \
  308. "fmul8sux16 %%f46, %%f48, %%f48 \n\t"\
  309. "fmul8sux16 %%f46, %%f50, %%f50 \n\t"\
  310. "fmul8sux16 %%f46, %%f52, %%f52 \n\t"\
  311. "fmul8sux16 %%f46, %%f54, %%f54 \n\t"\
  312. "fmul8sux16 %%f46, %%f16, %%f16 \n\t"\
  313. "fmul8sux16 %%f46, %%f20, %%f20 \n\t"\
  314. "fmul8sux16 %%f46, %%f24, %%f24 \n\t"\
  315. "fmul8sux16 %%f46, %%f28, %%f28 \n\t"\
  316. #define PUTPIXELSCLAMPED(dest) \
  317. "fpack16 %%f48, %%f14 \n\t"\
  318. "fpack16 %%f50, %%f12 \n\t"\
  319. "fpack16 %%f16, %%f0 \n\t"\
  320. "fpack16 %%f20, %%f2 \n\t"\
  321. "fpack16 %%f24, %%f4 \n\t"\
  322. "fpack16 %%f28, %%f6 \n\t"\
  323. "fpack16 %%f54, %%f8 \n\t"\
  324. "fpack16 %%f52, %%f10 \n\t"\
  325. "st %%f0, [%3+" dest "] \n\t"\
  326. "st %%f2, [%5+" dest "] \n\t"\
  327. "st %%f4, [%6+" dest "] \n\t"\
  328. "st %%f6, [%7+" dest "] \n\t"\
  329. "st %%f8, [%8+" dest "] \n\t"\
  330. "st %%f10, [%9+" dest "] \n\t"\
  331. "st %%f12, [%10+" dest "] \n\t"\
  332. "st %%f14, [%11+" dest "] \n\t"\
  333. #define ADDPIXELSCLAMPED(dest) \
  334. "ldd [%5], %%f18 \n\t"\
  335. "ld [%3+" dest"], %%f0 \n\t"\
  336. "ld [%6+" dest"], %%f2 \n\t"\
  337. "ld [%7+" dest"], %%f4 \n\t"\
  338. "ld [%8+" dest"], %%f6 \n\t"\
  339. "ld [%9+" dest"], %%f8 \n\t"\
  340. "ld [%10+" dest"], %%f10 \n\t"\
  341. "ld [%11+" dest"], %%f12 \n\t"\
  342. "ld [%12+" dest"], %%f14 \n\t"\
  343. "fmul8x16 %%f0, %%f18, %%f0 \n\t"\
  344. "fmul8x16 %%f2, %%f18, %%f2 \n\t"\
  345. "fmul8x16 %%f4, %%f18, %%f4 \n\t"\
  346. "fmul8x16 %%f6, %%f18, %%f6 \n\t"\
  347. "fmul8x16 %%f8, %%f18, %%f8 \n\t"\
  348. "fmul8x16 %%f10, %%f18, %%f10 \n\t"\
  349. "fmul8x16 %%f12, %%f18, %%f12 \n\t"\
  350. "fmul8x16 %%f14, %%f18, %%f14 \n\t"\
  351. "fpadd16 %%f0, %%f16, %%f0 \n\t"\
  352. "fpadd16 %%f2, %%f20, %%f2 \n\t"\
  353. "fpadd16 %%f4, %%f24, %%f4 \n\t"\
  354. "fpadd16 %%f6, %%f28, %%f6 \n\t"\
  355. "fpadd16 %%f8, %%f54, %%f8 \n\t"\
  356. "fpadd16 %%f10, %%f52, %%f10 \n\t"\
  357. "fpadd16 %%f12, %%f50, %%f12 \n\t"\
  358. "fpadd16 %%f14, %%f48, %%f14 \n\t"\
  359. "fpack16 %%f0, %%f0 \n\t"\
  360. "fpack16 %%f2, %%f2 \n\t"\
  361. "fpack16 %%f4, %%f4 \n\t"\
  362. "fpack16 %%f6, %%f6 \n\t"\
  363. "fpack16 %%f8, %%f8 \n\t"\
  364. "fpack16 %%f10, %%f10 \n\t"\
  365. "fpack16 %%f12, %%f12 \n\t"\
  366. "fpack16 %%f14, %%f14 \n\t"\
  367. "st %%f0, [%3+" dest "] \n\t"\
  368. "st %%f2, [%6+" dest "] \n\t"\
  369. "st %%f4, [%7+" dest "] \n\t"\
  370. "st %%f6, [%8+" dest "] \n\t"\
  371. "st %%f8, [%9+" dest "] \n\t"\
  372. "st %%f10, [%10+" dest "] \n\t"\
  373. "st %%f12, [%11+" dest "] \n\t"\
  374. "st %%f14, [%12+" dest "] \n\t"\
  375. void ff_simple_idct_vis(DCTELEM *data) {
  376. int out1, out2, out3, out4;
  377. DECLARE_ALIGNED(8, int16_t, temp)[8*8];
  378. __asm__ volatile(
  379. INIT_IDCT
  380. #define ADDROUNDER
  381. // shift right 16-4=12
  382. LOADSCALE("%2+8")
  383. IDCT4ROWS
  384. STOREROWS("%3+8")
  385. LOADSCALE("%2+0")
  386. IDCT4ROWS
  387. "std %%f48, [%3+112] \n\t"
  388. "std %%f50, [%3+96] \n\t"
  389. "std %%f52, [%3+80] \n\t"
  390. "std %%f54, [%3+64] \n\t"
  391. // shift right 16+4
  392. "ldd [%3+8], %%f18 \n\t"
  393. "ldd [%3+24], %%f22 \n\t"
  394. "ldd [%3+40], %%f26 \n\t"
  395. "ldd [%3+56], %%f30 \n\t"
  396. TRANSPOSE
  397. IDCT4ROWS
  398. SCALEROWS
  399. STOREROWS("%2+0")
  400. LOAD("%3+64")
  401. TRANSPOSE
  402. IDCT4ROWS
  403. SCALEROWS
  404. STOREROWS("%2+8")
  405. : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4)
  406. : "0" (scale), "1" (coeffs), "2" (data), "3" (temp)
  407. );
  408. }
  409. void ff_simple_idct_put_vis(uint8_t *dest, int line_size, DCTELEM *data) {
  410. int out1, out2, out3, out4, out5;
  411. int r1, r2, r3, r4, r5, r6, r7;
  412. __asm__ volatile(
  413. "wr %%g0, 0x8, %%gsr \n\t"
  414. INIT_IDCT
  415. "add %3, %4, %5 \n\t"
  416. "add %5, %4, %6 \n\t"
  417. "add %6, %4, %7 \n\t"
  418. "add %7, %4, %8 \n\t"
  419. "add %8, %4, %9 \n\t"
  420. "add %9, %4, %10 \n\t"
  421. "add %10, %4, %11 \n\t"
  422. // shift right 16-4=12
  423. LOADSCALE("%2+8")
  424. IDCT4ROWS
  425. STOREROWS("%2+8")
  426. LOADSCALE("%2+0")
  427. IDCT4ROWS
  428. "std %%f48, [%2+112] \n\t"
  429. "std %%f50, [%2+96] \n\t"
  430. "std %%f52, [%2+80] \n\t"
  431. "std %%f54, [%2+64] \n\t"
  432. #undef ADDROUNDER
  433. #define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t"
  434. // shift right 16+4
  435. "ldd [%2+8], %%f18 \n\t"
  436. "ldd [%2+24], %%f22 \n\t"
  437. "ldd [%2+40], %%f26 \n\t"
  438. "ldd [%2+56], %%f30 \n\t"
  439. TRANSPOSE
  440. IDCT4ROWS
  441. PUTPIXELSCLAMPED("0")
  442. LOAD("%2+64")
  443. TRANSPOSE
  444. IDCT4ROWS
  445. PUTPIXELSCLAMPED("4")
  446. : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5),
  447. "=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7)
  448. : "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size)
  449. );
  450. }
  451. void ff_simple_idct_add_vis(uint8_t *dest, int line_size, DCTELEM *data) {
  452. int out1, out2, out3, out4, out5, out6;
  453. int r1, r2, r3, r4, r5, r6, r7;
  454. __asm__ volatile(
  455. "wr %%g0, 0x8, %%gsr \n\t"
  456. INIT_IDCT
  457. "add %3, %4, %6 \n\t"
  458. "add %6, %4, %7 \n\t"
  459. "add %7, %4, %8 \n\t"
  460. "add %8, %4, %9 \n\t"
  461. "add %9, %4, %10 \n\t"
  462. "add %10, %4, %11 \n\t"
  463. "add %11, %4, %12 \n\t"
  464. #undef ADDROUNDER
  465. #define ADDROUNDER
  466. // shift right 16-4=12
  467. LOADSCALE("%2+8")
  468. IDCT4ROWS
  469. STOREROWS("%2+8")
  470. LOADSCALE("%2+0")
  471. IDCT4ROWS
  472. "std %%f48, [%2+112] \n\t"
  473. "std %%f50, [%2+96] \n\t"
  474. "std %%f52, [%2+80] \n\t"
  475. "std %%f54, [%2+64] \n\t"
  476. #undef ADDROUNDER
  477. #define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t"
  478. // shift right 16+4
  479. "ldd [%2+8], %%f18 \n\t"
  480. "ldd [%2+24], %%f22 \n\t"
  481. "ldd [%2+40], %%f26 \n\t"
  482. "ldd [%2+56], %%f30 \n\t"
  483. TRANSPOSE
  484. IDCT4ROWS
  485. ADDPIXELSCLAMPED("0")
  486. LOAD("%2+64")
  487. TRANSPOSE
  488. IDCT4ROWS
  489. ADDPIXELSCLAMPED("4")
  490. : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6),
  491. "=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7)
  492. : "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size), "5" (expand)
  493. );
  494. }