simple_idct_armv6.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. /*
  2. * Simple IDCT
  3. *
  4. * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  5. * Copyright (c) 2007 Mans Rullgard <mru@inprovide.com>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  24. #define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  25. #define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  26. #define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  27. #define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  28. #define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  29. #define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
  30. #define ROW_SHIFT 11
  31. #define COL_SHIFT 20
  32. #define W13 (W1 | (W3 << 16))
  33. #define W26 (W2 | (W6 << 16))
  34. #define W42 (W4 | (W2 << 16))
  35. #define W42n (-W4&0xffff | (-W2 << 16))
  36. #define W46 (W4 | (W6 << 16))
  37. #define W57 (W5 | (W7 << 16))
  38. .text
  39. .align
  40. w13: .long W13
  41. w26: .long W26
  42. w42: .long W42
  43. w42n: .long W42n
  44. w46: .long W46
  45. w57: .long W57
  46. /*
  47. Compute partial IDCT of single row.
  48. shift = left-shift amount
  49. a1 = source address
  50. a3 = row[2,0] <= 2 cycles
  51. a4 = row[3,1]
  52. ip = w42 <= 2 cycles
  53. Output in registers v1--v8
  54. */
  55. .macro idct_row shift
  56. ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */
  57. mov a2, #(1<<(\shift-1))
  58. smlad v1, a3, ip, a2
  59. smlsd v4, a3, ip, a2
  60. ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */
  61. ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */
  62. smlad v2, a3, lr, a2
  63. smlsd v3, a3, lr, a2
  64. smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */
  65. smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */
  66. ldr lr, [a1, #12] /* lr = row[7,5] */
  67. pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */
  68. pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */
  69. smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */
  70. smlad v5, lr, v7, v5 /* B0 += W5*row[5] + W7*row[7] */
  71. smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */
  72. ldr a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */
  73. smlad v7, lr, a3, v7 /* B2 += W7*row[5] + W3*row[7] */
  74. ldr a3, [a1, #4] /* a3 = row[6,4] */
  75. smlsdx fp, lr, ip, fp /* B3 += W3*row[5] - W1*row[7] */
  76. ldr ip, [pc, #(w46-.-8)] /* ip = W4 | (W6 << 16) */
  77. smlad v6, lr, a2, v6 /* B1 -= W1*row[5] + W5*row[7] */
  78. smlad v2, a3, a4, v2 /* A1 += -W4*row[4] - W2*row[6] */
  79. smlsd v3, a3, a4, v3 /* A2 += -W4*row[4] + W2*row[6] */
  80. smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */
  81. smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */
  82. .endm
  83. /*
  84. Compute partial IDCT of half row.
  85. shift = left-shift amount
  86. a3 = row[2,0]
  87. a4 = row[3,1]
  88. ip = w42
  89. Output in registers v1--v8
  90. */
  91. .macro idct_row4 shift
  92. ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */
  93. ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */
  94. mov a2, #(1<<(\shift-1))
  95. smlad v1, a3, ip, a2
  96. smlsd v4, a3, ip, a2
  97. ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */
  98. smlad v2, a3, lr, a2
  99. smlsd v3, a3, lr, a2
  100. smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */
  101. smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */
  102. pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */
  103. pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */
  104. smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */
  105. smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */
  106. .endm
  107. /*
  108. Compute final part of IDCT single row without shift.
  109. Input in registers v1--v8
  110. Output in registers ip, v1--v3, lr, v5--v7
  111. */
  112. .macro idct_finish
  113. add ip, v1, v5 /* a2 = A0 + B0 */
  114. sub lr, v1, v5 /* a3 = A0 - B0 */
  115. sub v1, v2, v6 /* a3 = A1 + B1 */
  116. add v5, v2, v6 /* a3 = A1 - B1 */
  117. add v2, v3, v7 /* a2 = A2 + B2 */
  118. sub v6, v3, v7 /* a2 = A2 - B2 */
  119. add v3, v4, fp /* a3 = A3 + B3 */
  120. sub v7, v4, fp /* a3 = A3 - B3 */
  121. .endm
  122. /*
  123. Compute final part of IDCT single row.
  124. shift = right-shift amount
  125. Input/output in registers v1--v8
  126. */
  127. .macro idct_finish_shift shift
  128. add a4, v1, v5 /* a4 = A0 + B0 */
  129. sub a3, v1, v5 /* a3 = A0 - B0 */
  130. mov v1, a4, asr #\shift
  131. mov v5, a3, asr #\shift
  132. sub a4, v2, v6 /* a4 = A1 + B1 */
  133. add a3, v2, v6 /* a3 = A1 - B1 */
  134. mov v2, a4, asr #\shift
  135. mov v6, a3, asr #\shift
  136. add a4, v3, v7 /* a4 = A2 + B2 */
  137. sub a3, v3, v7 /* a3 = A2 - B2 */
  138. mov v3, a4, asr #\shift
  139. mov v7, a3, asr #\shift
  140. add a4, v4, fp /* a4 = A3 + B3 */
  141. sub a3, v4, fp /* a3 = A3 - B3 */
  142. mov v4, a4, asr #\shift
  143. mov fp, a3, asr #\shift
  144. .endm
  145. /*
  146. Compute final part of IDCT single row, saturating results at 8 bits.
  147. shift = right-shift amount
  148. Input/output in registers v1--v8
  149. */
  150. .macro idct_finish_shift_sat shift
  151. add a4, v1, v5 /* a4 = A0 + B0 */
  152. sub ip, v1, v5 /* ip = A0 - B0 */
  153. usat v1, #8, a4, asr #\shift
  154. usat v5, #8, ip, asr #\shift
  155. sub a4, v2, v6 /* a4 = A1 + B1 */
  156. add ip, v2, v6 /* ip = A1 - B1 */
  157. usat v2, #8, a4, asr #\shift
  158. usat v6, #8, ip, asr #\shift
  159. add a4, v3, v7 /* a4 = A2 + B2 */
  160. sub ip, v3, v7 /* ip = A2 - B2 */
  161. usat v3, #8, a4, asr #\shift
  162. usat v7, #8, ip, asr #\shift
  163. add a4, v4, fp /* a4 = A3 + B3 */
  164. sub ip, v4, fp /* ip = A3 - B3 */
  165. usat v4, #8, a4, asr #\shift
  166. usat fp, #8, ip, asr #\shift
  167. .endm
  168. /*
  169. Compute IDCT of single row, storing as column.
  170. a1 = source
  171. a2 = dest
  172. */
  173. .align
  174. .func idct_row_armv6
  175. idct_row_armv6:
  176. str lr, [sp, #-4]!
  177. ldr lr, [a1, #12] /* lr = row[7,5] */
  178. ldr ip, [a1, #4] /* ip = row[6,4] */
  179. ldr a4, [a1, #8] /* a4 = row[3,1] */
  180. ldr a3, [a1] /* a3 = row[2,0] */
  181. orrs lr, lr, ip
  182. cmpeq lr, a4
  183. cmpeq lr, a3, lsr #16
  184. beq 1f
  185. str a2, [sp, #-4]!
  186. ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
  187. cmp lr, #0
  188. beq 2f
  189. idct_row ROW_SHIFT
  190. b 3f
  191. 2: idct_row4 ROW_SHIFT
  192. 3: ldr a2, [sp], #4
  193. idct_finish_shift ROW_SHIFT
  194. strh v1, [a2]
  195. strh v2, [a2, #(16*2)]
  196. strh v3, [a2, #(16*4)]
  197. strh v4, [a2, #(16*6)]
  198. strh fp, [a2, #(16*1)]
  199. strh v7, [a2, #(16*3)]
  200. strh v6, [a2, #(16*5)]
  201. strh v5, [a2, #(16*7)]
  202. ldr pc, [sp], #4
  203. 1: mov a3, a3, lsl #3
  204. strh a3, [a2]
  205. strh a3, [a2, #(16*2)]
  206. strh a3, [a2, #(16*4)]
  207. strh a3, [a2, #(16*6)]
  208. strh a3, [a2, #(16*1)]
  209. strh a3, [a2, #(16*3)]
  210. strh a3, [a2, #(16*5)]
  211. strh a3, [a2, #(16*7)]
  212. ldr pc, [sp], #4
  213. .endfunc
  214. /*
  215. Compute IDCT of single column, read as row.
  216. a1 = source
  217. a2 = dest
  218. */
  219. .align
  220. .func idct_col_armv6
  221. idct_col_armv6:
  222. stmfd sp!, {a2, lr}
  223. ldr a3, [a1] /* a3 = row[2,0] */
  224. ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
  225. ldr a4, [a1, #8] /* a4 = row[3,1] */
  226. idct_row COL_SHIFT
  227. ldr a2, [sp], #4
  228. idct_finish_shift COL_SHIFT
  229. strh v1, [a2]
  230. strh v2, [a2, #(16*1)]
  231. strh v3, [a2, #(16*2)]
  232. strh v4, [a2, #(16*3)]
  233. strh fp, [a2, #(16*4)]
  234. strh v7, [a2, #(16*5)]
  235. strh v6, [a2, #(16*6)]
  236. strh v5, [a2, #(16*7)]
  237. ldr pc, [sp], #4
  238. .endfunc
  239. /*
  240. Compute IDCT of single column, read as row, store saturated 8-bit.
  241. a1 = source
  242. a2 = dest
  243. a3 = line size
  244. */
  245. .align
  246. .func idct_col_put_armv6
  247. idct_col_put_armv6:
  248. stmfd sp!, {a2, a3, lr}
  249. ldr a3, [a1] /* a3 = row[2,0] */
  250. ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
  251. ldr a4, [a1, #8] /* a4 = row[3,1] */
  252. idct_row COL_SHIFT
  253. ldmfd sp!, {a2, a3}
  254. idct_finish_shift_sat COL_SHIFT
  255. strb v1, [a2], a3
  256. strb v2, [a2], a3
  257. strb v3, [a2], a3
  258. strb v4, [a2], a3
  259. strb fp, [a2], a3
  260. strb v7, [a2], a3
  261. strb v6, [a2], a3
  262. strb v5, [a2], a3
  263. sub a2, a2, a3, lsl #3
  264. ldr pc, [sp], #4
  265. .endfunc
  266. /*
  267. Compute IDCT of single column, read as row, add/store saturated 8-bit.
  268. a1 = source
  269. a2 = dest
  270. a3 = line size
  271. */
  272. .align
  273. .func idct_col_add_armv6
  274. idct_col_add_armv6:
  275. stmfd sp!, {a2, a3, lr}
  276. ldr a3, [a1] /* a3 = row[2,0] */
  277. ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
  278. ldr a4, [a1, #8] /* a4 = row[3,1] */
  279. idct_row COL_SHIFT
  280. ldmfd sp!, {a2, a3}
  281. idct_finish
  282. ldrb a4, [a2]
  283. ldrb v4, [a2, a3]
  284. ldrb fp, [a2, a3, lsl #2]
  285. add ip, a4, ip, asr #COL_SHIFT
  286. usat ip, #8, ip
  287. add v1, v4, v1, asr #COL_SHIFT
  288. strb ip, [a2], a3
  289. ldrb ip, [a2, a3]
  290. usat v1, #8, v1
  291. ldrb fp, [a2, a3, lsl #2]
  292. add v2, ip, v2, asr #COL_SHIFT
  293. usat v2, #8, v2
  294. strb v1, [a2], a3
  295. ldrb a4, [a2, a3]
  296. ldrb ip, [a2, a3, lsl #2]
  297. strb v2, [a2], a3
  298. ldrb v4, [a2, a3]
  299. ldrb v1, [a2, a3, lsl #2]
  300. add v3, a4, v3, asr #COL_SHIFT
  301. usat v3, #8, v3
  302. add v7, v4, v7, asr #COL_SHIFT
  303. usat v7, #8, v7
  304. add v6, fp, v6, asr #COL_SHIFT
  305. usat v6, #8, v6
  306. add v5, ip, v5, asr #COL_SHIFT
  307. usat v5, #8, v5
  308. add lr, v1, lr, asr #COL_SHIFT
  309. usat lr, #8, lr
  310. strb v3, [a2], a3
  311. strb v7, [a2], a3
  312. strb v6, [a2], a3
  313. strb v5, [a2], a3
  314. strb lr, [a2], a3
  315. sub a2, a2, a3, lsl #3
  316. ldr pc, [sp], #4
  317. .endfunc
  318. /*
  319. Compute 8 IDCT row transforms.
  320. func = IDCT row->col function
  321. width = width of columns in bytes
  322. */
  323. .macro idct_rows func width
  324. bl \func
  325. add a1, a1, #(16*2)
  326. add a2, a2, #\width
  327. bl \func
  328. add a1, a1, #(16*2)
  329. add a2, a2, #\width
  330. bl \func
  331. add a1, a1, #(16*2)
  332. add a2, a2, #\width
  333. bl \func
  334. sub a1, a1, #(16*5)
  335. add a2, a2, #\width
  336. bl \func
  337. add a1, a1, #(16*2)
  338. add a2, a2, #\width
  339. bl \func
  340. add a1, a1, #(16*2)
  341. add a2, a2, #\width
  342. bl \func
  343. add a1, a1, #(16*2)
  344. add a2, a2, #\width
  345. bl \func
  346. sub a1, a1, #(16*7)
  347. .endm
  348. .align
  349. .global ff_simple_idct_armv6
  350. .func ff_simple_idct_armv6
  351. /* void ff_simple_idct_armv6(DCTELEM *data); */
  352. ff_simple_idct_armv6:
  353. stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
  354. sub sp, sp, #128
  355. mov a2, sp
  356. idct_rows idct_row_armv6, 2
  357. mov a2, a1
  358. mov a1, sp
  359. idct_rows idct_col_armv6, 2
  360. add sp, sp, #128
  361. ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
  362. .endfunc
  363. .align
  364. .global ff_simple_idct_add_armv6
  365. .func ff_simple_idct_add_armv6
  366. /* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
  367. ff_simple_idct_add_armv6:
  368. stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
  369. sub sp, sp, #128
  370. mov a1, a3
  371. mov a2, sp
  372. idct_rows idct_row_armv6, 2
  373. mov a1, sp
  374. ldr a2, [sp, #128]
  375. ldr a3, [sp, #(128+4)]
  376. idct_rows idct_col_add_armv6, 1
  377. add sp, sp, #(128+8)
  378. ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
  379. .endfunc
  380. .align
  381. .global ff_simple_idct_put_armv6
  382. .func ff_simple_idct_put_armv6
  383. /* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
  384. ff_simple_idct_put_armv6:
  385. stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
  386. sub sp, sp, #128
  387. mov a1, a3
  388. mov a2, sp
  389. idct_rows idct_row_armv6, 2
  390. mov a1, sp
  391. ldr a2, [sp, #128]
  392. ldr a3, [sp, #(128+4)]
  393. idct_rows idct_col_put_armv6, 1
  394. add sp, sp, #(128+8)
  395. ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
  396. .endfunc