simple_idct_arm.S 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. /*
  2. * simple_idct_arm.S
  3. * Copyright (C) 2002 Frederic 'dilb' Boulay.
  4. * All Rights Reserved.
  5. *
  6. * Author: Frederic Boulay <dilb@handhelds.org>
  7. *
  8. * The function defined in this file is derived from the simple_idct function
  9. * from the libavcodec library part of the FFmpeg project.
  10. *
  11. * This file is part of FFmpeg.
  12. *
  13. * FFmpeg is free software; you can redistribute it and/or
  14. * modify it under the terms of the GNU Lesser General Public
  15. * License as published by the Free Software Foundation; either
  16. * version 2.1 of the License, or (at your option) any later version.
  17. *
  18. * FFmpeg is distributed in the hope that it will be useful,
  19. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  21. * Lesser General Public License for more details.
  22. *
  23. * You should have received a copy of the GNU Lesser General Public
  24. * License along with FFmpeg; if not, write to the Free Software
  25. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  26. */
  27. /* useful constants for the algorithm, they are save in __constant_ptr__ at */
  28. /* the end of the source code.*/
  29. #define W1 22725
  30. #define W2 21407
  31. #define W3 19266
  32. #define W4 16383
  33. #define W5 12873
  34. #define W6 8867
  35. #define W7 4520
  36. #define MASK_MSHW 0xFFFF0000
  37. /* offsets of the constants in the vector */
  38. #define offW1 0
  39. #define offW2 4
  40. #define offW3 8
  41. #define offW4 12
  42. #define offW5 16
  43. #define offW6 20
  44. #define offW7 24
  45. #define offMASK_MSHW 28
  46. #define ROW_SHIFT 11
  47. #define ROW_SHIFT2MSHW (16-11)
  48. #define COL_SHIFT 20
  49. #define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
  50. #define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
  51. .text
  52. .align
  53. .global simple_idct_ARM
  54. simple_idct_ARM:
  55. @@ void simple_idct_ARM(int16_t *block)
  56. @@ save stack for reg needed (take all of them),
  57. @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
  58. @@ so it must not be overwritten, if it is not saved!!
  59. @@ R12 is another scratch register, so it should not be saved too
  60. @@ save all registers
  61. stmfd sp!, {r4-r11, r14} @ R14 is also called LR
  62. @@ at this point, R0=block, other registers are free.
  63. add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
  64. add r12, pc, #(__constant_ptr__-.-8) @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
  65. @@ add 2 temporary variables in the stack: R0 and R14
  66. sub sp, sp, #8 @ allow 2 local variables
  67. str r0, [sp, #0] @ save block in sp[0]
  68. @@ stack status
  69. @@ sp+4 free
  70. @@ sp+0 R0 (block)
  71. @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
  72. __row_loop:
  73. @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimise ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
  74. ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
  75. ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1]
  76. ldr r3, [r14, #8] @ R3=ROWr32[2]
  77. ldr r4, [r14, #12] @ R4=ROWr32[3]
  78. @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
  79. @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
  80. @@ else follow the complete algorithm.
  81. @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
  82. @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
  83. orr r5, r4, r3 @ R5=R4 | R3
  84. orr r5, r5, r2 @ R5=R4 | R3 | R2
  85. orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null)
  86. beq __end_row_loop
  87. mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
  88. ldrsh r6, [r14, #0] @ R6=ROWr16[0]
  89. orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7
  90. beq __almost_empty_row
  91. __b_evaluation:
  92. @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
  93. @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
  94. @@ R12=__const_ptr_, R14=&block[n]
  95. @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
  96. @@ MUL16(b0, W1, row[1]);
  97. @@ MUL16(b1, W3, row[1]);
  98. @@ MUL16(b2, W5, row[1]);
  99. @@ MUL16(b3, W7, row[1]);
  100. @@ MAC16(b0, W3, row[3]);
  101. @@ MAC16(b1, -W7, row[3]);
  102. @@ MAC16(b2, -W1, row[3]);
  103. @@ MAC16(b3, -W5, row[3]);
  104. ldr r8, [r12, #offW1] @ R8=W1
  105. mov r2, r2, asr #16 @ R2=ROWr16[3]
  106. mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  107. ldr r9, [r12, #offW3] @ R9=W3
  108. ldr r10, [r12, #offW5] @ R10=W5
  109. mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  110. ldr r11, [r12, #offW7] @ R11=W7
  111. mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  112. mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  113. teq r2, #0 @ if null avoid muls
  114. mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  115. rsbne r2, r2, #0 @ R2=-ROWr16[3]
  116. mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  117. mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  118. mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  119. @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
  120. @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
  121. @@ R12=__const_ptr_, R14=&block[n]
  122. @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
  123. @@ if (temp != 0) {}
  124. orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3]
  125. beq __end_b_evaluation
  126. @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
  127. @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
  128. @@ R12=__const_ptr_, R14=&block[n]
  129. @@ MAC16(b0, W5, row[5]);
  130. @@ MAC16(b2, W7, row[5]);
  131. @@ MAC16(b3, W3, row[5]);
  132. @@ MAC16(b1, -W1, row[5]);
  133. @@ MAC16(b0, W7, row[7]);
  134. @@ MAC16(b2, W3, row[7]);
  135. @@ MAC16(b3, -W1, row[7]);
  136. @@ MAC16(b1, -W5, row[7]);
  137. mov r3, r3, asr #16 @ R3=ROWr16[5]
  138. teq r3, #0 @ if null avoid muls
  139. mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
  140. mov r4, r4, asr #16 @ R4=ROWr16[7]
  141. mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
  142. mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
  143. rsbne r3, r3, #0 @ R3=-ROWr16[5]
  144. mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
  145. @@ R3 is free now
  146. teq r4, #0 @ if null avoid muls
  147. mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
  148. mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
  149. rsbne r4, r4, #0 @ R4=-ROWr16[7]
  150. mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
  151. mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
  152. @@ R4 is free now
  153. __end_b_evaluation:
  154. @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
  155. @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
  156. @@ R12=__const_ptr_, R14=&block[n]
  157. __a_evaluation:
  158. @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
  159. @@ a1 = a0 + W6 * row[2];
  160. @@ a2 = a0 - W6 * row[2];
  161. @@ a3 = a0 - W2 * row[2];
  162. @@ a0 = a0 + W2 * row[2];
  163. ldr r9, [r12, #offW4] @ R9=W4
  164. mul r6, r9, r6 @ R6=W4*ROWr16[0]
  165. ldr r10, [r12, #offW6] @ R10=W6
  166. ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet)
  167. add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
  168. mul r11, r10, r4 @ R11=W6*ROWr16[2]
  169. ldr r8, [r12, #offW2] @ R8=W2
  170. sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
  171. @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
  172. @@ if (temp != 0) {}
  173. teq r2, #0
  174. beq __end_bef_a_evaluation
  175. add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
  176. mul r11, r8, r4 @ R11=W2*ROWr16[2]
  177. sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
  178. add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
  179. @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
  180. @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
  181. @@ R12=__const_ptr_, R14=&block[n]
  182. @@ a0 += W4*row[4]
  183. @@ a1 -= W4*row[4]
  184. @@ a2 -= W4*row[4]
  185. @@ a3 += W4*row[4]
  186. ldrsh r11, [r14, #8] @ R11=ROWr16[4]
  187. teq r11, #0 @ if null avoid muls
  188. mulne r11, r9, r11 @ R11=W4*ROWr16[4]
  189. @@ R9 is free now
  190. ldrsh r9, [r14, #12] @ R9=ROWr16[6]
  191. addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
  192. subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
  193. subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
  194. addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
  195. @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
  196. teq r9, #0 @ if null avoid muls
  197. mulne r11, r10, r9 @ R11=W6*ROWr16[6]
  198. addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
  199. mulne r10, r8, r9 @ R10=W2*ROWr16[6]
  200. @@ a0 += W6*row[6];
  201. @@ a3 -= W6*row[6];
  202. @@ a1 -= W2*row[6];
  203. @@ a2 += W2*row[6];
  204. subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
  205. subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
  206. addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
  207. __end_a_evaluation:
  208. @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
  209. @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
  210. @@ R12=__const_ptr_, R14=&block[n]
  211. @@ row[0] = (a0 + b0) >> ROW_SHIFT;
  212. @@ row[1] = (a1 + b1) >> ROW_SHIFT;
  213. @@ row[2] = (a2 + b2) >> ROW_SHIFT;
  214. @@ row[3] = (a3 + b3) >> ROW_SHIFT;
  215. @@ row[4] = (a3 - b3) >> ROW_SHIFT;
  216. @@ row[5] = (a2 - b2) >> ROW_SHIFT;
  217. @@ row[6] = (a1 - b1) >> ROW_SHIFT;
  218. @@ row[7] = (a0 - b0) >> ROW_SHIFT;
  219. add r8, r6, r0 @ R8=a0+b0
  220. add r9, r2, r1 @ R9=a1+b1
  221. @@ put 2 16 bits half-words in a 32bits word
  222. @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
  223. ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
  224. and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
  225. mvn r11, r10 @ R11= NOT R10= 0x0000FFFF
  226. and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
  227. orr r8, r8, r9
  228. str r8, [r14, #0]
  229. add r8, r3, r5 @ R8=a2+b2
  230. add r9, r4, r7 @ R9=a3+b3
  231. and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
  232. and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
  233. orr r8, r8, r9
  234. str r8, [r14, #4]
  235. sub r8, r4, r7 @ R8=a3-b3
  236. sub r9, r3, r5 @ R9=a2-b2
  237. and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
  238. and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
  239. orr r8, r8, r9
  240. str r8, [r14, #8]
  241. sub r8, r2, r1 @ R8=a1-b1
  242. sub r9, r6, r0 @ R9=a0-b0
  243. and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
  244. and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
  245. orr r8, r8, r9
  246. str r8, [r14, #12]
  247. bal __end_row_loop
  248. __almost_empty_row:
  249. @@ the row was empty, except ROWr16[0], now, management of this special case
  250. @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
  251. @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
  252. @@ R8=0xFFFF (temp), R9-R11 free
  253. mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
  254. sub r8, r8, #1 @ R8 is now ready.
  255. and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
  256. orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16)
  257. str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5
  258. str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5
  259. str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5
  260. str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5
  261. __end_row_loop:
  262. @@ at this point, R0-R11 (free)
  263. @@ R12=__const_ptr_, R14=&block[n]
  264. ldr r0, [sp, #0] @ R0=block
  265. teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished.
  266. sub r14, r14, #16
  267. bne __row_loop
  268. @@ at this point, R0=block, R1-R11 (free)
  269. @@ R12=__const_ptr_, R14=&block[n]
  270. add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
  271. __col_loop:
  272. __b_evaluation2:
  273. @@ at this point, R0=block (temp), R1-R11 (free)
  274. @@ R12=__const_ptr_, R14=&block[n]
  275. @@ proceed with b0-b3 first, followed by a0-a3
  276. @@ MUL16(b0, W1, col[8x1]);
  277. @@ MUL16(b1, W3, col[8x1]);
  278. @@ MUL16(b2, W5, col[8x1]);
  279. @@ MUL16(b3, W7, col[8x1]);
  280. @@ MAC16(b0, W3, col[8x3]);
  281. @@ MAC16(b1, -W7, col[8x3]);
  282. @@ MAC16(b2, -W1, col[8x3]);
  283. @@ MAC16(b3, -W5, col[8x3]);
  284. ldr r8, [r12, #offW1] @ R8=W1
  285. ldrsh r7, [r14, #16]
  286. mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  287. ldr r9, [r12, #offW3] @ R9=W3
  288. ldr r10, [r12, #offW5] @ R10=W5
  289. mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  290. ldr r11, [r12, #offW7] @ R11=W7
  291. mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  292. ldrsh r2, [r14, #48]
  293. mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  294. teq r2, #0 @ if 0, then avoid muls
  295. mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  296. rsbne r2, r2, #0 @ R2=-ROWr16[3]
  297. mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  298. mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  299. mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  300. @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
  301. @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
  302. @@ R12=__const_ptr_, R14=&block[n]
  303. @@ MAC16(b0, W5, col[5x8]);
  304. @@ MAC16(b2, W7, col[5x8]);
  305. @@ MAC16(b3, W3, col[5x8]);
  306. @@ MAC16(b1, -W1, col[5x8]);
  307. @@ MAC16(b0, W7, col[7x8]);
  308. @@ MAC16(b2, W3, col[7x8]);
  309. @@ MAC16(b3, -W1, col[7x8]);
  310. @@ MAC16(b1, -W5, col[7x8]);
  311. ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
  312. teq r3, #0 @ if 0 then avoid muls
  313. mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
  314. mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
  315. mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
  316. rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
  317. ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
  318. mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
  319. @@ R3 is free now
  320. teq r4, #0 @ if 0 then avoid muls
  321. mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
  322. mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
  323. rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
  324. mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
  325. mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
  326. @@ R4 is free now
  327. __end_b_evaluation2:
  328. @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
  329. @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
  330. @@ R12=__const_ptr_, R14=&block[n]
  331. __a_evaluation2:
  332. @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
  333. @@ a1 = a0 + W6 * row[2];
  334. @@ a2 = a0 - W6 * row[2];
  335. @@ a3 = a0 - W2 * row[2];
  336. @@ a0 = a0 + W2 * row[2];
  337. ldrsh r6, [r14, #0]
  338. ldr r9, [r12, #offW4] @ R9=W4
  339. mul r6, r9, r6 @ R6=W4*ROWr16[0]
  340. ldr r10, [r12, #offW6] @ R10=W6
  341. ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet)
  342. add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
  343. mul r11, r10, r4 @ R11=W6*ROWr16[2]
  344. ldr r8, [r12, #offW2] @ R8=W2
  345. add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
  346. sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
  347. mul r11, r8, r4 @ R11=W2*ROWr16[2]
  348. sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
  349. add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
  350. @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
  351. @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
  352. @@ R12=__const_ptr_, R14=&block[n]
  353. @@ a0 += W4*row[4]
  354. @@ a1 -= W4*row[4]
  355. @@ a2 -= W4*row[4]
  356. @@ a3 += W4*row[4]
  357. ldrsh r11, [r14, #64] @ R11=ROWr16[4]
  358. teq r11, #0 @ if null avoid muls
  359. mulne r11, r9, r11 @ R11=W4*ROWr16[4]
  360. @@ R9 is free now
  361. addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
  362. subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
  363. subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
  364. ldrsh r9, [r14, #96] @ R9=ROWr16[6]
  365. addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
  366. @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
  367. teq r9, #0 @ if null avoid muls
  368. mulne r11, r10, r9 @ R11=W6*ROWr16[6]
  369. addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
  370. mulne r10, r8, r9 @ R10=W2*ROWr16[6]
  371. @@ a0 += W6*row[6];
  372. @@ a3 -= W6*row[6];
  373. @@ a1 -= W2*row[6];
  374. @@ a2 += W2*row[6];
  375. subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
  376. subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
  377. addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
  378. __end_a_evaluation2:
  379. @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
  380. @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
  381. @@ R12=__const_ptr_, R14=&block[n]
  382. @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
  383. @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
  384. @@ col[16] = ((a2 + b2) >> COL_SHIFT);
  385. @@ col[24] = ((a3 + b3) >> COL_SHIFT);
  386. @@ col[32] = ((a3 - b3) >> COL_SHIFT);
  387. @@ col[40] = ((a2 - b2) >> COL_SHIFT);
  388. @@ col[48] = ((a1 - b1) >> COL_SHIFT);
  389. @@ col[56] = ((a0 - b0) >> COL_SHIFT);
  390. @@@@@ no optimisation here @@@@@
  391. add r8, r6, r0 @ R8=a0+b0
  392. add r9, r2, r1 @ R9=a1+b1
  393. mov r8, r8, asr #COL_SHIFT
  394. mov r9, r9, asr #COL_SHIFT
  395. strh r8, [r14, #0]
  396. strh r9, [r14, #16]
  397. add r8, r3, r5 @ R8=a2+b2
  398. add r9, r4, r7 @ R9=a3+b3
  399. mov r8, r8, asr #COL_SHIFT
  400. mov r9, r9, asr #COL_SHIFT
  401. strh r8, [r14, #32]
  402. strh r9, [r14, #48]
  403. sub r8, r4, r7 @ R8=a3-b3
  404. sub r9, r3, r5 @ R9=a2-b2
  405. mov r8, r8, asr #COL_SHIFT
  406. mov r9, r9, asr #COL_SHIFT
  407. strh r8, [r14, #64]
  408. strh r9, [r14, #80]
  409. sub r8, r2, r1 @ R8=a1-b1
  410. sub r9, r6, r0 @ R9=a0-b0
  411. mov r8, r8, asr #COL_SHIFT
  412. mov r9, r9, asr #COL_SHIFT
  413. strh r8, [r14, #96]
  414. strh r9, [r14, #112]
  415. __end_col_loop:
  416. @@ at this point, R0-R11 (free)
  417. @@ R12=__const_ptr_, R14=&block[n]
  418. ldr r0, [sp, #0] @ R0=block
  419. teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished.
  420. sub r14, r14, #2
  421. bne __col_loop
  422. __end_simple_idct_ARM:
  423. @@ restore registers to previous status!
  424. add sp, sp, #8 @@ the local variables!
  425. ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
  426. @@ kind of sub-function, here not to overload the common case.
  427. __end_bef_a_evaluation:
  428. add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
  429. mul r11, r8, r4 @ R11=W2*ROWr16[2]
  430. sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
  431. add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
  432. bal __end_a_evaluation
  433. __constant_ptr__: @@ see #defines at the beginning of the source code for values.
  434. .align
  435. .word W1
  436. .word W2
  437. .word W3
  438. .word W4
  439. .word W5
  440. .word W6
  441. .word W7
  442. .word MASK_MSHW