dsputil_iwmmxt_rnd.h 47 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118
  1. /*
  2. * iWMMXt optimized DSP utils
  3. * copyright (c) 2004 AGAWA Koji
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. /* This header intentionally has no multiple inclusion guards. It is meant to
  22. * be included multiple times and generates different code depending on the
  23. * value of certain #defines. */
  24. void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  25. {
  26. int stride = line_size;
  27. __asm__ volatile (
  28. "and r12, %[pixels], #7 \n\t"
  29. "bic %[pixels], %[pixels], #7 \n\t"
  30. "tmcr wcgr1, r12 \n\t"
  31. "add r4, %[pixels], %[line_size] \n\t"
  32. "add r5, %[block], %[line_size] \n\t"
  33. "mov %[line_size], %[line_size], lsl #1 \n\t"
  34. "1: \n\t"
  35. "wldrd wr0, [%[pixels]] \n\t"
  36. "subs %[h], %[h], #2 \n\t"
  37. "wldrd wr1, [%[pixels], #8] \n\t"
  38. "add %[pixels], %[pixels], %[line_size] \n\t"
  39. "wldrd wr3, [r4] \n\t"
  40. "pld [%[pixels]] \n\t"
  41. "pld [%[pixels], #32] \n\t"
  42. "wldrd wr4, [r4, #8] \n\t"
  43. "add r4, r4, %[line_size] \n\t"
  44. "walignr1 wr8, wr0, wr1 \n\t"
  45. "pld [r4] \n\t"
  46. "pld [r4, #32] \n\t"
  47. "walignr1 wr10, wr3, wr4 \n\t"
  48. "wstrd wr8, [%[block]] \n\t"
  49. "add %[block], %[block], %[line_size] \n\t"
  50. "wstrd wr10, [r5] \n\t"
  51. "add r5, r5, %[line_size] \n\t"
  52. "bne 1b \n\t"
  53. : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
  54. :
  55. : "memory", "r4", "r5", "r12");
  56. }
  57. void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  58. {
  59. int stride = line_size;
  60. __asm__ volatile (
  61. "and r12, %[pixels], #7 \n\t"
  62. "bic %[pixels], %[pixels], #7 \n\t"
  63. "tmcr wcgr1, r12 \n\t"
  64. "add r4, %[pixels], %[line_size] \n\t"
  65. "add r5, %[block], %[line_size] \n\t"
  66. "mov %[line_size], %[line_size], lsl #1 \n\t"
  67. "1: \n\t"
  68. "wldrd wr0, [%[pixels]] \n\t"
  69. "subs %[h], %[h], #2 \n\t"
  70. "wldrd wr1, [%[pixels], #8] \n\t"
  71. "add %[pixels], %[pixels], %[line_size] \n\t"
  72. "wldrd wr3, [r4] \n\t"
  73. "pld [%[pixels]] \n\t"
  74. "pld [%[pixels], #32] \n\t"
  75. "wldrd wr4, [r4, #8] \n\t"
  76. "add r4, r4, %[line_size] \n\t"
  77. "walignr1 wr8, wr0, wr1 \n\t"
  78. "wldrd wr0, [%[block]] \n\t"
  79. "wldrd wr2, [r5] \n\t"
  80. "pld [r4] \n\t"
  81. "pld [r4, #32] \n\t"
  82. "walignr1 wr10, wr3, wr4 \n\t"
  83. WAVG2B" wr8, wr8, wr0 \n\t"
  84. WAVG2B" wr10, wr10, wr2 \n\t"
  85. "wstrd wr8, [%[block]] \n\t"
  86. "add %[block], %[block], %[line_size] \n\t"
  87. "wstrd wr10, [r5] \n\t"
  88. "pld [%[block]] \n\t"
  89. "pld [%[block], #32] \n\t"
  90. "add r5, r5, %[line_size] \n\t"
  91. "pld [r5] \n\t"
  92. "pld [r5, #32] \n\t"
  93. "bne 1b \n\t"
  94. : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
  95. :
  96. : "memory", "r4", "r5", "r12");
  97. }
  98. void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  99. {
  100. int stride = line_size;
  101. __asm__ volatile (
  102. "and r12, %[pixels], #7 \n\t"
  103. "bic %[pixels], %[pixels], #7 \n\t"
  104. "tmcr wcgr1, r12 \n\t"
  105. "add r4, %[pixels], %[line_size] \n\t"
  106. "add r5, %[block], %[line_size] \n\t"
  107. "mov %[line_size], %[line_size], lsl #1 \n\t"
  108. "1: \n\t"
  109. "wldrd wr0, [%[pixels]] \n\t"
  110. "wldrd wr1, [%[pixels], #8] \n\t"
  111. "subs %[h], %[h], #2 \n\t"
  112. "wldrd wr2, [%[pixels], #16] \n\t"
  113. "add %[pixels], %[pixels], %[line_size] \n\t"
  114. "wldrd wr3, [r4] \n\t"
  115. "pld [%[pixels]] \n\t"
  116. "pld [%[pixels], #32] \n\t"
  117. "walignr1 wr8, wr0, wr1 \n\t"
  118. "wldrd wr4, [r4, #8] \n\t"
  119. "walignr1 wr9, wr1, wr2 \n\t"
  120. "wldrd wr5, [r4, #16] \n\t"
  121. "add r4, r4, %[line_size] \n\t"
  122. "pld [r4] \n\t"
  123. "pld [r4, #32] \n\t"
  124. "walignr1 wr10, wr3, wr4 \n\t"
  125. "wstrd wr8, [%[block]] \n\t"
  126. "walignr1 wr11, wr4, wr5 \n\t"
  127. "wstrd wr9, [%[block], #8] \n\t"
  128. "add %[block], %[block], %[line_size] \n\t"
  129. "wstrd wr10, [r5] \n\t"
  130. "wstrd wr11, [r5, #8] \n\t"
  131. "add r5, r5, %[line_size] \n\t"
  132. "bne 1b \n\t"
  133. : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
  134. :
  135. : "memory", "r4", "r5", "r12");
  136. }
  137. void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  138. {
  139. int stride = line_size;
  140. __asm__ volatile (
  141. "pld [%[pixels]] \n\t"
  142. "pld [%[pixels], #32] \n\t"
  143. "pld [%[block]] \n\t"
  144. "pld [%[block], #32] \n\t"
  145. "and r12, %[pixels], #7 \n\t"
  146. "bic %[pixels], %[pixels], #7 \n\t"
  147. "tmcr wcgr1, r12 \n\t"
  148. "add r4, %[pixels], %[line_size]\n\t"
  149. "add r5, %[block], %[line_size] \n\t"
  150. "mov %[line_size], %[line_size], lsl #1 \n\t"
  151. "1: \n\t"
  152. "wldrd wr0, [%[pixels]] \n\t"
  153. "wldrd wr1, [%[pixels], #8] \n\t"
  154. "subs %[h], %[h], #2 \n\t"
  155. "wldrd wr2, [%[pixels], #16] \n\t"
  156. "add %[pixels], %[pixels], %[line_size] \n\t"
  157. "wldrd wr3, [r4] \n\t"
  158. "pld [%[pixels]] \n\t"
  159. "pld [%[pixels], #32] \n\t"
  160. "walignr1 wr8, wr0, wr1 \n\t"
  161. "wldrd wr4, [r4, #8] \n\t"
  162. "walignr1 wr9, wr1, wr2 \n\t"
  163. "wldrd wr5, [r4, #16] \n\t"
  164. "add r4, r4, %[line_size] \n\t"
  165. "wldrd wr0, [%[block]] \n\t"
  166. "pld [r4] \n\t"
  167. "wldrd wr1, [%[block], #8] \n\t"
  168. "pld [r4, #32] \n\t"
  169. "wldrd wr2, [r5] \n\t"
  170. "walignr1 wr10, wr3, wr4 \n\t"
  171. "wldrd wr3, [r5, #8] \n\t"
  172. WAVG2B" wr8, wr8, wr0 \n\t"
  173. WAVG2B" wr9, wr9, wr1 \n\t"
  174. WAVG2B" wr10, wr10, wr2 \n\t"
  175. "wstrd wr8, [%[block]] \n\t"
  176. "walignr1 wr11, wr4, wr5 \n\t"
  177. WAVG2B" wr11, wr11, wr3 \n\t"
  178. "wstrd wr9, [%[block], #8] \n\t"
  179. "add %[block], %[block], %[line_size] \n\t"
  180. "wstrd wr10, [r5] \n\t"
  181. "pld [%[block]] \n\t"
  182. "pld [%[block], #32] \n\t"
  183. "wstrd wr11, [r5, #8] \n\t"
  184. "add r5, r5, %[line_size] \n\t"
  185. "pld [r5] \n\t"
  186. "pld [r5, #32] \n\t"
  187. "bne 1b \n\t"
  188. : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
  189. :
  190. : "memory", "r4", "r5", "r12");
  191. }
  192. void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  193. {
  194. int stride = line_size;
  195. // [wr0 wr1 wr2 wr3] for previous line
  196. // [wr4 wr5 wr6 wr7] for current line
  197. SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
  198. __asm__ volatile(
  199. "pld [%[pixels]] \n\t"
  200. "pld [%[pixels], #32] \n\t"
  201. "and r12, %[pixels], #7 \n\t"
  202. "bic %[pixels], %[pixels], #7 \n\t"
  203. "tmcr wcgr1, r12 \n\t"
  204. "add r12, r12, #1 \n\t"
  205. "add r4, %[pixels], %[line_size]\n\t"
  206. "tmcr wcgr2, r12 \n\t"
  207. "add r5, %[block], %[line_size] \n\t"
  208. "mov %[line_size], %[line_size], lsl #1 \n\t"
  209. "1: \n\t"
  210. "wldrd wr10, [%[pixels]] \n\t"
  211. "cmp r12, #8 \n\t"
  212. "wldrd wr11, [%[pixels], #8] \n\t"
  213. "add %[pixels], %[pixels], %[line_size] \n\t"
  214. "wldrd wr13, [r4] \n\t"
  215. "pld [%[pixels]] \n\t"
  216. "wldrd wr14, [r4, #8] \n\t"
  217. "pld [%[pixels], #32] \n\t"
  218. "add r4, r4, %[line_size] \n\t"
  219. "walignr1 wr0, wr10, wr11 \n\t"
  220. "pld [r4] \n\t"
  221. "pld [r4, #32] \n\t"
  222. "walignr1 wr2, wr13, wr14 \n\t"
  223. "wmoveq wr4, wr11 \n\t"
  224. "wmoveq wr6, wr14 \n\t"
  225. "walignr2ne wr4, wr10, wr11 \n\t"
  226. "walignr2ne wr6, wr13, wr14 \n\t"
  227. WAVG2B" wr0, wr0, wr4 \n\t"
  228. WAVG2B" wr2, wr2, wr6 \n\t"
  229. "wstrd wr0, [%[block]] \n\t"
  230. "subs %[h], %[h], #2 \n\t"
  231. "wstrd wr2, [r5] \n\t"
  232. "add %[block], %[block], %[line_size] \n\t"
  233. "add r5, r5, %[line_size] \n\t"
  234. "bne 1b \n\t"
  235. : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
  236. :
  237. : "r4", "r5", "r12", "memory");
  238. }
  239. void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  240. {
  241. int stride = line_size;
  242. // [wr0 wr1 wr2 wr3] for previous line
  243. // [wr4 wr5 wr6 wr7] for current line
  244. SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
  245. __asm__ volatile(
  246. "pld [%[pixels]] \n\t"
  247. "pld [%[pixels], #32] \n\t"
  248. "and r12, %[pixels], #7 \n\t"
  249. "bic %[pixels], %[pixels], #7 \n\t"
  250. "tmcr wcgr1, r12 \n\t"
  251. "add r12, r12, #1 \n\t"
  252. "add r4, %[pixels], %[line_size]\n\t"
  253. "tmcr wcgr2, r12 \n\t"
  254. "add r5, %[block], %[line_size] \n\t"
  255. "mov %[line_size], %[line_size], lsl #1 \n\t"
  256. "1: \n\t"
  257. "wldrd wr10, [%[pixels]] \n\t"
  258. "cmp r12, #8 \n\t"
  259. "wldrd wr11, [%[pixels], #8] \n\t"
  260. "wldrd wr12, [%[pixels], #16] \n\t"
  261. "add %[pixels], %[pixels], %[line_size] \n\t"
  262. "wldrd wr13, [r4] \n\t"
  263. "pld [%[pixels]] \n\t"
  264. "wldrd wr14, [r4, #8] \n\t"
  265. "pld [%[pixels], #32] \n\t"
  266. "wldrd wr15, [r4, #16] \n\t"
  267. "add r4, r4, %[line_size] \n\t"
  268. "walignr1 wr0, wr10, wr11 \n\t"
  269. "pld [r4] \n\t"
  270. "pld [r4, #32] \n\t"
  271. "walignr1 wr1, wr11, wr12 \n\t"
  272. "walignr1 wr2, wr13, wr14 \n\t"
  273. "walignr1 wr3, wr14, wr15 \n\t"
  274. "wmoveq wr4, wr11 \n\t"
  275. "wmoveq wr5, wr12 \n\t"
  276. "wmoveq wr6, wr14 \n\t"
  277. "wmoveq wr7, wr15 \n\t"
  278. "walignr2ne wr4, wr10, wr11 \n\t"
  279. "walignr2ne wr5, wr11, wr12 \n\t"
  280. "walignr2ne wr6, wr13, wr14 \n\t"
  281. "walignr2ne wr7, wr14, wr15 \n\t"
  282. WAVG2B" wr0, wr0, wr4 \n\t"
  283. WAVG2B" wr1, wr1, wr5 \n\t"
  284. "wstrd wr0, [%[block]] \n\t"
  285. WAVG2B" wr2, wr2, wr6 \n\t"
  286. "wstrd wr1, [%[block], #8] \n\t"
  287. WAVG2B" wr3, wr3, wr7 \n\t"
  288. "add %[block], %[block], %[line_size] \n\t"
  289. "wstrd wr2, [r5] \n\t"
  290. "subs %[h], %[h], #2 \n\t"
  291. "wstrd wr3, [r5, #8] \n\t"
  292. "add r5, r5, %[line_size] \n\t"
  293. "bne 1b \n\t"
  294. : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
  295. :
  296. : "r4", "r5", "r12", "memory");
  297. }
  298. void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  299. {
  300. int stride = line_size;
  301. // [wr0 wr1 wr2 wr3] for previous line
  302. // [wr4 wr5 wr6 wr7] for current line
  303. SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
  304. __asm__ volatile(
  305. "pld [%[pixels]] \n\t"
  306. "pld [%[pixels], #32] \n\t"
  307. "pld [%[block]] \n\t"
  308. "pld [%[block], #32] \n\t"
  309. "and r12, %[pixels], #7 \n\t"
  310. "bic %[pixels], %[pixels], #7 \n\t"
  311. "tmcr wcgr1, r12 \n\t"
  312. "add r12, r12, #1 \n\t"
  313. "add r4, %[pixels], %[line_size]\n\t"
  314. "tmcr wcgr2, r12 \n\t"
  315. "add r5, %[block], %[line_size] \n\t"
  316. "mov %[line_size], %[line_size], lsl #1 \n\t"
  317. "pld [r5] \n\t"
  318. "pld [r5, #32] \n\t"
  319. "1: \n\t"
  320. "wldrd wr10, [%[pixels]] \n\t"
  321. "cmp r12, #8 \n\t"
  322. "wldrd wr11, [%[pixels], #8] \n\t"
  323. "add %[pixels], %[pixels], %[line_size] \n\t"
  324. "wldrd wr13, [r4] \n\t"
  325. "pld [%[pixels]] \n\t"
  326. "wldrd wr14, [r4, #8] \n\t"
  327. "pld [%[pixels], #32] \n\t"
  328. "add r4, r4, %[line_size] \n\t"
  329. "walignr1 wr0, wr10, wr11 \n\t"
  330. "pld [r4] \n\t"
  331. "pld [r4, #32] \n\t"
  332. "walignr1 wr2, wr13, wr14 \n\t"
  333. "wmoveq wr4, wr11 \n\t"
  334. "wmoveq wr6, wr14 \n\t"
  335. "walignr2ne wr4, wr10, wr11 \n\t"
  336. "wldrd wr10, [%[block]] \n\t"
  337. "walignr2ne wr6, wr13, wr14 \n\t"
  338. "wldrd wr12, [r5] \n\t"
  339. WAVG2B" wr0, wr0, wr4 \n\t"
  340. WAVG2B" wr2, wr2, wr6 \n\t"
  341. WAVG2B" wr0, wr0, wr10 \n\t"
  342. WAVG2B" wr2, wr2, wr12 \n\t"
  343. "wstrd wr0, [%[block]] \n\t"
  344. "subs %[h], %[h], #2 \n\t"
  345. "wstrd wr2, [r5] \n\t"
  346. "add %[block], %[block], %[line_size] \n\t"
  347. "add r5, r5, %[line_size] \n\t"
  348. "pld [%[block]] \n\t"
  349. "pld [%[block], #32] \n\t"
  350. "pld [r5] \n\t"
  351. "pld [r5, #32] \n\t"
  352. "bne 1b \n\t"
  353. : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
  354. :
  355. : "r4", "r5", "r12", "memory");
  356. }
  357. void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  358. {
  359. int stride = line_size;
  360. // [wr0 wr1 wr2 wr3] for previous line
  361. // [wr4 wr5 wr6 wr7] for current line
  362. SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
  363. __asm__ volatile(
  364. "pld [%[pixels]] \n\t"
  365. "pld [%[pixels], #32] \n\t"
  366. "pld [%[block]] \n\t"
  367. "pld [%[block], #32] \n\t"
  368. "and r12, %[pixels], #7 \n\t"
  369. "bic %[pixels], %[pixels], #7 \n\t"
  370. "tmcr wcgr1, r12 \n\t"
  371. "add r12, r12, #1 \n\t"
  372. "add r4, %[pixels], %[line_size]\n\t"
  373. "tmcr wcgr2, r12 \n\t"
  374. "add r5, %[block], %[line_size] \n\t"
  375. "mov %[line_size], %[line_size], lsl #1 \n\t"
  376. "pld [r5] \n\t"
  377. "pld [r5, #32] \n\t"
  378. "1: \n\t"
  379. "wldrd wr10, [%[pixels]] \n\t"
  380. "cmp r12, #8 \n\t"
  381. "wldrd wr11, [%[pixels], #8] \n\t"
  382. "wldrd wr12, [%[pixels], #16] \n\t"
  383. "add %[pixels], %[pixels], %[line_size] \n\t"
  384. "wldrd wr13, [r4] \n\t"
  385. "pld [%[pixels]] \n\t"
  386. "wldrd wr14, [r4, #8] \n\t"
  387. "pld [%[pixels], #32] \n\t"
  388. "wldrd wr15, [r4, #16] \n\t"
  389. "add r4, r4, %[line_size] \n\t"
  390. "walignr1 wr0, wr10, wr11 \n\t"
  391. "pld [r4] \n\t"
  392. "pld [r4, #32] \n\t"
  393. "walignr1 wr1, wr11, wr12 \n\t"
  394. "walignr1 wr2, wr13, wr14 \n\t"
  395. "walignr1 wr3, wr14, wr15 \n\t"
  396. "wmoveq wr4, wr11 \n\t"
  397. "wmoveq wr5, wr12 \n\t"
  398. "wmoveq wr6, wr14 \n\t"
  399. "wmoveq wr7, wr15 \n\t"
  400. "walignr2ne wr4, wr10, wr11 \n\t"
  401. "walignr2ne wr5, wr11, wr12 \n\t"
  402. "walignr2ne wr6, wr13, wr14 \n\t"
  403. "walignr2ne wr7, wr14, wr15 \n\t"
  404. "wldrd wr10, [%[block]] \n\t"
  405. WAVG2B" wr0, wr0, wr4 \n\t"
  406. "wldrd wr11, [%[block], #8] \n\t"
  407. WAVG2B" wr1, wr1, wr5 \n\t"
  408. "wldrd wr12, [r5] \n\t"
  409. WAVG2B" wr2, wr2, wr6 \n\t"
  410. "wldrd wr13, [r5, #8] \n\t"
  411. WAVG2B" wr3, wr3, wr7 \n\t"
  412. WAVG2B" wr0, wr0, wr10 \n\t"
  413. WAVG2B" wr1, wr1, wr11 \n\t"
  414. WAVG2B" wr2, wr2, wr12 \n\t"
  415. WAVG2B" wr3, wr3, wr13 \n\t"
  416. "wstrd wr0, [%[block]] \n\t"
  417. "subs %[h], %[h], #2 \n\t"
  418. "wstrd wr1, [%[block], #8] \n\t"
  419. "add %[block], %[block], %[line_size] \n\t"
  420. "wstrd wr2, [r5] \n\t"
  421. "pld [%[block]] \n\t"
  422. "wstrd wr3, [r5, #8] \n\t"
  423. "add r5, r5, %[line_size] \n\t"
  424. "pld [%[block], #32] \n\t"
  425. "pld [r5] \n\t"
  426. "pld [r5, #32] \n\t"
  427. "bne 1b \n\t"
  428. : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
  429. :
  430. :"r4", "r5", "r12", "memory");
  431. }
  432. void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  433. {
  434. int stride = line_size;
  435. // [wr0 wr1 wr2 wr3] for previous line
  436. // [wr4 wr5 wr6 wr7] for current line
  437. __asm__ volatile(
  438. "pld [%[pixels]] \n\t"
  439. "pld [%[pixels], #32] \n\t"
  440. "and r12, %[pixels], #7 \n\t"
  441. "tmcr wcgr1, r12 \n\t"
  442. "bic %[pixels], %[pixels], #7 \n\t"
  443. "wldrd wr10, [%[pixels]] \n\t"
  444. "wldrd wr11, [%[pixels], #8] \n\t"
  445. "pld [%[block]] \n\t"
  446. "add %[pixels], %[pixels], %[line_size] \n\t"
  447. "walignr1 wr0, wr10, wr11 \n\t"
  448. "pld [%[pixels]] \n\t"
  449. "pld [%[pixels], #32] \n\t"
  450. "1: \n\t"
  451. "wldrd wr10, [%[pixels]] \n\t"
  452. "wldrd wr11, [%[pixels], #8] \n\t"
  453. "add %[pixels], %[pixels], %[line_size] \n\t"
  454. "pld [%[pixels]] \n\t"
  455. "pld [%[pixels], #32] \n\t"
  456. "walignr1 wr4, wr10, wr11 \n\t"
  457. "wldrd wr10, [%[block]] \n\t"
  458. WAVG2B" wr8, wr0, wr4 \n\t"
  459. WAVG2B" wr8, wr8, wr10 \n\t"
  460. "wstrd wr8, [%[block]] \n\t"
  461. "add %[block], %[block], %[line_size] \n\t"
  462. "wldrd wr10, [%[pixels]] \n\t"
  463. "wldrd wr11, [%[pixels], #8] \n\t"
  464. "pld [%[block]] \n\t"
  465. "add %[pixels], %[pixels], %[line_size] \n\t"
  466. "pld [%[pixels]] \n\t"
  467. "pld [%[pixels], #32] \n\t"
  468. "walignr1 wr0, wr10, wr11 \n\t"
  469. "wldrd wr10, [%[block]] \n\t"
  470. WAVG2B" wr8, wr0, wr4 \n\t"
  471. WAVG2B" wr8, wr8, wr10 \n\t"
  472. "wstrd wr8, [%[block]] \n\t"
  473. "add %[block], %[block], %[line_size] \n\t"
  474. "subs %[h], %[h], #2 \n\t"
  475. "pld [%[block]] \n\t"
  476. "bne 1b \n\t"
  477. : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
  478. :
  479. : "cc", "memory", "r12");
  480. }
  481. void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  482. {
  483. int stride = line_size;
  484. // [wr0 wr1 wr2 wr3] for previous line
  485. // [wr4 wr5 wr6 wr7] for current line
  486. __asm__ volatile(
  487. "pld [%[pixels]] \n\t"
  488. "pld [%[pixels], #32] \n\t"
  489. "and r12, %[pixels], #7 \n\t"
  490. "tmcr wcgr1, r12 \n\t"
  491. "bic %[pixels], %[pixels], #7 \n\t"
  492. "wldrd wr10, [%[pixels]] \n\t"
  493. "wldrd wr11, [%[pixels], #8] \n\t"
  494. "wldrd wr12, [%[pixels], #16] \n\t"
  495. "add %[pixels], %[pixels], %[line_size] \n\t"
  496. "pld [%[pixels]] \n\t"
  497. "pld [%[pixels], #32] \n\t"
  498. "walignr1 wr0, wr10, wr11 \n\t"
  499. "walignr1 wr1, wr11, wr12 \n\t"
  500. "1: \n\t"
  501. "wldrd wr10, [%[pixels]] \n\t"
  502. "wldrd wr11, [%[pixels], #8] \n\t"
  503. "wldrd wr12, [%[pixels], #16] \n\t"
  504. "add %[pixels], %[pixels], %[line_size] \n\t"
  505. "pld [%[pixels]] \n\t"
  506. "pld [%[pixels], #32] \n\t"
  507. "walignr1 wr4, wr10, wr11 \n\t"
  508. "walignr1 wr5, wr11, wr12 \n\t"
  509. WAVG2B" wr8, wr0, wr4 \n\t"
  510. WAVG2B" wr9, wr1, wr5 \n\t"
  511. "wstrd wr8, [%[block]] \n\t"
  512. "wstrd wr9, [%[block], #8] \n\t"
  513. "add %[block], %[block], %[line_size] \n\t"
  514. "wldrd wr10, [%[pixels]] \n\t"
  515. "wldrd wr11, [%[pixels], #8] \n\t"
  516. "wldrd wr12, [%[pixels], #16] \n\t"
  517. "add %[pixels], %[pixels], %[line_size] \n\t"
  518. "pld [%[pixels]] \n\t"
  519. "pld [%[pixels], #32] \n\t"
  520. "walignr1 wr0, wr10, wr11 \n\t"
  521. "walignr1 wr1, wr11, wr12 \n\t"
  522. WAVG2B" wr8, wr0, wr4 \n\t"
  523. WAVG2B" wr9, wr1, wr5 \n\t"
  524. "wstrd wr8, [%[block]] \n\t"
  525. "wstrd wr9, [%[block], #8] \n\t"
  526. "add %[block], %[block], %[line_size] \n\t"
  527. "subs %[h], %[h], #2 \n\t"
  528. "bne 1b \n\t"
  529. : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
  530. :
  531. : "r4", "r5", "r12", "memory");
  532. }
  533. void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  534. {
  535. int stride = line_size;
  536. // [wr0 wr1 wr2 wr3] for previous line
  537. // [wr4 wr5 wr6 wr7] for current line
  538. __asm__ volatile(
  539. "pld [%[pixels]] \n\t"
  540. "pld [%[pixels], #32] \n\t"
  541. "and r12, %[pixels], #7 \n\t"
  542. "tmcr wcgr1, r12 \n\t"
  543. "bic %[pixels], %[pixels], #7 \n\t"
  544. "wldrd wr10, [%[pixels]] \n\t"
  545. "wldrd wr11, [%[pixels], #8] \n\t"
  546. "pld [%[block]] \n\t"
  547. "wldrd wr12, [%[pixels], #16] \n\t"
  548. "add %[pixels], %[pixels], %[line_size] \n\t"
  549. "pld [%[pixels]] \n\t"
  550. "pld [%[pixels], #32] \n\t"
  551. "walignr1 wr0, wr10, wr11 \n\t"
  552. "walignr1 wr1, wr11, wr12 \n\t"
  553. "1: \n\t"
  554. "wldrd wr10, [%[pixels]] \n\t"
  555. "wldrd wr11, [%[pixels], #8] \n\t"
  556. "wldrd wr12, [%[pixels], #16] \n\t"
  557. "add %[pixels], %[pixels], %[line_size] \n\t"
  558. "pld [%[pixels]] \n\t"
  559. "pld [%[pixels], #32] \n\t"
  560. "walignr1 wr4, wr10, wr11 \n\t"
  561. "walignr1 wr5, wr11, wr12 \n\t"
  562. "wldrd wr10, [%[block]] \n\t"
  563. "wldrd wr11, [%[block], #8] \n\t"
  564. WAVG2B" wr8, wr0, wr4 \n\t"
  565. WAVG2B" wr9, wr1, wr5 \n\t"
  566. WAVG2B" wr8, wr8, wr10 \n\t"
  567. WAVG2B" wr9, wr9, wr11 \n\t"
  568. "wstrd wr8, [%[block]] \n\t"
  569. "wstrd wr9, [%[block], #8] \n\t"
  570. "add %[block], %[block], %[line_size] \n\t"
  571. "wldrd wr10, [%[pixels]] \n\t"
  572. "wldrd wr11, [%[pixels], #8] \n\t"
  573. "pld [%[block]] \n\t"
  574. "wldrd wr12, [%[pixels], #16] \n\t"
  575. "add %[pixels], %[pixels], %[line_size] \n\t"
  576. "pld [%[pixels]] \n\t"
  577. "pld [%[pixels], #32] \n\t"
  578. "walignr1 wr0, wr10, wr11 \n\t"
  579. "walignr1 wr1, wr11, wr12 \n\t"
  580. "wldrd wr10, [%[block]] \n\t"
  581. "wldrd wr11, [%[block], #8] \n\t"
  582. WAVG2B" wr8, wr0, wr4 \n\t"
  583. WAVG2B" wr9, wr1, wr5 \n\t"
  584. WAVG2B" wr8, wr8, wr10 \n\t"
  585. WAVG2B" wr9, wr9, wr11 \n\t"
  586. "wstrd wr8, [%[block]] \n\t"
  587. "wstrd wr9, [%[block], #8] \n\t"
  588. "add %[block], %[block], %[line_size] \n\t"
  589. "subs %[h], %[h], #2 \n\t"
  590. "pld [%[block]] \n\t"
  591. "bne 1b \n\t"
  592. : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
  593. :
  594. : "r4", "r5", "r12", "memory");
  595. }
  596. void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  597. {
  598. // [wr0 wr1 wr2 wr3] for previous line
  599. // [wr4 wr5 wr6 wr7] for current line
  600. SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
  601. __asm__ volatile(
  602. "pld [%[pixels]] \n\t"
  603. "mov r12, #2 \n\t"
  604. "pld [%[pixels], #32] \n\t"
  605. "tmcr wcgr0, r12 \n\t" /* for shift value */
  606. "and r12, %[pixels], #7 \n\t"
  607. "bic %[pixels], %[pixels], #7 \n\t"
  608. "tmcr wcgr1, r12 \n\t"
  609. // [wr0 wr1 wr2 wr3] <= *
  610. // [wr4 wr5 wr6 wr7]
  611. "wldrd wr12, [%[pixels]] \n\t"
  612. "add r12, r12, #1 \n\t"
  613. "wldrd wr13, [%[pixels], #8] \n\t"
  614. "tmcr wcgr2, r12 \n\t"
  615. "add %[pixels], %[pixels], %[line_size] \n\t"
  616. "cmp r12, #8 \n\t"
  617. "pld [%[pixels]] \n\t"
  618. "pld [%[pixels], #32] \n\t"
  619. "walignr1 wr2, wr12, wr13 \n\t"
  620. "wmoveq wr10, wr13 \n\t"
  621. "walignr2ne wr10, wr12, wr13 \n\t"
  622. "wunpckelub wr0, wr2 \n\t"
  623. "wunpckehub wr1, wr2 \n\t"
  624. "wunpckelub wr8, wr10 \n\t"
  625. "wunpckehub wr9, wr10 \n\t"
  626. "waddhus wr0, wr0, wr8 \n\t"
  627. "waddhus wr1, wr1, wr9 \n\t"
  628. "1: \n\t"
  629. // [wr0 wr1 wr2 wr3]
  630. // [wr4 wr5 wr6 wr7] <= *
  631. "wldrd wr12, [%[pixels]] \n\t"
  632. "cmp r12, #8 \n\t"
  633. "wldrd wr13, [%[pixels], #8] \n\t"
  634. "add %[pixels], %[pixels], %[line_size] \n\t"
  635. "walignr1 wr6, wr12, wr13 \n\t"
  636. "pld [%[pixels]] \n\t"
  637. "pld [%[pixels], #32] \n\t"
  638. "wmoveq wr10, wr13 \n\t"
  639. "walignr2ne wr10, wr12, wr13 \n\t"
  640. "wunpckelub wr4, wr6 \n\t"
  641. "wunpckehub wr5, wr6 \n\t"
  642. "wunpckelub wr8, wr10 \n\t"
  643. "wunpckehub wr9, wr10 \n\t"
  644. "waddhus wr4, wr4, wr8 \n\t"
  645. "waddhus wr5, wr5, wr9 \n\t"
  646. "waddhus wr8, wr0, wr4 \n\t"
  647. "waddhus wr9, wr1, wr5 \n\t"
  648. "waddhus wr8, wr8, wr15 \n\t"
  649. "waddhus wr9, wr9, wr15 \n\t"
  650. "wsrlhg wr8, wr8, wcgr0 \n\t"
  651. "wsrlhg wr9, wr9, wcgr0 \n\t"
  652. "wpackhus wr8, wr8, wr9 \n\t"
  653. "wstrd wr8, [%[block]] \n\t"
  654. "add %[block], %[block], %[line_size] \n\t"
  655. // [wr0 wr1 wr2 wr3] <= *
  656. // [wr4 wr5 wr6 wr7]
  657. "wldrd wr12, [%[pixels]] \n\t"
  658. "wldrd wr13, [%[pixels], #8] \n\t"
  659. "add %[pixels], %[pixels], %[line_size] \n\t"
  660. "walignr1 wr2, wr12, wr13 \n\t"
  661. "pld [%[pixels]] \n\t"
  662. "pld [%[pixels], #32] \n\t"
  663. "wmoveq wr10, wr13 \n\t"
  664. "walignr2ne wr10, wr12, wr13 \n\t"
  665. "wunpckelub wr0, wr2 \n\t"
  666. "wunpckehub wr1, wr2 \n\t"
  667. "wunpckelub wr8, wr10 \n\t"
  668. "wunpckehub wr9, wr10 \n\t"
  669. "waddhus wr0, wr0, wr8 \n\t"
  670. "waddhus wr1, wr1, wr9 \n\t"
  671. "waddhus wr8, wr0, wr4 \n\t"
  672. "waddhus wr9, wr1, wr5 \n\t"
  673. "waddhus wr8, wr8, wr15 \n\t"
  674. "waddhus wr9, wr9, wr15 \n\t"
  675. "wsrlhg wr8, wr8, wcgr0 \n\t"
  676. "wsrlhg wr9, wr9, wcgr0 \n\t"
  677. "wpackhus wr8, wr8, wr9 \n\t"
  678. "subs %[h], %[h], #2 \n\t"
  679. "wstrd wr8, [%[block]] \n\t"
  680. "add %[block], %[block], %[line_size] \n\t"
  681. "bne 1b \n\t"
  682. : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
  683. : [line_size]"r"(line_size)
  684. : "r12", "memory");
  685. }
  686. void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  687. {
  688. // [wr0 wr1 wr2 wr3] for previous line
  689. // [wr4 wr5 wr6 wr7] for current line
  690. SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
  691. __asm__ volatile(
  692. "pld [%[pixels]] \n\t"
  693. "mov r12, #2 \n\t"
  694. "pld [%[pixels], #32] \n\t"
  695. "tmcr wcgr0, r12 \n\t" /* for shift value */
  696. /* alignment */
  697. "and r12, %[pixels], #7 \n\t"
  698. "bic %[pixels], %[pixels], #7 \n\t"
  699. "tmcr wcgr1, r12 \n\t"
  700. "add r12, r12, #1 \n\t"
  701. "tmcr wcgr2, r12 \n\t"
  702. // [wr0 wr1 wr2 wr3] <= *
  703. // [wr4 wr5 wr6 wr7]
  704. "wldrd wr12, [%[pixels]] \n\t"
  705. "cmp r12, #8 \n\t"
  706. "wldrd wr13, [%[pixels], #8] \n\t"
  707. "wldrd wr14, [%[pixels], #16] \n\t"
  708. "add %[pixels], %[pixels], %[line_size] \n\t"
  709. "pld [%[pixels]] \n\t"
  710. "walignr1 wr2, wr12, wr13 \n\t"
  711. "pld [%[pixels], #32] \n\t"
  712. "walignr1 wr3, wr13, wr14 \n\t"
  713. "wmoveq wr10, wr13 \n\t"
  714. "wmoveq wr11, wr14 \n\t"
  715. "walignr2ne wr10, wr12, wr13 \n\t"
  716. "walignr2ne wr11, wr13, wr14 \n\t"
  717. "wunpckelub wr0, wr2 \n\t"
  718. "wunpckehub wr1, wr2 \n\t"
  719. "wunpckelub wr2, wr3 \n\t"
  720. "wunpckehub wr3, wr3 \n\t"
  721. "wunpckelub wr8, wr10 \n\t"
  722. "wunpckehub wr9, wr10 \n\t"
  723. "wunpckelub wr10, wr11 \n\t"
  724. "wunpckehub wr11, wr11 \n\t"
  725. "waddhus wr0, wr0, wr8 \n\t"
  726. "waddhus wr1, wr1, wr9 \n\t"
  727. "waddhus wr2, wr2, wr10 \n\t"
  728. "waddhus wr3, wr3, wr11 \n\t"
  729. "1: \n\t"
  730. // [wr0 wr1 wr2 wr3]
  731. // [wr4 wr5 wr6 wr7] <= *
  732. "wldrd wr12, [%[pixels]] \n\t"
  733. "cmp r12, #8 \n\t"
  734. "wldrd wr13, [%[pixels], #8] \n\t"
  735. "wldrd wr14, [%[pixels], #16] \n\t"
  736. "add %[pixels], %[pixels], %[line_size] \n\t"
  737. "walignr1 wr6, wr12, wr13 \n\t"
  738. "pld [%[pixels]] \n\t"
  739. "pld [%[pixels], #32] \n\t"
  740. "walignr1 wr7, wr13, wr14 \n\t"
  741. "wmoveq wr10, wr13 \n\t"
  742. "wmoveq wr11, wr14 \n\t"
  743. "walignr2ne wr10, wr12, wr13 \n\t"
  744. "walignr2ne wr11, wr13, wr14 \n\t"
  745. "wunpckelub wr4, wr6 \n\t"
  746. "wunpckehub wr5, wr6 \n\t"
  747. "wunpckelub wr6, wr7 \n\t"
  748. "wunpckehub wr7, wr7 \n\t"
  749. "wunpckelub wr8, wr10 \n\t"
  750. "wunpckehub wr9, wr10 \n\t"
  751. "wunpckelub wr10, wr11 \n\t"
  752. "wunpckehub wr11, wr11 \n\t"
  753. "waddhus wr4, wr4, wr8 \n\t"
  754. "waddhus wr5, wr5, wr9 \n\t"
  755. "waddhus wr6, wr6, wr10 \n\t"
  756. "waddhus wr7, wr7, wr11 \n\t"
  757. "waddhus wr8, wr0, wr4 \n\t"
  758. "waddhus wr9, wr1, wr5 \n\t"
  759. "waddhus wr10, wr2, wr6 \n\t"
  760. "waddhus wr11, wr3, wr7 \n\t"
  761. "waddhus wr8, wr8, wr15 \n\t"
  762. "waddhus wr9, wr9, wr15 \n\t"
  763. "waddhus wr10, wr10, wr15 \n\t"
  764. "waddhus wr11, wr11, wr15 \n\t"
  765. "wsrlhg wr8, wr8, wcgr0 \n\t"
  766. "wsrlhg wr9, wr9, wcgr0 \n\t"
  767. "wsrlhg wr10, wr10, wcgr0 \n\t"
  768. "wsrlhg wr11, wr11, wcgr0 \n\t"
  769. "wpackhus wr8, wr8, wr9 \n\t"
  770. "wpackhus wr9, wr10, wr11 \n\t"
  771. "wstrd wr8, [%[block]] \n\t"
  772. "wstrd wr9, [%[block], #8] \n\t"
  773. "add %[block], %[block], %[line_size] \n\t"
  774. // [wr0 wr1 wr2 wr3] <= *
  775. // [wr4 wr5 wr6 wr7]
  776. "wldrd wr12, [%[pixels]] \n\t"
  777. "wldrd wr13, [%[pixels], #8] \n\t"
  778. "wldrd wr14, [%[pixels], #16] \n\t"
  779. "add %[pixels], %[pixels], %[line_size] \n\t"
  780. "walignr1 wr2, wr12, wr13 \n\t"
  781. "pld [%[pixels]] \n\t"
  782. "pld [%[pixels], #32] \n\t"
  783. "walignr1 wr3, wr13, wr14 \n\t"
  784. "wmoveq wr10, wr13 \n\t"
  785. "wmoveq wr11, wr14 \n\t"
  786. "walignr2ne wr10, wr12, wr13 \n\t"
  787. "walignr2ne wr11, wr13, wr14 \n\t"
  788. "wunpckelub wr0, wr2 \n\t"
  789. "wunpckehub wr1, wr2 \n\t"
  790. "wunpckelub wr2, wr3 \n\t"
  791. "wunpckehub wr3, wr3 \n\t"
  792. "wunpckelub wr8, wr10 \n\t"
  793. "wunpckehub wr9, wr10 \n\t"
  794. "wunpckelub wr10, wr11 \n\t"
  795. "wunpckehub wr11, wr11 \n\t"
  796. "waddhus wr0, wr0, wr8 \n\t"
  797. "waddhus wr1, wr1, wr9 \n\t"
  798. "waddhus wr2, wr2, wr10 \n\t"
  799. "waddhus wr3, wr3, wr11 \n\t"
  800. "waddhus wr8, wr0, wr4 \n\t"
  801. "waddhus wr9, wr1, wr5 \n\t"
  802. "waddhus wr10, wr2, wr6 \n\t"
  803. "waddhus wr11, wr3, wr7 \n\t"
  804. "waddhus wr8, wr8, wr15 \n\t"
  805. "waddhus wr9, wr9, wr15 \n\t"
  806. "waddhus wr10, wr10, wr15 \n\t"
  807. "waddhus wr11, wr11, wr15 \n\t"
  808. "wsrlhg wr8, wr8, wcgr0 \n\t"
  809. "wsrlhg wr9, wr9, wcgr0 \n\t"
  810. "wsrlhg wr10, wr10, wcgr0 \n\t"
  811. "wsrlhg wr11, wr11, wcgr0 \n\t"
  812. "wpackhus wr8, wr8, wr9 \n\t"
  813. "wpackhus wr9, wr10, wr11 \n\t"
  814. "wstrd wr8, [%[block]] \n\t"
  815. "wstrd wr9, [%[block], #8] \n\t"
  816. "add %[block], %[block], %[line_size] \n\t"
  817. "subs %[h], %[h], #2 \n\t"
  818. "bne 1b \n\t"
  819. : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
  820. : [line_size]"r"(line_size)
  821. : "r12", "memory");
  822. }
  823. void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  824. {
  825. // [wr0 wr1 wr2 wr3] for previous line
  826. // [wr4 wr5 wr6 wr7] for current line
  827. SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
  828. __asm__ volatile(
  829. "pld [%[block]] \n\t"
  830. "pld [%[block], #32] \n\t"
  831. "pld [%[pixels]] \n\t"
  832. "mov r12, #2 \n\t"
  833. "pld [%[pixels], #32] \n\t"
  834. "tmcr wcgr0, r12 \n\t" /* for shift value */
  835. "and r12, %[pixels], #7 \n\t"
  836. "bic %[pixels], %[pixels], #7 \n\t"
  837. "tmcr wcgr1, r12 \n\t"
  838. // [wr0 wr1 wr2 wr3] <= *
  839. // [wr4 wr5 wr6 wr7]
  840. "wldrd wr12, [%[pixels]] \n\t"
  841. "add r12, r12, #1 \n\t"
  842. "wldrd wr13, [%[pixels], #8] \n\t"
  843. "tmcr wcgr2, r12 \n\t"
  844. "add %[pixels], %[pixels], %[line_size] \n\t"
  845. "cmp r12, #8 \n\t"
  846. "pld [%[pixels]] \n\t"
  847. "pld [%[pixels], #32] \n\t"
  848. "walignr1 wr2, wr12, wr13 \n\t"
  849. "wmoveq wr10, wr13 \n\t"
  850. "walignr2ne wr10, wr12, wr13 \n\t"
  851. "wunpckelub wr0, wr2 \n\t"
  852. "wunpckehub wr1, wr2 \n\t"
  853. "wunpckelub wr8, wr10 \n\t"
  854. "wunpckehub wr9, wr10 \n\t"
  855. "waddhus wr0, wr0, wr8 \n\t"
  856. "waddhus wr1, wr1, wr9 \n\t"
  857. "1: \n\t"
  858. // [wr0 wr1 wr2 wr3]
  859. // [wr4 wr5 wr6 wr7] <= *
  860. "wldrd wr12, [%[pixels]] \n\t"
  861. "cmp r12, #8 \n\t"
  862. "wldrd wr13, [%[pixels], #8] \n\t"
  863. "add %[pixels], %[pixels], %[line_size] \n\t"
  864. "walignr1 wr6, wr12, wr13 \n\t"
  865. "pld [%[pixels]] \n\t"
  866. "pld [%[pixels], #32] \n\t"
  867. "wmoveq wr10, wr13 \n\t"
  868. "walignr2ne wr10, wr12, wr13 \n\t"
  869. "wunpckelub wr4, wr6 \n\t"
  870. "wunpckehub wr5, wr6 \n\t"
  871. "wunpckelub wr8, wr10 \n\t"
  872. "wunpckehub wr9, wr10 \n\t"
  873. "waddhus wr4, wr4, wr8 \n\t"
  874. "waddhus wr5, wr5, wr9 \n\t"
  875. "waddhus wr8, wr0, wr4 \n\t"
  876. "waddhus wr9, wr1, wr5 \n\t"
  877. "waddhus wr8, wr8, wr15 \n\t"
  878. "waddhus wr9, wr9, wr15 \n\t"
  879. "wldrd wr12, [%[block]] \n\t"
  880. "wsrlhg wr8, wr8, wcgr0 \n\t"
  881. "wsrlhg wr9, wr9, wcgr0 \n\t"
  882. "wpackhus wr8, wr8, wr9 \n\t"
  883. WAVG2B" wr8, wr8, wr12 \n\t"
  884. "wstrd wr8, [%[block]] \n\t"
  885. "add %[block], %[block], %[line_size] \n\t"
  886. "wldrd wr12, [%[pixels]] \n\t"
  887. "pld [%[block]] \n\t"
  888. "pld [%[block], #32] \n\t"
  889. // [wr0 wr1 wr2 wr3] <= *
  890. // [wr4 wr5 wr6 wr7]
  891. "wldrd wr13, [%[pixels], #8] \n\t"
  892. "add %[pixels], %[pixels], %[line_size] \n\t"
  893. "walignr1 wr2, wr12, wr13 \n\t"
  894. "pld [%[pixels]] \n\t"
  895. "pld [%[pixels], #32] \n\t"
  896. "wmoveq wr10, wr13 \n\t"
  897. "walignr2ne wr10, wr12, wr13 \n\t"
  898. "wunpckelub wr0, wr2 \n\t"
  899. "wunpckehub wr1, wr2 \n\t"
  900. "wunpckelub wr8, wr10 \n\t"
  901. "wunpckehub wr9, wr10 \n\t"
  902. "waddhus wr0, wr0, wr8 \n\t"
  903. "waddhus wr1, wr1, wr9 \n\t"
  904. "waddhus wr8, wr0, wr4 \n\t"
  905. "waddhus wr9, wr1, wr5 \n\t"
  906. "waddhus wr8, wr8, wr15 \n\t"
  907. "waddhus wr9, wr9, wr15 \n\t"
  908. "wldrd wr12, [%[block]] \n\t"
  909. "wsrlhg wr8, wr8, wcgr0 \n\t"
  910. "wsrlhg wr9, wr9, wcgr0 \n\t"
  911. "wpackhus wr8, wr8, wr9 \n\t"
  912. "subs %[h], %[h], #2 \n\t"
  913. WAVG2B" wr8, wr8, wr12 \n\t"
  914. "wstrd wr8, [%[block]] \n\t"
  915. "add %[block], %[block], %[line_size] \n\t"
  916. "pld [%[block]] \n\t"
  917. "pld [%[block], #32] \n\t"
  918. "bne 1b \n\t"
  919. : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
  920. : [line_size]"r"(line_size)
  921. : "r12", "memory");
  922. }
  923. void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  924. {
  925. // [wr0 wr1 wr2 wr3] for previous line
  926. // [wr4 wr5 wr6 wr7] for current line
  927. SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
  928. __asm__ volatile(
  929. "pld [%[block]] \n\t"
  930. "pld [%[block], #32] \n\t"
  931. "pld [%[pixels]] \n\t"
  932. "mov r12, #2 \n\t"
  933. "pld [%[pixels], #32] \n\t"
  934. "tmcr wcgr0, r12 \n\t" /* for shift value */
  935. /* alignment */
  936. "and r12, %[pixels], #7 \n\t"
  937. "bic %[pixels], %[pixels], #7 \n\t"
  938. "tmcr wcgr1, r12 \n\t"
  939. "add r12, r12, #1 \n\t"
  940. "tmcr wcgr2, r12 \n\t"
  941. // [wr0 wr1 wr2 wr3] <= *
  942. // [wr4 wr5 wr6 wr7]
  943. "wldrd wr12, [%[pixels]] \n\t"
  944. "cmp r12, #8 \n\t"
  945. "wldrd wr13, [%[pixels], #8] \n\t"
  946. "wldrd wr14, [%[pixels], #16] \n\t"
  947. "add %[pixels], %[pixels], %[line_size] \n\t"
  948. "pld [%[pixels]] \n\t"
  949. "walignr1 wr2, wr12, wr13 \n\t"
  950. "pld [%[pixels], #32] \n\t"
  951. "walignr1 wr3, wr13, wr14 \n\t"
  952. "wmoveq wr10, wr13 \n\t"
  953. "wmoveq wr11, wr14 \n\t"
  954. "walignr2ne wr10, wr12, wr13 \n\t"
  955. "walignr2ne wr11, wr13, wr14 \n\t"
  956. "wunpckelub wr0, wr2 \n\t"
  957. "wunpckehub wr1, wr2 \n\t"
  958. "wunpckelub wr2, wr3 \n\t"
  959. "wunpckehub wr3, wr3 \n\t"
  960. "wunpckelub wr8, wr10 \n\t"
  961. "wunpckehub wr9, wr10 \n\t"
  962. "wunpckelub wr10, wr11 \n\t"
  963. "wunpckehub wr11, wr11 \n\t"
  964. "waddhus wr0, wr0, wr8 \n\t"
  965. "waddhus wr1, wr1, wr9 \n\t"
  966. "waddhus wr2, wr2, wr10 \n\t"
  967. "waddhus wr3, wr3, wr11 \n\t"
  968. "1: \n\t"
  969. // [wr0 wr1 wr2 wr3]
  970. // [wr4 wr5 wr6 wr7] <= *
  971. "wldrd wr12, [%[pixels]] \n\t"
  972. "cmp r12, #8 \n\t"
  973. "wldrd wr13, [%[pixels], #8] \n\t"
  974. "wldrd wr14, [%[pixels], #16] \n\t"
  975. "add %[pixels], %[pixels], %[line_size] \n\t"
  976. "walignr1 wr6, wr12, wr13 \n\t"
  977. "pld [%[pixels]] \n\t"
  978. "pld [%[pixels], #32] \n\t"
  979. "walignr1 wr7, wr13, wr14 \n\t"
  980. "wmoveq wr10, wr13 \n\t"
  981. "wmoveq wr11, wr14 \n\t"
  982. "walignr2ne wr10, wr12, wr13 \n\t"
  983. "walignr2ne wr11, wr13, wr14 \n\t"
  984. "wunpckelub wr4, wr6 \n\t"
  985. "wunpckehub wr5, wr6 \n\t"
  986. "wunpckelub wr6, wr7 \n\t"
  987. "wunpckehub wr7, wr7 \n\t"
  988. "wunpckelub wr8, wr10 \n\t"
  989. "wunpckehub wr9, wr10 \n\t"
  990. "wunpckelub wr10, wr11 \n\t"
  991. "wunpckehub wr11, wr11 \n\t"
  992. "waddhus wr4, wr4, wr8 \n\t"
  993. "waddhus wr5, wr5, wr9 \n\t"
  994. "waddhus wr6, wr6, wr10 \n\t"
  995. "waddhus wr7, wr7, wr11 \n\t"
  996. "waddhus wr8, wr0, wr4 \n\t"
  997. "waddhus wr9, wr1, wr5 \n\t"
  998. "waddhus wr10, wr2, wr6 \n\t"
  999. "waddhus wr11, wr3, wr7 \n\t"
  1000. "waddhus wr8, wr8, wr15 \n\t"
  1001. "waddhus wr9, wr9, wr15 \n\t"
  1002. "waddhus wr10, wr10, wr15 \n\t"
  1003. "waddhus wr11, wr11, wr15 \n\t"
  1004. "wsrlhg wr8, wr8, wcgr0 \n\t"
  1005. "wsrlhg wr9, wr9, wcgr0 \n\t"
  1006. "wldrd wr12, [%[block]] \n\t"
  1007. "wldrd wr13, [%[block], #8] \n\t"
  1008. "wsrlhg wr10, wr10, wcgr0 \n\t"
  1009. "wsrlhg wr11, wr11, wcgr0 \n\t"
  1010. "wpackhus wr8, wr8, wr9 \n\t"
  1011. "wpackhus wr9, wr10, wr11 \n\t"
  1012. WAVG2B" wr8, wr8, wr12 \n\t"
  1013. WAVG2B" wr9, wr9, wr13 \n\t"
  1014. "wstrd wr8, [%[block]] \n\t"
  1015. "wstrd wr9, [%[block], #8] \n\t"
  1016. "add %[block], %[block], %[line_size] \n\t"
  1017. // [wr0 wr1 wr2 wr3] <= *
  1018. // [wr4 wr5 wr6 wr7]
  1019. "wldrd wr12, [%[pixels]] \n\t"
  1020. "pld [%[block]] \n\t"
  1021. "wldrd wr13, [%[pixels], #8] \n\t"
  1022. "pld [%[block], #32] \n\t"
  1023. "wldrd wr14, [%[pixels], #16] \n\t"
  1024. "add %[pixels], %[pixels], %[line_size] \n\t"
  1025. "walignr1 wr2, wr12, wr13 \n\t"
  1026. "pld [%[pixels]] \n\t"
  1027. "pld [%[pixels], #32] \n\t"
  1028. "walignr1 wr3, wr13, wr14 \n\t"
  1029. "wmoveq wr10, wr13 \n\t"
  1030. "wmoveq wr11, wr14 \n\t"
  1031. "walignr2ne wr10, wr12, wr13 \n\t"
  1032. "walignr2ne wr11, wr13, wr14 \n\t"
  1033. "wunpckelub wr0, wr2 \n\t"
  1034. "wunpckehub wr1, wr2 \n\t"
  1035. "wunpckelub wr2, wr3 \n\t"
  1036. "wunpckehub wr3, wr3 \n\t"
  1037. "wunpckelub wr8, wr10 \n\t"
  1038. "wunpckehub wr9, wr10 \n\t"
  1039. "wunpckelub wr10, wr11 \n\t"
  1040. "wunpckehub wr11, wr11 \n\t"
  1041. "waddhus wr0, wr0, wr8 \n\t"
  1042. "waddhus wr1, wr1, wr9 \n\t"
  1043. "waddhus wr2, wr2, wr10 \n\t"
  1044. "waddhus wr3, wr3, wr11 \n\t"
  1045. "waddhus wr8, wr0, wr4 \n\t"
  1046. "waddhus wr9, wr1, wr5 \n\t"
  1047. "waddhus wr10, wr2, wr6 \n\t"
  1048. "waddhus wr11, wr3, wr7 \n\t"
  1049. "waddhus wr8, wr8, wr15 \n\t"
  1050. "waddhus wr9, wr9, wr15 \n\t"
  1051. "waddhus wr10, wr10, wr15 \n\t"
  1052. "waddhus wr11, wr11, wr15 \n\t"
  1053. "wsrlhg wr8, wr8, wcgr0 \n\t"
  1054. "wsrlhg wr9, wr9, wcgr0 \n\t"
  1055. "wldrd wr12, [%[block]] \n\t"
  1056. "wldrd wr13, [%[block], #8] \n\t"
  1057. "wsrlhg wr10, wr10, wcgr0 \n\t"
  1058. "wsrlhg wr11, wr11, wcgr0 \n\t"
  1059. "wpackhus wr8, wr8, wr9 \n\t"
  1060. "wpackhus wr9, wr10, wr11 \n\t"
  1061. WAVG2B" wr8, wr8, wr12 \n\t"
  1062. WAVG2B" wr9, wr9, wr13 \n\t"
  1063. "wstrd wr8, [%[block]] \n\t"
  1064. "wstrd wr9, [%[block], #8] \n\t"
  1065. "add %[block], %[block], %[line_size] \n\t"
  1066. "subs %[h], %[h], #2 \n\t"
  1067. "pld [%[block]] \n\t"
  1068. "pld [%[block], #32] \n\t"
  1069. "bne 1b \n\t"
  1070. : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
  1071. : [line_size]"r"(line_size)
  1072. : "r12", "memory");
  1073. }