msa_macro.h 63 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397
  1. // Copyright 2016 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // MSA common macros
  11. //
  12. // Author(s): Prashant Patil (prashant.patil@imgtec.com)
  13. #ifndef WEBP_DSP_MSA_MACRO_H_
  14. #define WEBP_DSP_MSA_MACRO_H_
  15. #include "./dsp.h"
  16. #if defined(WEBP_USE_MSA)
  17. #include <stdint.h>
  18. #include <msa.h>
  19. #if defined(__clang__)
  20. #define CLANG_BUILD
  21. #endif
  22. #ifdef CLANG_BUILD
  23. #define ALPHAVAL (-1)
  24. #define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b)
  25. #define ADDVI_W(a, b) __msa_addvi_w((v4i32)a, b)
  26. #define SRAI_B(a, b) __msa_srai_b((v16i8)a, b)
  27. #define SRAI_H(a, b) __msa_srai_h((v8i16)a, b)
  28. #define SRAI_W(a, b) __msa_srai_w((v4i32)a, b)
  29. #define SRLI_H(a, b) __msa_srli_h((v8i16)a, b)
  30. #define SLLI_B(a, b) __msa_slli_b((v4i32)a, b)
  31. #define ANDI_B(a, b) __msa_andi_b((v16u8)a, b)
  32. #define ORI_B(a, b) __msa_ori_b((v16u8)a, b)
  33. #else
  34. #define ALPHAVAL (0xff)
  35. #define ADDVI_H(a, b) (a + b)
  36. #define ADDVI_W(a, b) (a + b)
  37. #define SRAI_B(a, b) (a >> b)
  38. #define SRAI_H(a, b) (a >> b)
  39. #define SRAI_W(a, b) (a >> b)
  40. #define SRLI_H(a, b) (a << b)
  41. #define SLLI_B(a, b) (a << b)
  42. #define ANDI_B(a, b) (a & b)
  43. #define ORI_B(a, b) (a | b)
  44. #endif
  45. #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc))
  46. #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
  47. #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
  48. #define LD_H(RTYPE, psrc) *((RTYPE*)(psrc))
  49. #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
  50. #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
  51. #define LD_W(RTYPE, psrc) *((RTYPE*)(psrc))
  52. #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
  53. #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
  54. #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
  55. #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
  56. #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
  57. #define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
  58. #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
  59. #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
  60. #define ST_W(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
  61. #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
  62. #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
  63. #define MSA_LOAD_FUNC(TYPE, INSTR, FUNC_NAME) \
  64. static inline TYPE FUNC_NAME(const void* const psrc) { \
  65. const uint8_t* const psrc_m = (const uint8_t*)psrc; \
  66. TYPE val_m; \
  67. asm volatile ( \
  68. "" #INSTR " %[val_m], %[psrc_m] \n\t" \
  69. : [val_m] "=r" (val_m) \
  70. : [psrc_m] "m" (*psrc_m)); \
  71. return val_m; \
  72. }
  73. #define MSA_LOAD(psrc, FUNC_NAME) FUNC_NAME(psrc)
  74. #define MSA_STORE_FUNC(TYPE, INSTR, FUNC_NAME) \
  75. static inline void FUNC_NAME(TYPE val, void* const pdst) { \
  76. uint8_t* const pdst_m = (uint8_t*)pdst; \
  77. TYPE val_m = val; \
  78. asm volatile ( \
  79. " " #INSTR " %[val_m], %[pdst_m] \n\t" \
  80. : [pdst_m] "=m" (*pdst_m) \
  81. : [val_m] "r" (val_m)); \
  82. }
  83. #define MSA_STORE(val, pdst, FUNC_NAME) FUNC_NAME(val, pdst)
  84. #if (__mips_isa_rev >= 6)
  85. MSA_LOAD_FUNC(uint16_t, lh, msa_lh);
  86. #define LH(psrc) MSA_LOAD(psrc, msa_lh)
  87. MSA_LOAD_FUNC(uint32_t, lw, msa_lw);
  88. #define LW(psrc) MSA_LOAD(psrc, msa_lw)
  89. #if (__mips == 64)
  90. MSA_LOAD_FUNC(uint64_t, ld, msa_ld);
  91. #define LD(psrc) MSA_LOAD(psrc, msa_ld)
  92. #else // !(__mips == 64)
  93. #define LD(psrc) ((((uint64_t)MSA_LOAD(psrc + 4, msa_lw)) << 32) | \
  94. MSA_LOAD(psrc, msa_lw))
  95. #endif // (__mips == 64)
  96. MSA_STORE_FUNC(uint16_t, sh, msa_sh);
  97. #define SH(val, pdst) MSA_STORE(val, pdst, msa_sh)
  98. MSA_STORE_FUNC(uint32_t, sw, msa_sw);
  99. #define SW(val, pdst) MSA_STORE(val, pdst, msa_sw)
  100. MSA_STORE_FUNC(uint64_t, sd, msa_sd);
  101. #define SD(val, pdst) MSA_STORE(val, pdst, msa_sd)
  102. #else // !(__mips_isa_rev >= 6)
  103. MSA_LOAD_FUNC(uint16_t, ulh, msa_ulh);
  104. #define LH(psrc) MSA_LOAD(psrc, msa_ulh)
  105. MSA_LOAD_FUNC(uint32_t, ulw, msa_ulw);
  106. #define LW(psrc) MSA_LOAD(psrc, msa_ulw)
  107. #if (__mips == 64)
  108. MSA_LOAD_FUNC(uint64_t, uld, msa_uld);
  109. #define LD(psrc) MSA_LOAD(psrc, msa_uld)
  110. #else // !(__mips == 64)
  111. #define LD(psrc) ((((uint64_t)MSA_LOAD(psrc + 4, msa_ulw)) << 32) | \
  112. MSA_LOAD(psrc, msa_ulw))
  113. #endif // (__mips == 64)
  114. MSA_STORE_FUNC(uint16_t, ush, msa_ush);
  115. #define SH(val, pdst) MSA_STORE(val, pdst, msa_ush)
  116. MSA_STORE_FUNC(uint32_t, usw, msa_usw);
  117. #define SW(val, pdst) MSA_STORE(val, pdst, msa_usw)
  118. #define SD(val, pdst) do { \
  119. uint8_t* const pdst_sd_m = (uint8_t*)(pdst); \
  120. const uint32_t val0_m = (uint32_t)(val & 0x00000000FFFFFFFF); \
  121. const uint32_t val1_m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF); \
  122. SW(val0_m, pdst_sd_m); \
  123. SW(val1_m, pdst_sd_m + 4); \
  124. } while (0)
  125. #endif // (__mips_isa_rev >= 6)
  126. /* Description : Load 4 words with stride
  127. * Arguments : Inputs - psrc, stride
  128. * Outputs - out0, out1, out2, out3
  129. * Details : Load word in 'out0' from (psrc)
  130. * Load word in 'out1' from (psrc + stride)
  131. * Load word in 'out2' from (psrc + 2 * stride)
  132. * Load word in 'out3' from (psrc + 3 * stride)
  133. */
  134. #define LW4(psrc, stride, out0, out1, out2, out3) do { \
  135. const uint8_t* ptmp = (const uint8_t*)psrc; \
  136. out0 = LW(ptmp); \
  137. ptmp += stride; \
  138. out1 = LW(ptmp); \
  139. ptmp += stride; \
  140. out2 = LW(ptmp); \
  141. ptmp += stride; \
  142. out3 = LW(ptmp); \
  143. } while (0)
  144. /* Description : Store words with stride
  145. * Arguments : Inputs - in0, in1, in2, in3, pdst, stride
  146. * Details : Store word from 'in0' to (pdst)
  147. * Store word from 'in1' to (pdst + stride)
  148. * Store word from 'in2' to (pdst + 2 * stride)
  149. * Store word from 'in3' to (pdst + 3 * stride)
  150. */
  151. #define SW4(in0, in1, in2, in3, pdst, stride) do { \
  152. uint8_t* ptmp = (uint8_t*)pdst; \
  153. SW(in0, ptmp); \
  154. ptmp += stride; \
  155. SW(in1, ptmp); \
  156. ptmp += stride; \
  157. SW(in2, ptmp); \
  158. ptmp += stride; \
  159. SW(in3, ptmp); \
  160. } while (0)
  161. #define SW3(in0, in1, in2, pdst, stride) do { \
  162. uint8_t* ptmp = (uint8_t*)pdst; \
  163. SW(in0, ptmp); \
  164. ptmp += stride; \
  165. SW(in1, ptmp); \
  166. ptmp += stride; \
  167. SW(in2, ptmp); \
  168. } while (0)
  169. #define SW2(in0, in1, pdst, stride) do { \
  170. uint8_t* ptmp = (uint8_t*)pdst; \
  171. SW(in0, ptmp); \
  172. ptmp += stride; \
  173. SW(in1, ptmp); \
  174. } while (0)
  175. /* Description : Store 4 double words with stride
  176. * Arguments : Inputs - in0, in1, in2, in3, pdst, stride
  177. * Details : Store double word from 'in0' to (pdst)
  178. * Store double word from 'in1' to (pdst + stride)
  179. * Store double word from 'in2' to (pdst + 2 * stride)
  180. * Store double word from 'in3' to (pdst + 3 * stride)
  181. */
  182. #define SD4(in0, in1, in2, in3, pdst, stride) do { \
  183. uint8_t* ptmp = (uint8_t*)pdst; \
  184. SD(in0, ptmp); \
  185. ptmp += stride; \
  186. SD(in1, ptmp); \
  187. ptmp += stride; \
  188. SD(in2, ptmp); \
  189. ptmp += stride; \
  190. SD(in3, ptmp); \
  191. } while (0)
  192. /* Description : Load vectors with 16 byte elements with stride
  193. * Arguments : Inputs - psrc, stride
  194. * Outputs - out0, out1
  195. * Return Type - as per RTYPE
  196. * Details : Load 16 byte elements in 'out0' from (psrc)
  197. * Load 16 byte elements in 'out1' from (psrc + stride)
  198. */
  199. #define LD_B2(RTYPE, psrc, stride, out0, out1) do { \
  200. out0 = LD_B(RTYPE, psrc); \
  201. out1 = LD_B(RTYPE, psrc + stride); \
  202. } while (0)
  203. #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
  204. #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
  205. #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) do { \
  206. LD_B2(RTYPE, psrc, stride, out0, out1); \
  207. out2 = LD_B(RTYPE, psrc + 2 * stride); \
  208. } while (0)
  209. #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
  210. #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
  211. #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) do { \
  212. LD_B2(RTYPE, psrc, stride, out0, out1); \
  213. LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3); \
  214. } while (0)
  215. #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
  216. #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
  217. #define LD_B8(RTYPE, psrc, stride, \
  218. out0, out1, out2, out3, out4, out5, out6, out7) do { \
  219. LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3); \
  220. LD_B4(RTYPE, psrc + 4 * stride, stride, out4, out5, out6, out7); \
  221. } while (0)
  222. #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
  223. #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
  224. /* Description : Load vectors with 8 halfword elements with stride
  225. * Arguments : Inputs - psrc, stride
  226. * Outputs - out0, out1
  227. * Details : Load 8 halfword elements in 'out0' from (psrc)
  228. * Load 8 halfword elements in 'out1' from (psrc + stride)
  229. */
  230. #define LD_H2(RTYPE, psrc, stride, out0, out1) do { \
  231. out0 = LD_H(RTYPE, psrc); \
  232. out1 = LD_H(RTYPE, psrc + stride); \
  233. } while (0)
  234. #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
  235. #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
  236. /* Description : Load vectors with 4 word elements with stride
  237. * Arguments : Inputs - psrc, stride
  238. * Outputs - out0, out1, out2, out3
  239. * Details : Load 4 word elements in 'out0' from (psrc + 0 * stride)
  240. * Load 4 word elements in 'out1' from (psrc + 1 * stride)
  241. * Load 4 word elements in 'out2' from (psrc + 2 * stride)
  242. * Load 4 word elements in 'out3' from (psrc + 3 * stride)
  243. */
  244. #define LD_W2(RTYPE, psrc, stride, out0, out1) do { \
  245. out0 = LD_W(RTYPE, psrc); \
  246. out1 = LD_W(RTYPE, psrc + stride); \
  247. } while (0)
  248. #define LD_UW2(...) LD_W2(v4u32, __VA_ARGS__)
  249. #define LD_SW2(...) LD_W2(v4i32, __VA_ARGS__)
  250. #define LD_W3(RTYPE, psrc, stride, out0, out1, out2) do { \
  251. LD_W2(RTYPE, psrc, stride, out0, out1); \
  252. out2 = LD_W(RTYPE, psrc + 2 * stride); \
  253. } while (0)
  254. #define LD_UW3(...) LD_W3(v4u32, __VA_ARGS__)
  255. #define LD_SW3(...) LD_W3(v4i32, __VA_ARGS__)
  256. #define LD_W4(RTYPE, psrc, stride, out0, out1, out2, out3) do { \
  257. LD_W2(RTYPE, psrc, stride, out0, out1); \
  258. LD_W2(RTYPE, psrc + 2 * stride, stride, out2, out3); \
  259. } while (0)
  260. #define LD_UW4(...) LD_W4(v4u32, __VA_ARGS__)
  261. #define LD_SW4(...) LD_W4(v4i32, __VA_ARGS__)
  262. /* Description : Store vectors of 16 byte elements with stride
  263. * Arguments : Inputs - in0, in1, pdst, stride
  264. * Details : Store 16 byte elements from 'in0' to (pdst)
  265. * Store 16 byte elements from 'in1' to (pdst + stride)
  266. */
  267. #define ST_B2(RTYPE, in0, in1, pdst, stride) do { \
  268. ST_B(RTYPE, in0, pdst); \
  269. ST_B(RTYPE, in1, pdst + stride); \
  270. } while (0)
  271. #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
  272. #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
  273. #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) do { \
  274. ST_B2(RTYPE, in0, in1, pdst, stride); \
  275. ST_B2(RTYPE, in2, in3, pdst + 2 * stride, stride); \
  276. } while (0)
  277. #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
  278. #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
  279. #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
  280. pdst, stride) do { \
  281. ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
  282. ST_B4(RTYPE, in4, in5, in6, in7, pdst + 4 * stride, stride); \
  283. } while (0)
  284. #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
  285. /* Description : Store vectors of 4 word elements with stride
  286. * Arguments : Inputs - in0, in1, in2, in3, pdst, stride
  287. * Details : Store 4 word elements from 'in0' to (pdst + 0 * stride)
  288. * Store 4 word elements from 'in1' to (pdst + 1 * stride)
  289. * Store 4 word elements from 'in2' to (pdst + 2 * stride)
  290. * Store 4 word elements from 'in3' to (pdst + 3 * stride)
  291. */
  292. #define ST_W2(RTYPE, in0, in1, pdst, stride) do { \
  293. ST_W(RTYPE, in0, pdst); \
  294. ST_W(RTYPE, in1, pdst + stride); \
  295. } while (0)
  296. #define ST_UW2(...) ST_W2(v4u32, __VA_ARGS__)
  297. #define ST_SW2(...) ST_W2(v4i32, __VA_ARGS__)
  298. #define ST_W3(RTYPE, in0, in1, in2, pdst, stride) do { \
  299. ST_W2(RTYPE, in0, in1, pdst, stride); \
  300. ST_W(RTYPE, in2, pdst + 2 * stride); \
  301. } while (0)
  302. #define ST_UW3(...) ST_W3(v4u32, __VA_ARGS__)
  303. #define ST_SW3(...) ST_W3(v4i32, __VA_ARGS__)
  304. #define ST_W4(RTYPE, in0, in1, in2, in3, pdst, stride) do { \
  305. ST_W2(RTYPE, in0, in1, pdst, stride); \
  306. ST_W2(RTYPE, in2, in3, pdst + 2 * stride, stride); \
  307. } while (0)
  308. #define ST_UW4(...) ST_W4(v4u32, __VA_ARGS__)
  309. #define ST_SW4(...) ST_W4(v4i32, __VA_ARGS__)
  310. /* Description : Store vectors of 8 halfword elements with stride
  311. * Arguments : Inputs - in0, in1, pdst, stride
  312. * Details : Store 8 halfword elements from 'in0' to (pdst)
  313. * Store 8 halfword elements from 'in1' to (pdst + stride)
  314. */
  315. #define ST_H2(RTYPE, in0, in1, pdst, stride) do { \
  316. ST_H(RTYPE, in0, pdst); \
  317. ST_H(RTYPE, in1, pdst + stride); \
  318. } while (0)
  319. #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
  320. #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
  321. /* Description : Store 2x4 byte block to destination memory from input vector
  322. * Arguments : Inputs - in, stidx, pdst, stride
  323. * Details : Index 'stidx' halfword element from 'in' vector is copied to
  324. * the GP register and stored to (pdst)
  325. * Index 'stidx+1' halfword element from 'in' vector is copied to
  326. * the GP register and stored to (pdst + stride)
  327. * Index 'stidx+2' halfword element from 'in' vector is copied to
  328. * the GP register and stored to (pdst + 2 * stride)
  329. * Index 'stidx+3' halfword element from 'in' vector is copied to
  330. * the GP register and stored to (pdst + 3 * stride)
  331. */
  332. #define ST2x4_UB(in, stidx, pdst, stride) do { \
  333. uint8_t* pblk_2x4_m = (uint8_t*)pdst; \
  334. const uint16_t out0_m = __msa_copy_s_h((v8i16)in, stidx); \
  335. const uint16_t out1_m = __msa_copy_s_h((v8i16)in, stidx + 1); \
  336. const uint16_t out2_m = __msa_copy_s_h((v8i16)in, stidx + 2); \
  337. const uint16_t out3_m = __msa_copy_s_h((v8i16)in, stidx + 3); \
  338. SH(out0_m, pblk_2x4_m); \
  339. pblk_2x4_m += stride; \
  340. SH(out1_m, pblk_2x4_m); \
  341. pblk_2x4_m += stride; \
  342. SH(out2_m, pblk_2x4_m); \
  343. pblk_2x4_m += stride; \
  344. SH(out3_m, pblk_2x4_m); \
  345. } while (0)
  346. /* Description : Store 4x4 byte block to destination memory from input vector
  347. * Arguments : Inputs - in0, in1, pdst, stride
  348. * Details : 'Idx0' word element from input vector 'in0' is copied to the
  349. * GP register and stored to (pdst)
  350. * 'Idx1' word element from input vector 'in0' is copied to the
  351. * GP register and stored to (pdst + stride)
  352. * 'Idx2' word element from input vector 'in0' is copied to the
  353. * GP register and stored to (pdst + 2 * stride)
  354. * 'Idx3' word element from input vector 'in0' is copied to the
  355. * GP register and stored to (pdst + 3 * stride)
  356. */
  357. #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) do { \
  358. uint8_t* const pblk_4x4_m = (uint8_t*)pdst; \
  359. const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0); \
  360. const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1); \
  361. const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2); \
  362. const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3); \
  363. SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
  364. } while (0)
  365. #define ST4x8_UB(in0, in1, pdst, stride) do { \
  366. uint8_t* const pblk_4x8 = (uint8_t*)pdst; \
  367. ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
  368. ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
  369. } while (0)
  370. /* Description : Immediate number of elements to slide
  371. * Arguments : Inputs - in0, in1, slide_val
  372. * Outputs - out
  373. * Return Type - as per RTYPE
  374. * Details : Byte elements from 'in1' vector are slid into 'in0' by
  375. * value specified in the 'slide_val'
  376. */
  377. #define SLDI_B(RTYPE, in0, in1, slide_val) \
  378. (RTYPE)__msa_sldi_b((v16i8)in0, (v16i8)in1, slide_val) \
  379. #define SLDI_UB(...) SLDI_B(v16u8, __VA_ARGS__)
  380. #define SLDI_SB(...) SLDI_B(v16i8, __VA_ARGS__)
  381. #define SLDI_SH(...) SLDI_B(v8i16, __VA_ARGS__)
  382. /* Description : Shuffle byte vector elements as per mask vector
  383. * Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
  384. * Outputs - out0, out1
  385. * Return Type - as per RTYPE
  386. * Details : Byte elements from 'in0' & 'in1' are copied selectively to
  387. * 'out0' as per control vector 'mask0'
  388. */
  389. #define VSHF_B(RTYPE, in0, in1, mask) \
  390. (RTYPE)__msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0)
  391. #define VSHF_UB(...) VSHF_B(v16u8, __VA_ARGS__)
  392. #define VSHF_SB(...) VSHF_B(v16i8, __VA_ARGS__)
  393. #define VSHF_UH(...) VSHF_B(v8u16, __VA_ARGS__)
  394. #define VSHF_SH(...) VSHF_B(v8i16, __VA_ARGS__)
  395. #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do { \
  396. out0 = VSHF_B(RTYPE, in0, in1, mask0); \
  397. out1 = VSHF_B(RTYPE, in2, in3, mask1); \
  398. } while (0)
  399. #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
  400. #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
  401. #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
  402. #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
  403. /* Description : Shuffle halfword vector elements as per mask vector
  404. * Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
  405. * Outputs - out0, out1
  406. * Return Type - as per RTYPE
  407. * Details : halfword elements from 'in0' & 'in1' are copied selectively to
  408. * 'out0' as per control vector 'mask0'
  409. */
  410. #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do { \
  411. out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \
  412. out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \
  413. } while (0)
  414. #define VSHF_H2_UH(...) VSHF_H2(v8u16, __VA_ARGS__)
  415. #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
  416. /* Description : Dot product of byte vector elements
  417. * Arguments : Inputs - mult0, mult1, cnst0, cnst1
  418. * Outputs - out0, out1
  419. * Return Type - as per RTYPE
  420. * Details : Signed byte elements from 'mult0' are multiplied with
  421. * signed byte elements from 'cnst0' producing a result
  422. * twice the size of input i.e. signed halfword.
  423. * The multiplication result of adjacent odd-even elements
  424. * are added together and written to the 'out0' vector
  425. */
  426. #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \
  427. out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \
  428. out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \
  429. } while (0)
  430. #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
  431. /* Description : Dot product of halfword vector elements
  432. * Arguments : Inputs - mult0, mult1, cnst0, cnst1
  433. * Outputs - out0, out1
  434. * Return Type - as per RTYPE
  435. * Details : Signed halfword elements from 'mult0' are multiplied with
  436. * signed halfword elements from 'cnst0' producing a result
  437. * twice the size of input i.e. signed word.
  438. * The multiplication result of adjacent odd-even elements
  439. * are added together and written to the 'out0' vector
  440. */
  441. #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \
  442. out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \
  443. out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \
  444. } while (0)
  445. #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
  446. /* Description : Dot product of unsigned word vector elements
  447. * Arguments : Inputs - mult0, mult1, cnst0, cnst1
  448. * Outputs - out0, out1
  449. * Return Type - as per RTYPE
  450. * Details : Unsigned word elements from 'mult0' are multiplied with
  451. * unsigned word elements from 'cnst0' producing a result
  452. * twice the size of input i.e. unsigned double word.
  453. * The multiplication result of adjacent odd-even elements
  454. * are added together and written to the 'out0' vector
  455. */
  456. #define DOTP_UW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \
  457. out0 = (RTYPE)__msa_dotp_u_d((v4u32)mult0, (v4u32)cnst0); \
  458. out1 = (RTYPE)__msa_dotp_u_d((v4u32)mult1, (v4u32)cnst1); \
  459. } while (0)
  460. #define DOTP_UW2_UD(...) DOTP_UW2(v2u64, __VA_ARGS__)
  461. /* Description : Dot product & addition of halfword vector elements
  462. * Arguments : Inputs - mult0, mult1, cnst0, cnst1
  463. * Outputs - out0, out1
  464. * Return Type - as per RTYPE
  465. * Details : Signed halfword elements from 'mult0' are multiplied with
  466. * signed halfword elements from 'cnst0' producing a result
  467. * twice the size of input i.e. signed word.
  468. * The multiplication result of adjacent odd-even elements
  469. * are added to the 'out0' vector
  470. */
  471. #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \
  472. out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
  473. out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
  474. } while (0)
  475. #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
  476. /* Description : Clips all signed halfword elements of input vector
  477. * between 0 & 255
  478. * Arguments : Input/output - val
  479. * Return Type - signed halfword
  480. */
  481. #define CLIP_SH_0_255(val) do { \
  482. const v8i16 max_m = __msa_ldi_h(255); \
  483. val = __msa_maxi_s_h((v8i16)val, 0); \
  484. val = __msa_min_s_h(max_m, (v8i16)val); \
  485. } while (0)
  486. #define CLIP_SH2_0_255(in0, in1) do { \
  487. CLIP_SH_0_255(in0); \
  488. CLIP_SH_0_255(in1); \
  489. } while (0)
  490. #define CLIP_SH4_0_255(in0, in1, in2, in3) do { \
  491. CLIP_SH2_0_255(in0, in1); \
  492. CLIP_SH2_0_255(in2, in3); \
  493. } while (0)
  494. /* Description : Clips all unsigned halfword elements of input vector
  495. * between 0 & 255
  496. * Arguments : Input - in
  497. * Output - out_m
  498. * Return Type - unsigned halfword
  499. */
  500. #define CLIP_UH_0_255(in) do { \
  501. const v8u16 max_m = (v8u16)__msa_ldi_h(255); \
  502. in = __msa_maxi_u_h((v8u16) in, 0); \
  503. in = __msa_min_u_h((v8u16) max_m, (v8u16) in); \
  504. } while (0)
  505. #define CLIP_UH2_0_255(in0, in1) do { \
  506. CLIP_UH_0_255(in0); \
  507. CLIP_UH_0_255(in1); \
  508. } while (0)
  509. /* Description : Clips all signed word elements of input vector
  510. * between 0 & 255
  511. * Arguments : Input/output - val
  512. * Return Type - signed word
  513. */
  514. #define CLIP_SW_0_255(val) do { \
  515. const v4i32 max_m = __msa_ldi_w(255); \
  516. val = __msa_maxi_s_w((v4i32)val, 0); \
  517. val = __msa_min_s_w(max_m, (v4i32)val); \
  518. } while (0)
  519. #define CLIP_SW4_0_255(in0, in1, in2, in3) do { \
  520. CLIP_SW_0_255(in0); \
  521. CLIP_SW_0_255(in1); \
  522. CLIP_SW_0_255(in2); \
  523. CLIP_SW_0_255(in3); \
  524. } while (0)
  525. /* Description : Horizontal addition of 4 signed word elements of input vector
  526. * Arguments : Input - in (signed word vector)
  527. * Output - sum_m (i32 sum)
  528. * Return Type - signed word (GP)
  529. * Details : 4 signed word elements of 'in' vector are added together and
  530. * the resulting integer sum is returned
  531. */
  532. static WEBP_INLINE int32_t func_hadd_sw_s32(v4i32 in) {
  533. const v2i64 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);
  534. const v2i64 res1_m = __msa_splati_d(res0_m, 1);
  535. const v2i64 out = res0_m + res1_m;
  536. int32_t sum_m = __msa_copy_s_w((v4i32)out, 0);
  537. return sum_m;
  538. }
  539. #define HADD_SW_S32(in) func_hadd_sw_s32(in)
  540. /* Description : Horizontal addition of 8 signed halfword elements
  541. * Arguments : Input - in (signed halfword vector)
  542. * Output - sum_m (s32 sum)
  543. * Return Type - signed word
  544. * Details : 8 signed halfword elements of input vector are added
  545. * together and the resulting integer sum is returned
  546. */
  547. static WEBP_INLINE int32_t func_hadd_sh_s32(v8i16 in) {
  548. const v4i32 res = __msa_hadd_s_w(in, in);
  549. const v2i64 res0 = __msa_hadd_s_d(res, res);
  550. const v2i64 res1 = __msa_splati_d(res0, 1);
  551. const v2i64 res2 = res0 + res1;
  552. const int32_t sum_m = __msa_copy_s_w((v4i32)res2, 0);
  553. return sum_m;
  554. }
  555. #define HADD_SH_S32(in) func_hadd_sh_s32(in)
  556. /* Description : Horizontal addition of 8 unsigned halfword elements
  557. * Arguments : Input - in (unsigned halfword vector)
  558. * Output - sum_m (u32 sum)
  559. * Return Type - unsigned word
  560. * Details : 8 unsigned halfword elements of input vector are added
  561. * together and the resulting integer sum is returned
  562. */
  563. static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
  564. uint32_t sum_m;
  565. const v4u32 res_m = __msa_hadd_u_w(in, in);
  566. v2u64 res0_m = __msa_hadd_u_d(res_m, res_m);
  567. v2u64 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);
  568. res0_m = res0_m + res1_m;
  569. sum_m = __msa_copy_s_w((v4i32)res0_m, 0);
  570. return sum_m;
  571. }
  572. #define HADD_UH_U32(in) func_hadd_uh_u32(in)
  573. /* Description : Horizontal addition of signed half word vector elements
  574. Arguments : Inputs - in0, in1
  575. Outputs - out0, out1
  576. Return Type - as per RTYPE
  577. Details : Each signed odd half word element from 'in0' is added to
  578. even signed half word element from 'in0' (pairwise) and the
  579. halfword result is written in 'out0'
  580. */
  581. #define HADD_SH2(RTYPE, in0, in1, out0, out1) do { \
  582. out0 = (RTYPE)__msa_hadd_s_w((v8i16)in0, (v8i16)in0); \
  583. out1 = (RTYPE)__msa_hadd_s_w((v8i16)in1, (v8i16)in1); \
  584. } while (0)
  585. #define HADD_SH2_SW(...) HADD_SH2(v4i32, __VA_ARGS__)
  586. #define HADD_SH4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) do { \
  587. HADD_SH2(RTYPE, in0, in1, out0, out1); \
  588. HADD_SH2(RTYPE, in2, in3, out2, out3); \
  589. } while (0)
  590. #define HADD_SH4_SW(...) HADD_SH4(v4i32, __VA_ARGS__)
  591. /* Description : Horizontal subtraction of unsigned byte vector elements
  592. * Arguments : Inputs - in0, in1
  593. * Outputs - out0, out1
  594. * Return Type - as per RTYPE
  595. * Details : Each unsigned odd byte element from 'in0' is subtracted from
  596. * even unsigned byte element from 'in0' (pairwise) and the
  597. * halfword result is written to 'out0'
  598. */
  599. #define HSUB_UB2(RTYPE, in0, in1, out0, out1) do { \
  600. out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
  601. out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
  602. } while (0)
  603. #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
  604. #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
  605. #define HSUB_UB2_SW(...) HSUB_UB2(v4i32, __VA_ARGS__)
  606. /* Description : Set element n input vector to GPR value
  607. * Arguments : Inputs - in0, in1, in2, in3
  608. * Output - out
  609. * Return Type - as per RTYPE
  610. * Details : Set element 0 in vector 'out' to value specified in 'in0'
  611. */
  612. #define INSERT_W2(RTYPE, in0, in1, out) do { \
  613. out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
  614. out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
  615. } while (0)
  616. #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
  617. #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
  618. #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) do { \
  619. out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
  620. out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
  621. out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
  622. out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
  623. } while (0)
  624. #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
  625. #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
  626. #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
  627. /* Description : Set element n of double word input vector to GPR value
  628. * Arguments : Inputs - in0, in1
  629. * Output - out
  630. * Return Type - as per RTYPE
  631. * Details : Set element 0 in vector 'out' to GPR value specified in 'in0'
  632. * Set element 1 in vector 'out' to GPR value specified in 'in1'
  633. */
  634. #define INSERT_D2(RTYPE, in0, in1, out) do { \
  635. out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
  636. out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
  637. } while (0)
  638. #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
  639. #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
  640. /* Description : Interleave even byte elements from vectors
  641. * Arguments : Inputs - in0, in1, in2, in3
  642. * Outputs - out0, out1
  643. * Return Type - as per RTYPE
  644. * Details : Even byte elements of 'in0' and 'in1' are interleaved
  645. * and written to 'out0'
  646. */
  647. #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  648. out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
  649. out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
  650. } while (0)
  651. #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
  652. #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
  653. #define ILVEV_B2_UH(...) ILVEV_B2(v8u16, __VA_ARGS__)
  654. #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
  655. #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
  656. /* Description : Interleave odd byte elements from vectors
  657. * Arguments : Inputs - in0, in1, in2, in3
  658. * Outputs - out0, out1
  659. * Return Type - as per RTYPE
  660. * Details : Odd byte elements of 'in0' and 'in1' are interleaved
  661. * and written to 'out0'
  662. */
  663. #define ILVOD_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  664. out0 = (RTYPE)__msa_ilvod_b((v16i8)in1, (v16i8)in0); \
  665. out1 = (RTYPE)__msa_ilvod_b((v16i8)in3, (v16i8)in2); \
  666. } while (0)
  667. #define ILVOD_B2_UB(...) ILVOD_B2(v16u8, __VA_ARGS__)
  668. #define ILVOD_B2_SB(...) ILVOD_B2(v16i8, __VA_ARGS__)
  669. #define ILVOD_B2_UH(...) ILVOD_B2(v8u16, __VA_ARGS__)
  670. #define ILVOD_B2_SH(...) ILVOD_B2(v8i16, __VA_ARGS__)
  671. #define ILVOD_B2_SD(...) ILVOD_B2(v2i64, __VA_ARGS__)
  672. /* Description : Interleave even halfword elements from vectors
  673. * Arguments : Inputs - in0, in1, in2, in3
  674. * Outputs - out0, out1
  675. * Return Type - as per RTYPE
  676. * Details : Even halfword elements of 'in0' and 'in1' are interleaved
  677. * and written to 'out0'
  678. */
  679. #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  680. out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
  681. out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
  682. } while (0)
  683. #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
  684. #define ILVEV_H2_UH(...) ILVEV_H2(v8u16, __VA_ARGS__)
  685. #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
  686. #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
  687. /* Description : Interleave odd halfword elements from vectors
  688. * Arguments : Inputs - in0, in1, in2, in3
  689. * Outputs - out0, out1
  690. * Return Type - as per RTYPE
  691. * Details : Odd halfword elements of 'in0' and 'in1' are interleaved
  692. * and written to 'out0'
  693. */
  694. #define ILVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  695. out0 = (RTYPE)__msa_ilvod_h((v8i16)in1, (v8i16)in0); \
  696. out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2); \
  697. } while (0)
  698. #define ILVOD_H2_UB(...) ILVOD_H2(v16u8, __VA_ARGS__)
  699. #define ILVOD_H2_UH(...) ILVOD_H2(v8u16, __VA_ARGS__)
  700. #define ILVOD_H2_SH(...) ILVOD_H2(v8i16, __VA_ARGS__)
  701. #define ILVOD_H2_SW(...) ILVOD_H2(v4i32, __VA_ARGS__)
  702. /* Description : Interleave even word elements from vectors
  703. * Arguments : Inputs - in0, in1, in2, in3
  704. * Outputs - out0, out1
  705. * Return Type - as per RTYPE
  706. * Details : Even word elements of 'in0' and 'in1' are interleaved
  707. * and written to 'out0'
  708. */
  709. #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  710. out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
  711. out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
  712. } while (0)
  713. #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
  714. #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
  715. #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
  716. #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
  717. /* Description : Interleave even-odd word elements from vectors
  718. * Arguments : Inputs - in0, in1, in2, in3
  719. * Outputs - out0, out1
  720. * Return Type - as per RTYPE
  721. * Details : Even word elements of 'in0' and 'in1' are interleaved
  722. * and written to 'out0'
  723. * Odd word elements of 'in2' and 'in3' are interleaved
  724. * and written to 'out1'
  725. */
  726. #define ILVEVOD_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  727. out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
  728. out1 = (RTYPE)__msa_ilvod_w((v4i32)in3, (v4i32)in2); \
  729. } while (0)
  730. #define ILVEVOD_W2_UB(...) ILVEVOD_W2(v16u8, __VA_ARGS__)
  731. #define ILVEVOD_W2_UH(...) ILVEVOD_W2(v8u16, __VA_ARGS__)
  732. #define ILVEVOD_W2_SH(...) ILVEVOD_W2(v8i16, __VA_ARGS__)
  733. #define ILVEVOD_W2_SW(...) ILVEVOD_W2(v4i32, __VA_ARGS__)
  734. /* Description : Interleave even-odd half-word elements from vectors
  735. * Arguments : Inputs - in0, in1, in2, in3
  736. * Outputs - out0, out1
  737. * Return Type - as per RTYPE
  738. * Details : Even half-word elements of 'in0' and 'in1' are interleaved
  739. * and written to 'out0'
  740. * Odd half-word elements of 'in2' and 'in3' are interleaved
  741. * and written to 'out1'
  742. */
  743. #define ILVEVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  744. out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
  745. out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2); \
  746. } while (0)
  747. #define ILVEVOD_H2_UB(...) ILVEVOD_H2(v16u8, __VA_ARGS__)
  748. #define ILVEVOD_H2_UH(...) ILVEVOD_H2(v8u16, __VA_ARGS__)
  749. #define ILVEVOD_H2_SH(...) ILVEVOD_H2(v8i16, __VA_ARGS__)
  750. #define ILVEVOD_H2_SW(...) ILVEVOD_H2(v4i32, __VA_ARGS__)
  751. /* Description : Interleave even double word elements from vectors
  752. * Arguments : Inputs - in0, in1, in2, in3
  753. * Outputs - out0, out1
  754. * Return Type - as per RTYPE
  755. * Details : Even double word elements of 'in0' and 'in1' are interleaved
  756. * and written to 'out0'
  757. */
  758. #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  759. out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
  760. out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
  761. } while (0)
  762. #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
  763. #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
  764. #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
  765. #define ILVEV_D2_SD(...) ILVEV_D2(v2i64, __VA_ARGS__)
  766. /* Description : Interleave left half of byte elements from vectors
  767. * Arguments : Inputs - in0, in1, in2, in3
  768. * Outputs - out0, out1
  769. * Return Type - as per RTYPE
  770. * Details : Left half of byte elements of 'in0' and 'in1' are interleaved
  771. * and written to 'out0'.
  772. */
  773. #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  774. out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
  775. out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
  776. } while (0)
  777. #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
  778. #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
  779. #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
  780. #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
  781. #define ILVL_B2_SW(...) ILVL_B2(v4i32, __VA_ARGS__)
  782. /* Description : Interleave right half of byte elements from vectors
  783. * Arguments : Inputs - in0, in1, in2, in3
  784. * Outputs - out0, out1
  785. * Return Type - as per RTYPE
  786. * Details : Right half of byte elements of 'in0' and 'in1' are interleaved
  787. * and written to out0.
  788. */
  789. #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  790. out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
  791. out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
  792. } while (0)
  793. #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
  794. #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
  795. #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
  796. #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
  797. #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
  798. #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
  799. out0, out1, out2, out3) do { \
  800. ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
  801. ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
  802. } while (0)
  803. #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
  804. #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
  805. #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
  806. #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
  807. #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
  808. /* Description : Interleave right half of halfword elements from vectors
  809. * Arguments : Inputs - in0, in1, in2, in3
  810. * Outputs - out0, out1
  811. * Return Type - as per RTYPE
  812. * Details : Right half of halfword elements of 'in0' and 'in1' are
  813. * interleaved and written to 'out0'.
  814. */
  815. #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  816. out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
  817. out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
  818. } while (0)
  819. #define ILVR_H2_UB(...) ILVR_H2(v16u8, __VA_ARGS__)
  820. #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
  821. #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
  822. #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
  823. out0, out1, out2, out3) do { \
  824. ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
  825. ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
  826. } while (0)
  827. #define ILVR_H4_UB(...) ILVR_H4(v16u8, __VA_ARGS__)
  828. #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
  829. #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
  830. /* Description : Interleave right half of double word elements from vectors
  831. * Arguments : Inputs - in0, in1, in2, in3
  832. * Outputs - out0, out1
  833. * Return Type - as per RTYPE
  834. * Details : Right half of double word elements of 'in0' and 'in1' are
  835. * interleaved and written to 'out0'.
  836. */
  837. #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  838. out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1); \
  839. out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3); \
  840. } while (0)
  841. #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
  842. #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
  843. #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
  844. #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
  845. out0, out1, out2, out3) do { \
  846. ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
  847. ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
  848. } while (0)
  849. #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
  850. #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
  851. /* Description : Interleave both left and right half of input vectors
  852. * Arguments : Inputs - in0, in1
  853. * Outputs - out0, out1
  854. * Return Type - as per RTYPE
  855. * Details : Right half of byte elements from 'in0' and 'in1' are
  856. * interleaved and written to 'out0'
  857. */
  858. #define ILVRL_B2(RTYPE, in0, in1, out0, out1) do { \
  859. out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
  860. out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
  861. } while (0)
  862. #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
  863. #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
  864. #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
  865. #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
  866. #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
  867. #define ILVRL_H2(RTYPE, in0, in1, out0, out1) do { \
  868. out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
  869. out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
  870. } while (0)
  871. #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
  872. #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
  873. #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
  874. #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
  875. #define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)
  876. #define ILVRL_W2(RTYPE, in0, in1, out0, out1) do { \
  877. out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
  878. out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
  879. } while (0)
  880. #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
  881. #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
  882. #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
  883. #define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__)
  884. /* Description : Pack even byte elements of vector pairs
  885. * Arguments : Inputs - in0, in1, in2, in3
  886. * Outputs - out0, out1
  887. * Return Type - as per RTYPE
  888. * Details : Even byte elements of 'in0' are copied to the left half of
  889. * 'out0' & even byte elements of 'in1' are copied to the right
  890. * half of 'out0'.
  891. */
  892. #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  893. out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
  894. out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
  895. } while (0)
  896. #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
  897. #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
  898. #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
  899. #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
  900. #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
  901. out0, out1, out2, out3) do { \
  902. PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
  903. PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
  904. } while (0)
  905. #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
  906. #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
  907. #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
  908. #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
  909. /* Description : Pack even halfword elements of vector pairs
  910. * Arguments : Inputs - in0, in1, in2, in3
  911. * Outputs - out0, out1
  912. * Return Type - as per RTYPE
  913. * Details : Even halfword elements of 'in0' are copied to the left half of
  914. * 'out0' & even halfword elements of 'in1' are copied to the
  915. * right half of 'out0'.
  916. */
  917. #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  918. out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
  919. out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
  920. } while (0)
  921. #define PCKEV_H2_UH(...) PCKEV_H2(v8u16, __VA_ARGS__)
  922. #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
  923. #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
  924. #define PCKEV_H2_UW(...) PCKEV_H2(v4u32, __VA_ARGS__)
  925. /* Description : Pack even word elements of vector pairs
  926. * Arguments : Inputs - in0, in1, in2, in3
  927. * Outputs - out0, out1
  928. * Return Type - as per RTYPE
  929. * Details : Even word elements of 'in0' are copied to the left half of
  930. * 'out0' & even word elements of 'in1' are copied to the
  931. * right half of 'out0'.
  932. */
  933. #define PCKEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  934. out0 = (RTYPE)__msa_pckev_w((v4i32)in0, (v4i32)in1); \
  935. out1 = (RTYPE)__msa_pckev_w((v4i32)in2, (v4i32)in3); \
  936. } while (0)
  937. #define PCKEV_W2_UH(...) PCKEV_W2(v8u16, __VA_ARGS__)
  938. #define PCKEV_W2_SH(...) PCKEV_W2(v8i16, __VA_ARGS__)
  939. #define PCKEV_W2_SW(...) PCKEV_W2(v4i32, __VA_ARGS__)
  940. #define PCKEV_W2_UW(...) PCKEV_W2(v4u32, __VA_ARGS__)
  941. /* Description : Pack odd halfword elements of vector pairs
  942. * Arguments : Inputs - in0, in1, in2, in3
  943. * Outputs - out0, out1
  944. * Return Type - as per RTYPE
  945. * Details : Odd halfword elements of 'in0' are copied to the left half of
  946. * 'out0' & odd halfword elements of 'in1' are copied to the
  947. * right half of 'out0'.
  948. */
  949. #define PCKOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  950. out0 = (RTYPE)__msa_pckod_h((v8i16)in0, (v8i16)in1); \
  951. out1 = (RTYPE)__msa_pckod_h((v8i16)in2, (v8i16)in3); \
  952. } while (0)
  953. #define PCKOD_H2_UH(...) PCKOD_H2(v8u16, __VA_ARGS__)
  954. #define PCKOD_H2_SH(...) PCKOD_H2(v8i16, __VA_ARGS__)
  955. #define PCKOD_H2_SW(...) PCKOD_H2(v4i32, __VA_ARGS__)
  956. #define PCKOD_H2_UW(...) PCKOD_H2(v4u32, __VA_ARGS__)
  957. /* Description : Arithmetic immediate shift right all elements of word vector
  958. * Arguments : Inputs - in0, in1, shift
  959. * Outputs - in place operation
  960. * Return Type - as per input vector RTYPE
  961. * Details : Each element of vector 'in0' is right shifted by 'shift' and
  962. * the result is written in-place. 'shift' is a GP variable.
  963. */
  964. #define SRAI_W2(RTYPE, in0, in1, shift_val) do { \
  965. in0 = (RTYPE)SRAI_W(in0, shift_val); \
  966. in1 = (RTYPE)SRAI_W(in1, shift_val); \
  967. } while (0)
  968. #define SRAI_W2_SW(...) SRAI_W2(v4i32, __VA_ARGS__)
  969. #define SRAI_W2_UW(...) SRAI_W2(v4u32, __VA_ARGS__)
  970. #define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) do { \
  971. SRAI_W2(RTYPE, in0, in1, shift_val); \
  972. SRAI_W2(RTYPE, in2, in3, shift_val); \
  973. } while (0)
  974. #define SRAI_W4_SW(...) SRAI_W4(v4i32, __VA_ARGS__)
  975. #define SRAI_W4_UW(...) SRAI_W4(v4u32, __VA_ARGS__)
  976. /* Description : Arithmetic shift right all elements of half-word vector
  977. * Arguments : Inputs - in0, in1, shift
  978. * Outputs - in place operation
  979. * Return Type - as per input vector RTYPE
  980. * Details : Each element of vector 'in0' is right shifted by 'shift' and
  981. * the result is written in-place. 'shift' is a GP variable.
  982. */
  983. #define SRAI_H2(RTYPE, in0, in1, shift_val) do { \
  984. in0 = (RTYPE)SRAI_H(in0, shift_val); \
  985. in1 = (RTYPE)SRAI_H(in1, shift_val); \
  986. } while (0)
  987. #define SRAI_H2_SH(...) SRAI_H2(v8i16, __VA_ARGS__)
  988. #define SRAI_H2_UH(...) SRAI_H2(v8u16, __VA_ARGS__)
  989. /* Description : Arithmetic rounded shift right all elements of word vector
  990. * Arguments : Inputs - in0, in1, shift
  991. * Outputs - in place operation
  992. * Return Type - as per input vector RTYPE
  993. * Details : Each element of vector 'in0' is right shifted by 'shift' and
  994. * the result is written in-place. 'shift' is a GP variable.
  995. */
  996. #define SRARI_W2(RTYPE, in0, in1, shift) do { \
  997. in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
  998. in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
  999. } while (0)
  1000. #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
  1001. #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) do { \
  1002. SRARI_W2(RTYPE, in0, in1, shift); \
  1003. SRARI_W2(RTYPE, in2, in3, shift); \
  1004. } while (0)
  1005. #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
  1006. #define SRARI_W4_UW(...) SRARI_W4(v4u32, __VA_ARGS__)
  1007. #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
  1008. /* Description : Shift right arithmetic rounded double words
  1009. * Arguments : Inputs - in0, in1, shift
  1010. * Outputs - in place operation
  1011. * Return Type - as per RTYPE
  1012. * Details : Each element of vector 'in0' is shifted right arithmetically by
  1013. * the number of bits in the corresponding element in the vector
  1014. * 'shift'. The last discarded bit is added to shifted value for
  1015. * rounding and the result is written in-place.
  1016. * 'shift' is a vector.
  1017. */
  1018. #define SRAR_D2(RTYPE, in0, in1, shift) do { \
  1019. in0 = (RTYPE)__msa_srar_d((v2i64)in0, (v2i64)shift); \
  1020. in1 = (RTYPE)__msa_srar_d((v2i64)in1, (v2i64)shift); \
  1021. } while (0)
  1022. #define SRAR_D2_SW(...) SRAR_D2(v4i32, __VA_ARGS__)
  1023. #define SRAR_D2_SD(...) SRAR_D2(v2i64, __VA_ARGS__)
  1024. #define SRAR_D2_UD(...) SRAR_D2(v2u64, __VA_ARGS__)
  1025. #define SRAR_D4(RTYPE, in0, in1, in2, in3, shift) do { \
  1026. SRAR_D2(RTYPE, in0, in1, shift); \
  1027. SRAR_D2(RTYPE, in2, in3, shift); \
  1028. } while (0)
  1029. #define SRAR_D4_SD(...) SRAR_D4(v2i64, __VA_ARGS__)
  1030. #define SRAR_D4_UD(...) SRAR_D4(v2u64, __VA_ARGS__)
  1031. /* Description : Addition of 2 pairs of half-word vectors
  1032. * Arguments : Inputs - in0, in1, in2, in3
  1033. * Outputs - out0, out1
  1034. * Details : Each element in 'in0' is added to 'in1' and result is written
  1035. * to 'out0'.
  1036. */
  1037. #define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  1038. out0 = (RTYPE)ADDVI_H(in0, in1); \
  1039. out1 = (RTYPE)ADDVI_H(in2, in3); \
  1040. } while (0)
  1041. #define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__)
  1042. #define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__)
  1043. /* Description : Addition of 2 pairs of word vectors
  1044. * Arguments : Inputs - in0, in1, in2, in3
  1045. * Outputs - out0, out1
  1046. * Details : Each element in 'in0' is added to 'in1' and result is written
  1047. * to 'out0'.
  1048. */
  1049. #define ADDVI_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  1050. out0 = (RTYPE)ADDVI_W(in0, in1); \
  1051. out1 = (RTYPE)ADDVI_W(in2, in3); \
  1052. } while (0)
  1053. #define ADDVI_W2_SW(...) ADDVI_W2(v4i32, __VA_ARGS__)
  1054. /* Description : Fill 2 pairs of word vectors with GP registers
  1055. * Arguments : Inputs - in0, in1
  1056. * Outputs - out0, out1
  1057. * Details : GP register in0 is replicated in each word element of out0
  1058. * GP register in1 is replicated in each word element of out1
  1059. */
  1060. #define FILL_W2(RTYPE, in0, in1, out0, out1) do { \
  1061. out0 = (RTYPE)__msa_fill_w(in0); \
  1062. out1 = (RTYPE)__msa_fill_w(in1); \
  1063. } while (0)
  1064. #define FILL_W2_SW(...) FILL_W2(v4i32, __VA_ARGS__)
  1065. /* Description : Addition of 2 pairs of vectors
  1066. * Arguments : Inputs - in0, in1, in2, in3
  1067. * Outputs - out0, out1
  1068. * Details : Each element in 'in0' is added to 'in1' and result is written
  1069. * to 'out0'.
  1070. */
  1071. #define ADD2(in0, in1, in2, in3, out0, out1) do { \
  1072. out0 = in0 + in1; \
  1073. out1 = in2 + in3; \
  1074. } while (0)
  1075. #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \
  1076. out0, out1, out2, out3) do { \
  1077. ADD2(in0, in1, in2, in3, out0, out1); \
  1078. ADD2(in4, in5, in6, in7, out2, out3); \
  1079. } while (0)
  1080. /* Description : Subtraction of 2 pairs of vectors
  1081. * Arguments : Inputs - in0, in1, in2, in3
  1082. * Outputs - out0, out1
  1083. * Details : Each element in 'in1' is subtracted from 'in0' and result is
  1084. * written to 'out0'.
  1085. */
  1086. #define SUB2(in0, in1, in2, in3, out0, out1) do { \
  1087. out0 = in0 - in1; \
  1088. out1 = in2 - in3; \
  1089. } while (0)
  1090. #define SUB3(in0, in1, in2, in3, in4, in5, out0, out1, out2) do { \
  1091. out0 = in0 - in1; \
  1092. out1 = in2 - in3; \
  1093. out2 = in4 - in5; \
  1094. } while (0)
  1095. #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \
  1096. out0, out1, out2, out3) do { \
  1097. out0 = in0 - in1; \
  1098. out1 = in2 - in3; \
  1099. out2 = in4 - in5; \
  1100. out3 = in6 - in7; \
  1101. } while (0)
  1102. /* Description : Addition - Subtraction of input vectors
  1103. * Arguments : Inputs - in0, in1
  1104. * Outputs - out0, out1
  1105. * Details : Each element in 'in1' is added to 'in0' and result is
  1106. * written to 'out0'.
  1107. * Each element in 'in1' is subtracted from 'in0' and result is
  1108. * written to 'out1'.
  1109. */
  1110. #define ADDSUB2(in0, in1, out0, out1) do { \
  1111. out0 = in0 + in1; \
  1112. out1 = in0 - in1; \
  1113. } while (0)
  1114. /* Description : Multiplication of pairs of vectors
  1115. * Arguments : Inputs - in0, in1, in2, in3
  1116. * Outputs - out0, out1
  1117. * Details : Each element from 'in0' is multiplied with elements from 'in1'
  1118. * and the result is written to 'out0'
  1119. */
  1120. #define MUL2(in0, in1, in2, in3, out0, out1) do { \
  1121. out0 = in0 * in1; \
  1122. out1 = in2 * in3; \
  1123. } while (0)
  1124. #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \
  1125. out0, out1, out2, out3) do { \
  1126. MUL2(in0, in1, in2, in3, out0, out1); \
  1127. MUL2(in4, in5, in6, in7, out2, out3); \
  1128. } while (0)
  1129. /* Description : Sign extend halfword elements from right half of the vector
  1130. * Arguments : Input - in (halfword vector)
  1131. * Output - out (sign extended word vector)
  1132. * Return Type - signed word
  1133. * Details : Sign bit of halfword elements from input vector 'in' is
  1134. * extracted and interleaved with same vector 'in0' to generate
  1135. * 4 word elements keeping sign intact
  1136. */
  1137. #define UNPCK_R_SH_SW(in, out) do { \
  1138. const v8i16 sign_m = __msa_clti_s_h((v8i16)in, 0); \
  1139. out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
  1140. } while (0)
  1141. /* Description : Sign extend halfword elements from input vector and return
  1142. * the result in pair of vectors
  1143. * Arguments : Input - in (halfword vector)
  1144. * Outputs - out0, out1 (sign extended word vectors)
  1145. * Return Type - signed word
  1146. * Details : Sign bit of halfword elements from input vector 'in' is
  1147. * extracted and interleaved right with same vector 'in0' to
  1148. * generate 4 signed word elements in 'out0'
  1149. * Then interleaved left with same vector 'in0' to
  1150. * generate 4 signed word elements in 'out1'
  1151. */
  1152. #define UNPCK_SH_SW(in, out0, out1) do { \
  1153. const v8i16 tmp_m = __msa_clti_s_h((v8i16)in, 0); \
  1154. ILVRL_H2_SW(tmp_m, in, out0, out1); \
  1155. } while (0)
  1156. /* Description : Butterfly of 4 input vectors
  1157. * Arguments : Inputs - in0, in1, in2, in3
  1158. * Outputs - out0, out1, out2, out3
  1159. * Details : Butterfly operation
  1160. */
  1161. #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) do { \
  1162. out0 = in0 + in3; \
  1163. out1 = in1 + in2; \
  1164. out2 = in1 - in2; \
  1165. out3 = in0 - in3; \
  1166. } while (0)
  1167. /* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
  1168. * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
  1169. * in8, in9, in10, in11, in12, in13, in14, in15
  1170. * Outputs - out0, out1, out2, out3
  1171. * Return Type - unsigned byte
  1172. */
  1173. #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
  1174. in8, in9, in10, in11, in12, in13, in14, in15, \
  1175. out0, out1, out2, out3) do { \
  1176. v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m; \
  1177. ILVEV_W2_SD(in0, in4, in8, in12, tmp2_m, tmp3_m); \
  1178. ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
  1179. ILVEV_D2_UB(tmp2_m, tmp3_m, tmp0_m, tmp1_m, out1, out3); \
  1180. ILVEV_W2_SD(in2, in6, in10, in14, tmp4_m, tmp5_m); \
  1181. ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
  1182. ILVEV_D2_SD(tmp4_m, tmp5_m, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
  1183. ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
  1184. ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out0, out2); \
  1185. ILVOD_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
  1186. ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out1, out3); \
  1187. } while (0)
  1188. /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
  1189. * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
  1190. * in8, in9, in10, in11, in12, in13, in14, in15
  1191. * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
  1192. * Return Type - unsigned byte
  1193. */
  1194. #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
  1195. in8, in9, in10, in11, in12, in13, in14, in15, \
  1196. out0, out1, out2, out3, out4, out5, \
  1197. out6, out7) do { \
  1198. v8i16 tmp0_m, tmp1_m, tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
  1199. v4i32 tmp2_m, tmp3_m; \
  1200. ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
  1201. ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
  1202. ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
  1203. ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
  1204. ILVEV_B2_SH(out7, out6, out5, out4, tmp0_m, tmp1_m); \
  1205. ILVOD_B2_SH(out7, out6, out5, out4, tmp4_m, tmp5_m); \
  1206. ILVEV_B2_UB(out3, out2, out1, out0, out5, out7); \
  1207. ILVOD_B2_SH(out3, out2, out1, out0, tmp6_m, tmp7_m); \
  1208. ILVEV_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
  1209. ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out0, out4); \
  1210. ILVOD_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
  1211. ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out2, out6); \
  1212. ILVEV_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
  1213. ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out1, out5); \
  1214. ILVOD_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
  1215. ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out3, out7); \
  1216. } while (0)
  1217. /* Description : Transpose 4x4 block with word elements in vectors
  1218. * Arguments : Inputs - in0, in1, in2, in3
  1219. * Outputs - out0, out1, out2, out3
  1220. * Return Type - as per RTYPE
  1221. */
  1222. #define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, \
  1223. out0, out1, out2, out3) do { \
  1224. v4i32 s0_m, s1_m, s2_m, s3_m; \
  1225. ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
  1226. ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
  1227. out0 = (RTYPE)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \
  1228. out1 = (RTYPE)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \
  1229. out2 = (RTYPE)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \
  1230. out3 = (RTYPE)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \
  1231. } while (0)
  1232. #define TRANSPOSE4x4_SW_SW(...) TRANSPOSE4x4_W(v4i32, __VA_ARGS__)
  1233. /* Description : Add block 4x4
  1234. * Arguments : Inputs - in0, in1, in2, in3, pdst, stride
  1235. * Details : Least significant 4 bytes from each input vector are added to
  1236. * the destination bytes, clipped between 0-255 and stored.
  1237. */
  1238. #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do { \
  1239. uint32_t src0_m, src1_m, src2_m, src3_m; \
  1240. v8i16 inp0_m, inp1_m, res0_m, res1_m; \
  1241. v16i8 dst0_m = { 0 }; \
  1242. v16i8 dst1_m = { 0 }; \
  1243. const v16i8 zero_m = { 0 }; \
  1244. ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m); \
  1245. LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
  1246. INSERT_W2_SB(src0_m, src1_m, dst0_m); \
  1247. INSERT_W2_SB(src2_m, src3_m, dst1_m); \
  1248. ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
  1249. ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
  1250. CLIP_SH2_0_255(res0_m, res1_m); \
  1251. PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
  1252. ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \
  1253. } while (0)
  1254. /* Description : Pack even byte elements, extract 0 & 2 index words from pair
  1255. * of results and store 4 words in destination memory as per
  1256. * stride
  1257. * Arguments : Inputs - in0, in1, in2, in3, pdst, stride
  1258. */
  1259. #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do { \
  1260. v16i8 tmp0_m, tmp1_m; \
  1261. PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
  1262. ST4x4_UB(tmp0_m, tmp1_m, 0, 2, 0, 2, pdst, stride); \
  1263. } while (0)
  1264. /* Description : average with rounding (in0 + in1 + 1) / 2.
  1265. * Arguments : Inputs - in0, in1, in2, in3,
  1266. * Outputs - out0, out1
  1267. * Return Type - as per RTYPE
  1268. * Details : Each unsigned byte element from 'in0' vector is added with
  1269. * each unsigned byte element from 'in1' vector. Then the average
  1270. * with rounding is calculated and written to 'out0'
  1271. */
  1272. #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
  1273. out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
  1274. out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
  1275. } while (0)
  1276. #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
  1277. #endif // WEBP_USE_MSA
  1278. #endif // WEBP_DSP_MSA_MACRO_H_