mct.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. /*
  2. * The copyright in this software is being made available under the 2-clauses
  3. * BSD License, included below. This software may be subject to other third
  4. * party and contributor rights, including patent rights, and no such rights
  5. * are granted under this license.
  6. *
  7. * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium
  8. * Copyright (c) 2002-2014, Professor Benoit Macq
  9. * Copyright (c) 2001-2003, David Janssens
  10. * Copyright (c) 2002-2003, Yannick Verschueren
  11. * Copyright (c) 2003-2007, Francois-Olivier Devaux
  12. * Copyright (c) 2003-2014, Antonin Descampe
  13. * Copyright (c) 2005, Herve Drolon, FreeImage Team
  14. * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR
  15. * Copyright (c) 2012, CS Systemes d'Information, France
  16. * All rights reserved.
  17. *
  18. * Redistribution and use in source and binary forms, with or without
  19. * modification, are permitted provided that the following conditions
  20. * are met:
  21. * 1. Redistributions of source code must retain the above copyright
  22. * notice, this list of conditions and the following disclaimer.
  23. * 2. Redistributions in binary form must reproduce the above copyright
  24. * notice, this list of conditions and the following disclaimer in the
  25. * documentation and/or other materials provided with the distribution.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
  28. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  30. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  31. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  34. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  35. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  36. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  37. * POSSIBILITY OF SUCH DAMAGE.
  38. */
  39. #ifdef __SSE__
  40. #include <xmmintrin.h>
  41. #endif
  42. #ifdef __SSE2__
  43. #include <emmintrin.h>
  44. #endif
  45. #ifdef __SSE4_1__
  46. #include <smmintrin.h>
  47. #endif
  48. #include "opj_includes.h"
  49. /* <summary> */
  50. /* This table contains the norms of the basis function of the reversible MCT. */
  51. /* </summary> */
  52. static const OPJ_FLOAT64 opj_mct_norms[3] = { 1.732, .8292, .8292 };
  53. /* <summary> */
  54. /* This table contains the norms of the basis function of the irreversible MCT. */
  55. /* </summary> */
  56. static const OPJ_FLOAT64 opj_mct_norms_real[3] = { 1.732, 1.805, 1.573 };
  57. const OPJ_FLOAT64 * opj_mct_get_mct_norms()
  58. {
  59. return opj_mct_norms;
  60. }
  61. const OPJ_FLOAT64 * opj_mct_get_mct_norms_real()
  62. {
  63. return opj_mct_norms_real;
  64. }
  65. /* <summary> */
  66. /* Forward reversible MCT. */
  67. /* </summary> */
  68. #ifdef __SSE2__
  69. void opj_mct_encode(
  70. OPJ_INT32* OPJ_RESTRICT c0,
  71. OPJ_INT32* OPJ_RESTRICT c1,
  72. OPJ_INT32* OPJ_RESTRICT c2,
  73. OPJ_SIZE_T n)
  74. {
  75. OPJ_SIZE_T i;
  76. const OPJ_SIZE_T len = n;
  77. /* buffer are aligned on 16 bytes */
  78. assert(((size_t)c0 & 0xf) == 0);
  79. assert(((size_t)c1 & 0xf) == 0);
  80. assert(((size_t)c2 & 0xf) == 0);
  81. for (i = 0; i < (len & ~3U); i += 4) {
  82. __m128i y, u, v;
  83. __m128i r = _mm_load_si128((const __m128i *) & (c0[i]));
  84. __m128i g = _mm_load_si128((const __m128i *) & (c1[i]));
  85. __m128i b = _mm_load_si128((const __m128i *) & (c2[i]));
  86. y = _mm_add_epi32(g, g);
  87. y = _mm_add_epi32(y, b);
  88. y = _mm_add_epi32(y, r);
  89. y = _mm_srai_epi32(y, 2);
  90. u = _mm_sub_epi32(b, g);
  91. v = _mm_sub_epi32(r, g);
  92. _mm_store_si128((__m128i *) & (c0[i]), y);
  93. _mm_store_si128((__m128i *) & (c1[i]), u);
  94. _mm_store_si128((__m128i *) & (c2[i]), v);
  95. }
  96. for (; i < len; ++i) {
  97. OPJ_INT32 r = c0[i];
  98. OPJ_INT32 g = c1[i];
  99. OPJ_INT32 b = c2[i];
  100. OPJ_INT32 y = (r + (g * 2) + b) >> 2;
  101. OPJ_INT32 u = b - g;
  102. OPJ_INT32 v = r - g;
  103. c0[i] = y;
  104. c1[i] = u;
  105. c2[i] = v;
  106. }
  107. }
  108. #else
  109. void opj_mct_encode(
  110. OPJ_INT32* OPJ_RESTRICT c0,
  111. OPJ_INT32* OPJ_RESTRICT c1,
  112. OPJ_INT32* OPJ_RESTRICT c2,
  113. OPJ_SIZE_T n)
  114. {
  115. OPJ_SIZE_T i;
  116. const OPJ_SIZE_T len = n;
  117. for (i = 0; i < len; ++i) {
  118. OPJ_INT32 r = c0[i];
  119. OPJ_INT32 g = c1[i];
  120. OPJ_INT32 b = c2[i];
  121. OPJ_INT32 y = (r + (g * 2) + b) >> 2;
  122. OPJ_INT32 u = b - g;
  123. OPJ_INT32 v = r - g;
  124. c0[i] = y;
  125. c1[i] = u;
  126. c2[i] = v;
  127. }
  128. }
  129. #endif
  130. /* <summary> */
  131. /* Inverse reversible MCT. */
  132. /* </summary> */
  133. #ifdef __SSE2__
  134. void opj_mct_decode(
  135. OPJ_INT32* OPJ_RESTRICT c0,
  136. OPJ_INT32* OPJ_RESTRICT c1,
  137. OPJ_INT32* OPJ_RESTRICT c2,
  138. OPJ_SIZE_T n)
  139. {
  140. OPJ_SIZE_T i;
  141. const OPJ_SIZE_T len = n;
  142. for (i = 0; i < (len & ~3U); i += 4) {
  143. __m128i r, g, b;
  144. __m128i y = _mm_load_si128((const __m128i *) & (c0[i]));
  145. __m128i u = _mm_load_si128((const __m128i *) & (c1[i]));
  146. __m128i v = _mm_load_si128((const __m128i *) & (c2[i]));
  147. g = y;
  148. g = _mm_sub_epi32(g, _mm_srai_epi32(_mm_add_epi32(u, v), 2));
  149. r = _mm_add_epi32(v, g);
  150. b = _mm_add_epi32(u, g);
  151. _mm_store_si128((__m128i *) & (c0[i]), r);
  152. _mm_store_si128((__m128i *) & (c1[i]), g);
  153. _mm_store_si128((__m128i *) & (c2[i]), b);
  154. }
  155. for (; i < len; ++i) {
  156. OPJ_INT32 y = c0[i];
  157. OPJ_INT32 u = c1[i];
  158. OPJ_INT32 v = c2[i];
  159. OPJ_INT32 g = y - ((u + v) >> 2);
  160. OPJ_INT32 r = v + g;
  161. OPJ_INT32 b = u + g;
  162. c0[i] = r;
  163. c1[i] = g;
  164. c2[i] = b;
  165. }
  166. }
  167. #else
  168. void opj_mct_decode(
  169. OPJ_INT32* OPJ_RESTRICT c0,
  170. OPJ_INT32* OPJ_RESTRICT c1,
  171. OPJ_INT32* OPJ_RESTRICT c2,
  172. OPJ_SIZE_T n)
  173. {
  174. OPJ_SIZE_T i;
  175. for (i = 0; i < n; ++i) {
  176. OPJ_INT32 y = c0[i];
  177. OPJ_INT32 u = c1[i];
  178. OPJ_INT32 v = c2[i];
  179. OPJ_INT32 g = y - ((u + v) >> 2);
  180. OPJ_INT32 r = v + g;
  181. OPJ_INT32 b = u + g;
  182. c0[i] = r;
  183. c1[i] = g;
  184. c2[i] = b;
  185. }
  186. }
  187. #endif
  188. /* <summary> */
  189. /* Get norm of basis function of reversible MCT. */
  190. /* </summary> */
  191. OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno)
  192. {
  193. return opj_mct_norms[compno];
  194. }
  195. /* <summary> */
  196. /* Forward irreversible MCT. */
  197. /* </summary> */
  198. void opj_mct_encode_real(
  199. OPJ_FLOAT32* OPJ_RESTRICT c0,
  200. OPJ_FLOAT32* OPJ_RESTRICT c1,
  201. OPJ_FLOAT32* OPJ_RESTRICT c2,
  202. OPJ_SIZE_T n)
  203. {
  204. OPJ_SIZE_T i;
  205. #ifdef __SSE__
  206. const __m128 YR = _mm_set1_ps(0.299f);
  207. const __m128 YG = _mm_set1_ps(0.587f);
  208. const __m128 YB = _mm_set1_ps(0.114f);
  209. const __m128 UR = _mm_set1_ps(-0.16875f);
  210. const __m128 UG = _mm_set1_ps(-0.331260f);
  211. const __m128 UB = _mm_set1_ps(0.5f);
  212. const __m128 VR = _mm_set1_ps(0.5f);
  213. const __m128 VG = _mm_set1_ps(-0.41869f);
  214. const __m128 VB = _mm_set1_ps(-0.08131f);
  215. for (i = 0; i < (n >> 3); i ++) {
  216. __m128 r, g, b, y, u, v;
  217. r = _mm_load_ps(c0);
  218. g = _mm_load_ps(c1);
  219. b = _mm_load_ps(c2);
  220. y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, YR), _mm_mul_ps(g, YG)),
  221. _mm_mul_ps(b, YB));
  222. u = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, UR), _mm_mul_ps(g, UG)),
  223. _mm_mul_ps(b, UB));
  224. v = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, VR), _mm_mul_ps(g, VG)),
  225. _mm_mul_ps(b, VB));
  226. _mm_store_ps(c0, y);
  227. _mm_store_ps(c1, u);
  228. _mm_store_ps(c2, v);
  229. c0 += 4;
  230. c1 += 4;
  231. c2 += 4;
  232. r = _mm_load_ps(c0);
  233. g = _mm_load_ps(c1);
  234. b = _mm_load_ps(c2);
  235. y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, YR), _mm_mul_ps(g, YG)),
  236. _mm_mul_ps(b, YB));
  237. u = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, UR), _mm_mul_ps(g, UG)),
  238. _mm_mul_ps(b, UB));
  239. v = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, VR), _mm_mul_ps(g, VG)),
  240. _mm_mul_ps(b, VB));
  241. _mm_store_ps(c0, y);
  242. _mm_store_ps(c1, u);
  243. _mm_store_ps(c2, v);
  244. c0 += 4;
  245. c1 += 4;
  246. c2 += 4;
  247. }
  248. n &= 7;
  249. #endif
  250. for (i = 0; i < n; ++i) {
  251. OPJ_FLOAT32 r = c0[i];
  252. OPJ_FLOAT32 g = c1[i];
  253. OPJ_FLOAT32 b = c2[i];
  254. OPJ_FLOAT32 y = 0.299f * r + 0.587f * g + 0.114f * b;
  255. OPJ_FLOAT32 u = -0.16875f * r - 0.331260f * g + 0.5f * b;
  256. OPJ_FLOAT32 v = 0.5f * r - 0.41869f * g - 0.08131f * b;
  257. c0[i] = y;
  258. c1[i] = u;
  259. c2[i] = v;
  260. }
  261. }
  262. /* <summary> */
  263. /* Inverse irreversible MCT. */
  264. /* </summary> */
  265. void opj_mct_decode_real(
  266. OPJ_FLOAT32* OPJ_RESTRICT c0,
  267. OPJ_FLOAT32* OPJ_RESTRICT c1,
  268. OPJ_FLOAT32* OPJ_RESTRICT c2,
  269. OPJ_SIZE_T n)
  270. {
  271. OPJ_SIZE_T i;
  272. #ifdef __SSE__
  273. __m128 vrv, vgu, vgv, vbu;
  274. vrv = _mm_set1_ps(1.402f);
  275. vgu = _mm_set1_ps(0.34413f);
  276. vgv = _mm_set1_ps(0.71414f);
  277. vbu = _mm_set1_ps(1.772f);
  278. for (i = 0; i < (n >> 3); ++i) {
  279. __m128 vy, vu, vv;
  280. __m128 vr, vg, vb;
  281. vy = _mm_load_ps(c0);
  282. vu = _mm_load_ps(c1);
  283. vv = _mm_load_ps(c2);
  284. vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv));
  285. vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv));
  286. vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu));
  287. _mm_store_ps(c0, vr);
  288. _mm_store_ps(c1, vg);
  289. _mm_store_ps(c2, vb);
  290. c0 += 4;
  291. c1 += 4;
  292. c2 += 4;
  293. vy = _mm_load_ps(c0);
  294. vu = _mm_load_ps(c1);
  295. vv = _mm_load_ps(c2);
  296. vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv));
  297. vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv));
  298. vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu));
  299. _mm_store_ps(c0, vr);
  300. _mm_store_ps(c1, vg);
  301. _mm_store_ps(c2, vb);
  302. c0 += 4;
  303. c1 += 4;
  304. c2 += 4;
  305. }
  306. n &= 7;
  307. #endif
  308. for (i = 0; i < n; ++i) {
  309. OPJ_FLOAT32 y = c0[i];
  310. OPJ_FLOAT32 u = c1[i];
  311. OPJ_FLOAT32 v = c2[i];
  312. OPJ_FLOAT32 r = y + (v * 1.402f);
  313. OPJ_FLOAT32 g = y - (u * 0.34413f) - (v * (0.71414f));
  314. OPJ_FLOAT32 b = y + (u * 1.772f);
  315. c0[i] = r;
  316. c1[i] = g;
  317. c2[i] = b;
  318. }
  319. }
  320. /* <summary> */
  321. /* Get norm of basis function of irreversible MCT. */
  322. /* </summary> */
  323. OPJ_FLOAT64 opj_mct_getnorm_real(OPJ_UINT32 compno)
  324. {
  325. return opj_mct_norms_real[compno];
  326. }
  327. OPJ_BOOL opj_mct_encode_custom(
  328. OPJ_BYTE * pCodingdata,
  329. OPJ_SIZE_T n,
  330. OPJ_BYTE ** pData,
  331. OPJ_UINT32 pNbComp,
  332. OPJ_UINT32 isSigned)
  333. {
  334. OPJ_FLOAT32 * lMct = (OPJ_FLOAT32 *) pCodingdata;
  335. OPJ_SIZE_T i;
  336. OPJ_UINT32 j;
  337. OPJ_UINT32 k;
  338. OPJ_UINT32 lNbMatCoeff = pNbComp * pNbComp;
  339. OPJ_INT32 * lCurrentData = 00;
  340. OPJ_INT32 * lCurrentMatrix = 00;
  341. OPJ_INT32 ** lData = (OPJ_INT32 **) pData;
  342. OPJ_UINT32 lMultiplicator = 1 << 13;
  343. OPJ_INT32 * lMctPtr;
  344. OPJ_ARG_NOT_USED(isSigned);
  345. lCurrentData = (OPJ_INT32 *) opj_malloc((pNbComp + lNbMatCoeff) * sizeof(
  346. OPJ_INT32));
  347. if (! lCurrentData) {
  348. return OPJ_FALSE;
  349. }
  350. lCurrentMatrix = lCurrentData + pNbComp;
  351. for (i = 0; i < lNbMatCoeff; ++i) {
  352. lCurrentMatrix[i] = (OPJ_INT32)(*(lMct++) * (OPJ_FLOAT32)lMultiplicator);
  353. }
  354. for (i = 0; i < n; ++i) {
  355. lMctPtr = lCurrentMatrix;
  356. for (j = 0; j < pNbComp; ++j) {
  357. lCurrentData[j] = (*(lData[j]));
  358. }
  359. for (j = 0; j < pNbComp; ++j) {
  360. *(lData[j]) = 0;
  361. for (k = 0; k < pNbComp; ++k) {
  362. *(lData[j]) += opj_int_fix_mul(*lMctPtr, lCurrentData[k]);
  363. ++lMctPtr;
  364. }
  365. ++lData[j];
  366. }
  367. }
  368. opj_free(lCurrentData);
  369. return OPJ_TRUE;
  370. }
  371. OPJ_BOOL opj_mct_decode_custom(
  372. OPJ_BYTE * pDecodingData,
  373. OPJ_SIZE_T n,
  374. OPJ_BYTE ** pData,
  375. OPJ_UINT32 pNbComp,
  376. OPJ_UINT32 isSigned)
  377. {
  378. OPJ_FLOAT32 * lMct;
  379. OPJ_SIZE_T i;
  380. OPJ_UINT32 j;
  381. OPJ_UINT32 k;
  382. OPJ_FLOAT32 * lCurrentData = 00;
  383. OPJ_FLOAT32 * lCurrentResult = 00;
  384. OPJ_FLOAT32 ** lData = (OPJ_FLOAT32 **) pData;
  385. OPJ_ARG_NOT_USED(isSigned);
  386. lCurrentData = (OPJ_FLOAT32 *) opj_malloc(2 * pNbComp * sizeof(OPJ_FLOAT32));
  387. if (! lCurrentData) {
  388. return OPJ_FALSE;
  389. }
  390. lCurrentResult = lCurrentData + pNbComp;
  391. for (i = 0; i < n; ++i) {
  392. lMct = (OPJ_FLOAT32 *) pDecodingData;
  393. for (j = 0; j < pNbComp; ++j) {
  394. lCurrentData[j] = (OPJ_FLOAT32)(*(lData[j]));
  395. }
  396. for (j = 0; j < pNbComp; ++j) {
  397. lCurrentResult[j] = 0;
  398. for (k = 0; k < pNbComp; ++k) {
  399. lCurrentResult[j] += *(lMct++) * lCurrentData[k];
  400. }
  401. *(lData[j]++) = (OPJ_FLOAT32)(lCurrentResult[j]);
  402. }
  403. }
  404. opj_free(lCurrentData);
  405. return OPJ_TRUE;
  406. }
  407. void opj_calculate_norms(OPJ_FLOAT64 * pNorms,
  408. OPJ_UINT32 pNbComps,
  409. OPJ_FLOAT32 * pMatrix)
  410. {
  411. OPJ_UINT32 i, j, lIndex;
  412. OPJ_FLOAT32 lCurrentValue;
  413. OPJ_FLOAT64 * lNorms = (OPJ_FLOAT64 *) pNorms;
  414. OPJ_FLOAT32 * lMatrix = (OPJ_FLOAT32 *) pMatrix;
  415. for (i = 0; i < pNbComps; ++i) {
  416. lNorms[i] = 0;
  417. lIndex = i;
  418. for (j = 0; j < pNbComps; ++j) {
  419. lCurrentValue = lMatrix[lIndex];
  420. lIndex += pNbComps;
  421. lNorms[i] += (OPJ_FLOAT64) lCurrentValue * lCurrentValue;
  422. }
  423. lNorms[i] = sqrt(lNorms[i]);
  424. }
  425. }