jcdctmgr.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720
  1. /*
  2. * jcdctmgr.c
  3. *
  4. * This file was part of the Independent JPEG Group's software:
  5. * Copyright (C) 1994-1996, Thomas G. Lane.
  6. * libjpeg-turbo Modifications:
  7. * Copyright (C) 1999-2006, MIYASAKA Masaru.
  8. * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  9. * Copyright (C) 2011, 2014-2015, D. R. Commander.
  10. * For conditions of distribution and use, see the accompanying README.ijg
  11. * file.
  12. *
  13. * This file contains the forward-DCT management logic.
  14. * This code selects a particular DCT implementation to be used,
  15. * and it performs related housekeeping chores including coefficient
  16. * quantization.
  17. */
  18. #define JPEG_INTERNALS
  19. #include "jinclude.h"
  20. #include "jpeglib.h"
  21. #include "jdct.h" /* Private declarations for DCT subsystem */
  22. #include "jsimddct.h"
  23. /* Private subobject for this module */
  24. typedef void (*forward_DCT_method_ptr) (DCTELEM *data);
  25. typedef void (*float_DCT_method_ptr) (FAST_FLOAT *data);
  26. typedef void (*convsamp_method_ptr) (JSAMPARRAY sample_data,
  27. JDIMENSION start_col,
  28. DCTELEM *workspace);
  29. typedef void (*float_convsamp_method_ptr) (JSAMPARRAY sample_data,
  30. JDIMENSION start_col,
  31. FAST_FLOAT *workspace);
  32. typedef void (*quantize_method_ptr) (JCOEFPTR coef_block, DCTELEM *divisors,
  33. DCTELEM *workspace);
  34. typedef void (*float_quantize_method_ptr) (JCOEFPTR coef_block,
  35. FAST_FLOAT *divisors,
  36. FAST_FLOAT *workspace);
  37. METHODDEF(void) quantize(JCOEFPTR, DCTELEM *, DCTELEM *);
  38. typedef struct {
  39. struct jpeg_forward_dct pub; /* public fields */
  40. /* Pointer to the DCT routine actually in use */
  41. forward_DCT_method_ptr dct;
  42. convsamp_method_ptr convsamp;
  43. quantize_method_ptr quantize;
  44. /* The actual post-DCT divisors --- not identical to the quant table
  45. * entries, because of scaling (especially for an unnormalized DCT).
  46. * Each table is given in normal array order.
  47. */
  48. DCTELEM *divisors[NUM_QUANT_TBLS];
  49. /* work area for FDCT subroutine */
  50. DCTELEM *workspace;
  51. #ifdef DCT_FLOAT_SUPPORTED
  52. /* Same as above for the floating-point case. */
  53. float_DCT_method_ptr float_dct;
  54. float_convsamp_method_ptr float_convsamp;
  55. float_quantize_method_ptr float_quantize;
  56. FAST_FLOAT *float_divisors[NUM_QUANT_TBLS];
  57. FAST_FLOAT *float_workspace;
  58. #endif
  59. } my_fdct_controller;
  60. typedef my_fdct_controller *my_fdct_ptr;
  61. #if BITS_IN_JSAMPLE == 8
  62. /*
  63. * Find the highest bit in an integer through binary search.
  64. */
  65. LOCAL(int)
  66. flss(UINT16 val)
  67. {
  68. int bit;
  69. bit = 16;
  70. if (!val)
  71. return 0;
  72. if (!(val & 0xff00)) {
  73. bit -= 8;
  74. val <<= 8;
  75. }
  76. if (!(val & 0xf000)) {
  77. bit -= 4;
  78. val <<= 4;
  79. }
  80. if (!(val & 0xc000)) {
  81. bit -= 2;
  82. val <<= 2;
  83. }
  84. if (!(val & 0x8000)) {
  85. bit -= 1;
  86. val <<= 1;
  87. }
  88. return bit;
  89. }
  90. /*
  91. * Compute values to do a division using reciprocal.
  92. *
  93. * This implementation is based on an algorithm described in
  94. * "How to optimize for the Pentium family of microprocessors"
  95. * (http://www.agner.org/assem/).
  96. * More information about the basic algorithm can be found in
  97. * the paper "Integer Division Using Reciprocals" by Robert Alverson.
  98. *
  99. * The basic idea is to replace x/d by x * d^-1. In order to store
  100. * d^-1 with enough precision we shift it left a few places. It turns
  101. * out that this algoright gives just enough precision, and also fits
  102. * into DCTELEM:
  103. *
  104. * b = (the number of significant bits in divisor) - 1
  105. * r = (word size) + b
  106. * f = 2^r / divisor
  107. *
  108. * f will not be an integer for most cases, so we need to compensate
  109. * for the rounding error introduced:
  110. *
  111. * no fractional part:
  112. *
  113. * result = input >> r
  114. *
  115. * fractional part of f < 0.5:
  116. *
  117. * round f down to nearest integer
  118. * result = ((input + 1) * f) >> r
  119. *
  120. * fractional part of f > 0.5:
  121. *
  122. * round f up to nearest integer
  123. * result = (input * f) >> r
  124. *
  125. * This is the original algorithm that gives truncated results. But we
  126. * want properly rounded results, so we replace "input" with
  127. * "input + divisor/2".
  128. *
  129. * In order to allow SIMD implementations we also tweak the values to
  130. * allow the same calculation to be made at all times:
  131. *
  132. * dctbl[0] = f rounded to nearest integer
  133. * dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
  134. * dctbl[2] = 1 << ((word size) * 2 - r)
  135. * dctbl[3] = r - (word size)
  136. *
  137. * dctbl[2] is for stupid instruction sets where the shift operation
  138. * isn't member wise (e.g. MMX).
  139. *
  140. * The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
  141. * is that most SIMD implementations have a "multiply and store top
  142. * half" operation.
  143. *
  144. * Lastly, we store each of the values in their own table instead
  145. * of in a consecutive manner, yet again in order to allow SIMD
  146. * routines.
  147. */
  148. LOCAL(int)
  149. compute_reciprocal(UINT16 divisor, DCTELEM *dtbl)
  150. {
  151. UDCTELEM2 fq, fr;
  152. UDCTELEM c;
  153. int b, r;
  154. if (divisor == 1) {
  155. /* divisor == 1 means unquantized, so these reciprocal/correction/shift
  156. * values will cause the C quantization algorithm to act like the
  157. * identity function. Since only the C quantization algorithm is used in
  158. * these cases, the scale value is irrelevant.
  159. */
  160. dtbl[DCTSIZE2 * 0] = (DCTELEM)1; /* reciprocal */
  161. dtbl[DCTSIZE2 * 1] = (DCTELEM)0; /* correction */
  162. dtbl[DCTSIZE2 * 2] = (DCTELEM)1; /* scale */
  163. dtbl[DCTSIZE2 * 3] = -(DCTELEM)(sizeof(DCTELEM) * 8); /* shift */
  164. return 0;
  165. }
  166. b = flss(divisor) - 1;
  167. r = sizeof(DCTELEM) * 8 + b;
  168. fq = ((UDCTELEM2)1 << r) / divisor;
  169. fr = ((UDCTELEM2)1 << r) % divisor;
  170. c = divisor / 2; /* for rounding */
  171. if (fr == 0) { /* divisor is power of two */
  172. /* fq will be one bit too large to fit in DCTELEM, so adjust */
  173. fq >>= 1;
  174. r--;
  175. } else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */
  176. c++;
  177. } else { /* fractional part is > 0.5 */
  178. fq++;
  179. }
  180. dtbl[DCTSIZE2 * 0] = (DCTELEM)fq; /* reciprocal */
  181. dtbl[DCTSIZE2 * 1] = (DCTELEM)c; /* correction + roundfactor */
  182. #ifdef WITH_SIMD
  183. dtbl[DCTSIZE2 * 2] = (DCTELEM)(1 << (sizeof(DCTELEM) * 8 * 2 - r)); /* scale */
  184. #else
  185. dtbl[DCTSIZE2 * 2] = 1;
  186. #endif
  187. dtbl[DCTSIZE2 * 3] = (DCTELEM)r - sizeof(DCTELEM) * 8; /* shift */
  188. if (r <= 16) return 0;
  189. else return 1;
  190. }
  191. #endif
  192. /*
  193. * Initialize for a processing pass.
  194. * Verify that all referenced Q-tables are present, and set up
  195. * the divisor table for each one.
  196. * In the current implementation, DCT of all components is done during
  197. * the first pass, even if only some components will be output in the
  198. * first scan. Hence all components should be examined here.
  199. */
  200. METHODDEF(void)
  201. start_pass_fdctmgr(j_compress_ptr cinfo)
  202. {
  203. my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
  204. int ci, qtblno, i;
  205. jpeg_component_info *compptr;
  206. JQUANT_TBL *qtbl;
  207. DCTELEM *dtbl;
  208. for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
  209. ci++, compptr++) {
  210. qtblno = compptr->quant_tbl_no;
  211. /* Make sure specified quantization table is present */
  212. if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
  213. cinfo->quant_tbl_ptrs[qtblno] == NULL)
  214. ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
  215. qtbl = cinfo->quant_tbl_ptrs[qtblno];
  216. /* Compute divisors for this quant table */
  217. /* We may do this more than once for same table, but it's not a big deal */
  218. switch (cinfo->dct_method) {
  219. #ifdef DCT_ISLOW_SUPPORTED
  220. case JDCT_ISLOW:
  221. /* For LL&M IDCT method, divisors are equal to raw quantization
  222. * coefficients multiplied by 8 (to counteract scaling).
  223. */
  224. if (fdct->divisors[qtblno] == NULL) {
  225. fdct->divisors[qtblno] = (DCTELEM *)
  226. (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
  227. (DCTSIZE2 * 4) * sizeof(DCTELEM));
  228. }
  229. dtbl = fdct->divisors[qtblno];
  230. for (i = 0; i < DCTSIZE2; i++) {
  231. #if BITS_IN_JSAMPLE == 8
  232. if (!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]) &&
  233. fdct->quantize == jsimd_quantize)
  234. fdct->quantize = quantize;
  235. #else
  236. dtbl[i] = ((DCTELEM)qtbl->quantval[i]) << 3;
  237. #endif
  238. }
  239. break;
  240. #endif
  241. #ifdef DCT_IFAST_SUPPORTED
  242. case JDCT_IFAST:
  243. {
  244. /* For AA&N IDCT method, divisors are equal to quantization
  245. * coefficients scaled by scalefactor[row]*scalefactor[col], where
  246. * scalefactor[0] = 1
  247. * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
  248. * We apply a further scale factor of 8.
  249. */
  250. #define CONST_BITS 14
  251. static const INT16 aanscales[DCTSIZE2] = {
  252. /* precomputed values scaled up by 14 bits */
  253. 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
  254. 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270,
  255. 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906,
  256. 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
  257. 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
  258. 12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552,
  259. 8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446,
  260. 4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247
  261. };
  262. SHIFT_TEMPS
  263. if (fdct->divisors[qtblno] == NULL) {
  264. fdct->divisors[qtblno] = (DCTELEM *)
  265. (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
  266. (DCTSIZE2 * 4) * sizeof(DCTELEM));
  267. }
  268. dtbl = fdct->divisors[qtblno];
  269. for (i = 0; i < DCTSIZE2; i++) {
  270. #if BITS_IN_JSAMPLE == 8
  271. if (!compute_reciprocal(
  272. DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
  273. (JLONG)aanscales[i]),
  274. CONST_BITS - 3), &dtbl[i]) &&
  275. fdct->quantize == jsimd_quantize)
  276. fdct->quantize = quantize;
  277. #else
  278. dtbl[i] = (DCTELEM)
  279. DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
  280. (JLONG)aanscales[i]),
  281. CONST_BITS - 3);
  282. #endif
  283. }
  284. }
  285. break;
  286. #endif
  287. #ifdef DCT_FLOAT_SUPPORTED
  288. case JDCT_FLOAT:
  289. {
  290. /* For float AA&N IDCT method, divisors are equal to quantization
  291. * coefficients scaled by scalefactor[row]*scalefactor[col], where
  292. * scalefactor[0] = 1
  293. * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
  294. * We apply a further scale factor of 8.
  295. * What's actually stored is 1/divisor so that the inner loop can
  296. * use a multiplication rather than a division.
  297. */
  298. FAST_FLOAT *fdtbl;
  299. int row, col;
  300. static const double aanscalefactor[DCTSIZE] = {
  301. 1.0, 1.387039845, 1.306562965, 1.175875602,
  302. 1.0, 0.785694958, 0.541196100, 0.275899379
  303. };
  304. if (fdct->float_divisors[qtblno] == NULL) {
  305. fdct->float_divisors[qtblno] = (FAST_FLOAT *)
  306. (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
  307. DCTSIZE2 * sizeof(FAST_FLOAT));
  308. }
  309. fdtbl = fdct->float_divisors[qtblno];
  310. i = 0;
  311. for (row = 0; row < DCTSIZE; row++) {
  312. for (col = 0; col < DCTSIZE; col++) {
  313. fdtbl[i] = (FAST_FLOAT)
  314. (1.0 / (((double)qtbl->quantval[i] *
  315. aanscalefactor[row] * aanscalefactor[col] * 8.0)));
  316. i++;
  317. }
  318. }
  319. }
  320. break;
  321. #endif
  322. default:
  323. ERREXIT(cinfo, JERR_NOT_COMPILED);
  324. break;
  325. }
  326. }
  327. }
  328. /*
  329. * Load data into workspace, applying unsigned->signed conversion.
  330. */
  331. METHODDEF(void)
  332. convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
  333. {
  334. register DCTELEM *workspaceptr;
  335. register JSAMPROW elemptr;
  336. register int elemr;
  337. workspaceptr = workspace;
  338. for (elemr = 0; elemr < DCTSIZE; elemr++) {
  339. elemptr = sample_data[elemr] + start_col;
  340. #if DCTSIZE == 8 /* unroll the inner loop */
  341. *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
  342. *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
  343. *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
  344. *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
  345. *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
  346. *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
  347. *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
  348. *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
  349. #else
  350. {
  351. register int elemc;
  352. for (elemc = DCTSIZE; elemc > 0; elemc--)
  353. *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
  354. }
  355. #endif
  356. }
  357. }
  358. /*
  359. * Quantize/descale the coefficients, and store into coef_blocks[].
  360. */
  361. METHODDEF(void)
  362. quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
  363. {
  364. int i;
  365. DCTELEM temp;
  366. JCOEFPTR output_ptr = coef_block;
  367. #if BITS_IN_JSAMPLE == 8
  368. UDCTELEM recip, corr;
  369. int shift;
  370. UDCTELEM2 product;
  371. for (i = 0; i < DCTSIZE2; i++) {
  372. temp = workspace[i];
  373. recip = divisors[i + DCTSIZE2 * 0];
  374. corr = divisors[i + DCTSIZE2 * 1];
  375. shift = divisors[i + DCTSIZE2 * 3];
  376. if (temp < 0) {
  377. temp = -temp;
  378. product = (UDCTELEM2)(temp + corr) * recip;
  379. product >>= shift + sizeof(DCTELEM) * 8;
  380. temp = (DCTELEM)product;
  381. temp = -temp;
  382. } else {
  383. product = (UDCTELEM2)(temp + corr) * recip;
  384. product >>= shift + sizeof(DCTELEM) * 8;
  385. temp = (DCTELEM)product;
  386. }
  387. output_ptr[i] = (JCOEF)temp;
  388. }
  389. #else
  390. register DCTELEM qval;
  391. for (i = 0; i < DCTSIZE2; i++) {
  392. qval = divisors[i];
  393. temp = workspace[i];
  394. /* Divide the coefficient value by qval, ensuring proper rounding.
  395. * Since C does not specify the direction of rounding for negative
  396. * quotients, we have to force the dividend positive for portability.
  397. *
  398. * In most files, at least half of the output values will be zero
  399. * (at default quantization settings, more like three-quarters...)
  400. * so we should ensure that this case is fast. On many machines,
  401. * a comparison is enough cheaper than a divide to make a special test
  402. * a win. Since both inputs will be nonnegative, we need only test
  403. * for a < b to discover whether a/b is 0.
  404. * If your machine's division is fast enough, define FAST_DIVIDE.
  405. */
  406. #ifdef FAST_DIVIDE
  407. #define DIVIDE_BY(a, b) a /= b
  408. #else
  409. #define DIVIDE_BY(a, b) if (a >= b) a /= b; else a = 0
  410. #endif
  411. if (temp < 0) {
  412. temp = -temp;
  413. temp += qval >> 1; /* for rounding */
  414. DIVIDE_BY(temp, qval);
  415. temp = -temp;
  416. } else {
  417. temp += qval >> 1; /* for rounding */
  418. DIVIDE_BY(temp, qval);
  419. }
  420. output_ptr[i] = (JCOEF)temp;
  421. }
  422. #endif
  423. }
  424. /*
  425. * Perform forward DCT on one or more blocks of a component.
  426. *
  427. * The input samples are taken from the sample_data[] array starting at
  428. * position start_row/start_col, and moving to the right for any additional
  429. * blocks. The quantized coefficients are returned in coef_blocks[].
  430. */
  431. METHODDEF(void)
  432. forward_DCT(j_compress_ptr cinfo, jpeg_component_info *compptr,
  433. JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
  434. JDIMENSION start_row, JDIMENSION start_col, JDIMENSION num_blocks)
  435. /* This version is used for integer DCT implementations. */
  436. {
  437. /* This routine is heavily used, so it's worth coding it tightly. */
  438. my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
  439. DCTELEM *divisors = fdct->divisors[compptr->quant_tbl_no];
  440. DCTELEM *workspace;
  441. JDIMENSION bi;
  442. /* Make sure the compiler doesn't look up these every pass */
  443. forward_DCT_method_ptr do_dct = fdct->dct;
  444. convsamp_method_ptr do_convsamp = fdct->convsamp;
  445. quantize_method_ptr do_quantize = fdct->quantize;
  446. workspace = fdct->workspace;
  447. sample_data += start_row; /* fold in the vertical offset once */
  448. for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
  449. /* Load data into workspace, applying unsigned->signed conversion */
  450. (*do_convsamp) (sample_data, start_col, workspace);
  451. /* Perform the DCT */
  452. (*do_dct) (workspace);
  453. /* Quantize/descale the coefficients, and store into coef_blocks[] */
  454. (*do_quantize) (coef_blocks[bi], divisors, workspace);
  455. }
  456. }
  457. #ifdef DCT_FLOAT_SUPPORTED
  458. METHODDEF(void)
  459. convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
  460. FAST_FLOAT *workspace)
  461. {
  462. register FAST_FLOAT *workspaceptr;
  463. register JSAMPROW elemptr;
  464. register int elemr;
  465. workspaceptr = workspace;
  466. for (elemr = 0; elemr < DCTSIZE; elemr++) {
  467. elemptr = sample_data[elemr] + start_col;
  468. #if DCTSIZE == 8 /* unroll the inner loop */
  469. *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
  470. *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
  471. *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
  472. *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
  473. *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
  474. *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
  475. *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
  476. *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
  477. #else
  478. {
  479. register int elemc;
  480. for (elemc = DCTSIZE; elemc > 0; elemc--)
  481. *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
  482. }
  483. #endif
  484. }
  485. }
  486. METHODDEF(void)
  487. quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
  488. FAST_FLOAT *workspace)
  489. {
  490. register FAST_FLOAT temp;
  491. register int i;
  492. register JCOEFPTR output_ptr = coef_block;
  493. for (i = 0; i < DCTSIZE2; i++) {
  494. /* Apply the quantization and scaling factor */
  495. temp = workspace[i] * divisors[i];
  496. /* Round to nearest integer.
  497. * Since C does not specify the direction of rounding for negative
  498. * quotients, we have to force the dividend positive for portability.
  499. * The maximum coefficient size is +-16K (for 12-bit data), so this
  500. * code should work for either 16-bit or 32-bit ints.
  501. */
  502. output_ptr[i] = (JCOEF)((int)(temp + (FAST_FLOAT)16384.5) - 16384);
  503. }
  504. }
  505. METHODDEF(void)
  506. forward_DCT_float(j_compress_ptr cinfo, jpeg_component_info *compptr,
  507. JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
  508. JDIMENSION start_row, JDIMENSION start_col,
  509. JDIMENSION num_blocks)
  510. /* This version is used for floating-point DCT implementations. */
  511. {
  512. /* This routine is heavily used, so it's worth coding it tightly. */
  513. my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
  514. FAST_FLOAT *divisors = fdct->float_divisors[compptr->quant_tbl_no];
  515. FAST_FLOAT *workspace;
  516. JDIMENSION bi;
  517. /* Make sure the compiler doesn't look up these every pass */
  518. float_DCT_method_ptr do_dct = fdct->float_dct;
  519. float_convsamp_method_ptr do_convsamp = fdct->float_convsamp;
  520. float_quantize_method_ptr do_quantize = fdct->float_quantize;
  521. workspace = fdct->float_workspace;
  522. sample_data += start_row; /* fold in the vertical offset once */
  523. for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
  524. /* Load data into workspace, applying unsigned->signed conversion */
  525. (*do_convsamp) (sample_data, start_col, workspace);
  526. /* Perform the DCT */
  527. (*do_dct) (workspace);
  528. /* Quantize/descale the coefficients, and store into coef_blocks[] */
  529. (*do_quantize) (coef_blocks[bi], divisors, workspace);
  530. }
  531. }
  532. #endif /* DCT_FLOAT_SUPPORTED */
  533. /*
  534. * Initialize FDCT manager.
  535. */
  536. GLOBAL(void)
  537. jinit_forward_dct(j_compress_ptr cinfo)
  538. {
  539. my_fdct_ptr fdct;
  540. int i;
  541. fdct = (my_fdct_ptr)
  542. (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
  543. sizeof(my_fdct_controller));
  544. cinfo->fdct = (struct jpeg_forward_dct *)fdct;
  545. fdct->pub.start_pass = start_pass_fdctmgr;
  546. /* First determine the DCT... */
  547. switch (cinfo->dct_method) {
  548. #ifdef DCT_ISLOW_SUPPORTED
  549. case JDCT_ISLOW:
  550. fdct->pub.forward_DCT = forward_DCT;
  551. if (jsimd_can_fdct_islow())
  552. fdct->dct = jsimd_fdct_islow;
  553. else
  554. fdct->dct = jpeg_fdct_islow;
  555. break;
  556. #endif
  557. #ifdef DCT_IFAST_SUPPORTED
  558. case JDCT_IFAST:
  559. fdct->pub.forward_DCT = forward_DCT;
  560. if (jsimd_can_fdct_ifast())
  561. fdct->dct = jsimd_fdct_ifast;
  562. else
  563. fdct->dct = jpeg_fdct_ifast;
  564. break;
  565. #endif
  566. #ifdef DCT_FLOAT_SUPPORTED
  567. case JDCT_FLOAT:
  568. fdct->pub.forward_DCT = forward_DCT_float;
  569. if (jsimd_can_fdct_float())
  570. fdct->float_dct = jsimd_fdct_float;
  571. else
  572. fdct->float_dct = jpeg_fdct_float;
  573. break;
  574. #endif
  575. default:
  576. ERREXIT(cinfo, JERR_NOT_COMPILED);
  577. break;
  578. }
  579. /* ...then the supporting stages. */
  580. switch (cinfo->dct_method) {
  581. #ifdef DCT_ISLOW_SUPPORTED
  582. case JDCT_ISLOW:
  583. #endif
  584. #ifdef DCT_IFAST_SUPPORTED
  585. case JDCT_IFAST:
  586. #endif
  587. #if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED)
  588. if (jsimd_can_convsamp())
  589. fdct->convsamp = jsimd_convsamp;
  590. else
  591. fdct->convsamp = convsamp;
  592. if (jsimd_can_quantize())
  593. fdct->quantize = jsimd_quantize;
  594. else
  595. fdct->quantize = quantize;
  596. break;
  597. #endif
  598. #ifdef DCT_FLOAT_SUPPORTED
  599. case JDCT_FLOAT:
  600. if (jsimd_can_convsamp_float())
  601. fdct->float_convsamp = jsimd_convsamp_float;
  602. else
  603. fdct->float_convsamp = convsamp_float;
  604. if (jsimd_can_quantize_float())
  605. fdct->float_quantize = jsimd_quantize_float;
  606. else
  607. fdct->float_quantize = quantize_float;
  608. break;
  609. #endif
  610. default:
  611. ERREXIT(cinfo, JERR_NOT_COMPILED);
  612. break;
  613. }
  614. /* Allocate workspace memory */
  615. #ifdef DCT_FLOAT_SUPPORTED
  616. if (cinfo->dct_method == JDCT_FLOAT)
  617. fdct->float_workspace = (FAST_FLOAT *)
  618. (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
  619. sizeof(FAST_FLOAT) * DCTSIZE2);
  620. else
  621. #endif
  622. fdct->workspace = (DCTELEM *)
  623. (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
  624. sizeof(DCTELEM) * DCTSIZE2);
  625. /* Mark divisor tables unallocated */
  626. for (i = 0; i < NUM_QUANT_TBLS; i++) {
  627. fdct->divisors[i] = NULL;
  628. #ifdef DCT_FLOAT_SUPPORTED
  629. fdct->float_divisors[i] = NULL;
  630. #endif
  631. }
  632. }