123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622 |
- /*
- * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
- *
- * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
- #define JPEG_INTERNALS
- #include "jconfigint.h"
- #include "../../jinclude.h"
- #include "../../jpeglib.h"
- #include "../../jsimd.h"
- #include "../../jdct.h"
- #include "../../jsimddct.h"
- #include "../jsimd.h"
- #include "neon-compat.h"
- #include <arm_neon.h>
- /* Data preparation for encode_mcu_AC_first().
- *
- * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
- * found in jcphuff.c.
- */
- void jsimd_encode_mcu_AC_first_prepare_neon
- (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
- JCOEF *values, size_t *zerobits)
- {
- JCOEF *values_ptr = values;
- JCOEF *diff_values_ptr = values + DCTSIZE2;
- /* Rows of coefficients to zero (since they haven't been processed) */
- int i, rows_to_zero = 8;
- for (i = 0; i < Sl / 16; i++) {
- int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
- int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
- /* Isolate sign of coefficients. */
- int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
- int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
- /* Compute absolute value of coefficients and apply point transform Al. */
- int16x8_t abs_coefs1 = vabsq_s16(coefs1);
- int16x8_t abs_coefs2 = vabsq_s16(coefs2);
- coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
- coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
- /* Compute diff values. */
- int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
- int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
- /* Store transformed coefficients and diff values. */
- vst1q_s16(values_ptr, coefs1);
- vst1q_s16(values_ptr + DCTSIZE, coefs2);
- vst1q_s16(diff_values_ptr, diff1);
- vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
- values_ptr += 16;
- diff_values_ptr += 16;
- jpeg_natural_order_start += 16;
- rows_to_zero -= 2;
- }
- /* Same operation but for remaining partial vector */
- int remaining_coefs = Sl % 16;
- if (remaining_coefs > 8) {
- int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
- int16x8_t coefs2 = vdupq_n_s16(0);
- switch (remaining_coefs) {
- case 15:
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
- FALLTHROUGH /*FALLTHROUGH*/
- case 14:
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
- FALLTHROUGH /*FALLTHROUGH*/
- case 13:
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
- FALLTHROUGH /*FALLTHROUGH*/
- case 12:
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
- FALLTHROUGH /*FALLTHROUGH*/
- case 11:
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
- FALLTHROUGH /*FALLTHROUGH*/
- case 10:
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
- FALLTHROUGH /*FALLTHROUGH*/
- case 9:
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
- FALLTHROUGH /*FALLTHROUGH*/
- default:
- break;
- }
- /* Isolate sign of coefficients. */
- int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
- int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
- /* Compute absolute value of coefficients and apply point transform Al. */
- int16x8_t abs_coefs1 = vabsq_s16(coefs1);
- int16x8_t abs_coefs2 = vabsq_s16(coefs2);
- coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
- coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
- /* Compute diff values. */
- int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
- int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
- /* Store transformed coefficients and diff values. */
- vst1q_s16(values_ptr, coefs1);
- vst1q_s16(values_ptr + DCTSIZE, coefs2);
- vst1q_s16(diff_values_ptr, diff1);
- vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
- values_ptr += 16;
- diff_values_ptr += 16;
- rows_to_zero -= 2;
- } else if (remaining_coefs > 0) {
- int16x8_t coefs = vdupq_n_s16(0);
- switch (remaining_coefs) {
- case 8:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
- FALLTHROUGH /*FALLTHROUGH*/
- case 7:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
- FALLTHROUGH /*FALLTHROUGH*/
- case 6:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
- FALLTHROUGH /*FALLTHROUGH*/
- case 5:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
- FALLTHROUGH /*FALLTHROUGH*/
- case 4:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
- FALLTHROUGH /*FALLTHROUGH*/
- case 3:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
- FALLTHROUGH /*FALLTHROUGH*/
- case 2:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
- FALLTHROUGH /*FALLTHROUGH*/
- case 1:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
- FALLTHROUGH /*FALLTHROUGH*/
- default:
- break;
- }
- /* Isolate sign of coefficients. */
- int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
- /* Compute absolute value of coefficients and apply point transform Al. */
- int16x8_t abs_coefs = vabsq_s16(coefs);
- coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
- /* Compute diff values. */
- int16x8_t diff = veorq_s16(coefs, sign_coefs);
- /* Store transformed coefficients and diff values. */
- vst1q_s16(values_ptr, coefs);
- vst1q_s16(diff_values_ptr, diff);
- values_ptr += 8;
- diff_values_ptr += 8;
- rows_to_zero--;
- }
- /* Zero remaining memory in the values and diff_values blocks. */
- for (i = 0; i < rows_to_zero; i++) {
- vst1q_s16(values_ptr, vdupq_n_s16(0));
- vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
- values_ptr += 8;
- diff_values_ptr += 8;
- }
- /* Construct zerobits bitmap. A set bit means that the corresponding
- * coefficient != 0.
- */
- int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
- int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
- int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
- int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
- int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
- int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
- int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
- int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
- uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
- uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
- uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
- uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
- uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
- uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
- uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
- uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
- /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
- const uint8x8_t bitmap_mask =
- vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
- row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
- row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
- row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
- row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
- row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
- row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
- row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
- row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
- uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
- uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
- uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
- uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
- uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
- uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
- uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
- #if defined(__aarch64__) || defined(_M_ARM64)
- /* Move bitmap to a 64-bit scalar register. */
- uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
- /* Store zerobits bitmap. */
- *zerobits = ~bitmap;
- #else
- /* Move bitmap to two 32-bit scalar registers. */
- uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
- uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
- /* Store zerobits bitmap. */
- zerobits[0] = ~bitmap0;
- zerobits[1] = ~bitmap1;
- #endif
- }
- /* Data preparation for encode_mcu_AC_refine().
- *
- * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
- * found in jcphuff.c.
- */
- int jsimd_encode_mcu_AC_refine_prepare_neon
- (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
- JCOEF *absvalues, size_t *bits)
- {
- /* Temporary storage buffers for data used to compute the signbits bitmap and
- * the end-of-block (EOB) position
- */
- uint8_t coef_sign_bits[64];
- uint8_t coef_eq1_bits[64];
- JCOEF *absvalues_ptr = absvalues;
- uint8_t *coef_sign_bits_ptr = coef_sign_bits;
- uint8_t *eq1_bits_ptr = coef_eq1_bits;
- /* Rows of coefficients to zero (since they haven't been processed) */
- int i, rows_to_zero = 8;
- for (i = 0; i < Sl / 16; i++) {
- int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
- int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
- /* Compute and store data for signbits bitmap. */
- uint8x8_t sign_coefs1 =
- vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
- uint8x8_t sign_coefs2 =
- vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
- vst1_u8(coef_sign_bits_ptr, sign_coefs1);
- vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
- /* Compute absolute value of coefficients and apply point transform Al. */
- int16x8_t abs_coefs1 = vabsq_s16(coefs1);
- int16x8_t abs_coefs2 = vabsq_s16(coefs2);
- coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
- coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
- vst1q_s16(absvalues_ptr, coefs1);
- vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
- /* Test whether transformed coefficient values == 1 (used to find EOB
- * position.)
- */
- uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
- uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
- vst1_u8(eq1_bits_ptr, coefs_eq11);
- vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
- absvalues_ptr += 16;
- coef_sign_bits_ptr += 16;
- eq1_bits_ptr += 16;
- jpeg_natural_order_start += 16;
- rows_to_zero -= 2;
- }
- /* Same operation but for remaining partial vector */
- int remaining_coefs = Sl % 16;
- if (remaining_coefs > 8) {
- int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
- coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
- int16x8_t coefs2 = vdupq_n_s16(0);
- switch (remaining_coefs) {
- case 15:
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
- FALLTHROUGH /*FALLTHROUGH*/
- case 14:
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
- FALLTHROUGH /*FALLTHROUGH*/
- case 13:
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
- FALLTHROUGH /*FALLTHROUGH*/
- case 12:
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
- FALLTHROUGH /*FALLTHROUGH*/
- case 11:
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
- FALLTHROUGH /*FALLTHROUGH*/
- case 10:
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
- FALLTHROUGH /*FALLTHROUGH*/
- case 9:
- coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
- FALLTHROUGH /*FALLTHROUGH*/
- default:
- break;
- }
- /* Compute and store data for signbits bitmap. */
- uint8x8_t sign_coefs1 =
- vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
- uint8x8_t sign_coefs2 =
- vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
- vst1_u8(coef_sign_bits_ptr, sign_coefs1);
- vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
- /* Compute absolute value of coefficients and apply point transform Al. */
- int16x8_t abs_coefs1 = vabsq_s16(coefs1);
- int16x8_t abs_coefs2 = vabsq_s16(coefs2);
- coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
- coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
- vst1q_s16(absvalues_ptr, coefs1);
- vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
- /* Test whether transformed coefficient values == 1 (used to find EOB
- * position.)
- */
- uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
- uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
- vst1_u8(eq1_bits_ptr, coefs_eq11);
- vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
- absvalues_ptr += 16;
- coef_sign_bits_ptr += 16;
- eq1_bits_ptr += 16;
- jpeg_natural_order_start += 16;
- rows_to_zero -= 2;
- } else if (remaining_coefs > 0) {
- int16x8_t coefs = vdupq_n_s16(0);
- switch (remaining_coefs) {
- case 8:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
- FALLTHROUGH /*FALLTHROUGH*/
- case 7:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
- FALLTHROUGH /*FALLTHROUGH*/
- case 6:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
- FALLTHROUGH /*FALLTHROUGH*/
- case 5:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
- FALLTHROUGH /*FALLTHROUGH*/
- case 4:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
- FALLTHROUGH /*FALLTHROUGH*/
- case 3:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
- FALLTHROUGH /*FALLTHROUGH*/
- case 2:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
- FALLTHROUGH /*FALLTHROUGH*/
- case 1:
- coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
- FALLTHROUGH /*FALLTHROUGH*/
- default:
- break;
- }
- /* Compute and store data for signbits bitmap. */
- uint8x8_t sign_coefs =
- vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
- vst1_u8(coef_sign_bits_ptr, sign_coefs);
- /* Compute absolute value of coefficients and apply point transform Al. */
- int16x8_t abs_coefs = vabsq_s16(coefs);
- coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
- vst1q_s16(absvalues_ptr, coefs);
- /* Test whether transformed coefficient values == 1 (used to find EOB
- * position.)
- */
- uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
- vst1_u8(eq1_bits_ptr, coefs_eq1);
- absvalues_ptr += 8;
- coef_sign_bits_ptr += 8;
- eq1_bits_ptr += 8;
- rows_to_zero--;
- }
- /* Zero remaining memory in blocks. */
- for (i = 0; i < rows_to_zero; i++) {
- vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
- vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
- vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
- absvalues_ptr += 8;
- coef_sign_bits_ptr += 8;
- eq1_bits_ptr += 8;
- }
- /* Construct zerobits bitmap. */
- int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
- int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
- int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
- int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
- int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
- int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
- int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
- int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
- uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
- uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
- uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
- uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
- uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
- uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
- uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
- uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
- /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
- const uint8x8_t bitmap_mask =
- vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
- abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
- abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
- abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
- abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
- abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
- abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
- abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
- abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
- uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
- uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
- uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
- uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
- uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
- uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
- uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
- #if defined(__aarch64__) || defined(_M_ARM64)
- /* Move bitmap to a 64-bit scalar register. */
- uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
- /* Store zerobits bitmap. */
- bits[0] = ~bitmap;
- #else
- /* Move bitmap to two 32-bit scalar registers. */
- uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
- uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
- /* Store zerobits bitmap. */
- bits[0] = ~bitmap0;
- bits[1] = ~bitmap1;
- #endif
- /* Construct signbits bitmap. */
- uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
- uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
- uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
- uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
- uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
- uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
- uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
- uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
- signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
- signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
- signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
- signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
- signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
- signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
- signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
- signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
- bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
- bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
- bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
- bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
- bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
- bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
- bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
- #if defined(__aarch64__) || defined(_M_ARM64)
- /* Move bitmap to a 64-bit scalar register. */
- bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
- /* Store signbits bitmap. */
- bits[1] = ~bitmap;
- #else
- /* Move bitmap to two 32-bit scalar registers. */
- bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
- bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
- /* Store signbits bitmap. */
- bits[2] = ~bitmap0;
- bits[3] = ~bitmap1;
- #endif
- /* Construct bitmap to find EOB position (the index of the last coefficient
- * equal to 1.)
- */
- uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
- uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
- uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
- uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
- uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
- uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
- uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
- uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
- row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
- row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
- row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
- row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
- row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
- row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
- row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
- row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
- bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
- bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
- bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
- bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
- bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
- bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
- bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
- #if defined(__aarch64__) || defined(_M_ARM64)
- /* Move bitmap to a 64-bit scalar register. */
- bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
- /* Return EOB position. */
- if (bitmap == 0) {
- /* EOB position is defined to be 0 if all coefficients != 1. */
- return 0;
- } else {
- return 63 - BUILTIN_CLZLL(bitmap);
- }
- #else
- /* Move bitmap to two 32-bit scalar registers. */
- bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
- bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
- /* Return EOB position. */
- if (bitmap0 == 0 && bitmap1 == 0) {
- return 0;
- } else if (bitmap1 != 0) {
- return 63 - BUILTIN_CLZ(bitmap1);
- } else {
- return 31 - BUILTIN_CLZ(bitmap0);
- }
- #endif
- }
|