palette_neon_intrinsics.c 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. /* palette_neon_intrinsics.c - NEON optimised palette expansion functions
  2. *
  3. * Copyright (c) 2018-2019 Cosmin Truta
  4. * Copyright (c) 2017-2018 Arm Holdings. All rights reserved.
  5. * Written by Richard Townsend <Richard.Townsend@arm.com>, February 2017.
  6. *
  7. * This code is released under the libpng license.
  8. * For conditions of distribution and use, see the disclaimer
  9. * and license in png.h
  10. */
  11. #include "../pngpriv.h"
  12. #if PNG_ARM_NEON_IMPLEMENTATION == 1
  13. #if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64)
  14. # include <arm64_neon.h>
  15. #else
  16. # include <arm_neon.h>
  17. #endif
  18. /* Build an RGBA8 palette from the separate RGB and alpha palettes. */
  19. void
  20. png_riffle_palette_neon(png_structrp png_ptr)
  21. {
  22. png_const_colorp palette = png_ptr->palette;
  23. png_bytep riffled_palette = png_ptr->riffled_palette;
  24. png_const_bytep trans_alpha = png_ptr->trans_alpha;
  25. int num_trans = png_ptr->num_trans;
  26. int i;
  27. /* Initially black, opaque. */
  28. uint8x16x4_t w = {{
  29. vdupq_n_u8(0x00),
  30. vdupq_n_u8(0x00),
  31. vdupq_n_u8(0x00),
  32. vdupq_n_u8(0xff),
  33. }};
  34. png_debug(1, "in png_riffle_palette_neon");
  35. /* First, riffle the RGB colours into an RGBA8 palette.
  36. * The alpha component is set to opaque for now.
  37. */
  38. for (i = 0; i < 256; i += 16)
  39. {
  40. uint8x16x3_t v = vld3q_u8((png_const_bytep)(palette + i));
  41. w.val[0] = v.val[0];
  42. w.val[1] = v.val[1];
  43. w.val[2] = v.val[2];
  44. vst4q_u8(riffled_palette + (i << 2), w);
  45. }
  46. /* Fix up the missing transparency values. */
  47. for (i = 0; i < num_trans; i++)
  48. riffled_palette[(i << 2) + 3] = trans_alpha[i];
  49. }
  50. /* Expands a palettized row into RGBA8. */
  51. int
  52. png_do_expand_palette_rgba8_neon(png_structrp png_ptr, png_row_infop row_info,
  53. png_const_bytep row, png_bytepp ssp, png_bytepp ddp)
  54. {
  55. png_uint_32 row_width = row_info->width;
  56. const png_uint_32 *riffled_palette =
  57. (const png_uint_32 *)png_ptr->riffled_palette;
  58. const png_uint_32 pixels_per_chunk = 4;
  59. png_uint_32 i;
  60. png_debug(1, "in png_do_expand_palette_rgba8_neon");
  61. PNG_UNUSED(row)
  62. if (row_width < pixels_per_chunk)
  63. return 0;
  64. /* This function originally gets the last byte of the output row.
  65. * The NEON part writes forward from a given position, so we have
  66. * to seek this back by 4 pixels x 4 bytes.
  67. */
  68. *ddp = *ddp - ((pixels_per_chunk * sizeof(png_uint_32)) - 1);
  69. for (i = 0; i < row_width; i += pixels_per_chunk)
  70. {
  71. uint32x4_t cur;
  72. png_bytep sp = *ssp - i, dp = *ddp - (i << 2);
  73. cur = vld1q_dup_u32 (riffled_palette + *(sp - 3));
  74. cur = vld1q_lane_u32(riffled_palette + *(sp - 2), cur, 1);
  75. cur = vld1q_lane_u32(riffled_palette + *(sp - 1), cur, 2);
  76. cur = vld1q_lane_u32(riffled_palette + *(sp - 0), cur, 3);
  77. vst1q_u32((void *)dp, cur);
  78. }
  79. if (i != row_width)
  80. {
  81. /* Remove the amount that wasn't processed. */
  82. i -= pixels_per_chunk;
  83. }
  84. /* Decrement output pointers. */
  85. *ssp = *ssp - i;
  86. *ddp = *ddp - (i << 2);
  87. return i;
  88. }
  89. /* Expands a palettized row into RGB8. */
  90. int
  91. png_do_expand_palette_rgb8_neon(png_structrp png_ptr, png_row_infop row_info,
  92. png_const_bytep row, png_bytepp ssp, png_bytepp ddp)
  93. {
  94. png_uint_32 row_width = row_info->width;
  95. png_const_bytep palette = (png_const_bytep)png_ptr->palette;
  96. const png_uint_32 pixels_per_chunk = 8;
  97. png_uint_32 i;
  98. png_debug(1, "in png_do_expand_palette_rgb8_neon");
  99. PNG_UNUSED(row)
  100. if (row_width <= pixels_per_chunk)
  101. return 0;
  102. /* Seeking this back by 8 pixels x 3 bytes. */
  103. *ddp = *ddp - ((pixels_per_chunk * sizeof(png_color)) - 1);
  104. for (i = 0; i < row_width; i += pixels_per_chunk)
  105. {
  106. uint8x8x3_t cur;
  107. png_bytep sp = *ssp - i, dp = *ddp - ((i << 1) + i);
  108. cur = vld3_dup_u8(palette + sizeof(png_color) * (*(sp - 7)));
  109. cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 6)), cur, 1);
  110. cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 5)), cur, 2);
  111. cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 4)), cur, 3);
  112. cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 3)), cur, 4);
  113. cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 2)), cur, 5);
  114. cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 1)), cur, 6);
  115. cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 0)), cur, 7);
  116. vst3_u8((void *)dp, cur);
  117. }
  118. if (i != row_width)
  119. {
  120. /* Remove the amount that wasn't processed. */
  121. i -= pixels_per_chunk;
  122. }
  123. /* Decrement output pointers. */
  124. *ssp = *ssp - i;
  125. *ddp = *ddp - ((i << 1) + i);
  126. return i;
  127. }
  128. #endif /* PNG_ARM_NEON_IMPLEMENTATION */