arm64.c 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. ///////////////////////////////////////////////////////////////////////////////
  2. //
  3. /// \file arm64.c
  4. /// \brief Filter for ARM64 binaries
  5. ///
  6. /// This converts ARM64 relative addresses in the BL and ADRP immediates
  7. /// to absolute values to increase redundancy of ARM64 code.
  8. ///
  9. /// Converting B or ADR instructions was also tested but it's not useful.
  10. /// A majority of the jumps for the B instruction are very small (+/- 0xFF).
  11. /// These are typical for loops and if-statements. Encoding them to their
  12. /// absolute address reduces redundancy since many of the small relative
  13. /// jump values are repeated, but very few of the absolute addresses are.
  14. //
  15. // Authors: Lasse Collin
  16. // Jia Tan
  17. // Igor Pavlov
  18. //
  19. // This file has been put into the public domain.
  20. // You can do whatever you want with this file.
  21. //
  22. ///////////////////////////////////////////////////////////////////////////////
  23. #include "simple_private.h"
  24. static size_t
  25. arm64_code(void *simple lzma_attribute((__unused__)),
  26. uint32_t now_pos, bool is_encoder,
  27. uint8_t *buffer, size_t size)
  28. {
  29. size_t i;
  30. // Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower
  31. // with auto-vectorization that is enabled by default with -O2.
  32. // Such vectorization bloat happens with -O2 when targeting ARM64 too
  33. // but performance hasn't been tested.
  34. #ifdef __clang__
  35. # pragma clang loop vectorize(disable)
  36. #endif
  37. for (i = 0; i + 4 <= size; i += 4) {
  38. uint32_t pc = (uint32_t)(now_pos + i);
  39. uint32_t instr = read32le(buffer + i);
  40. if ((instr >> 26) == 0x25) {
  41. // BL instruction:
  42. // The full 26-bit immediate is converted.
  43. // The range is +/-128 MiB.
  44. //
  45. // Using the full range is helps quite a lot with
  46. // big executables. Smaller range would reduce false
  47. // positives in non-code sections of the input though
  48. // so this is a compromise that slightly favors big
  49. // files. With the full range only six bits of the 32
  50. // need to match to trigger a conversion.
  51. const uint32_t src = instr;
  52. instr = 0x94000000;
  53. pc >>= 2;
  54. if (!is_encoder)
  55. pc = 0U - pc;
  56. instr |= (src + pc) & 0x03FFFFFF;
  57. write32le(buffer + i, instr);
  58. } else if ((instr & 0x9F000000) == 0x90000000) {
  59. // ADRP instruction:
  60. // Only values in the range +/-512 MiB are converted.
  61. //
  62. // Using less than the full +/-4 GiB range reduces
  63. // false positives on non-code sections of the input
  64. // while being excellent for executables up to 512 MiB.
  65. // The positive effect of ADRP conversion is smaller
  66. // than that of BL but it also doesn't hurt so much in
  67. // non-code sections of input because, with +/-512 MiB
  68. // range, nine bits of 32 need to match to trigger a
  69. // conversion (two 10-bit match choices = 9 bits).
  70. const uint32_t src = ((instr >> 29) & 3)
  71. | ((instr >> 3) & 0x001FFFFC);
  72. // With the addition only one branch is needed to
  73. // check the +/- range. This is usually false when
  74. // processing ARM64 code so branch prediction will
  75. // handle it well in terms of performance.
  76. //
  77. //if ((src & 0x001E0000) != 0
  78. // && (src & 0x001E0000) != 0x001E0000)
  79. if ((src + 0x00020000) & 0x001C0000)
  80. continue;
  81. instr &= 0x9000001F;
  82. pc >>= 12;
  83. if (!is_encoder)
  84. pc = 0U - pc;
  85. const uint32_t dest = src + pc;
  86. instr |= (dest & 3) << 29;
  87. instr |= (dest & 0x0003FFFC) << 3;
  88. instr |= (0U - (dest & 0x00020000)) & 0x00E00000;
  89. write32le(buffer + i, instr);
  90. }
  91. }
  92. return i;
  93. }
  94. static lzma_ret
  95. arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator,
  96. const lzma_filter_info *filters, bool is_encoder)
  97. {
  98. return lzma_simple_coder_init(next, allocator, filters,
  99. &arm64_code, 0, 4, 4, is_encoder);
  100. }
  101. #ifdef HAVE_ENCODER_ARM64
  102. extern lzma_ret
  103. lzma_simple_arm64_encoder_init(lzma_next_coder *next,
  104. const lzma_allocator *allocator,
  105. const lzma_filter_info *filters)
  106. {
  107. return arm64_coder_init(next, allocator, filters, true);
  108. }
  109. #endif
  110. #ifdef HAVE_DECODER_ARM64
  111. extern lzma_ret
  112. lzma_simple_arm64_decoder_init(lzma_next_coder *next,
  113. const lzma_allocator *allocator,
  114. const lzma_filter_info *filters)
  115. {
  116. return arm64_coder_init(next, allocator, filters, false);
  117. }
  118. #endif