riscv.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755
  1. // SPDX-License-Identifier: 0BSD
  2. ///////////////////////////////////////////////////////////////////////////////
  3. //
  4. /// \file riscv.c
  5. /// \brief Filter for 32-bit/64-bit little/big endian RISC-V binaries
  6. ///
  7. /// This converts program counter relative addresses in function calls
  8. /// (JAL, AUIPC+JALR), address calculation of functions and global
  9. /// variables (AUIPC+ADDI), loads (AUIPC+load), and stores (AUIPC+store).
  10. ///
  11. /// For AUIPC+inst2 pairs, the paired instruction checking is fairly relaxed.
  12. /// The paired instruction opcode must only have its lowest two bits set,
  13. /// meaning it will convert any paired instruction that is not a 16-bit
  14. /// compressed instruction. This was shown to be enough to keep the number
  15. /// of false matches low while improving code size and speed.
  16. //
  17. // Authors: Lasse Collin
  18. // Jia Tan
  19. //
  20. // Special thanks:
  21. //
  22. // - Chien Wong <m@xv97.com> provided a few early versions of RISC-V
  23. // filter variants along with test files and benchmark results.
  24. //
  25. // - Igor Pavlov helped a lot in the filter design, getting it both
  26. // faster and smaller. The implementation here is still independently
  27. // written, not based on LZMA SDK.
  28. //
  29. ///////////////////////////////////////////////////////////////////////////////
  30. /*
  31. RISC-V filtering
  32. ================
  33. RV32I and RV64I, possibly combined with extensions C, Zfh, F, D,
  34. and Q, are identical enough that the same filter works for both.
  35. The instruction encoding is always little endian, even on systems
  36. with big endian data access. Thus the same filter works for both
  37. endiannesses.
  38. The following instructions have program counter relative
  39. (pc-relative) behavior:
  40. JAL
  41. ---
  42. JAL is used for function calls (including tail calls) and
  43. unconditional jumps within functions. Jumps within functions
  44. aren't useful to filter because the absolute addresses often
  45. appear only once or at most a few times. Tail calls and jumps
  46. within functions look the same to a simple filter so neither
  47. are filtered, that is, JAL x0 is ignored (the ABI name of the
  48. register x0 is "zero").
  49. Almost all calls store the return address to register x1 (ra)
  50. or x5 (t0). To reduce false matches when the filter is applied
  51. to non-code data, only the JAL instructions that use x1 or x5
  52. are converted. JAL has pc-relative range of +/-1 MiB so longer
  53. calls and jumps need another method (AUIPC+JALR).
  54. C.J and C.JAL
  55. -------------
  56. C.J and C.JAL have pc-relative range of +/-2 KiB.
  57. C.J is for tail calls and jumps within functions and isn't
  58. filtered for the reasons mentioned for JAL x0.
  59. C.JAL is an RV32C-only instruction. Its encoding overlaps with
  60. RV64C-only C.ADDIW which is a common instruction. So if filtering
  61. C.JAL was useful (it wasn't tested) then a separate filter would
  62. be needed for RV32 and RV64. Also, false positives would be a
  63. significant problem when the filter is applied to non-code data
  64. because C.JAL needs only five bits to match. Thus, this filter
  65. doesn't modify C.JAL instructions.
  66. BEQ, BNE, BLT, BGE, BLTU, BGEU, C.BEQZ, and C.BNEZ
  67. --------------------------------------------------
  68. These are conditional branches with pc-relative range
  69. of +/-4 KiB (+/-256 B for C.*). The absolute addresses often
  70. appear only once and very short distances are the most common,
  71. so filtering these instructions would make compression worse.
  72. AUIPC with rd != x0
  73. -------------------
  74. AUIPC is paired with a second instruction (inst2) to do
  75. pc-relative jumps, calls, loads, stores, and for taking
  76. an address of a symbol. AUIPC has a 20-bit immediate and
  77. the possible inst2 choices have a 12-bit immediate.
  78. AUIPC stores pc + 20-bit signed immediate to a register.
  79. The immediate encodes a multiple of 4 KiB so AUIPC itself
  80. has a pc-relative range of +/-2 GiB. AUIPC does *NOT* set
  81. the lowest 12 bits of the result to zero! This means that
  82. the 12-bit immediate in inst2 cannot just include the lowest
  83. 12 bits of the absolute address as is; the immediate has to
  84. compensate for the lowest 12 bits that AUIPC copies from the
  85. program counter. This means that a good filter has to convert
  86. not only AUIPC but also the paired inst2.
  87. A strict filter would focus on filtering the following
  88. AUIPC+inst2 pairs:
  89. - AUIPC+JALR: Function calls, including tail calls.
  90. - AUIPC+ADDI: Calculating the address of a function
  91. or a global variable.
  92. - AUIPC+load/store from the base instruction sets
  93. (RV32I, RV64I) or from the floating point extensions
  94. Zfh, F, D, and Q:
  95. * RV32I: LB, LH, LW, LBU, LHU, SB, SH, SW
  96. * RV64I has also: LD, LWU, SD
  97. * Zfh: FLH, FSH
  98. * F: FLW, FSW
  99. * D: FLD, FSD
  100. * Q: FLQ, FSQ
  101. NOTE: AUIPC+inst2 can only be a pair if AUIPC's rd specifies
  102. the same register as inst2's rs1.
  103. Instead of strictly accepting only the above instructions as inst2,
  104. this filter uses a much simpler condition: the lowest two bits of
  105. inst2 must be set, that is, inst2 must not be a 16-bit compressed
  106. instruction. So this will accept all 32-bit and possible future
  107. extended instructions as a pair to AUIPC if the bits in AUIPC's
  108. rd [11:7] match the bits [19:15] in inst2 (the bits that I-type and
  109. S-type instructions use for rs1). Testing showed that this relaxed
  110. condition for inst2 did not consistently or significantly affect
  111. compression ratio but it reduced code size and improved speed.
  112. Additionally, the paired instruction is always treated as an I-type
  113. instruction. The S-type instructions used by stores (SB, SH, SW,
  114. etc.) place the lowest 5 bits of the immediate in a different
  115. location than I-type instructions. AUIPC+store pairs are less
  116. common than other pairs, and testing showed that the extra
  117. code required to handle S-type instructions was not worth the
  118. compression ratio gained.
  119. AUIPC+inst2 don't necessarily appear sequentially next to each
  120. other although very often they do. Especially AUIPC+JALR are
  121. sequential as that may allow instruction fusion in processors
  122. (and perhaps help branch prediction as a fused AUIPC+JALR is
  123. a direct branch while JALR alone is an indirect branch).
  124. Clang 16 can generate code where AUIPC+inst2 is split:
  125. - AUIPC is outside a loop and inst2 (load/store) is inside
  126. the loop. This way the AUIPC instruction needs to be
  127. executed only once.
  128. - Load-modify-store may have AUIPC for the load and the same
  129. AUIPC-result is used for the store too. This may get combined
  130. with AUIPC being outside the loop.
  131. - AUIPC is before a conditional branch and inst2 is hundreds
  132. of bytes away at the branch target.
  133. - Inner and outer pair:
  134. auipc a1,0x2f
  135. auipc a2,0x3d
  136. ld a2,-500(a2)
  137. addi a1,a1,-233
  138. - Many split pairs with an untaken conditional branch between:
  139. auipc s9,0x1613 # Pair 1
  140. auipc s4,0x1613 # Pair 2
  141. auipc s6,0x1613 # Pair 3
  142. auipc s10,0x1613 # Pair 4
  143. beqz a5,a3baae
  144. ld a0,0(a6)
  145. ld a6,246(s9) # Pair 1
  146. ld a1,250(s4) # Pair 2
  147. ld a3,254(s6) # Pair 3
  148. ld a4,258(s10) # Pair 4
  149. It's not possible to find all split pairs in a filter like this.
  150. At least in 2024, simple sequential pairs are 99 % of AUIPC uses
  151. so filtering only such pairs gives good results and makes the
  152. filter simpler. However, it's possible that future compilers will
  153. produce different code where sequential pairs aren't as common.
  154. This filter doesn't convert AUIPC instructions alone because:
  155. (1) The conversion would be off-by-one (or off-by-4096) half the
  156. time because the lowest 12 bits from inst2 (inst2_imm12)
  157. aren't known. We only know that the absolute address is
  158. pc + AUIPC_imm20 + [-2048, +2047] but there is no way to
  159. know the exact 4096-byte multiple (or 4096 * n + 2048):
  160. there are always two possibilities because AUIPC copies
  161. the 12 lowest bits from pc instead of zeroing them.
  162. NOTE: The sign-extension of inst2_imm12 adds a tiny bit
  163. of extra complexity to AUIPC math in general but it's not
  164. the reason for this problem. The sign-extension only changes
  165. the relative position of the pc-relative 4096-byte window.
  166. (2) Matching AUIPC instruction alone requires only seven bits.
  167. When the filter is applied to non-code data, that leads
  168. to many false positives which make compression worse.
  169. As long as most AUIPC+inst2 pairs appear as two consecutive
  170. instructions, converting only such pairs gives better results.
  171. In assembly, AUIPC+inst2 tend to look like this:
  172. # Call:
  173. auipc ra, 0x12345
  174. jalr ra, -42(ra)
  175. # Tail call:
  176. auipc t1, 0x12345
  177. jalr zero, -42(t1)
  178. # Getting the absolute address:
  179. auipc a0, 0x12345
  180. addi a0, a0, -42
  181. # rd of inst2 isn't necessarily the same as rs1 even
  182. # in cases where there is no reason to preserve rs1.
  183. auipc a0, 0x12345
  184. addi a1, a0, -42
  185. As of 2024, 16-bit instructions from the C extension don't
  186. appear as inst2. The RISC-V psABI doesn't list AUIPC+C.* as
  187. a linker relaxation type explicitly but it's not disallowed
  188. either. Usefulness is limited as most of the time the lowest
  189. 12 bits won't fit in a C instruction. This filter doesn't
  190. support AUIPC+C.* combinations because this makes the filter
  191. simpler, there are no test files, and it hopefully will never
  192. be needed anyway.
  193. (Compare AUIPC to ARM64 where ADRP does set the lowest 12 bits
  194. to zero. The paired instruction has the lowest 12 bits of the
  195. absolute address as is in a zero-extended immediate. Thus the
  196. ARM64 filter doesn't need to care about the instructions that
  197. are paired with ADRP. An off-by-4096 issue can still occur if
  198. the code section isn't aligned with the filter's start offset.
  199. It's not a problem with standalone ELF files but Windows PE
  200. files need start_offset=3072 for best results. Also, a .tar
  201. stores files with 512-byte alignment so most of the time it
  202. won't be the best for ARM64.)
  203. AUIPC with rd == x0
  204. -------------------
  205. AUIPC instructions with rd=x0 are reserved for HINTs in the base
  206. instruction set. Such AUIPC instructions are never filtered.
  207. As of January 2024, it seems likely that AUIPC with rd=x0 will
  208. be used for landing pads (pseudoinstruction LPAD). LPAD is used
  209. to mark valid targets for indirect jumps (for JALR), for example,
  210. beginnings of functions. The 20-bit immediate in LPAD instruction
  211. is a label, not a pc-relative address. Thus it would be
  212. counterproductive to convert AUIPC instructions with rd=x0.
  213. Often the next instruction after LPAD won't have rs1=x0 and thus
  214. the filtering would be skipped for that reason alone. However,
  215. it's not good to rely on this. For example, consider a function
  216. that begins like this:
  217. int foo(int i)
  218. {
  219. if (i <= 234) {
  220. ...
  221. }
  222. A compiler may generate something like this:
  223. lpad 0x54321
  224. li a5, 234
  225. bgt a0, a5, .L2
  226. Converting the pseudoinstructions to raw instructions:
  227. auipc x0, 0x54321
  228. addi x15, x0, 234
  229. blt x15, x10, .L2
  230. In this case the filter would undesirably convert the AUIPC+ADDI
  231. pair if the filter didn't explicitly skip AUIPC instructions
  232. that have rd=x0.
  233. */
  234. #include "simple_private.h"
  235. // This checks two conditions at once:
  236. // - AUIPC rd == inst2 rs1.
  237. // - inst2 opcode has the lowest two bits set.
  238. //
  239. // The 8 bit left shift aligns the rd of AUIPC with the rs1 of inst2.
  240. // By XORing the registers, any non-zero value in those bits indicates the
  241. // registers are not equal and thus not an AUIPC pair. Subtracting 3 from
  242. // inst2 will zero out the first two opcode bits only when they are set.
  243. // The mask tests if any of the register or opcode bits are set (and thus
  244. // not an AUIPC pair).
  245. //
  246. // Alternative expression: (((((auipc) << 8) ^ (inst2)) & 0xF8003) != 3)
  247. #define NOT_AUIPC_PAIR(auipc, inst2) \
  248. ((((auipc) << 8) ^ ((inst2) - 3)) & 0xF8003)
  249. // This macro checks multiple conditions:
  250. // (1) AUIPC rd [11:7] == x2 (special rd value).
  251. // (2) AUIPC bits 12 and 13 set (the lowest two opcode bits of packed inst2).
  252. // (3) inst2_rs1 doesn't equal x0 or x2 because the opposite
  253. // conversion is only done when
  254. // auipc_rd != x0 &&
  255. // auipc_rd != x2 &&
  256. // auipc_rd == inst2_rs1.
  257. //
  258. // The left-hand side takes care of (1) and (2).
  259. // (a) The lowest 7 bits are already known to be AUIPC so subtracting 0x17
  260. // makes those bits zeros.
  261. // (b) If AUIPC rd equals x2, subtracting 0x100 makes bits [11:7] zeros.
  262. // If rd doesn't equal x2, then there will be at least one non-zero bit
  263. // and the next step (c) is irrelevant.
  264. // (c) If the lowest two opcode bits of the packed inst2 are set in [13:12],
  265. // then subtracting 0x3000 will make those bits zeros. Otherwise there
  266. // will be at least one non-zero bit.
  267. //
  268. // The shift by 18 removes the high bits from the final '>=' comparison and
  269. // ensures that any non-zero result will be larger than any possible result
  270. // from the right-hand side of the comparison. The cast ensures that the
  271. // left-hand side didn't get promoted to a larger type than uint32_t.
  272. //
  273. // On the right-hand side, inst2_rs1 & 0x1D will be non-zero as long as
  274. // inst2_rs1 is not x0 or x2.
  275. //
  276. // The final '>=' comparison will make the expression true if:
  277. // - The subtraction caused any bits to be set (special AUIPC rd value not
  278. // used or inst2 opcode bits not set). (non-zero >= non-zero or 0)
  279. // - The subtraction did not cause any bits to be set but inst2_rs1 was
  280. // x0 or x2. (0 >= 0)
  281. #define NOT_SPECIAL_AUIPC(auipc, inst2_rs1) \
  282. ((uint32_t)(((auipc) - 0x3117) << 18) >= ((inst2_rs1) & 0x1D))
  283. // The encode and decode functions are split for this filter because of the
  284. // AUIPC+inst2 filtering. This filter design allows a decoder-only
  285. // implementation to be smaller than alternative designs.
  286. #ifdef HAVE_ENCODER_RISCV
  287. static size_t
  288. riscv_encode(void *simple lzma_attribute((__unused__)),
  289. uint32_t now_pos,
  290. bool is_encoder lzma_attribute((__unused__)),
  291. uint8_t *buffer, size_t size)
  292. {
  293. // Avoid using i + 8 <= size in the loop condition.
  294. //
  295. // NOTE: If there is a JAL in the last six bytes of the stream, it
  296. // won't be converted. This is intentional to keep the code simpler.
  297. if (size < 8)
  298. return 0;
  299. size -= 8;
  300. size_t i;
  301. // The loop is advanced by 2 bytes every iteration since the
  302. // instruction stream may include 16-bit instructions (C extension).
  303. for (i = 0; i <= size; i += 2) {
  304. uint32_t inst = buffer[i];
  305. if (inst == 0xEF) {
  306. // JAL
  307. const uint32_t b1 = buffer[i + 1];
  308. // Only filter rd=x1(ra) and rd=x5(t0).
  309. if ((b1 & 0x0D) != 0)
  310. continue;
  311. // The 20-bit immediate is in four pieces.
  312. // The encoder stores it in big endian form
  313. // since it improves compression slightly.
  314. const uint32_t b2 = buffer[i + 2];
  315. const uint32_t b3 = buffer[i + 3];
  316. const uint32_t pc = now_pos + (uint32_t)i;
  317. // The following chart shows the highest three bytes of JAL, focusing on
  318. // the 20-bit immediate field [31:12]. The first row of numbers is the
  319. // bit position in a 32-bit little endian instruction. The second row of
  320. // numbers shows the order of the immediate field in a J-type instruction.
  321. // The last row is the bit number in each byte.
  322. //
  323. // To determine the amount to shift each bit, subtract the value in
  324. // the last row from the value in the second last row. If the number
  325. // is positive, shift left. If negative, shift right.
  326. //
  327. // For example, at the rightmost side of the chart, the bit 4 in b1 is
  328. // the bit 12 of the address. Thus that bit needs to be shifted left
  329. // by 12 - 4 = 8 bits to put it in the right place in the addr variable.
  330. //
  331. // NOTE: The immediate of a J-type instruction holds bits [20:1] of
  332. // the address. The bit [0] is always 0 and not part of the immediate.
  333. //
  334. // | b3 | b2 | b1 |
  335. // | 31 30 29 28 27 26 25 24 | 23 22 21 20 19 18 17 16 | 15 14 13 12 x x x x |
  336. // | 20 10 9 8 7 6 5 4 | 3 2 1 11 19 18 17 16 | 15 14 13 12 x x x x |
  337. // | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 7 6 5 4 x x x x |
  338. uint32_t addr = ((b1 & 0xF0) << 8)
  339. | ((b2 & 0x0F) << 16)
  340. | ((b2 & 0x10) << 7)
  341. | ((b2 & 0xE0) >> 4)
  342. | ((b3 & 0x7F) << 4)
  343. | ((b3 & 0x80) << 13);
  344. addr += pc;
  345. buffer[i + 1] = (uint8_t)((b1 & 0x0F)
  346. | ((addr >> 13) & 0xF0));
  347. buffer[i + 2] = (uint8_t)(addr >> 9);
  348. buffer[i + 3] = (uint8_t)(addr >> 1);
  349. // The "-2" is included because the for-loop will
  350. // always increment by 2. In this case, we want to
  351. // skip an extra 2 bytes since we used 4 bytes
  352. // of input.
  353. i += 4 - 2;
  354. } else if ((inst & 0x7F) == 0x17) {
  355. // AUIPC
  356. inst |= (uint32_t)buffer[i + 1] << 8;
  357. inst |= (uint32_t)buffer[i + 2] << 16;
  358. inst |= (uint32_t)buffer[i + 3] << 24;
  359. // Branch based on AUIPC's rd. The bitmask test does
  360. // the same thing as this:
  361. //
  362. // const uint32_t auipc_rd = (inst >> 7) & 0x1F;
  363. // if (auipc_rd != 0 && auipc_rd != 2) {
  364. if (inst & 0xE80) {
  365. // AUIPC's rd doesn't equal x0 or x2.
  366. // Check if AUIPC+inst2 are a pair.
  367. uint32_t inst2 = read32le(buffer + i + 4);
  368. if (NOT_AUIPC_PAIR(inst, inst2)) {
  369. // The NOT_AUIPC_PAIR macro allows
  370. // a false AUIPC+AUIPC pair if the
  371. // bits [19:15] (where rs1 would be)
  372. // in the second AUIPC match the rd
  373. // of the first AUIPC.
  374. //
  375. // We must skip enough forward so
  376. // that the first two bytes of the
  377. // second AUIPC cannot get converted.
  378. // Such a conversion could make the
  379. // current pair become a valid pair
  380. // which would desync the decoder.
  381. //
  382. // Skipping six bytes is enough even
  383. // though the above condition looks
  384. // at the lowest four bits of the
  385. // buffer[i + 6] too. This is safe
  386. // because this filter never changes
  387. // those bits if a conversion at
  388. // that position is done.
  389. i += 6 - 2;
  390. continue;
  391. }
  392. // Convert AUIPC+inst2 to a special format:
  393. //
  394. // - The lowest 7 bits [6:0] retain the
  395. // AUIPC opcode.
  396. //
  397. // - The rd [11:7] is set to x2(sp). x2 is
  398. // used as the stack pointer so AUIPC with
  399. // rd=x2 should be very rare in real-world
  400. // executables.
  401. //
  402. // - The remaining 20 bits [31:12] (that
  403. // normally hold the pc-relative immediate)
  404. // are used to store the lowest 20 bits of
  405. // inst2. That is, the 12-bit immediate of
  406. // inst2 is not included.
  407. //
  408. // - The location of the original inst2 is
  409. // used to store the 32-bit absolute
  410. // address in big endian format. Compared
  411. // to the 20+12-bit split encoding, this
  412. // results in a longer uninterrupted
  413. // sequence of identical common bytes
  414. // when the same address is referred
  415. // with different instruction pairs
  416. // (like AUIPC+LD vs. AUIPC+ADDI) or
  417. // when the occurrences of the same
  418. // pair use different registers. When
  419. // referring to adjacent memory locations
  420. // (like function calls that go via the
  421. // ELF PLT), in big endian order only the
  422. // last 1-2 bytes differ; in little endian
  423. // the differing 1-2 bytes would be in the
  424. // middle of the 8-byte sequence.
  425. //
  426. // When reversing the transformation, the
  427. // original rd of AUIPC can be restored
  428. // from inst2's rs1 as they are required to
  429. // be the same.
  430. // Arithmetic right shift makes sign extension
  431. // trivial but (1) it's implementation-defined
  432. // behavior (C99/C11/C23 6.5.7-p5) and so is
  433. // (2) casting unsigned to signed (6.3.1.3-p3).
  434. //
  435. // One can check for (1) with
  436. //
  437. // if ((-1 >> 1) == -1) ...
  438. //
  439. // but (2) has to be checked from the
  440. // compiler docs. GCC promises that (1)
  441. // and (2) behave in the common expected
  442. // way and thus
  443. //
  444. // addr += (uint32_t)(
  445. // (int32_t)inst2 >> 20);
  446. //
  447. // does the same as the code below. But since
  448. // the 100 % portable way is only a few bytes
  449. // bigger code and there is no real speed
  450. // difference, let's just use that, especially
  451. // since the decoder doesn't need this at all.
  452. uint32_t addr = inst & 0xFFFFF000;
  453. addr += (inst2 >> 20)
  454. - ((inst2 >> 19) & 0x1000);
  455. addr += now_pos + (uint32_t)i;
  456. // Construct the first 32 bits:
  457. // [6:0] AUIPC opcode
  458. // [11:7] Special AUIPC rd = x2
  459. // [31:12] The lowest 20 bits of inst2
  460. inst = 0x17 | (2 << 7) | (inst2 << 12);
  461. write32le(buffer + i, inst);
  462. // The second 32 bits store the absolute
  463. // address in big endian order.
  464. write32be(buffer + i + 4, addr);
  465. } else {
  466. // AUIPC's rd equals x0 or x2.
  467. //
  468. // x0 indicates a landing pad (LPAD).
  469. // It's always skipped.
  470. //
  471. // AUIPC with rd == x2 is used for the special
  472. // format as explained above. When the input
  473. // contains a byte sequence that matches the
  474. // special format, "fake" decoding must be
  475. // done to keep the filter bijective (that
  476. // is, safe to apply on arbitrary data).
  477. //
  478. // See the "x0 or x2" section in riscv_decode()
  479. // for how the "real" decoding is done. The
  480. // "fake" decoding is a simplified version
  481. // of "real" decoding with the following
  482. // differences (these reduce code size of
  483. // the decoder):
  484. // (1) The lowest 12 bits aren't sign-extended.
  485. // (2) No address conversion is done.
  486. // (3) Big endian format isn't used (the fake
  487. // address is in little endian order).
  488. // Check if inst matches the special format.
  489. const uint32_t fake_rs1 = inst >> 27;
  490. if (NOT_SPECIAL_AUIPC(inst, fake_rs1)) {
  491. i += 4 - 2;
  492. continue;
  493. }
  494. const uint32_t fake_addr =
  495. read32le(buffer + i + 4);
  496. // Construct the second 32 bits:
  497. // [19:0] Upper 20 bits from AUIPC
  498. // [31:20] The lowest 12 bits of fake_addr
  499. const uint32_t fake_inst2 = (inst >> 12)
  500. | (fake_addr << 20);
  501. // Construct new first 32 bits from:
  502. // [6:0] AUIPC opcode
  503. // [11:7] Fake AUIPC rd = fake_rs1
  504. // [31:12] The highest 20 bits of fake_addr
  505. inst = 0x17 | (fake_rs1 << 7)
  506. | (fake_addr & 0xFFFFF000);
  507. write32le(buffer + i, inst);
  508. write32le(buffer + i + 4, fake_inst2);
  509. }
  510. i += 8 - 2;
  511. }
  512. }
  513. return i;
  514. }
  515. extern lzma_ret
  516. lzma_simple_riscv_encoder_init(lzma_next_coder *next,
  517. const lzma_allocator *allocator,
  518. const lzma_filter_info *filters)
  519. {
  520. return lzma_simple_coder_init(next, allocator, filters,
  521. &riscv_encode, 0, 8, 2, true);
  522. }
  523. #endif
  524. #ifdef HAVE_DECODER_RISCV
  525. static size_t
  526. riscv_decode(void *simple lzma_attribute((__unused__)),
  527. uint32_t now_pos,
  528. bool is_encoder lzma_attribute((__unused__)),
  529. uint8_t *buffer, size_t size)
  530. {
  531. if (size < 8)
  532. return 0;
  533. size -= 8;
  534. size_t i;
  535. for (i = 0; i <= size; i += 2) {
  536. uint32_t inst = buffer[i];
  537. if (inst == 0xEF) {
  538. // JAL
  539. const uint32_t b1 = buffer[i + 1];
  540. // Only filter rd=x1(ra) and rd=x5(t0).
  541. if ((b1 & 0x0D) != 0)
  542. continue;
  543. const uint32_t b2 = buffer[i + 2];
  544. const uint32_t b3 = buffer[i + 3];
  545. const uint32_t pc = now_pos + (uint32_t)i;
  546. // | b3 | b2 | b1 |
  547. // | 31 30 29 28 27 26 25 24 | 23 22 21 20 19 18 17 16 | 15 14 13 12 x x x x |
  548. // | 20 10 9 8 7 6 5 4 | 3 2 1 11 19 18 17 16 | 15 14 13 12 x x x x |
  549. // | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 7 6 5 4 x x x x |
  550. uint32_t addr = ((b1 & 0xF0) << 13)
  551. | (b2 << 9) | (b3 << 1);
  552. addr -= pc;
  553. buffer[i + 1] = (uint8_t)((b1 & 0x0F)
  554. | ((addr >> 8) & 0xF0));
  555. buffer[i + 2] = (uint8_t)(((addr >> 16) & 0x0F)
  556. | ((addr >> 7) & 0x10)
  557. | ((addr << 4) & 0xE0));
  558. buffer[i + 3] = (uint8_t)(((addr >> 4) & 0x7F)
  559. | ((addr >> 13) & 0x80));
  560. i += 4 - 2;
  561. } else if ((inst & 0x7F) == 0x17) {
  562. // AUIPC
  563. uint32_t inst2;
  564. inst |= (uint32_t)buffer[i + 1] << 8;
  565. inst |= (uint32_t)buffer[i + 2] << 16;
  566. inst |= (uint32_t)buffer[i + 3] << 24;
  567. if (inst & 0xE80) {
  568. // AUIPC's rd doesn't equal x0 or x2.
  569. // Check if it is a "fake" AUIPC+inst2 pair.
  570. inst2 = read32le(buffer + i + 4);
  571. if (NOT_AUIPC_PAIR(inst, inst2)) {
  572. i += 6 - 2;
  573. continue;
  574. }
  575. // Decode (or more like re-encode) the "fake"
  576. // pair. The "fake" format doesn't do
  577. // sign-extension, address conversion, or
  578. // use big endian. (The use of little endian
  579. // allows sharing the write32le() calls in
  580. // the decoder to reduce code size when
  581. // unaligned access isn't supported.)
  582. uint32_t addr = inst & 0xFFFFF000;
  583. addr += inst2 >> 20;
  584. inst = 0x17 | (2 << 7) | (inst2 << 12);
  585. inst2 = addr;
  586. } else {
  587. // AUIPC's rd equals x0 or x2.
  588. // Check if inst matches the special format
  589. // used by the encoder.
  590. const uint32_t inst2_rs1 = inst >> 27;
  591. if (NOT_SPECIAL_AUIPC(inst, inst2_rs1)) {
  592. i += 4 - 2;
  593. continue;
  594. }
  595. // Decode the "real" pair.
  596. uint32_t addr = read32be(buffer + i + 4);
  597. addr -= now_pos + (uint32_t)i;
  598. // The second instruction:
  599. // - Get the lowest 20 bits from inst.
  600. // - Add the lowest 12 bits of the address
  601. // as the immediate field.
  602. inst2 = (inst >> 12) | (addr << 20);
  603. // AUIPC:
  604. // - rd is the same as inst2_rs1.
  605. // - The sign extension of the lowest 12 bits
  606. // must be taken into account.
  607. inst = 0x17 | (inst2_rs1 << 7)
  608. | ((addr + 0x800) & 0xFFFFF000);
  609. }
  610. // Both decoder branches write in little endian order.
  611. write32le(buffer + i, inst);
  612. write32le(buffer + i + 4, inst2);
  613. i += 8 - 2;
  614. }
  615. }
  616. return i;
  617. }
  618. extern lzma_ret
  619. lzma_simple_riscv_decoder_init(lzma_next_coder *next,
  620. const lzma_allocator *allocator,
  621. const lzma_filter_info *filters)
  622. {
  623. return lzma_simple_coder_init(next, allocator, filters,
  624. &riscv_decode, 0, 8, 2, false);
  625. }
  626. #endif