README-Thumb.txt 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. //===---------------------------------------------------------------------===//
  2. // Random ideas for the ARM backend (Thumb specific).
  3. //===---------------------------------------------------------------------===//
  4. * Add support for compiling functions in both ARM and Thumb mode, then taking
  5. the smallest.
  6. * Add support for compiling individual basic blocks in thumb mode, when in a
  7. larger ARM function. This can be used for presumed cold code, like paths
  8. to abort (failure path of asserts), EH handling code, etc.
  9. * Thumb doesn't have normal pre/post increment addressing modes, but you can
  10. load/store 32-bit integers with pre/postinc by using load/store multiple
  11. instrs with a single register.
  12. * Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add
  13. and cmp instructions can use high registers. Also, we can use them as
  14. temporaries to spill values into.
  15. * In thumb mode, short, byte, and bool preferred alignments are currently set
  16. to 4 to accommodate ISA restriction (i.e. add sp, #imm, imm must be multiple
  17. of 4).
  18. //===---------------------------------------------------------------------===//
  19. Potential jumptable improvements:
  20. * If we know function size is less than (1 << 16) * 2 bytes, we can use 16-bit
  21. jumptable entries (e.g. (L1 - L2) >> 1). Or even smaller entries if the
  22. function is even smaller. This also applies to ARM.
  23. * Thumb jumptable codegen can improve given some help from the assembler. This
  24. is what we generate right now:
  25. .set PCRELV0, (LJTI1_0_0-(LPCRELL0+4))
  26. LPCRELL0:
  27. mov r1, #PCRELV0
  28. add r1, pc
  29. ldr r0, [r0, r1]
  30. mov pc, r0
  31. .align 2
  32. LJTI1_0_0:
  33. .long LBB1_3
  34. ...
  35. Note there is another pc relative add that we can take advantage of.
  36. add r1, pc, #imm_8 * 4
  37. We should be able to generate:
  38. LPCRELL0:
  39. add r1, LJTI1_0_0
  40. ldr r0, [r0, r1]
  41. mov pc, r0
  42. .align 2
  43. LJTI1_0_0:
  44. .long LBB1_3
  45. if the assembler can translate the add to:
  46. add r1, pc, #((LJTI1_0_0-(LPCRELL0+4))&0xfffffffc)
  47. Note the assembler also does something similar to constpool load:
  48. LPCRELL0:
  49. ldr r0, LCPI1_0
  50. =>
  51. ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc)
  52. //===---------------------------------------------------------------------===//
  53. We compile the following:
  54. define i16 @func_entry_2E_ce(i32 %i) {
  55. switch i32 %i, label %bb12.exitStub [
  56. i32 0, label %bb4.exitStub
  57. i32 1, label %bb9.exitStub
  58. i32 2, label %bb4.exitStub
  59. i32 3, label %bb4.exitStub
  60. i32 7, label %bb9.exitStub
  61. i32 8, label %bb.exitStub
  62. i32 9, label %bb9.exitStub
  63. ]
  64. bb12.exitStub:
  65. ret i16 0
  66. bb4.exitStub:
  67. ret i16 1
  68. bb9.exitStub:
  69. ret i16 2
  70. bb.exitStub:
  71. ret i16 3
  72. }
  73. into:
  74. _func_entry_2E_ce:
  75. mov r2, #1
  76. lsl r2, r0
  77. cmp r0, #9
  78. bhi LBB1_4 @bb12.exitStub
  79. LBB1_1: @newFuncRoot
  80. mov r1, #13
  81. tst r2, r1
  82. bne LBB1_5 @bb4.exitStub
  83. LBB1_2: @newFuncRoot
  84. ldr r1, LCPI1_0
  85. tst r2, r1
  86. bne LBB1_6 @bb9.exitStub
  87. LBB1_3: @newFuncRoot
  88. mov r1, #1
  89. lsl r1, r1, #8
  90. tst r2, r1
  91. bne LBB1_7 @bb.exitStub
  92. LBB1_4: @bb12.exitStub
  93. mov r0, #0
  94. bx lr
  95. LBB1_5: @bb4.exitStub
  96. mov r0, #1
  97. bx lr
  98. LBB1_6: @bb9.exitStub
  99. mov r0, #2
  100. bx lr
  101. LBB1_7: @bb.exitStub
  102. mov r0, #3
  103. bx lr
  104. LBB1_8:
  105. .align 2
  106. LCPI1_0:
  107. .long 642
  108. gcc compiles to:
  109. cmp r0, #9
  110. @ lr needed for prologue
  111. bhi L2
  112. ldr r3, L11
  113. mov r2, #1
  114. mov r1, r2, asl r0
  115. ands r0, r3, r2, asl r0
  116. movne r0, #2
  117. bxne lr
  118. tst r1, #13
  119. beq L9
  120. L3:
  121. mov r0, r2
  122. bx lr
  123. L9:
  124. tst r1, #256
  125. movne r0, #3
  126. bxne lr
  127. L2:
  128. mov r0, #0
  129. bx lr
  130. L12:
  131. .align 2
  132. L11:
  133. .long 642
  134. GCC is doing a couple of clever things here:
  135. 1. It is predicating one of the returns. This isn't a clear win though: in
  136. cases where that return isn't taken, it is replacing one condbranch with
  137. two 'ne' predicated instructions.
  138. 2. It is sinking the shift of "1 << i" into the tst, and using ands instead of
  139. tst. This will probably require whole function isel.
  140. 3. GCC emits:
  141. tst r1, #256
  142. we emit:
  143. mov r1, #1
  144. lsl r1, r1, #8
  145. tst r2, r1
  146. //===---------------------------------------------------------------------===//
  147. When spilling in thumb mode and the sp offset is too large to fit in the ldr /
  148. str offset field, we load the offset from a constpool entry and add it to sp:
  149. ldr r2, LCPI
  150. add r2, sp
  151. ldr r2, [r2]
  152. These instructions preserve the condition code which is important if the spill
  153. is between a cmp and a bcc instruction. However, we can use the (potentially)
  154. cheaper sequence if we know it's ok to clobber the condition register.
  155. add r2, sp, #255 * 4
  156. add r2, #132
  157. ldr r2, [r2, #7 * 4]
  158. This is especially bad when dynamic alloca is used. The all fixed size stack
  159. objects are referenced off the frame pointer with negative offsets. See
  160. oggenc for an example.
  161. //===---------------------------------------------------------------------===//
  162. Poor codegen test/CodeGen/ARM/select.ll f7:
  163. ldr r5, LCPI1_0
  164. LPC0:
  165. add r5, pc
  166. ldr r6, LCPI1_1
  167. ldr r2, LCPI1_2
  168. mov r3, r6
  169. mov lr, pc
  170. bx r5
  171. //===---------------------------------------------------------------------===//
  172. Make register allocator / spiller smarter so we can re-materialize "mov r, imm",
  173. etc. Almost all Thumb instructions clobber condition code.
  174. //===---------------------------------------------------------------------===//
  175. Thumb load / store address mode offsets are scaled. The values kept in the
  176. instruction operands are pre-scale values. This probably ought to be changed
  177. to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions.
  178. //===---------------------------------------------------------------------===//
  179. We need to make (some of the) Thumb1 instructions predicable. That will allow
  180. shrinking of predicated Thumb2 instructions. To allow this, we need to be able
  181. to toggle the 's' bit since they do not set CPSR when they are inside IT blocks.
  182. //===---------------------------------------------------------------------===//
  183. Make use of hi register variants of cmp: tCMPhir / tCMPZhir.
  184. //===---------------------------------------------------------------------===//
  185. Thumb1 immediate field sometimes keep pre-scaled values. See
  186. ThumbRegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and
  187. Thumb2.
  188. //===---------------------------------------------------------------------===//
  189. Rather than having tBR_JTr print a ".align 2" and constant island pass pad it,
  190. add a target specific ALIGN instruction instead. That way, getInstSizeInBytes
  191. won't have to over-estimate. It can also be used for loop alignment pass.
  192. //===---------------------------------------------------------------------===//
  193. We generate conditional code for icmp when we don't need to. This code:
  194. int foo(int s) {
  195. return s == 1;
  196. }
  197. produces:
  198. foo:
  199. cmp r0, #1
  200. mov.w r0, #0
  201. it eq
  202. moveq r0, #1
  203. bx lr
  204. when it could use subs + adcs. This is GCC PR46975.