README-SSE.txt 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829
  1. //===---------------------------------------------------------------------===//
  2. // Random ideas for the X86 backend: SSE-specific stuff.
  3. //===---------------------------------------------------------------------===//
  4. //===---------------------------------------------------------------------===//
  5. SSE Variable shift can be custom lowered to something like this, which uses a
  6. small table + unaligned load + shuffle instead of going through memory.
  7. __m128i_shift_right:
  8. .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
  9. .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
  10. ...
  11. __m128i shift_right(__m128i value, unsigned long offset) {
  12. return _mm_shuffle_epi8(value,
  13. _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
  14. }
  15. //===---------------------------------------------------------------------===//
  16. SSE has instructions for doing operations on complex numbers, we should pattern
  17. match them. For example, this should turn into a horizontal add:
  18. typedef float __attribute__((vector_size(16))) v4f32;
  19. float f32(v4f32 A) {
  20. return A[0]+A[1]+A[2]+A[3];
  21. }
  22. Instead we get this:
  23. _f32: ## @f32
  24. pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0]
  25. addss %xmm0, %xmm1
  26. pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0]
  27. movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1]
  28. movaps %xmm0, %xmm3
  29. addss %xmm1, %xmm3
  30. movdqa %xmm2, %xmm0
  31. addss %xmm3, %xmm0
  32. ret
  33. Also, there are cases where some simple local SLP would improve codegen a bit.
  34. compiling this:
  35. _Complex float f32(_Complex float A, _Complex float B) {
  36. return A+B;
  37. }
  38. into:
  39. _f32: ## @f32
  40. movdqa %xmm0, %xmm2
  41. addss %xmm1, %xmm2
  42. pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0]
  43. pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0]
  44. addss %xmm1, %xmm3
  45. movaps %xmm2, %xmm0
  46. unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
  47. ret
  48. seems silly when it could just be one addps.
  49. //===---------------------------------------------------------------------===//
  50. Expand libm rounding functions inline: Significant speedups possible.
  51. http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
  52. //===---------------------------------------------------------------------===//
  53. When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
  54. other fast SSE modes.
  55. //===---------------------------------------------------------------------===//
  56. Think about doing i64 math in SSE regs on x86-32.
  57. //===---------------------------------------------------------------------===//
  58. This testcase should have no SSE instructions in it, and only one load from
  59. a constant pool:
  60. double %test3(bool %B) {
  61. %C = select bool %B, double 123.412, double 523.01123123
  62. ret double %C
  63. }
  64. Currently, the select is being lowered, which prevents the dag combiner from
  65. turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  66. The pattern isel got this one right.
  67. //===---------------------------------------------------------------------===//
  68. Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
  69. feasible.
  70. //===---------------------------------------------------------------------===//
  71. Codegen:
  72. if (copysign(1.0, x) == copysign(1.0, y))
  73. into:
  74. if (x^y & mask)
  75. when using SSE.
  76. //===---------------------------------------------------------------------===//
  77. Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
  78. of a v4sf value.
  79. //===---------------------------------------------------------------------===//
  80. Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
  81. Perhaps use pxor / xorp* to clear a XMM register first?
  82. //===---------------------------------------------------------------------===//
  83. External test Nurbs exposed some problems. Look for
  84. __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
  85. emits:
  86. movaps (%edx), %xmm2 #59.21
  87. movaps (%edx), %xmm5 #60.21
  88. movaps (%edx), %xmm4 #61.21
  89. movaps (%edx), %xmm3 #62.21
  90. movl 40(%ecx), %ebp #69.49
  91. shufps $0, %xmm2, %xmm5 #60.21
  92. movl 100(%esp), %ebx #69.20
  93. movl (%ebx), %edi #69.20
  94. imull %ebp, %edi #69.49
  95. addl (%eax), %edi #70.33
  96. shufps $85, %xmm2, %xmm4 #61.21
  97. shufps $170, %xmm2, %xmm3 #62.21
  98. shufps $255, %xmm2, %xmm2 #63.21
  99. lea (%ebp,%ebp,2), %ebx #69.49
  100. negl %ebx #69.49
  101. lea -3(%edi,%ebx), %ebx #70.33
  102. shll $4, %ebx #68.37
  103. addl 32(%ecx), %ebx #68.37
  104. testb $15, %bl #91.13
  105. jne L_B1.24 # Prob 5% #91.13
  106. This is the llvm code after instruction scheduling:
  107. cond_next140 (0xa910740, LLVM BB @0xa90beb0):
  108. %reg1078 = MOV32ri -3
  109. %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0
  110. %reg1037 = MOV32rm %reg1024, 1, %noreg, 40
  111. %reg1080 = IMUL32rr %reg1079, %reg1037
  112. %reg1081 = MOV32rm %reg1058, 1, %noreg, 0
  113. %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
  114. %reg1036 = MOV32rm %reg1024, 1, %noreg, 32
  115. %reg1082 = SHL32ri %reg1038, 4
  116. %reg1039 = ADD32rr %reg1036, %reg1082
  117. %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0
  118. %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
  119. %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
  120. %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
  121. %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
  122. %reg1040 = MOV32rr %reg1039
  123. %reg1084 = AND32ri8 %reg1039, 15
  124. CMP32ri8 %reg1084, 0
  125. JE mbb<cond_next204,0xa914d30>
  126. Still ok. After register allocation:
  127. cond_next140 (0xa910740, LLVM BB @0xa90beb0):
  128. %eax = MOV32ri -3
  129. %edx = MOV32rm %stack.3, 1, %noreg, 0
  130. ADD32rm %eax<def&use>, %edx, 1, %noreg, 0
  131. %edx = MOV32rm %stack.7, 1, %noreg, 0
  132. %edx = MOV32rm %edx, 1, %noreg, 40
  133. IMUL32rr %eax<def&use>, %edx
  134. %esi = MOV32rm %stack.5, 1, %noreg, 0
  135. %esi = MOV32rm %esi, 1, %noreg, 0
  136. MOV32mr %stack.4, 1, %noreg, 0, %esi
  137. %eax = LEA32r %esi, 1, %eax, -3
  138. %esi = MOV32rm %stack.7, 1, %noreg, 0
  139. %esi = MOV32rm %esi, 1, %noreg, 32
  140. %edi = MOV32rr %eax
  141. SHL32ri %edi<def&use>, 4
  142. ADD32rr %edi<def&use>, %esi
  143. %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0
  144. %xmm1 = MOVAPSrr %xmm0
  145. SHUFPSrr %xmm1<def&use>, %xmm1, 170
  146. %xmm2 = MOVAPSrr %xmm0
  147. SHUFPSrr %xmm2<def&use>, %xmm2, 0
  148. %xmm3 = MOVAPSrr %xmm0
  149. SHUFPSrr %xmm3<def&use>, %xmm3, 255
  150. SHUFPSrr %xmm0<def&use>, %xmm0, 85
  151. %ebx = MOV32rr %edi
  152. AND32ri8 %ebx<def&use>, 15
  153. CMP32ri8 %ebx, 0
  154. JE mbb<cond_next204,0xa914d30>
  155. This looks really bad. The problem is shufps is a destructive opcode. Since it
  156. appears as operand two in more than one shufps ops. It resulted in a number of
  157. copies. Note icc also suffers from the same problem. Either the instruction
  158. selector should select pshufd or The register allocator can made the two-address
  159. to three-address transformation.
  160. It also exposes some other problems. See MOV32ri -3 and the spills.
  161. //===---------------------------------------------------------------------===//
  162. Consider:
  163. __m128 test(float a) {
  164. return _mm_set_ps(0.0, 0.0, 0.0, a*a);
  165. }
  166. This compiles into:
  167. movss 4(%esp), %xmm1
  168. mulss %xmm1, %xmm1
  169. xorps %xmm0, %xmm0
  170. movss %xmm1, %xmm0
  171. ret
  172. Because mulss doesn't modify the top 3 elements, the top elements of
  173. xmm1 are already zero'd. We could compile this to:
  174. movss 4(%esp), %xmm0
  175. mulss %xmm0, %xmm0
  176. ret
  177. //===---------------------------------------------------------------------===//
  178. Here's a sick and twisted idea. Consider code like this:
  179. __m128 test(__m128 a) {
  180. float b = *(float*)&A;
  181. ...
  182. return _mm_set_ps(0.0, 0.0, 0.0, b);
  183. }
  184. This might compile to this code:
  185. movaps c(%esp), %xmm1
  186. xorps %xmm0, %xmm0
  187. movss %xmm1, %xmm0
  188. ret
  189. Now consider if the ... code caused xmm1 to get spilled. This might produce
  190. this code:
  191. movaps c(%esp), %xmm1
  192. movaps %xmm1, c2(%esp)
  193. ...
  194. xorps %xmm0, %xmm0
  195. movaps c2(%esp), %xmm1
  196. movss %xmm1, %xmm0
  197. ret
  198. However, since the reload is only used by these instructions, we could
  199. "fold" it into the uses, producing something like this:
  200. movaps c(%esp), %xmm1
  201. movaps %xmm1, c2(%esp)
  202. ...
  203. movss c2(%esp), %xmm0
  204. ret
  205. ... saving two instructions.
  206. The basic idea is that a reload from a spill slot, can, if only one 4-byte
  207. chunk is used, bring in 3 zeros the one element instead of 4 elements.
  208. This can be used to simplify a variety of shuffle operations, where the
  209. elements are fixed zeros.
  210. //===---------------------------------------------------------------------===//
  211. This code generates ugly code, probably due to costs being off or something:
  212. define void @test(float* %P, <4 x float>* %P2 ) {
  213. %xFloat0.688 = load float* %P
  214. %tmp = load <4 x float>* %P2
  215. %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
  216. store <4 x float> %inFloat3.713, <4 x float>* %P2
  217. ret void
  218. }
  219. Generates:
  220. _test:
  221. movl 8(%esp), %eax
  222. movaps (%eax), %xmm0
  223. pxor %xmm1, %xmm1
  224. movaps %xmm0, %xmm2
  225. shufps $50, %xmm1, %xmm2
  226. shufps $132, %xmm2, %xmm0
  227. movaps %xmm0, (%eax)
  228. ret
  229. Would it be better to generate:
  230. _test:
  231. movl 8(%esp), %ecx
  232. movaps (%ecx), %xmm0
  233. xor %eax, %eax
  234. pinsrw $6, %eax, %xmm0
  235. pinsrw $7, %eax, %xmm0
  236. movaps %xmm0, (%ecx)
  237. ret
  238. ?
  239. //===---------------------------------------------------------------------===//
  240. Some useful information in the Apple Altivec / SSE Migration Guide:
  241. http://developer.apple.com/documentation/Performance/Conceptual/
  242. Accelerate_sse_migration/index.html
  243. e.g. SSE select using and, andnot, or. Various SSE compare translations.
  244. //===---------------------------------------------------------------------===//
  245. Add hooks to commute some CMPP operations.
  246. //===---------------------------------------------------------------------===//
  247. Apply the same transformation that merged four float into a single 128-bit load
  248. to loads from constant pool.
  249. //===---------------------------------------------------------------------===//
  250. Floating point max / min are commutable when -enable-unsafe-fp-path is
  251. specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
  252. nodes which are selected to max / min instructions that are marked commutable.
  253. //===---------------------------------------------------------------------===//
  254. We should materialize vector constants like "all ones" and "signbit" with
  255. code like:
  256. cmpeqps xmm1, xmm1 ; xmm1 = all-ones
  257. and:
  258. cmpeqps xmm1, xmm1 ; xmm1 = all-ones
  259. psrlq xmm1, 31 ; xmm1 = all 100000000000...
  260. instead of using a load from the constant pool. The later is important for
  261. ABS/NEG/copysign etc.
  262. //===---------------------------------------------------------------------===//
  263. These functions:
  264. #include <xmmintrin.h>
  265. __m128i a;
  266. void x(unsigned short n) {
  267. a = _mm_slli_epi32 (a, n);
  268. }
  269. void y(unsigned n) {
  270. a = _mm_slli_epi32 (a, n);
  271. }
  272. compile to ( -O3 -static -fomit-frame-pointer):
  273. _x:
  274. movzwl 4(%esp), %eax
  275. movd %eax, %xmm0
  276. movaps _a, %xmm1
  277. pslld %xmm0, %xmm1
  278. movaps %xmm1, _a
  279. ret
  280. _y:
  281. movd 4(%esp), %xmm0
  282. movaps _a, %xmm1
  283. pslld %xmm0, %xmm1
  284. movaps %xmm1, _a
  285. ret
  286. "y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems
  287. like movd would be sufficient in both cases as the value is already zero
  288. extended in the 32-bit stack slot IIRC. For signed short, it should also be
  289. save, as a really-signed value would be undefined for pslld.
  290. //===---------------------------------------------------------------------===//
  291. #include <math.h>
  292. int t1(double d) { return signbit(d); }
  293. This currently compiles to:
  294. subl $12, %esp
  295. movsd 16(%esp), %xmm0
  296. movsd %xmm0, (%esp)
  297. movl 4(%esp), %eax
  298. shrl $31, %eax
  299. addl $12, %esp
  300. ret
  301. We should use movmskp{s|d} instead.
  302. //===---------------------------------------------------------------------===//
  303. CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
  304. (aligned) vector load. This functionality has a couple of problems.
  305. 1. The code to infer alignment from loads of globals is in the X86 backend,
  306. not the dag combiner. This is because dagcombine2 needs to be able to see
  307. through the X86ISD::Wrapper node, which DAGCombine can't really do.
  308. 2. The code for turning 4 x load into a single vector load is target
  309. independent and should be moved to the dag combiner.
  310. 3. The code for turning 4 x load into a vector load can only handle a direct
  311. load from a global or a direct load from the stack. It should be generalized
  312. to handle any load from P, P+4, P+8, P+12, where P can be anything.
  313. 4. The alignment inference code cannot handle loads from globals in non-static
  314. mode because it doesn't look through the extra dyld stub load. If you try
  315. vec_align.ll without -relocation-model=static, you'll see what I mean.
  316. //===---------------------------------------------------------------------===//
  317. We should lower store(fneg(load p), q) into an integer load+xor+store, which
  318. eliminates a constant pool load. For example, consider:
  319. define i64 @ccosf(float %z.0, float %z.1) nounwind readonly {
  320. entry:
  321. %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1]
  322. %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
  323. ret i64 %tmp20
  324. }
  325. declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
  326. This currently compiles to:
  327. LCPI1_0: # <4 x float>
  328. .long 2147483648 # float -0
  329. .long 2147483648 # float -0
  330. .long 2147483648 # float -0
  331. .long 2147483648 # float -0
  332. _ccosf:
  333. subl $12, %esp
  334. movss 16(%esp), %xmm0
  335. movss %xmm0, 4(%esp)
  336. movss 20(%esp), %xmm0
  337. xorps LCPI1_0, %xmm0
  338. movss %xmm0, (%esp)
  339. call L_ccoshf$stub
  340. addl $12, %esp
  341. ret
  342. Note the load into xmm0, then xor (to negate), then store. In PIC mode,
  343. this code computes the pic base and does two loads to do the constant pool
  344. load, so the improvement is much bigger.
  345. The tricky part about this xform is that the argument load/store isn't exposed
  346. until post-legalize, and at that point, the fneg has been custom expanded into
  347. an X86 fxor. This means that we need to handle this case in the x86 backend
  348. instead of in target independent code.
  349. //===---------------------------------------------------------------------===//
  350. Non-SSE4 insert into 16 x i8 is atrociously bad.
  351. //===---------------------------------------------------------------------===//
  352. <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
  353. is memory.
  354. //===---------------------------------------------------------------------===//
  355. INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
  356. any number of 0.0 simultaneously. Currently we only use it for simple
  357. insertions.
  358. See comments in LowerINSERT_VECTOR_ELT_SSE4.
  359. //===---------------------------------------------------------------------===//
  360. On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
  361. Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
  362. legal, it'll just take a few extra patterns written in the .td file.
  363. Note: this is not a code quality issue; the custom lowered code happens to be
  364. right, but we shouldn't have to custom lower anything. This is probably related
  365. to <2 x i64> ops being so bad.
  366. //===---------------------------------------------------------------------===//
  367. LLVM currently generates stack realignment code, when it is not necessary
  368. needed. The problem is that we need to know about stack alignment too early,
  369. before RA runs.
  370. At that point we don't know, whether there will be vector spill, or not.
  371. Stack realignment logic is overly conservative here, but otherwise we can
  372. produce unaligned loads/stores.
  373. Fixing this will require some huge RA changes.
  374. Testcase:
  375. #include <emmintrin.h>
  376. typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
  377. static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
  378. - 22725, - 12873};;
  379. vSInt16 madd(vSInt16 b)
  380. {
  381. return _mm_madd_epi16(a, b);
  382. }
  383. Generated code (x86-32, linux):
  384. madd:
  385. pushl %ebp
  386. movl %esp, %ebp
  387. andl $-16, %esp
  388. movaps .LCPI1_0, %xmm1
  389. pmaddwd %xmm1, %xmm0
  390. movl %ebp, %esp
  391. popl %ebp
  392. ret
  393. //===---------------------------------------------------------------------===//
  394. Consider:
  395. #include <emmintrin.h>
  396. __m128 foo2 (float x) {
  397. return _mm_set_ps (0, 0, x, 0);
  398. }
  399. In x86-32 mode, we generate this spiffy code:
  400. _foo2:
  401. movss 4(%esp), %xmm0
  402. pshufd $81, %xmm0, %xmm0
  403. ret
  404. in x86-64 mode, we generate this code, which could be better:
  405. _foo2:
  406. xorps %xmm1, %xmm1
  407. movss %xmm0, %xmm1
  408. pshufd $81, %xmm1, %xmm0
  409. ret
  410. In sse4 mode, we could use insertps to make both better.
  411. Here's another testcase that could use insertps [mem]:
  412. #include <xmmintrin.h>
  413. extern float x2, x3;
  414. __m128 foo1 (float x1, float x4) {
  415. return _mm_set_ps (x2, x1, x3, x4);
  416. }
  417. gcc mainline compiles it to:
  418. foo1:
  419. insertps $0x10, x2(%rip), %xmm0
  420. insertps $0x10, x3(%rip), %xmm1
  421. movaps %xmm1, %xmm2
  422. movlhps %xmm0, %xmm2
  423. movaps %xmm2, %xmm0
  424. ret
  425. //===---------------------------------------------------------------------===//
  426. We compile vector multiply-by-constant into poor code:
  427. define <4 x i32> @f(<4 x i32> %i) nounwind {
  428. %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
  429. ret <4 x i32> %A
  430. }
  431. On targets without SSE4.1, this compiles into:
  432. LCPI1_0: ## <4 x i32>
  433. .long 10
  434. .long 10
  435. .long 10
  436. .long 10
  437. .text
  438. .align 4,0x90
  439. .globl _f
  440. _f:
  441. pshufd $3, %xmm0, %xmm1
  442. movd %xmm1, %eax
  443. imull LCPI1_0+12, %eax
  444. movd %eax, %xmm1
  445. pshufd $1, %xmm0, %xmm2
  446. movd %xmm2, %eax
  447. imull LCPI1_0+4, %eax
  448. movd %eax, %xmm2
  449. punpckldq %xmm1, %xmm2
  450. movd %xmm0, %eax
  451. imull LCPI1_0, %eax
  452. movd %eax, %xmm1
  453. movhlps %xmm0, %xmm0
  454. movd %xmm0, %eax
  455. imull LCPI1_0+8, %eax
  456. movd %eax, %xmm0
  457. punpckldq %xmm0, %xmm1
  458. movaps %xmm1, %xmm0
  459. punpckldq %xmm2, %xmm0
  460. ret
  461. It would be better to synthesize integer vector multiplication by constants
  462. using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
  463. simple cases such as multiplication by powers of two would be better as
  464. vector shifts than as multiplications.
  465. //===---------------------------------------------------------------------===//
  466. We compile this:
  467. __m128i
  468. foo2 (char x)
  469. {
  470. return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
  471. }
  472. into:
  473. movl $1, %eax
  474. xorps %xmm0, %xmm0
  475. pinsrw $2, %eax, %xmm0
  476. movzbl 4(%esp), %eax
  477. pinsrw $3, %eax, %xmm0
  478. movl $256, %eax
  479. pinsrw $7, %eax, %xmm0
  480. ret
  481. gcc-4.2:
  482. subl $12, %esp
  483. movzbl 16(%esp), %eax
  484. movdqa LC0, %xmm0
  485. pinsrw $3, %eax, %xmm0
  486. addl $12, %esp
  487. ret
  488. .const
  489. .align 4
  490. LC0:
  491. .word 0
  492. .word 0
  493. .word 1
  494. .word 0
  495. .word 0
  496. .word 0
  497. .word 0
  498. .word 256
  499. With SSE4, it should be
  500. movdqa .LC0(%rip), %xmm0
  501. pinsrb $6, %edi, %xmm0
  502. //===---------------------------------------------------------------------===//
  503. We should transform a shuffle of two vectors of constants into a single vector
  504. of constants. Also, insertelement of a constant into a vector of constants
  505. should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
  506. We compiled it to something horrible:
  507. .align 4
  508. LCPI1_1: ## float
  509. .long 1065353216 ## float 1
  510. .const
  511. .align 4
  512. LCPI1_0: ## <4 x float>
  513. .space 4
  514. .long 1065353216 ## float 1
  515. .space 4
  516. .long 1065353216 ## float 1
  517. .text
  518. .align 4,0x90
  519. .globl _t
  520. _t:
  521. xorps %xmm0, %xmm0
  522. movhps LCPI1_0, %xmm0
  523. movss LCPI1_1, %xmm1
  524. movaps %xmm0, %xmm2
  525. shufps $2, %xmm1, %xmm2
  526. shufps $132, %xmm2, %xmm0
  527. movaps %xmm0, 0
  528. //===---------------------------------------------------------------------===//
  529. rdar://5907648
  530. This function:
  531. float foo(unsigned char x) {
  532. return x;
  533. }
  534. compiles to (x86-32):
  535. define float @foo(i8 zeroext %x) nounwind {
  536. %tmp12 = uitofp i8 %x to float ; <float> [#uses=1]
  537. ret float %tmp12
  538. }
  539. compiles to:
  540. _foo:
  541. subl $4, %esp
  542. movzbl 8(%esp), %eax
  543. cvtsi2ss %eax, %xmm0
  544. movss %xmm0, (%esp)
  545. flds (%esp)
  546. addl $4, %esp
  547. ret
  548. We should be able to use:
  549. cvtsi2ss 8($esp), %xmm0
  550. since we know the stack slot is already zext'd.
  551. //===---------------------------------------------------------------------===//
  552. Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
  553. when code size is critical. movlps is slower than movsd on core2 but it's one
  554. byte shorter.
  555. //===---------------------------------------------------------------------===//
  556. We should use a dynamic programming based approach to tell when using FPStack
  557. operations is cheaper than SSE. SciMark montecarlo contains code like this
  558. for example:
  559. double MonteCarlo_num_flops(int Num_samples) {
  560. return ((double) Num_samples)* 4.0;
  561. }
  562. In fpstack mode, this compiles into:
  563. LCPI1_0:
  564. .long 1082130432 ## float 4.000000e+00
  565. _MonteCarlo_num_flops:
  566. subl $4, %esp
  567. movl 8(%esp), %eax
  568. movl %eax, (%esp)
  569. fildl (%esp)
  570. fmuls LCPI1_0
  571. addl $4, %esp
  572. ret
  573. in SSE mode, it compiles into significantly slower code:
  574. _MonteCarlo_num_flops:
  575. subl $12, %esp
  576. cvtsi2sd 16(%esp), %xmm0
  577. mulsd LCPI1_0, %xmm0
  578. movsd %xmm0, (%esp)
  579. fldl (%esp)
  580. addl $12, %esp
  581. ret
  582. There are also other cases in scimark where using fpstack is better, it is
  583. cheaper to do fld1 than load from a constant pool for example, so
  584. "load, add 1.0, store" is better done in the fp stack, etc.
  585. //===---------------------------------------------------------------------===//
  586. These should compile into the same code (PR6214): Perhaps instcombine should
  587. canonicalize the former into the later?
  588. define float @foo(float %x) nounwind {
  589. %t = bitcast float %x to i32
  590. %s = and i32 %t, 2147483647
  591. %d = bitcast i32 %s to float
  592. ret float %d
  593. }
  594. declare float @fabsf(float %n)
  595. define float @bar(float %x) nounwind {
  596. %d = call float @fabsf(float %x)
  597. ret float %d
  598. }
  599. //===---------------------------------------------------------------------===//
  600. This IR (from PR6194):
  601. target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
  602. target triple = "x86_64-apple-darwin10.0.0"
  603. %0 = type { double, double }
  604. %struct.float3 = type { float, float, float }
  605. define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
  606. entry:
  607. %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1]
  608. %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1]
  609. %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1]
  610. %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1]
  611. %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1]
  612. %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1]
  613. %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
  614. store float %tmp12, float* %tmp5
  615. ret void
  616. }
  617. Compiles to:
  618. _test: ## @test
  619. movd %xmm0, %rax
  620. shrq $32, %rax
  621. movl %eax, 4(%rdi)
  622. ret
  623. This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
  624. doing a shuffle from v[1] to v[0] then a float store.
  625. //===---------------------------------------------------------------------===//
  626. [UNSAFE FP]
  627. void foo(double, double, double);
  628. void norm(double x, double y, double z) {
  629. double scale = __builtin_sqrt(x*x + y*y + z*z);
  630. foo(x/scale, y/scale, z/scale);
  631. }
  632. We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
  633. slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
  634. and emit 3 mulsd in place of the divs. This can be done as a target-independent
  635. transform.
  636. If we're dealing with floats instead of doubles we could even replace the sqrtss
  637. and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
  638. cost of reduced accuracy.
  639. //===---------------------------------------------------------------------===//