floatundisf.S 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  2. // See https://llvm.org/LICENSE.txt for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "../assembly.h"
  5. // float __floatundisf(du_int a);
  6. // Note that there is a hardware instruction, fildll, that does most of what
  7. // this function needs to do. However, because of our ia32 ABI, it will take
  8. // a write-small read-large stall, so the software implementation here is
  9. // actually several cycles faster.
  10. // This is a branch-free implementation. A branchy implementation might be
  11. // faster for the common case if you know something a priori about the input
  12. // distribution.
  13. /* branch-free x87 implementation - one cycle slower than without x87.
  14. #ifdef __i386__
  15. CONST_SECTION
  16. .balign 3
  17. .quad 0x43f0000000000000
  18. twop64: .quad 0x0000000000000000
  19. #define TWOp64 twop64-0b(%ecx,%eax,8)
  20. .text
  21. .balign 4
  22. DEFINE_COMPILERRT_FUNCTION(__floatundisf)
  23. movl 8(%esp), %eax
  24. movd 8(%esp), %xmm1
  25. movd 4(%esp), %xmm0
  26. punpckldq %xmm1, %xmm0
  27. calll 0f
  28. 0: popl %ecx
  29. sarl $31, %eax
  30. movq %xmm0, 4(%esp)
  31. fildll 4(%esp)
  32. faddl TWOp64
  33. fstps 4(%esp)
  34. flds 4(%esp)
  35. ret
  36. END_COMPILERRT_FUNCTION(__floatundisf)
  37. #endif // __i386__
  38. */
  39. // branch-free, x87-free implementation - faster at the expense of code size
  40. #ifdef __i386__
  41. CONST_SECTION
  42. .balign 16
  43. twop52:
  44. .quad 0x4330000000000000
  45. .quad 0x0000000000000fff
  46. .balign 16
  47. sticky:
  48. .quad 0x0000000000000000
  49. .long 0x00000012
  50. .balign 16
  51. twelve:
  52. .long 0x00000000
  53. #define TWOp52 twop52-0b(%ecx)
  54. #define STICKY sticky-0b(%ecx,%eax,8)
  55. .text
  56. .balign 4
  57. DEFINE_COMPILERRT_FUNCTION(__floatundisf)
  58. movl 8(%esp), %eax
  59. movd 8(%esp), %xmm1
  60. movd 4(%esp), %xmm0
  61. punpckldq %xmm1, %xmm0
  62. calll 0f
  63. 0: popl %ecx
  64. shrl %eax // high 31 bits of input as sint32
  65. addl $0x7ff80000, %eax
  66. sarl $31, %eax // (big input) ? -1 : 0
  67. movsd STICKY, %xmm1 // (big input) ? 0xfff : 0
  68. movl $12, %edx
  69. andl %eax, %edx // (big input) ? 12 : 0
  70. movd %edx, %xmm3
  71. andpd %xmm0, %xmm1 // (big input) ? input & 0xfff : 0
  72. movsd TWOp52, %xmm2 // 0x1.0p52
  73. psrlq %xmm3, %xmm0 // (big input) ? input >> 12 : input
  74. orpd %xmm2, %xmm1 // 0x1.0p52 + ((big input) ? input & 0xfff : input)
  75. orpd %xmm1, %xmm0 // 0x1.0p52 + ((big input) ? (input >> 12 | input & 0xfff) : input)
  76. subsd %xmm2, %xmm0 // (double)((big input) ? (input >> 12 | input & 0xfff) : input)
  77. cvtsd2ss %xmm0, %xmm0 // (float)((big input) ? (input >> 12 | input & 0xfff) : input)
  78. pslld $23, %xmm3
  79. paddd %xmm3, %xmm0 // (float)input
  80. movd %xmm0, 4(%esp)
  81. flds 4(%esp)
  82. ret
  83. END_COMPILERRT_FUNCTION(__floatundisf)
  84. #endif // __i386__
  85. NO_EXEC_STACK_DIRECTIVE