divfixedi64.asm 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. %include "defs.asm"
  2. ;************************* divfixedi64.asm *********************************
  3. ; Author: Agner Fog
  4. ; Date created: 2011-07-22
  5. ; Last modified: 2011-07-22
  6. ;
  7. ; Function prototypes:
  8. ; void setdivisori32(int buffer[2], int d);
  9. ; int dividefixedi32(const int buffer[2], int x);
  10. ; void setdivisoru32(uint32_t buffer[2], uint32_t d);
  11. ; uint32_t dividefixedu32(const uint32_t buffer[2], uint32_t x);
  12. ;
  13. ; Description:
  14. ; Functions for fast repeated integer division by the same divisor, signed
  15. ; and unsigned 32-bit integer versions. The divisor must be positive.
  16. ;
  17. ; The setdivisor functions calculate the reciprocal divisor and shift counts,
  18. ; the dividefixed functions do the division by multiplication and shift.
  19. ;
  20. ; The methods used are described by:
  21. ; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
  22. ; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
  23. ; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
  24. ;
  25. ; Mathematical formula, unsigned division:
  26. ; x = dividend
  27. ; d = divisor
  28. ; n = integer size, bits
  29. ; L = ceil(log2(d))
  30. ; m = 1 + 2^n * (2^L-d) / d [2^L should overflow to 0 if L = n]
  31. ; sh1 = min(L,1)
  32. ; sh2 = max(L-1,0)
  33. ; t = m*x >> n [high part of unsigned multiplication]
  34. ; x/d = (((x-t) >> sh1) + t) >> sh2
  35. ;
  36. ; Mathematical formula, signed division:
  37. ; x = dividend
  38. ; d = abs(divisor)
  39. ; n = integer size, bits
  40. ; L = ceil(log2(d))
  41. ; L = max(L,1)
  42. ; m = 1 + 2^(n+L-1)/d - 2^n [division should overflow to 0 if d = 1]
  43. ; sh1 = L-1
  44. ; q = x + (m*x >> n) [high part of signed multiplication]
  45. ; q = (q >> sh1) - (x<0 ? -1 : 0)
  46. ; if (divisor < 0) q = -q [negative divisor not supported in present implementation]
  47. ; x/d = q
  48. ;
  49. ; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
  50. ;******************************************************************************
  51. %IFDEF WINDOWS
  52. %define par1 rcx ; function parameter 1
  53. %define par2 edx ; function parameter 2
  54. %define buf r9 ; copy of function parameter 1: buffer
  55. %define rx r8
  56. %define rxd r8d ; d or x
  57. %ELSE ; UNIX
  58. %define par1 rdi ; function parameter 1
  59. %define par2 esi ; function parameter 2
  60. %define buf rdi ; function parameter 1: buffer
  61. %define rx rsi
  62. %define rxd esi ; d or x
  63. %ENDIF
  64. section .text
  65. ; extern "C" void setdivisori32(int buffer[2], int d);
  66. ; 32 bit signed
  67. global setdivisori32: function
  68. setdivisori32:
  69. %IFDEF WINDOWS
  70. mov rxd, edx ; x
  71. mov buf, rcx ; buffer
  72. %ENDIF
  73. dec rxd ; rxd = r8d or esi
  74. mov ecx, -1 ; value for bsr if rxd = 0 (assuming bsr leaves dest unchanged if src = 0, this works on both Intel, AMD and VIA processors)
  75. bsr ecx, rxd ; floor(log2(d-1))
  76. inc rxd
  77. js H120 ; d < 0. Generate error
  78. inc ecx ; L = ceil(log2(d))
  79. sub ecx, 1 ; shift count = L - 1
  80. adc ecx, 0 ; avoid negative shift count
  81. xor eax, eax
  82. mov edx, 1
  83. cmp rxd, edx
  84. je H110 ; avoid overflow when d = 1
  85. shl edx, cl
  86. div rxd
  87. H110: inc eax
  88. mov [buf], eax ; multiplier
  89. mov [buf+4], ecx ; shift count
  90. ret
  91. H120: ; d <= 0 not supported. Generate error
  92. mov edx, 1
  93. div edx ; will overflow
  94. ud2
  95. ; extern "C" int dividefixedi32(int buffer[2], int x);
  96. global dividefixedi32: function
  97. dividefixedi32:
  98. %IFDEF WINDOWS
  99. mov eax, edx
  100. mov rxd, edx ; x
  101. mov buf, rcx ; buffer
  102. %ELSE
  103. mov eax, esi
  104. %ENDIF
  105. imul dword [buf] ; m
  106. lea eax, [rdx+rx] ; rx = r8 or rsi
  107. mov ecx, [buf+4] ; shift count
  108. sar eax, cl
  109. sar rxd, 31 ; sign(x)
  110. sub eax, rxd
  111. ret
  112. ;extern "C" void setdivisoru32(int buffer[2], int d);
  113. ; 32 bit unsigned
  114. global setdivisoru32: function
  115. setdivisoru32:
  116. %IFDEF WINDOWS
  117. mov rxd, edx ; x
  118. mov buf, rcx ; buffer
  119. %ENDIF
  120. dec rxd ; rxd = r8d or esi
  121. mov ecx, -1 ; value for bsr if r8d = 0
  122. bsr ecx, rxd ; floor(log2(d-1))
  123. inc rxd
  124. inc ecx ; L = ceil(log2(d))
  125. mov edx, 1
  126. shl rdx, cl ; 2^L (64 bit shift because cl may be 32)
  127. sub edx, rxd
  128. xor eax, eax
  129. div rxd
  130. inc eax
  131. mov [buf], eax ; multiplier
  132. sub ecx, 1
  133. setae dl
  134. movzx edx, dl ; shift1
  135. seta al
  136. neg al
  137. and al,cl
  138. movzx eax, al ; shift 2
  139. shl eax, 8
  140. or eax, edx
  141. mov [buf+4], eax ; shift 1 and shift 2
  142. ret
  143. ;extern "C" int dividefixedu32(int buffer[2], int x);
  144. global dividefixedu32: function ; unsigned
  145. dividefixedu32:
  146. %IFDEF WINDOWS
  147. mov eax, edx
  148. mov rxd, edx ; x
  149. mov buf, rcx ; buffer
  150. %ELSE
  151. mov eax, esi
  152. %ENDIF
  153. mul dword [buf] ; m
  154. sub rxd, edx ; x-t
  155. mov ecx, [buf+4] ; shift 1 and shift 2
  156. shr rxd, cl
  157. lea eax, [rx+rdx]
  158. shr ecx, 8
  159. shr eax, cl
  160. ret