SMusatov
/
ydb
mirror of https://github.com/ydb-platform/ydb.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
							%include "defs.asm"

;*************************  divfixedi64.asm  *********************************
; Author:           Agner Fog
; Date created:     2011-07-22
; Last modified:    2011-07-22
;
; Function prototypes:
; void setdivisori32(int buffer[2], int d);
; int dividefixedi32(const int buffer[2], int x);
; void setdivisoru32(uint32_t buffer[2], uint32_t d);
; uint32_t dividefixedu32(const uint32_t buffer[2], uint32_t x);
;
; Description:
; Functions for fast repeated integer division by the same divisor, signed 
; and unsigned 32-bit integer versions. The divisor must be positive.
;
; The setdivisor functions calculate the reciprocal divisor and shift counts,
; the dividefixed functions do the division by multiplication and shift.
;
; The methods used are described by:
; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
;
; Mathematical formula, unsigned division:
; x = dividend
; d = divisor
; n = integer size, bits
; L = ceil(log2(d))
; m = 1 + 2^n * (2^L-d) / d       [2^L should overflow to 0 if L = n]
; sh1 = min(L,1)
; sh2 = max(L-1,0)
; t = m*x >> n                    [high part of unsigned multiplication]
; x/d = (((x-t) >> sh1) + t) >> sh2
;
; Mathematical formula, signed division:
; x = dividend
; d = abs(divisor)
; n = integer size, bits
; L = ceil(log2(d))
; L = max(L,1)
; m = 1 + 2^(n+L-1)/d - 2^n       [division should overflow to 0 if d = 1]
; sh1 = L-1
; q = x + (m*x >> n)              [high part of signed multiplication]
; q = (q >> sh1) - (x<0 ? -1 : 0)
; if (divisor < 0) q = -q         [negative divisor not supported in present implementation]
; x/d = q
;
; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
;******************************************************************************

%IFDEF  WINDOWS
%define par1   rcx                     ; function parameter 1
%define par2   edx                     ; function parameter 2
%define buf    r9                      ; copy of function parameter 1: buffer
%define rx     r8
%define rxd    r8d                     ; d or x
%ELSE   ; UNIX
%define par1   rdi                     ; function parameter 1
%define par2   esi                     ; function parameter 2
%define buf    rdi                     ; function parameter 1: buffer
%define rx     rsi
%define rxd    esi                     ; d or x
%ENDIF


section .text

; extern "C" void setdivisori32(int buffer[2], int d);
; 32 bit signed 

global setdivisori32: function
setdivisori32:
%IFDEF  WINDOWS
        mov     rxd, edx               ; x
        mov     buf, rcx               ; buffer
%ENDIF        
        dec     rxd                    ; rxd = r8d or esi
        mov     ecx, -1                ; value for bsr if rxd = 0 (assuming bsr leaves dest unchanged if src = 0, this works on both Intel, AMD and VIA processors)
        bsr     ecx, rxd               ; floor(log2(d-1))
        inc     rxd
        js      H120                   ; d < 0. Generate error
        inc     ecx                    ; L = ceil(log2(d))        
        sub     ecx, 1                 ; shift count = L - 1
        adc     ecx, 0                 ; avoid negative shift count
        xor     eax, eax
        mov     edx, 1
        cmp     rxd, edx
        je      H110                   ; avoid overflow when d = 1
        shl     edx, cl
        div     rxd
H110:   inc     eax
        mov     [buf], eax             ; multiplier
        mov     [buf+4], ecx           ; shift count
        ret
        
H120:   ; d <= 0 not supported. Generate error
        mov     edx, 1
        div     edx                    ; will overflow
        ud2

        
; extern "C" int dividefixedi32(int buffer[2], int x);
global dividefixedi32: function
dividefixedi32:
%IFDEF  WINDOWS
        mov     eax, edx
        mov     rxd, edx               ; x
        mov     buf, rcx               ; buffer
%ELSE
        mov     eax, esi
%ENDIF        
        imul    dword [buf]            ; m
        lea     eax, [rdx+rx]          ; rx = r8 or rsi
        mov     ecx, [buf+4]           ; shift count
        sar     eax, cl
        sar     rxd, 31                ; sign(x)
        sub     eax, rxd
        ret


;extern "C" void setdivisoru32(int buffer[2], int d);
; 32 bit unsigned 

global setdivisoru32: function
setdivisoru32:
%IFDEF  WINDOWS
        mov     rxd, edx               ; x
        mov     buf, rcx               ; buffer
%ENDIF        
        dec     rxd                    ; rxd = r8d or esi
        mov     ecx, -1                ; value for bsr if r8d = 0
        bsr     ecx, rxd               ; floor(log2(d-1))
        inc     rxd
        inc     ecx                    ; L = ceil(log2(d))
        mov     edx, 1
        shl     rdx, cl                ; 2^L (64 bit shift because cl may be 32)
        sub     edx, rxd
        xor     eax, eax
        div     rxd
        inc     eax
        mov     [buf], eax             ; multiplier
        sub     ecx, 1
        setae   dl
        movzx   edx, dl                ; shift1
        seta    al
        neg     al
        and     al,cl
        movzx   eax, al                ; shift 2
        shl     eax, 8
        or      eax, edx
        mov     [buf+4], eax           ; shift 1 and shift 2
        ret
        
;extern "C" int dividefixedu32(int buffer[2], int x);
global dividefixedu32: function       ; unsigned
dividefixedu32:
%IFDEF  WINDOWS
        mov     eax, edx
        mov     rxd, edx               ; x
        mov     buf, rcx               ; buffer
%ELSE
        mov     eax, esi
%ENDIF        
        mul     dword [buf]            ; m
        sub     rxd, edx               ; x-t
        mov     ecx, [buf+4]           ; shift 1 and shift 2
        shr     rxd, cl
        lea     eax, [rx+rdx]
        shr     ecx, 8
        shr     eax, cl
        ret