123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188 |
- %include "defs.asm"
- ;************************* unalignedisfaster64.asm ******************************
- ; Author: Agner Fog
- ; Date created: 2011-07-09
- ; Last modified: 2013-08-30
- ; Source URL: www.agner.org/optimize
- ; Project: asmlib.zip
- ; Language: assembly, NASM/YASM syntax, 64 bit
- ;
- ; C++ prototype:
- ; extern "C" int UnalignedIsFaster(void);
- ;
- ; Description:
- ; This function finds out if unaligned 16-bytes memory read is
- ; faster than aligned read followed by an alignment shift (PALIGNR) on the
- ; current CPU.
- ;
- ; Return value:
- ; 0: Unaligned read is probably slower than alignment shift
- ; 1: Unknown
- ; 2: Unaligned read is probably faster than alignment shift
- ;
- ;
- ; C++ prototype:
- ; extern "C" int Store256BitIsFaster(void);
- ;
- ; Description:
- ; This function finds out if a 32-bytes memory write is
- ; faster than two 16-bytes writes on the current CPU.
- ;
- ; Return value:
- ; 0: 32-bytes memory write is slower or AVX not supported
- ; 1: Unknown
- ; 2: 32-bytes memory write is faster
- ;
- ; Copyright (c) 2011 - 2013 GNU General Public License www.gnu.org/licenses
- ;******************************************************************************
- ;
- ; C++ prototype:
- ; extern "C" int UnalignedIsFaster(void);
- global UnalignedIsFaster: function
- global Store256BitIsFaster: function
- extern CpuType
- extern InstructionSet
- SECTION .text
- UnalignedIsFaster:
- %ifdef UNIX
- push 0 ; vendor
- mov rdi, rsp
- push 0 ; family
- mov rsi, rsp
- push 0 ; model
- mov rdx, rsp
- %else ; WINDOWS
- push 0 ; vendor
- mov rcx, rsp
- push 0 ; family
- mov rdx, rsp
- push 0 ; model
- mov r8, rsp
- %endif
- call CpuType ; get vendor, family, model
- pop rdx ; model
- pop rcx ; family
- pop r8 ; vendor
- xor eax, eax ; return value
- dec r8d
- jz Intel
- dec r8d
- jz AMD
- dec r8d
- jz VIA
- ; unknown vendor
- inc eax
- jmp Uend
-
- Intel: ; Unaligned read is faster on Intel Nehalem and later, but not Atom
- ; Nehalem = family 6, model 1AH
- ; Atom = family 6, model 1CH
- ; Netburst = family 0FH
- ; Future models are likely to be family 6, mayby > 6, model > 1C
- cmp ecx, 6
- jb Uend ; old Pentium 1, etc
- cmp ecx, 0FH
- je Uend ; old Netburst architecture
- cmp edx, 1AH
- jb Uend ; earlier than Nehalem
- cmp edx, 1CH
- je Uend ; Intel Atom
- or eax, 2 ; Intel Nehalem and later, except Atom
- jmp Uend
-
- AMD: ; AMD processors:
- ; The PALIGNR instruction is slow on AMD Bobcat but fast on Jaguar
- ; K10/Opteron = family 10H ; Use unaligned
- ; Bobcat = family 14H ; PALIGNR is very slow. Use unaligned
- ; Piledriver = family 15H ; Use unaligned
- ; Jaguar = family 16H ; PALIGNR is fast. Use aligned (aligned is faster in most cases, but not all)
- cmp ecx, 10H ; AMD K8 or earlier: use aligned
- jb Uend
- cmp ecx, 16H ; Jaguar: use aligned
- je Uend
- or eax, 2 ; AMD K10 or later: use unaligned
- jmp Uend
-
- VIA: ; Unaligned read is not faster than PALIGNR on VIA Nano 2000 and 3000
- cmp ecx, 0FH
- jna Uend ; VIA Nano
- inc eax ; Future versions: unknown
- ;jmp Uend
-
- Uend: ret
- ;UnalignedIsFaster ENDP
- Store256BitIsFaster:
- call InstructionSet
- cmp eax, 11 ; AVX supported
- jb S90
- %ifdef UNIX
- push 0 ; vendor
- mov rdi, rsp
- push 0 ; family
- mov rsi, rsp
- push 0 ; model
- mov rdx, rsp
- %else ; WINDOWS
- push 0 ; vendor
- mov rcx, rsp
- push 0 ; family
- mov rdx, rsp
- push 0 ; model
- mov r8, rsp
- %endif
- call CpuType ; get vendor, family, model
- pop rdx ; model
- pop rcx ; family
- pop rax ; vendor
- cmp eax, 1 ; Intel
- je S_Intel
- cmp eax, 2 ; AMD
- je S_AMD
- cmp eax, 3
- je S_VIA
- jmp S91 ; other vendor, not known
-
- S_Intel:cmp ecx, 6
- jne S92 ; unknown family. possibly future model
- ; model 2AH Sandy Bridge
- ; model 3AH Ivy Bridge
- ; model 3CH Haswell
- ; Sandy Bridge and Ivy Bridge are slightly faster with 128 than with 256 bit moves on large data blocks
- ; Haswell is much faster with 256 bit moves
- cmp edx, 3AH
- jbe S90
- jmp S92
- S_AMD: ; AMD
- cmp ecx, 15H ; family 15h = Bulldozer, Piledriver
- ja S92 ; assume future AMD families are faster
- ; family 16H = Jaguar. 256 bit write is slightly faster
- ; model 1 = Bulldozer is a little slower on 256 bit write
- ; model 2 = Piledriver is terribly slow on 256 bit write
- ; assume future models 3-4 are like Bulldozer
- cmp edx, 4
- jbe S90
- jmp S91 ; later models: don't know
-
- S_VIA: jmp S91 ; don't know
-
- S90: xor eax, eax ; return 0
- ret
-
- S91: mov eax, 1 ; return 1
- ret
-
- S92: mov eax, 2 ; return 2
- ret
-
- ; Store256BitIsFaster ENDP
|