unalignedisfaster64.asm 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. %include "defs.asm"
  2. ;************************* unalignedisfaster64.asm ******************************
  3. ; Author: Agner Fog
  4. ; Date created: 2011-07-09
  5. ; Last modified: 2013-08-30
  6. ; Source URL: www.agner.org/optimize
  7. ; Project: asmlib.zip
  8. ; Language: assembly, NASM/YASM syntax, 64 bit
  9. ;
  10. ; C++ prototype:
  11. ; extern "C" int UnalignedIsFaster(void);
  12. ;
  13. ; Description:
  14. ; This function finds out if unaligned 16-bytes memory read is
  15. ; faster than aligned read followed by an alignment shift (PALIGNR) on the
  16. ; current CPU.
  17. ;
  18. ; Return value:
  19. ; 0: Unaligned read is probably slower than alignment shift
  20. ; 1: Unknown
  21. ; 2: Unaligned read is probably faster than alignment shift
  22. ;
  23. ;
  24. ; C++ prototype:
  25. ; extern "C" int Store256BitIsFaster(void);
  26. ;
  27. ; Description:
  28. ; This function finds out if a 32-bytes memory write is
  29. ; faster than two 16-bytes writes on the current CPU.
  30. ;
  31. ; Return value:
  32. ; 0: 32-bytes memory write is slower or AVX not supported
  33. ; 1: Unknown
  34. ; 2: 32-bytes memory write is faster
  35. ;
  36. ; Copyright (c) 2011 - 2013 GNU General Public License www.gnu.org/licenses
  37. ;******************************************************************************
  38. ;
  39. ; C++ prototype:
  40. ; extern "C" int UnalignedIsFaster(void);
  41. global UnalignedIsFaster: function
  42. global Store256BitIsFaster: function
  43. extern CpuType
  44. extern InstructionSet
  45. SECTION .text
  46. UnalignedIsFaster:
  47. %ifdef UNIX
  48. push 0 ; vendor
  49. mov rdi, rsp
  50. push 0 ; family
  51. mov rsi, rsp
  52. push 0 ; model
  53. mov rdx, rsp
  54. %else ; WINDOWS
  55. push 0 ; vendor
  56. mov rcx, rsp
  57. push 0 ; family
  58. mov rdx, rsp
  59. push 0 ; model
  60. mov r8, rsp
  61. %endif
  62. call CpuType ; get vendor, family, model
  63. pop rdx ; model
  64. pop rcx ; family
  65. pop r8 ; vendor
  66. xor eax, eax ; return value
  67. dec r8d
  68. jz Intel
  69. dec r8d
  70. jz AMD
  71. dec r8d
  72. jz VIA
  73. ; unknown vendor
  74. inc eax
  75. jmp Uend
  76. Intel: ; Unaligned read is faster on Intel Nehalem and later, but not Atom
  77. ; Nehalem = family 6, model 1AH
  78. ; Atom = family 6, model 1CH
  79. ; Netburst = family 0FH
  80. ; Future models are likely to be family 6, mayby > 6, model > 1C
  81. cmp ecx, 6
  82. jb Uend ; old Pentium 1, etc
  83. cmp ecx, 0FH
  84. je Uend ; old Netburst architecture
  85. cmp edx, 1AH
  86. jb Uend ; earlier than Nehalem
  87. cmp edx, 1CH
  88. je Uend ; Intel Atom
  89. or eax, 2 ; Intel Nehalem and later, except Atom
  90. jmp Uend
  91. AMD: ; AMD processors:
  92. ; The PALIGNR instruction is slow on AMD Bobcat but fast on Jaguar
  93. ; K10/Opteron = family 10H ; Use unaligned
  94. ; Bobcat = family 14H ; PALIGNR is very slow. Use unaligned
  95. ; Piledriver = family 15H ; Use unaligned
  96. ; Jaguar = family 16H ; PALIGNR is fast. Use aligned (aligned is faster in most cases, but not all)
  97. cmp ecx, 10H ; AMD K8 or earlier: use aligned
  98. jb Uend
  99. cmp ecx, 16H ; Jaguar: use aligned
  100. je Uend
  101. or eax, 2 ; AMD K10 or later: use unaligned
  102. jmp Uend
  103. VIA: ; Unaligned read is not faster than PALIGNR on VIA Nano 2000 and 3000
  104. cmp ecx, 0FH
  105. jna Uend ; VIA Nano
  106. inc eax ; Future versions: unknown
  107. ;jmp Uend
  108. Uend: ret
  109. ;UnalignedIsFaster ENDP
  110. Store256BitIsFaster:
  111. call InstructionSet
  112. cmp eax, 11 ; AVX supported
  113. jb S90
  114. %ifdef UNIX
  115. push 0 ; vendor
  116. mov rdi, rsp
  117. push 0 ; family
  118. mov rsi, rsp
  119. push 0 ; model
  120. mov rdx, rsp
  121. %else ; WINDOWS
  122. push 0 ; vendor
  123. mov rcx, rsp
  124. push 0 ; family
  125. mov rdx, rsp
  126. push 0 ; model
  127. mov r8, rsp
  128. %endif
  129. call CpuType ; get vendor, family, model
  130. pop rdx ; model
  131. pop rcx ; family
  132. pop rax ; vendor
  133. cmp eax, 1 ; Intel
  134. je S_Intel
  135. cmp eax, 2 ; AMD
  136. je S_AMD
  137. cmp eax, 3
  138. je S_VIA
  139. jmp S91 ; other vendor, not known
  140. S_Intel:cmp ecx, 6
  141. jne S92 ; unknown family. possibly future model
  142. ; model 2AH Sandy Bridge
  143. ; model 3AH Ivy Bridge
  144. ; model 3CH Haswell
  145. ; Sandy Bridge and Ivy Bridge are slightly faster with 128 than with 256 bit moves on large data blocks
  146. ; Haswell is much faster with 256 bit moves
  147. cmp edx, 3AH
  148. jbe S90
  149. jmp S92
  150. S_AMD: ; AMD
  151. cmp ecx, 15H ; family 15h = Bulldozer, Piledriver
  152. ja S92 ; assume future AMD families are faster
  153. ; family 16H = Jaguar. 256 bit write is slightly faster
  154. ; model 1 = Bulldozer is a little slower on 256 bit write
  155. ; model 2 = Piledriver is terribly slow on 256 bit write
  156. ; assume future models 3-4 are like Bulldozer
  157. cmp edx, 4
  158. jbe S90
  159. jmp S91 ; later models: don't know
  160. S_VIA: jmp S91 ; don't know
  161. S90: xor eax, eax ; return 0
  162. ret
  163. S91: mov eax, 1 ; return 1
  164. ret
  165. S92: mov eax, 2 ; return 2
  166. ret
  167. ; Store256BitIsFaster ENDP