dispatchpatch64.asm 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. %include "defs.asm"
  2. ;*********************** dispatchpatch64.asm ********************************
  3. ; Author: Agner Fog
  4. ; Date created: 2007-07-20
  5. ; Last modified: 2013-08-21
  6. ; Source URL: www.agner.org/optimize
  7. ; Project: asmlib.zip
  8. ; Language: assembly, NASM/YASM syntax, 64 bit
  9. ;
  10. ; C++ prototype:
  11. ; extern "C" int __intel_cpu_indicator = 0;
  12. ; extern "C" void __intel_cpu_indicator_init()
  13. ;
  14. ; Description:
  15. ; Example of how to replace Intel CPU dispatcher in order to improve
  16. ; compatibility of Intel function libraries with non-Intel processors.
  17. ; Only works with static link libraries (*.lib, *.a), not dynamic libraries
  18. ; (*.dll, *.so). Linking in this as an object file will override the functions
  19. ; with the same name in the library.;
  20. ;
  21. ; Copyright (c) 2007-2013 GNU LGPL License v. 3.0 www.gnu.org/licenses/lgpl.html
  22. ;******************************************************************************
  23. ; extern InstructionSet: function
  24. %include "instrset64.asm" ; include code for InstructionSet function
  25. ; InstructionSet function return value:
  26. ; 4 or above = SSE2 supported
  27. ; 5 or above = SSE3 supported
  28. ; 6 or above = Supplementary SSE3
  29. ; 8 or above = SSE4.1 supported
  30. ; 9 or above = POPCNT supported
  31. ; 10 or above = SSE4.2 supported
  32. ; 11 or above = AVX supported by processor and operating system
  33. ; 12 or above = PCLMUL and AES supported
  34. ; 13 or above = AVX2 supported
  35. ; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
  36. ; 15 or above = HLE + RTM supported
  37. global __intel_cpu_indicator
  38. global __intel_cpu_indicator_init
  39. SECTION .data
  40. intel_cpu_indicator@: ; local name
  41. __intel_cpu_indicator: dd 0
  42. ; table of indicator values
  43. itable DD 1 ; 0: generic version, 80386 instruction set
  44. DD 8, 8 ; 1, 2: MMX
  45. DD 0x80 ; 3: SSE
  46. DD 0x200 ; 4: SSE2
  47. DD 0x800 ; 5: SSE3
  48. DD 0x1000, 0x1000 ; 6, 7: SSSE3
  49. DD 0x2000, 0x2000 ; 8, 9: SSE4.1
  50. DD 0x8000, 0x8000 ; 10, 11: SSE4.2 and popcnt
  51. DD 0x20000, 0x20000 ; 12, 13: AVX, pclmul, aes
  52. DD 0x400000 ; 14: AVX2, F16C, BMI1, BMI2, LZCNT, FMA3
  53. DD 0x800000 ; 15: HLE, RTM
  54. itablelen equ ($ - itable) / 4 ; length of table
  55. SECTION .text
  56. __intel_cpu_indicator_init:
  57. push rax ; registers must be pushed
  58. push rcx
  59. push rdx
  60. push r8
  61. push r9
  62. push r10
  63. push r11
  64. push rsi
  65. push rdi
  66. call InstructionSet
  67. cmp eax, itablelen
  68. jb L100
  69. mov eax, itablelen - 1 ; limit to table length
  70. L100: lea rdx, [rel itable]
  71. mov eax, [rdx + 4*rax]
  72. mov [rel intel_cpu_indicator@], eax ; store in __intel_cpu_indicator
  73. pop rdi
  74. pop rsi
  75. pop r11
  76. pop r10
  77. pop r9
  78. pop r8
  79. pop rdx
  80. pop rcx
  81. pop rax
  82. ret
  83. ;__intel_cpu_indicator_init ENDP
  84. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  85. ; Dispatcher for Math Kernel Library (MKL),
  86. ; version 10.2 and higher
  87. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  88. WEAK_SYM(mkl_serv_cpu_detect)
  89. SECTION .data
  90. ; table of indicator values
  91. ; Note: the table is different in 32 bit and 64 bit mode
  92. mkltab DD 0, 0, 0, 0 ; 0-3: generic version, 80386 instruction set
  93. DD 0 ; 4: SSE2
  94. DD 1 ; 5: SSE3
  95. DD 2, 2, 2, 2 ; 6-9: SSSE3
  96. DD 3 ; 10: SSE4.2
  97. DD 4, 4, 4 ; 11-13: AVX
  98. DD 5 ; 14: AVX2, FMA3, BMI1, BMI2, LZCNT, PCLMUL
  99. mkltablen equ ($ - mkltab) / 4 ; length of table
  100. SECTION .text
  101. mkl_serv_cpu_detect:
  102. push rcx ; Perhaps not needed
  103. push rdx
  104. push r8
  105. push r9
  106. %ifdef WINDOWS
  107. push rsi
  108. push rdi
  109. %endif
  110. call InstructionSet
  111. cmp eax, mkltablen
  112. jb M100
  113. mov eax, mkltablen - 1 ; limit to table length
  114. M100:
  115. lea rdx, [rel mkltab]
  116. mov eax, [rdx + 4*rax]
  117. %ifdef WINDOWS
  118. pop rdi
  119. pop rsi
  120. %endif
  121. pop r9
  122. pop r8
  123. pop rdx
  124. pop rcx
  125. ret
  126. ; end mkl_serv_cpu_detect
  127. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  128. ; Dispatcher for Vector Math Library (VML)
  129. ; version 10.0 and higher
  130. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  131. WEAK_SYM(mkl_vml_serv_cpu_detect)
  132. SECTION .data
  133. ; table of indicator values
  134. ; Note: the table is different in 32 bit and 64 bit mode
  135. vmltab DD 0, 0, 0, 0 ; 0-3: generic version, 80386 instruction set
  136. DD 1, 1 ; 4-5: SSE2
  137. DD 2, 2 ; 6-7: SSSE3
  138. DD 3, 3 ; 8-9: SSE4.1
  139. DD 4 ; 10: SSE4.2
  140. DD 5, 5, 5 ; 11: AVX
  141. ; DD 6 ??
  142. vmltablen equ ($ - vmltab) / 4 ; length of table
  143. SECTION .text
  144. mkl_vml_serv_cpu_detect:
  145. push rcx ; Perhaps not needed
  146. push rdx
  147. push r8
  148. push r9
  149. %ifdef WINDOWS
  150. push rsi
  151. push rdi
  152. %endif
  153. call InstructionSet
  154. cmp eax, vmltablen
  155. jb V100
  156. mov eax, vmltablen - 1 ; limit to table length
  157. V100:
  158. lea rdx, [rel vmltab]
  159. mov eax, [rdx + 4*rax]
  160. %ifdef WINDOWS
  161. pop rdi
  162. pop rsi
  163. %endif
  164. pop r9
  165. pop r8
  166. pop rdx
  167. pop rcx
  168. ret
  169. ; end mkl_vml_serv_cpu_detect
  170. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  171. ; Dispatcher for __intel_cpu_feature_indicator
  172. ; version 13 and higher
  173. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  174. global __intel_cpu_features_init
  175. global __intel_cpu_feature_indicator
  176. global __intel_cpu_fms_indicator
  177. global __intel_cpu_features_init_x
  178. global __intel_cpu_feature_indicator_x
  179. global __intel_cpu_fms_indicator_x
  180. SECTION .data
  181. ; table of indicator values
  182. intel_cpu_feature_indicator@:
  183. __intel_cpu_feature_indicator:
  184. __intel_cpu_feature_indicator_x DD 0, 0
  185. intel_cpu_fms_indicator@:
  186. __intel_cpu_fms_indicator:
  187. __intel_cpu_fms_indicator_x: DD 0, 0
  188. feattab DD 1 ; 0 default
  189. DD 0BH ; 1 MMX
  190. DD 0FH ; 2 conditional move and FCOMI supported
  191. DD 3FH ; 3 SSE
  192. DD 7FH ; 4 SSE2
  193. DD 0FFH ; 5 SSE3
  194. DD 1FFH, 1FFH ; 6 Supplementary SSE3
  195. DD 3FFH ; 8 SSE4.1
  196. DD 0BFFH ; 9 POPCNT
  197. DD 0FFFH ; 10 SSE4.2
  198. DD 10FFFH ; 11 AVX
  199. DD 16FFFH ; 12 PCLMUL and AES
  200. DD 816FFFH ; 13 AVX2
  201. DD 9DEFFFH ; 14 FMA3, F16C, BMI1, BMI2, LZCNT
  202. DD 0FDEFFFH ; 15 HLE, RTM
  203. feattablen equ ($ - feattab) / 4 ; length of table
  204. SECTION .text
  205. __intel_cpu_features_init:
  206. __intel_cpu_features_init_x:
  207. push rbx
  208. push rcx ; Perhaps not needed
  209. push rdx
  210. push r8
  211. push r9
  212. %ifdef WINDOWS
  213. push rsi
  214. push rdi
  215. %endif
  216. call InstructionSet
  217. cmp eax, feattablen
  218. jb F100
  219. mov eax, vmltablen - 1 ; limit to table length
  220. F100:
  221. lea rdx, [rel feattab]
  222. mov ebx, [rdx + 4*rax] ; look up in table
  223. push rbx
  224. mov eax, 1
  225. cpuid
  226. pop rbx
  227. bt ecx, 22 ; MOVBE
  228. jnc F200
  229. or ebx, 1000H
  230. F200: mov [intel_cpu_feature_indicator@], rbx
  231. ; get family and model
  232. mov edx, eax
  233. and eax, 0FH ; stepping bit 0-3
  234. mov ecx, edx
  235. shr ecx, 4
  236. and ecx, 0FH ; model
  237. mov ebx, edx
  238. shr ebx, 12
  239. and ebx, 0F0H ; x model
  240. or ecx, ebx ; full model
  241. mov ah, cl ; model bit 8 - 15
  242. mov ecx, edx
  243. shr ecx, 8
  244. and ecx, 0FH ; family
  245. mov ebx, edx
  246. shr ebx, 20
  247. and ebx, 0FFH ; x family
  248. add ecx, ebx ; full family
  249. shl ecx, 16
  250. or eax, ecx ; full family bit 16 - 23
  251. mov [intel_cpu_fms_indicator@], eax
  252. %ifdef WINDOWS
  253. pop rdi
  254. pop rsi
  255. %endif
  256. pop r9
  257. pop r8
  258. pop rdx
  259. pop rcx
  260. pop rbx
  261. ret
  262. ; end __intel_cpu_features_init