123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335 |
- %include "defs.asm"
- ;************************* cachesize64.asm *************************************
- ; Author: Agner Fog
- ; Date created: 2011-07-11
- ; Last modified: 2013-08-14
- ; Description:
- ; Determines the size of the data caches
- ;
- ; extern "C" site_t DataCacheSize(int level);
- ; Input:
- ; level: n = 1 - 4: level n data cache
- ; 0 = largest level data cache
- ; Return value: size in bytes of data cache
- ;
- ; The latest version of this file is available at:
- ; www.agner.org/optimize/asmexamples.zip
- ; Copyright (c) 2011-2013 GNU General Public License www.gnu.org/licenses
- ;******************************************************************************
- default rel
- global DataCacheSize: function
- ; Imported from cputype64.asm
- extern CpuType ; near. Determine CPU vendor
- struc data_layout
- ok: resd 2
- level1: resq 1
- level2: resq 1
- level3: resq 1
- level4: resq 1
- descriptortable: resd 60
- endstruc
- struc descriptor_record ; record for table of cache descriptors
- d_key: resb 1 ; key from cpuid instruction
- d_level: resb 1 ; cache level
- d_sizem: resb 1 ; size multiplier
- d_2pow: resb 1 ; power of 2. size = d_sizem << d_2pow
- endstruc
- SECTION .data
- dataref: ; reference point
- ok_: DD 0, 0 ; 1 when values are determined
- level1_: DQ 0 ; level 1 data cache size
- level2_: DQ 0 ; level 2 data cache size
- level3_: DQ 0 ; level 3 data cache size
- level4_: DQ 0 ; level 4 data cache size
- numlevels equ 4 ; max level
- ; From "Intel Processor Identification and the CPUID Instruction, Application note 485
- descriptortable_: ; table of Intel cache descriptors
- db 0Ah, 1, 1, 13 ; 8 kb L1 data cache
- db 0Ch, 1, 1, 14 ; 16 kb L1 data cache
- db 0Dh, 1, 1, 14 ; 16 kb L1 data cache
- db 21h, 2, 1, 18 ; 256 kb L2 data cache
- db 22h, 3, 1, 19 ; 512 kb L3 data cache
- db 23h, 3, 1, 20 ; 1 Mb L3 data cache
- db 25h, 3, 1, 21 ; 2 Mb L3 data cache
- db 29h, 3, 1, 22 ; 4 Mb L3 data cache
- db 2Ch, 1, 1, 15 ; 32 kb L1 data cache
- db 39h, 2, 1, 17 ; 128 kb L2 data cache
- db 3Ah, 2, 3, 16 ; 192 kb L2 data cache
- db 3Bh, 2, 1, 17 ; 128 kb L1 data cache
- db 3Ch, 2, 1, 18 ; 256 kb L1 data cache
- db 3Dh, 2, 3, 17 ; 384 kb L2 data cache
- db 3Eh, 2, 1, 19 ; 512 kb L2 data cache
- db 41h, 2, 1, 17 ; 128 kb L2 data cache
- db 42h, 2, 1, 18 ; 256 kb L2 data cache
- db 43h, 2, 1, 19 ; 512 kb L2 data cache
- db 44h, 2, 1, 20 ; 1 Mb L2 data cache
- db 45h, 2, 1, 21 ; 2 Mb L2 data cache
- db 46h, 3, 1, 22 ; 4 Mb L3 data cache
- db 47h, 3, 1, 23 ; 8 Mb L3 data cache
- db 48h, 2, 3, 20 ; 3 Mb L2 data cache
- db 49h, 2, 1, 22 ; 4 Mb L2 or 3 data cache
- db 4Ah, 3, 3, 21 ; 6 Mb L3 data cache
- db 4Bh, 3, 1, 23 ; 8 Mb L3 data cache
- db 4Ch, 3, 3, 22 ; 12 Mb L3 data cache
- db 4Dh, 3, 1, 24 ; 16 Mb L3 data cache
- db 4Eh, 2, 3, 21 ; 6 Mb L2 data cache
- db 60h, 1, 1, 14 ; 16 kb L1 data cache
- db 66h, 1, 1, 13 ; 8 kb L1 data cache
- db 67h, 1, 1, 14 ; 16 kb L1 data cache
- db 68h, 1, 1, 15 ; 32 kb L1 data cache
- db 78h, 2, 1, 20 ; 1 Mb L2 data cache
- db 79h, 2, 1, 17 ; 128 kb L2 data cache
- db 7Ah, 2, 1, 18 ; 256 kb L2 data cache
- db 7Bh, 2, 1, 19 ; 512 kb L2 data cache
- db 7Ch, 2, 1, 20 ; 1 Mb L2 data cache
- db 7Dh, 2, 1, 21 ; 2 Mb L2 data cache
- db 7Fh, 2, 1, 19 ; 512 kb L2 data cache
- db 82h, 2, 1, 18 ; 256 kb L2 data cache
- db 83h, 2, 1, 19 ; 512 kb L2 data cache
- db 84h, 2, 1, 20 ; 1 Mb L2 data cache
- db 85h, 2, 1, 21 ; 2 Mb L2 data cache
- db 86h, 2, 1, 19 ; 512 kb L2 data cache
- db 87h, 2, 1, 20 ; 1 Mb L2 data cache
- db 0D0h, 3, 1, 19 ; 512 kb L3 data cache
- db 0D1h, 3, 1, 20 ; 1 Mb L3 data cache
- db 0D2h, 3, 1, 21 ; 2 Mb L3 data cache
- db 0D6h, 3, 1, 20 ; 1 Mb L3 data cache
- db 0D7h, 3, 1, 21 ; 2 Mb L3 data cache
- db 0D8h, 3, 1, 22 ; 4 Mb L3 data cache
- db 0DCh, 3, 3, 19 ; 1.5 Mb L3 data cache
- db 0DDh, 3, 3, 20 ; 3 Mb L3 data cache
- db 0DEh, 3, 3, 21 ; 6 Mb L3 data cache
- db 0E2h, 3, 1, 21 ; 2 Mb L3 data cache
- db 0E3h, 3, 1, 22 ; 4 Mb L3 data cache
- db 0E4h, 3, 1, 23 ; 8 Mb L3 data cache
- db 0EAh, 3, 3, 22 ; 12 Mb L3 data cache
- db 0EBh, 3, 9, 21 ; 18 Mb L3 data cache
- db 0ECh, 3, 3, 23 ; 24 Mb L3 data cache
- descriptortablelength equ ($ - descriptortable_) / descriptor_record_size
- SECTION .text
- ; extern "C" site_t DataCacheSize(int level);
- ; Function entry:
- DataCacheSize:
- push rbx
- push r14
- %ifdef WINDOWS
- push rsi
- push rdi
- mov r14d, ecx ; level
- %else ; UNIX
- mov r14d, edi ; level
- %endif
- ; check if called before
- lea r9, [dataref]
- cmp dword [r9+ok], 1 ; ok
- je D800
-
- ; find cpu vendor
- push 0
- %ifdef WINDOWS
- mov rcx, rsp
- xor edx, edx
- xor r8d, r8d
- %else ; UNIX
- mov rdi, rsp
- xor esi, esi
- xor edx, edx
- %endif
- call CpuType
- lea r9, [dataref]
- pop rax ; eax = vendor
- dec eax
- jz Intel
- dec eax
- jz AMD
- dec eax
- jz VIA
- ; unknown vendor, try all methods
- call IntelNewMethod
- jnc D800 ; not carry = success
- call AMDMethod
- jnc D800 ; not carry = success
- call IntelOldMethod
- jmp D800 ; return whether success or not
-
- Intel: call IntelNewMethod
- jnc D800 ; not carry = success
- call IntelOldMethod
- jmp D800 ; return whether success or not
- AMD: ; AMD and VIA use same method
- VIA: call AMDMethod
-
- D800: ; cache data known, get desired return value
- xor eax, eax
- cmp r14d, numlevels
- ja D900
- cmp r14d, 0
- je D820
- ; level = 1 .. numlevels
- mov rax, [r9 + r14*8] ; size of selected cache
- jmp D850
- D820: ; level = 0. Get size of largest level cache
- mov rax, [r9 + level3] ; level3
- test rax, rax
- jnz D850
- mov rax, [r9 + level2] ; level2
- test rax, rax
- jnz D850
- mov eax, [r9 + level1] ; level1
- D850: mov dword [r9 + ok], 1 ; remember called, whether success or not
- D900:
- %ifdef WINDOWS
- pop rdi
- pop rsi
- %endif
- pop r14
- pop rbx
- ret
- ; Determine cache sizes by CPUID function 4
- ; input: esi = pointer to dataref
- ; output: values returned in dataref + level1, level2, level3
- ; carry flag = 0 on succes
- IntelNewMethod:
- xor eax, eax
- cpuid ; get number of CPUID functions
- cmp eax, 4
- jb I900 ; fail
- xor esi, esi ; loop counter
- I100: mov eax, 4
- mov ecx, esi
- cpuid ; get cache parameters
- mov edx, eax
- and edx, 11111b ; cache type
- jz I500 ; no more caches
- cmp edx, 2
- je I200 ; code cache, ignore
- inc ecx ; sets
- mov edx, ebx
- shr edx, 22
- inc edx ; ways
- imul ecx, edx
- mov edx, ebx
- shr edx, 12
- and edx, 1111111111b
- inc edx ; partitions
- imul ecx, edx
- and ebx, 111111111111b
- inc ebx ; line size
- imul rcx, rbx ; calculated cache size (64 bit)
- shr eax, 5
- and eax, 111b ; cache level
- cmp eax, numlevels
- jna I180
- mov eax, numlevels ; limit higher levels
- I180: mov [r9+rax*8], rcx ; store size of data cache level eax
- I200: inc esi
- cmp esi, 100h ; avoid infinite loop
- jb I100 ; next cache
- I500: ; loop finished
- ; check if OK
- mov eax, [r9+level1] ; level1
- cmp eax, 1024
- I900: ret ; carry flag set if fail
- ; Determine cache sizes by CPUID function 2
- ; input: esi = pointer to dataref
- ; output: values returned in dataref + level1, level2, level3
- ; carry flag = 0 on succes
- IntelOldMethod:
- xor eax, eax
- cpuid ; get number of CPUID functions
- cmp eax, 2
- jb J900 ; fail
- mov eax, 2
- xor ecx, ecx
- cpuid ; get 16 descriptor bytes in eax, ebx, ecx, edx
- mov al, 0 ; al does not contain a descriptor
- sub rsp, 16
- mov [rsp], eax ; save all descriptors
- mov [rsp+4], ebx
- mov [rsp+8], ecx
- mov [rsp+12], edx
- mov edx, 15 ; loop counter
- ; loop to read 16 descriptor bytes
- J100: mov al, byte [rsp+rdx]
- ; find in table
- mov ebx, descriptortablelength-1 ; loop counter
- ; loop to search in descriptortable
- J200: cmp al, [r9 + descriptortable + rbx*4 + d_key]
- jne J300
- ; descriptor found
- movzx eax, byte [r9 + descriptortable + rbx*4 + d_sizem]
- mov cl, [r9 + descriptortable + rbx*4 + d_2pow]
- shl eax, cl ; compute size
- movzx ecx, byte [r9 + descriptortable + rbx*4 + d_level]
- ; check that level = 1-3
- cmp ecx, 3
- ja J300
- mov [r9+rcx*8], rax ; store size eax of data cache level ecx
- J300: dec ebx
- jns J200 ; inner loop
- dec edx
- jns J100 ; outer loop
- add rsp, 16 ; remove from stack
- ; check if OK
- mov eax, [r9 + level1]
- cmp eax, 1024
- J900: ret ; carry flag set if fail
- ; Determine cache sizes by CPUID function 80000005H - 80000006H
- ; input: esi = pointer to dataref
- ; output: values returned in dataref
- ; carry flag = 0 on succes
- AMDMethod:
- mov eax, 80000000H
- cpuid ; get number of CPUID functions
- cmp eax, 6
- jb K900 ; fail
- mov eax, 80000005H
- cpuid ; get L1 cache size
- shr ecx, 24 ; L1 data cache size in kbytes
- shl ecx, 10 ; L1 data cache size in bytes
- mov [r9 + level1], ecx ; store L1 data cache size
- mov eax, 80000006H
- cpuid ; get L2 and L3 cache sizes
- shr ecx, 16 ; L2 data cache size in kbytes
- shl ecx, 10 ; L2 data cache size in bytes
- mov [r9 + level2], ecx ; store L2 data cache size
- mov ecx, edx
- shr ecx, 18 ; L3 data cache size / 512 kbytes
- shl rcx, 19 ; L3 data cache size in bytes
- %if 0 ; AMD manual is unclear:
- ; do we have to increase the value if the number of ways is not a power or 2?
- shr edx, 12
- and edx, 1111b ; L3 associativity
- cmp edx, 3
- jb K100
- test edx, 1
- jz K100
- ; number of ways is not a power of 2, multiply by 1.5 ?
- mov rax, rcx
- shr rax, 1
- add rcx, rax
- %endif
- K100: mov [r9 + level3], rcx ; store L3 data cache size
- ; check if OK
- mov eax, [r9 + level1]
- cmp eax, 1024
- K900: ret ; carry flag set if fail
|