strlen64.asm 3.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. %include "defs.asm"
  2. ;************************** strlen64.asm **********************************
  3. ; Author: Agner Fog
  4. ; Date created: 2008-07-19
  5. ; Last modified: 2008-10-16
  6. ; Description:
  7. ; Faster version of the standard strlen function:
  8. ; size_t strlen(const char * str);
  9. ; Finds the length of a zero-terminated string of bytes, optimized for speed.
  10. ;
  11. ; Overriding standard function strlen:
  12. ; The alias ?OVR_strlen is changed to _strlen in the object file if
  13. ; it is desired to override the standard library function strlen.
  14. ;
  15. ; Calling conventions:
  16. ; Stack alignment is not required. No shadow space or red zone used.
  17. ; Called internally from strcpy and strcat without stack aligned.
  18. ;
  19. ; Optimization:
  20. ; Uses XMM registers to read 16 bytes at a time, aligned.
  21. ; Misaligned parts of the string are read from the nearest 16-bytes boundary
  22. ; and the irrelevant part masked out. It may read both before the begin of
  23. ; the string and after the end, but will never load any unnecessary cache
  24. ; line and never trigger a page fault for reading from non-existing memory
  25. ; pages because it never reads past the nearest following 16-bytes boundary.
  26. ; It may, though, trigger any debug watch within the same 16-bytes boundary.
  27. ;
  28. ; The latest version of this file is available at:
  29. ; www.agner.org/optimize/asmexamples.zip
  30. ; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
  31. ;******************************************************************************
  32. default rel
  33. global A_strlen: function ; Function A_strlen
  34. global EXP(strlen): function ; ?OVR removed if standard function strlen overridden
  35. SECTION .text align=16
  36. ; extern "C" int strlen (const char * s);
  37. ; 64-bit Windows version:
  38. A_strlen:
  39. EXP(strlen):
  40. %IFDEF WINDOWS
  41. mov rax, rcx ; get pointer to string from rcx
  42. mov r8, rcx ; copy pointer
  43. %define Rscopy r8 ; Copy of s
  44. %ELSE ; Unix
  45. mov rax, rdi ; get pointer to string from rdi
  46. mov ecx, edi ; copy pointer (lower 32 bits)
  47. %define Rscopy rdi ; Copy of s
  48. %ENDIF
  49. ; rax = s, ecx = 32 bits of s
  50. pxor xmm0, xmm0 ; set to zero
  51. and ecx, 0FH ; lower 4 bits indicate misalignment
  52. and rax, -10H ; align pointer by 16
  53. movdqa xmm1, [rax] ; read from nearest preceding boundary
  54. pcmpeqb xmm1, xmm0 ; compare 16 bytes with zero
  55. pmovmskb edx, xmm1 ; get one bit for each byte result
  56. shr edx, cl ; shift out false bits
  57. shl edx, cl ; shift back again
  58. bsf edx, edx ; find first 1-bit
  59. jnz L2 ; found
  60. ; Main loop, search 16 bytes at a time
  61. L1: add rax, 10H ; increment pointer by 16
  62. movdqa xmm1, [rax] ; read 16 bytes aligned
  63. pcmpeqb xmm1, xmm0 ; compare 16 bytes with zero
  64. pmovmskb edx, xmm1 ; get one bit for each byte result
  65. bsf edx, edx ; find first 1-bit
  66. ; (moving the bsf out of the loop and using test here would be faster for long strings on old processors,
  67. ; but we are assuming that most strings are short, and newer processors have higher priority)
  68. jz L1 ; loop if not found
  69. L2: ; Zero-byte found. Compute string length
  70. sub rax, Rscopy ; subtract start address
  71. add rax, rdx ; add byte index
  72. ret
  73. ;A_strlen ENDP