xxhash_amd64.s 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. //go:build !appengine && gc && !purego
  2. // +build !appengine
  3. // +build gc
  4. // +build !purego
  5. #include "textflag.h"
  6. // Registers:
  7. #define h AX
  8. #define d AX
  9. #define p SI // pointer to advance through b
  10. #define n DX
  11. #define end BX // loop end
  12. #define v1 R8
  13. #define v2 R9
  14. #define v3 R10
  15. #define v4 R11
  16. #define x R12
  17. #define prime1 R13
  18. #define prime2 R14
  19. #define prime4 DI
  20. #define round(acc, x) \
  21. IMULQ prime2, x \
  22. ADDQ x, acc \
  23. ROLQ $31, acc \
  24. IMULQ prime1, acc
  25. // round0 performs the operation x = round(0, x).
  26. #define round0(x) \
  27. IMULQ prime2, x \
  28. ROLQ $31, x \
  29. IMULQ prime1, x
  30. // mergeRound applies a merge round on the two registers acc and x.
  31. // It assumes that prime1, prime2, and prime4 have been loaded.
  32. #define mergeRound(acc, x) \
  33. round0(x) \
  34. XORQ x, acc \
  35. IMULQ prime1, acc \
  36. ADDQ prime4, acc
  37. // blockLoop processes as many 32-byte blocks as possible,
  38. // updating v1, v2, v3, and v4. It assumes that there is at least one block
  39. // to process.
  40. #define blockLoop() \
  41. loop: \
  42. MOVQ +0(p), x \
  43. round(v1, x) \
  44. MOVQ +8(p), x \
  45. round(v2, x) \
  46. MOVQ +16(p), x \
  47. round(v3, x) \
  48. MOVQ +24(p), x \
  49. round(v4, x) \
  50. ADDQ $32, p \
  51. CMPQ p, end \
  52. JLE loop
  53. // func Sum64(b []byte) uint64
  54. TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
  55. // Load fixed primes.
  56. MOVQ ·primes+0(SB), prime1
  57. MOVQ ·primes+8(SB), prime2
  58. MOVQ ·primes+24(SB), prime4
  59. // Load slice.
  60. MOVQ b_base+0(FP), p
  61. MOVQ b_len+8(FP), n
  62. LEAQ (p)(n*1), end
  63. // The first loop limit will be len(b)-32.
  64. SUBQ $32, end
  65. // Check whether we have at least one block.
  66. CMPQ n, $32
  67. JLT noBlocks
  68. // Set up initial state (v1, v2, v3, v4).
  69. MOVQ prime1, v1
  70. ADDQ prime2, v1
  71. MOVQ prime2, v2
  72. XORQ v3, v3
  73. XORQ v4, v4
  74. SUBQ prime1, v4
  75. blockLoop()
  76. MOVQ v1, h
  77. ROLQ $1, h
  78. MOVQ v2, x
  79. ROLQ $7, x
  80. ADDQ x, h
  81. MOVQ v3, x
  82. ROLQ $12, x
  83. ADDQ x, h
  84. MOVQ v4, x
  85. ROLQ $18, x
  86. ADDQ x, h
  87. mergeRound(h, v1)
  88. mergeRound(h, v2)
  89. mergeRound(h, v3)
  90. mergeRound(h, v4)
  91. JMP afterBlocks
  92. noBlocks:
  93. MOVQ ·primes+32(SB), h
  94. afterBlocks:
  95. ADDQ n, h
  96. ADDQ $24, end
  97. CMPQ p, end
  98. JG try4
  99. loop8:
  100. MOVQ (p), x
  101. ADDQ $8, p
  102. round0(x)
  103. XORQ x, h
  104. ROLQ $27, h
  105. IMULQ prime1, h
  106. ADDQ prime4, h
  107. CMPQ p, end
  108. JLE loop8
  109. try4:
  110. ADDQ $4, end
  111. CMPQ p, end
  112. JG try1
  113. MOVL (p), x
  114. ADDQ $4, p
  115. IMULQ prime1, x
  116. XORQ x, h
  117. ROLQ $23, h
  118. IMULQ prime2, h
  119. ADDQ ·primes+16(SB), h
  120. try1:
  121. ADDQ $4, end
  122. CMPQ p, end
  123. JGE finalize
  124. loop1:
  125. MOVBQZX (p), x
  126. ADDQ $1, p
  127. IMULQ ·primes+32(SB), x
  128. XORQ x, h
  129. ROLQ $11, h
  130. IMULQ prime1, h
  131. CMPQ p, end
  132. JL loop1
  133. finalize:
  134. MOVQ h, x
  135. SHRQ $33, x
  136. XORQ x, h
  137. IMULQ prime2, h
  138. MOVQ h, x
  139. SHRQ $29, x
  140. XORQ x, h
  141. IMULQ ·primes+16(SB), h
  142. MOVQ h, x
  143. SHRQ $32, x
  144. XORQ x, h
  145. MOVQ h, ret+24(FP)
  146. RET
  147. // func writeBlocks(d *Digest, b []byte) int
  148. TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
  149. // Load fixed primes needed for round.
  150. MOVQ ·primes+0(SB), prime1
  151. MOVQ ·primes+8(SB), prime2
  152. // Load slice.
  153. MOVQ b_base+8(FP), p
  154. MOVQ b_len+16(FP), n
  155. LEAQ (p)(n*1), end
  156. SUBQ $32, end
  157. // Load vN from d.
  158. MOVQ s+0(FP), d
  159. MOVQ 0(d), v1
  160. MOVQ 8(d), v2
  161. MOVQ 16(d), v3
  162. MOVQ 24(d), v4
  163. // We don't need to check the loop condition here; this function is
  164. // always called with at least one block of data to process.
  165. blockLoop()
  166. // Copy vN back to d.
  167. MOVQ v1, 0(d)
  168. MOVQ v2, 8(d)
  169. MOVQ v3, 16(d)
  170. MOVQ v4, 24(d)
  171. // The number of bytes written is p minus the old base pointer.
  172. SUBQ b_base+8(FP), p
  173. MOVQ p, ret+32(FP)
  174. RET