xxhash_amd64.s 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. //go:build !appengine && gc && !purego && !noasm
  2. // +build !appengine
  3. // +build gc
  4. // +build !purego
  5. // +build !noasm
  6. #include "textflag.h"
  7. // Registers:
  8. #define h AX
  9. #define d AX
  10. #define p SI // pointer to advance through b
  11. #define n DX
  12. #define end BX // loop end
  13. #define v1 R8
  14. #define v2 R9
  15. #define v3 R10
  16. #define v4 R11
  17. #define x R12
  18. #define prime1 R13
  19. #define prime2 R14
  20. #define prime4 DI
  21. #define round(acc, x) \
  22. IMULQ prime2, x \
  23. ADDQ x, acc \
  24. ROLQ $31, acc \
  25. IMULQ prime1, acc
  26. // round0 performs the operation x = round(0, x).
  27. #define round0(x) \
  28. IMULQ prime2, x \
  29. ROLQ $31, x \
  30. IMULQ prime1, x
  31. // mergeRound applies a merge round on the two registers acc and x.
  32. // It assumes that prime1, prime2, and prime4 have been loaded.
  33. #define mergeRound(acc, x) \
  34. round0(x) \
  35. XORQ x, acc \
  36. IMULQ prime1, acc \
  37. ADDQ prime4, acc
  38. // blockLoop processes as many 32-byte blocks as possible,
  39. // updating v1, v2, v3, and v4. It assumes that there is at least one block
  40. // to process.
  41. #define blockLoop() \
  42. loop: \
  43. MOVQ +0(p), x \
  44. round(v1, x) \
  45. MOVQ +8(p), x \
  46. round(v2, x) \
  47. MOVQ +16(p), x \
  48. round(v3, x) \
  49. MOVQ +24(p), x \
  50. round(v4, x) \
  51. ADDQ $32, p \
  52. CMPQ p, end \
  53. JLE loop
  54. // func Sum64(b []byte) uint64
  55. TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
  56. // Load fixed primes.
  57. MOVQ ·primes+0(SB), prime1
  58. MOVQ ·primes+8(SB), prime2
  59. MOVQ ·primes+24(SB), prime4
  60. // Load slice.
  61. MOVQ b_base+0(FP), p
  62. MOVQ b_len+8(FP), n
  63. LEAQ (p)(n*1), end
  64. // The first loop limit will be len(b)-32.
  65. SUBQ $32, end
  66. // Check whether we have at least one block.
  67. CMPQ n, $32
  68. JLT noBlocks
  69. // Set up initial state (v1, v2, v3, v4).
  70. MOVQ prime1, v1
  71. ADDQ prime2, v1
  72. MOVQ prime2, v2
  73. XORQ v3, v3
  74. XORQ v4, v4
  75. SUBQ prime1, v4
  76. blockLoop()
  77. MOVQ v1, h
  78. ROLQ $1, h
  79. MOVQ v2, x
  80. ROLQ $7, x
  81. ADDQ x, h
  82. MOVQ v3, x
  83. ROLQ $12, x
  84. ADDQ x, h
  85. MOVQ v4, x
  86. ROLQ $18, x
  87. ADDQ x, h
  88. mergeRound(h, v1)
  89. mergeRound(h, v2)
  90. mergeRound(h, v3)
  91. mergeRound(h, v4)
  92. JMP afterBlocks
  93. noBlocks:
  94. MOVQ ·primes+32(SB), h
  95. afterBlocks:
  96. ADDQ n, h
  97. ADDQ $24, end
  98. CMPQ p, end
  99. JG try4
  100. loop8:
  101. MOVQ (p), x
  102. ADDQ $8, p
  103. round0(x)
  104. XORQ x, h
  105. ROLQ $27, h
  106. IMULQ prime1, h
  107. ADDQ prime4, h
  108. CMPQ p, end
  109. JLE loop8
  110. try4:
  111. ADDQ $4, end
  112. CMPQ p, end
  113. JG try1
  114. MOVL (p), x
  115. ADDQ $4, p
  116. IMULQ prime1, x
  117. XORQ x, h
  118. ROLQ $23, h
  119. IMULQ prime2, h
  120. ADDQ ·primes+16(SB), h
  121. try1:
  122. ADDQ $4, end
  123. CMPQ p, end
  124. JGE finalize
  125. loop1:
  126. MOVBQZX (p), x
  127. ADDQ $1, p
  128. IMULQ ·primes+32(SB), x
  129. XORQ x, h
  130. ROLQ $11, h
  131. IMULQ prime1, h
  132. CMPQ p, end
  133. JL loop1
  134. finalize:
  135. MOVQ h, x
  136. SHRQ $33, x
  137. XORQ x, h
  138. IMULQ prime2, h
  139. MOVQ h, x
  140. SHRQ $29, x
  141. XORQ x, h
  142. IMULQ ·primes+16(SB), h
  143. MOVQ h, x
  144. SHRQ $32, x
  145. XORQ x, h
  146. MOVQ h, ret+24(FP)
  147. RET
  148. // func writeBlocks(d *Digest, b []byte) int
  149. TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
  150. // Load fixed primes needed for round.
  151. MOVQ ·primes+0(SB), prime1
  152. MOVQ ·primes+8(SB), prime2
  153. // Load slice.
  154. MOVQ b_base+8(FP), p
  155. MOVQ b_len+16(FP), n
  156. LEAQ (p)(n*1), end
  157. SUBQ $32, end
  158. // Load vN from d.
  159. MOVQ s+0(FP), d
  160. MOVQ 0(d), v1
  161. MOVQ 8(d), v2
  162. MOVQ 16(d), v3
  163. MOVQ 24(d), v4
  164. // We don't need to check the loop condition here; this function is
  165. // always called with at least one block of data to process.
  166. blockLoop()
  167. // Copy vN back to d.
  168. MOVQ v1, 0(d)
  169. MOVQ v2, 8(d)
  170. MOVQ v3, 16(d)
  171. MOVQ v4, 24(d)
  172. // The number of bytes written is p minus the old base pointer.
  173. SUBQ b_base+8(FP), p
  174. MOVQ p, ret+32(FP)
  175. RET