xxhash_arm64.s 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. //go:build !appengine && gc && !purego && !noasm
  2. // +build !appengine
  3. // +build gc
  4. // +build !purego
  5. // +build !noasm
  6. #include "textflag.h"
  7. // Registers:
  8. #define digest R1
  9. #define h R2 // return value
  10. #define p R3 // input pointer
  11. #define n R4 // input length
  12. #define nblocks R5 // n / 32
  13. #define prime1 R7
  14. #define prime2 R8
  15. #define prime3 R9
  16. #define prime4 R10
  17. #define prime5 R11
  18. #define v1 R12
  19. #define v2 R13
  20. #define v3 R14
  21. #define v4 R15
  22. #define x1 R20
  23. #define x2 R21
  24. #define x3 R22
  25. #define x4 R23
  26. #define round(acc, x) \
  27. MADD prime2, acc, x, acc \
  28. ROR $64-31, acc \
  29. MUL prime1, acc
  30. // round0 performs the operation x = round(0, x).
  31. #define round0(x) \
  32. MUL prime2, x \
  33. ROR $64-31, x \
  34. MUL prime1, x
  35. #define mergeRound(acc, x) \
  36. round0(x) \
  37. EOR x, acc \
  38. MADD acc, prime4, prime1, acc
  39. // blockLoop processes as many 32-byte blocks as possible,
  40. // updating v1, v2, v3, and v4. It assumes that n >= 32.
  41. #define blockLoop() \
  42. LSR $5, n, nblocks \
  43. PCALIGN $16 \
  44. loop: \
  45. LDP.P 16(p), (x1, x2) \
  46. LDP.P 16(p), (x3, x4) \
  47. round(v1, x1) \
  48. round(v2, x2) \
  49. round(v3, x3) \
  50. round(v4, x4) \
  51. SUB $1, nblocks \
  52. CBNZ nblocks, loop
  53. // func Sum64(b []byte) uint64
  54. TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
  55. LDP b_base+0(FP), (p, n)
  56. LDP ·primes+0(SB), (prime1, prime2)
  57. LDP ·primes+16(SB), (prime3, prime4)
  58. MOVD ·primes+32(SB), prime5
  59. CMP $32, n
  60. CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
  61. BLT afterLoop
  62. ADD prime1, prime2, v1
  63. MOVD prime2, v2
  64. MOVD $0, v3
  65. NEG prime1, v4
  66. blockLoop()
  67. ROR $64-1, v1, x1
  68. ROR $64-7, v2, x2
  69. ADD x1, x2
  70. ROR $64-12, v3, x3
  71. ROR $64-18, v4, x4
  72. ADD x3, x4
  73. ADD x2, x4, h
  74. mergeRound(h, v1)
  75. mergeRound(h, v2)
  76. mergeRound(h, v3)
  77. mergeRound(h, v4)
  78. afterLoop:
  79. ADD n, h
  80. TBZ $4, n, try8
  81. LDP.P 16(p), (x1, x2)
  82. round0(x1)
  83. // NOTE: here and below, sequencing the EOR after the ROR (using a
  84. // rotated register) is worth a small but measurable speedup for small
  85. // inputs.
  86. ROR $64-27, h
  87. EOR x1 @> 64-27, h, h
  88. MADD h, prime4, prime1, h
  89. round0(x2)
  90. ROR $64-27, h
  91. EOR x2 @> 64-27, h, h
  92. MADD h, prime4, prime1, h
  93. try8:
  94. TBZ $3, n, try4
  95. MOVD.P 8(p), x1
  96. round0(x1)
  97. ROR $64-27, h
  98. EOR x1 @> 64-27, h, h
  99. MADD h, prime4, prime1, h
  100. try4:
  101. TBZ $2, n, try2
  102. MOVWU.P 4(p), x2
  103. MUL prime1, x2
  104. ROR $64-23, h
  105. EOR x2 @> 64-23, h, h
  106. MADD h, prime3, prime2, h
  107. try2:
  108. TBZ $1, n, try1
  109. MOVHU.P 2(p), x3
  110. AND $255, x3, x1
  111. LSR $8, x3, x2
  112. MUL prime5, x1
  113. ROR $64-11, h
  114. EOR x1 @> 64-11, h, h
  115. MUL prime1, h
  116. MUL prime5, x2
  117. ROR $64-11, h
  118. EOR x2 @> 64-11, h, h
  119. MUL prime1, h
  120. try1:
  121. TBZ $0, n, finalize
  122. MOVBU (p), x4
  123. MUL prime5, x4
  124. ROR $64-11, h
  125. EOR x4 @> 64-11, h, h
  126. MUL prime1, h
  127. finalize:
  128. EOR h >> 33, h
  129. MUL prime2, h
  130. EOR h >> 29, h
  131. MUL prime3, h
  132. EOR h >> 32, h
  133. MOVD h, ret+24(FP)
  134. RET
  135. // func writeBlocks(d *Digest, b []byte) int
  136. TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
  137. LDP ·primes+0(SB), (prime1, prime2)
  138. // Load state. Assume v[1-4] are stored contiguously.
  139. MOVD d+0(FP), digest
  140. LDP 0(digest), (v1, v2)
  141. LDP 16(digest), (v3, v4)
  142. LDP b_base+8(FP), (p, n)
  143. blockLoop()
  144. // Store updated state.
  145. STP (v1, v2), 0(digest)
  146. STP (v3, v4), 16(digest)
  147. BIC $31, n
  148. MOVD n, ret+32(FP)
  149. RET