decompress_amd64.go 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. //go:build amd64 && !appengine && !noasm && gc
  2. // +build amd64,!appengine,!noasm,gc
  3. // This file contains the specialisation of Decoder.Decompress4X
  4. // and Decoder.Decompress1X that use an asm implementation of thir main loops.
  5. package huff0
  6. import (
  7. "errors"
  8. "fmt"
  9. "github.com/klauspost/compress/internal/cpuinfo"
  10. )
  11. // decompress4x_main_loop_x86 is an x86 assembler implementation
  12. // of Decompress4X when tablelog > 8.
  13. //
  14. //go:noescape
  15. func decompress4x_main_loop_amd64(ctx *decompress4xContext)
  16. // decompress4x_8b_loop_x86 is an x86 assembler implementation
  17. // of Decompress4X when tablelog <= 8 which decodes 4 entries
  18. // per loop.
  19. //
  20. //go:noescape
  21. func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
  22. // fallback8BitSize is the size where using Go version is faster.
  23. const fallback8BitSize = 800
  24. type decompress4xContext struct {
  25. pbr *[4]bitReaderShifted
  26. peekBits uint8
  27. out *byte
  28. dstEvery int
  29. tbl *dEntrySingle
  30. decoded int
  31. limit *byte
  32. }
  33. // Decompress4X will decompress a 4X encoded stream.
  34. // The length of the supplied input must match the end of a block exactly.
  35. // The *capacity* of the dst slice must match the destination size of
  36. // the uncompressed data exactly.
  37. func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
  38. if len(d.dt.single) == 0 {
  39. return nil, errors.New("no table loaded")
  40. }
  41. if len(src) < 6+(4*1) {
  42. return nil, errors.New("input too small")
  43. }
  44. use8BitTables := d.actualTableLog <= 8
  45. if cap(dst) < fallback8BitSize && use8BitTables {
  46. return d.decompress4X8bit(dst, src)
  47. }
  48. var br [4]bitReaderShifted
  49. // Decode "jump table"
  50. start := 6
  51. for i := 0; i < 3; i++ {
  52. length := int(src[i*2]) | (int(src[i*2+1]) << 8)
  53. if start+length >= len(src) {
  54. return nil, errors.New("truncated input (or invalid offset)")
  55. }
  56. err := br[i].init(src[start : start+length])
  57. if err != nil {
  58. return nil, err
  59. }
  60. start += length
  61. }
  62. err := br[3].init(src[start:])
  63. if err != nil {
  64. return nil, err
  65. }
  66. // destination, offset to match first output
  67. dstSize := cap(dst)
  68. dst = dst[:dstSize]
  69. out := dst
  70. dstEvery := (dstSize + 3) / 4
  71. const tlSize = 1 << tableLogMax
  72. const tlMask = tlSize - 1
  73. single := d.dt.single[:tlSize]
  74. var decoded int
  75. if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
  76. ctx := decompress4xContext{
  77. pbr: &br,
  78. peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
  79. out: &out[0],
  80. dstEvery: dstEvery,
  81. tbl: &single[0],
  82. limit: &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
  83. }
  84. if use8BitTables {
  85. decompress4x_8b_main_loop_amd64(&ctx)
  86. } else {
  87. decompress4x_main_loop_amd64(&ctx)
  88. }
  89. decoded = ctx.decoded
  90. out = out[decoded/4:]
  91. }
  92. // Decode remaining.
  93. remainBytes := dstEvery - (decoded / 4)
  94. for i := range br {
  95. offset := dstEvery * i
  96. endsAt := offset + remainBytes
  97. if endsAt > len(out) {
  98. endsAt = len(out)
  99. }
  100. br := &br[i]
  101. bitsLeft := br.remaining()
  102. for bitsLeft > 0 {
  103. br.fill()
  104. if offset >= endsAt {
  105. return nil, errors.New("corruption detected: stream overrun 4")
  106. }
  107. // Read value and increment offset.
  108. val := br.peekBitsFast(d.actualTableLog)
  109. v := single[val&tlMask].entry
  110. nBits := uint8(v)
  111. br.advance(nBits)
  112. bitsLeft -= uint(nBits)
  113. out[offset] = uint8(v >> 8)
  114. offset++
  115. }
  116. if offset != endsAt {
  117. return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
  118. }
  119. decoded += offset - dstEvery*i
  120. err = br.close()
  121. if err != nil {
  122. return nil, err
  123. }
  124. }
  125. if dstSize != decoded {
  126. return nil, errors.New("corruption detected: short output block")
  127. }
  128. return dst, nil
  129. }
  130. // decompress4x_main_loop_x86 is an x86 assembler implementation
  131. // of Decompress1X when tablelog > 8.
  132. //
  133. //go:noescape
  134. func decompress1x_main_loop_amd64(ctx *decompress1xContext)
  135. // decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
  136. // of Decompress1X when tablelog > 8.
  137. //
  138. //go:noescape
  139. func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
  140. type decompress1xContext struct {
  141. pbr *bitReaderShifted
  142. peekBits uint8
  143. out *byte
  144. outCap int
  145. tbl *dEntrySingle
  146. decoded int
  147. }
  148. // Error reported by asm implementations
  149. const error_max_decoded_size_exeeded = -1
  150. // Decompress1X will decompress a 1X encoded stream.
  151. // The cap of the output buffer will be the maximum decompressed size.
  152. // The length of the supplied input must match the end of a block exactly.
  153. func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
  154. if len(d.dt.single) == 0 {
  155. return nil, errors.New("no table loaded")
  156. }
  157. var br bitReaderShifted
  158. err := br.init(src)
  159. if err != nil {
  160. return dst, err
  161. }
  162. maxDecodedSize := cap(dst)
  163. dst = dst[:maxDecodedSize]
  164. const tlSize = 1 << tableLogMax
  165. const tlMask = tlSize - 1
  166. if maxDecodedSize >= 4 {
  167. ctx := decompress1xContext{
  168. pbr: &br,
  169. out: &dst[0],
  170. outCap: maxDecodedSize,
  171. peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
  172. tbl: &d.dt.single[0],
  173. }
  174. if cpuinfo.HasBMI2() {
  175. decompress1x_main_loop_bmi2(&ctx)
  176. } else {
  177. decompress1x_main_loop_amd64(&ctx)
  178. }
  179. if ctx.decoded == error_max_decoded_size_exeeded {
  180. return nil, ErrMaxDecodedSizeExceeded
  181. }
  182. dst = dst[:ctx.decoded]
  183. }
  184. // br < 8, so uint8 is fine
  185. bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
  186. for bitsLeft > 0 {
  187. br.fill()
  188. if len(dst) >= maxDecodedSize {
  189. br.close()
  190. return nil, ErrMaxDecodedSizeExceeded
  191. }
  192. v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
  193. nBits := uint8(v.entry)
  194. br.advance(nBits)
  195. bitsLeft -= nBits
  196. dst = append(dst, uint8(v.entry>>8))
  197. }
  198. return dst, br.close()
  199. }