snappy.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. // Copyright 2019+ Klaus Post. All rights reserved.
  2. // License information can be found in the LICENSE file.
  3. // Based on work by Yann Collet, released under BSD License.
  4. package zstd
  5. import (
  6. "encoding/binary"
  7. "errors"
  8. "hash/crc32"
  9. "io"
  10. "github.com/klauspost/compress/huff0"
  11. snappy "github.com/klauspost/compress/internal/snapref"
  12. )
  13. const (
  14. snappyTagLiteral = 0x00
  15. snappyTagCopy1 = 0x01
  16. snappyTagCopy2 = 0x02
  17. snappyTagCopy4 = 0x03
  18. )
  19. const (
  20. snappyChecksumSize = 4
  21. snappyMagicBody = "sNaPpY"
  22. // snappyMaxBlockSize is the maximum size of the input to encodeBlock. It is not
  23. // part of the wire format per se, but some parts of the encoder assume
  24. // that an offset fits into a uint16.
  25. //
  26. // Also, for the framing format (Writer type instead of Encode function),
  27. // https://github.com/google/snappy/blob/master/framing_format.txt says
  28. // that "the uncompressed data in a chunk must be no longer than 65536
  29. // bytes".
  30. snappyMaxBlockSize = 65536
  31. // snappyMaxEncodedLenOfMaxBlockSize equals MaxEncodedLen(snappyMaxBlockSize), but is
  32. // hard coded to be a const instead of a variable, so that obufLen can also
  33. // be a const. Their equivalence is confirmed by
  34. // TestMaxEncodedLenOfMaxBlockSize.
  35. snappyMaxEncodedLenOfMaxBlockSize = 76490
  36. )
  37. const (
  38. chunkTypeCompressedData = 0x00
  39. chunkTypeUncompressedData = 0x01
  40. chunkTypePadding = 0xfe
  41. chunkTypeStreamIdentifier = 0xff
  42. )
  43. var (
  44. // ErrSnappyCorrupt reports that the input is invalid.
  45. ErrSnappyCorrupt = errors.New("snappy: corrupt input")
  46. // ErrSnappyTooLarge reports that the uncompressed length is too large.
  47. ErrSnappyTooLarge = errors.New("snappy: decoded block is too large")
  48. // ErrSnappyUnsupported reports that the input isn't supported.
  49. ErrSnappyUnsupported = errors.New("snappy: unsupported input")
  50. errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length")
  51. )
  52. // SnappyConverter can read SnappyConverter-compressed streams and convert them to zstd.
  53. // Conversion is done by converting the stream directly from Snappy without intermediate
  54. // full decoding.
  55. // Therefore the compression ratio is much less than what can be done by a full decompression
  56. // and compression, and a faulty Snappy stream may lead to a faulty Zstandard stream without
  57. // any errors being generated.
  58. // No CRC value is being generated and not all CRC values of the Snappy stream are checked.
  59. // However, it provides really fast recompression of Snappy streams.
  60. // The converter can be reused to avoid allocations, even after errors.
  61. type SnappyConverter struct {
  62. r io.Reader
  63. err error
  64. buf []byte
  65. block *blockEnc
  66. }
  67. // Convert the Snappy stream supplied in 'in' and write the zStandard stream to 'w'.
  68. // If any error is detected on the Snappy stream it is returned.
  69. // The number of bytes written is returned.
  70. func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
  71. initPredefined()
  72. r.err = nil
  73. r.r = in
  74. if r.block == nil {
  75. r.block = &blockEnc{}
  76. r.block.init()
  77. }
  78. r.block.initNewEncode()
  79. if len(r.buf) != snappyMaxEncodedLenOfMaxBlockSize+snappyChecksumSize {
  80. r.buf = make([]byte, snappyMaxEncodedLenOfMaxBlockSize+snappyChecksumSize)
  81. }
  82. r.block.litEnc.Reuse = huff0.ReusePolicyNone
  83. var written int64
  84. var readHeader bool
  85. {
  86. var header []byte
  87. var n int
  88. header, r.err = frameHeader{WindowSize: snappyMaxBlockSize}.appendTo(r.buf[:0])
  89. n, r.err = w.Write(header)
  90. if r.err != nil {
  91. return written, r.err
  92. }
  93. written += int64(n)
  94. }
  95. for {
  96. if !r.readFull(r.buf[:4], true) {
  97. // Add empty last block
  98. r.block.reset(nil)
  99. r.block.last = true
  100. err := r.block.encodeLits(r.block.literals, false)
  101. if err != nil {
  102. return written, err
  103. }
  104. n, err := w.Write(r.block.output)
  105. if err != nil {
  106. return written, err
  107. }
  108. written += int64(n)
  109. return written, r.err
  110. }
  111. chunkType := r.buf[0]
  112. if !readHeader {
  113. if chunkType != chunkTypeStreamIdentifier {
  114. println("chunkType != chunkTypeStreamIdentifier", chunkType)
  115. r.err = ErrSnappyCorrupt
  116. return written, r.err
  117. }
  118. readHeader = true
  119. }
  120. chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
  121. if chunkLen > len(r.buf) {
  122. println("chunkLen > len(r.buf)", chunkType)
  123. r.err = ErrSnappyUnsupported
  124. return written, r.err
  125. }
  126. // The chunk types are specified at
  127. // https://github.com/google/snappy/blob/master/framing_format.txt
  128. switch chunkType {
  129. case chunkTypeCompressedData:
  130. // Section 4.2. Compressed data (chunk type 0x00).
  131. if chunkLen < snappyChecksumSize {
  132. println("chunkLen < snappyChecksumSize", chunkLen, snappyChecksumSize)
  133. r.err = ErrSnappyCorrupt
  134. return written, r.err
  135. }
  136. buf := r.buf[:chunkLen]
  137. if !r.readFull(buf, false) {
  138. return written, r.err
  139. }
  140. //checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
  141. buf = buf[snappyChecksumSize:]
  142. n, hdr, err := snappyDecodedLen(buf)
  143. if err != nil {
  144. r.err = err
  145. return written, r.err
  146. }
  147. buf = buf[hdr:]
  148. if n > snappyMaxBlockSize {
  149. println("n > snappyMaxBlockSize", n, snappyMaxBlockSize)
  150. r.err = ErrSnappyCorrupt
  151. return written, r.err
  152. }
  153. r.block.reset(nil)
  154. r.block.pushOffsets()
  155. if err := decodeSnappy(r.block, buf); err != nil {
  156. r.err = err
  157. return written, r.err
  158. }
  159. if r.block.size+r.block.extraLits != n {
  160. printf("invalid size, want %d, got %d\n", n, r.block.size+r.block.extraLits)
  161. r.err = ErrSnappyCorrupt
  162. return written, r.err
  163. }
  164. err = r.block.encode(nil, false, false)
  165. switch err {
  166. case errIncompressible:
  167. r.block.popOffsets()
  168. r.block.reset(nil)
  169. r.block.literals, err = snappy.Decode(r.block.literals[:n], r.buf[snappyChecksumSize:chunkLen])
  170. if err != nil {
  171. return written, err
  172. }
  173. err = r.block.encodeLits(r.block.literals, false)
  174. if err != nil {
  175. return written, err
  176. }
  177. case nil:
  178. default:
  179. return written, err
  180. }
  181. n, r.err = w.Write(r.block.output)
  182. if r.err != nil {
  183. return written, err
  184. }
  185. written += int64(n)
  186. continue
  187. case chunkTypeUncompressedData:
  188. if debugEncoder {
  189. println("Uncompressed, chunklen", chunkLen)
  190. }
  191. // Section 4.3. Uncompressed data (chunk type 0x01).
  192. if chunkLen < snappyChecksumSize {
  193. println("chunkLen < snappyChecksumSize", chunkLen, snappyChecksumSize)
  194. r.err = ErrSnappyCorrupt
  195. return written, r.err
  196. }
  197. r.block.reset(nil)
  198. buf := r.buf[:snappyChecksumSize]
  199. if !r.readFull(buf, false) {
  200. return written, r.err
  201. }
  202. checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
  203. // Read directly into r.decoded instead of via r.buf.
  204. n := chunkLen - snappyChecksumSize
  205. if n > snappyMaxBlockSize {
  206. println("n > snappyMaxBlockSize", n, snappyMaxBlockSize)
  207. r.err = ErrSnappyCorrupt
  208. return written, r.err
  209. }
  210. r.block.literals = r.block.literals[:n]
  211. if !r.readFull(r.block.literals, false) {
  212. return written, r.err
  213. }
  214. if snappyCRC(r.block.literals) != checksum {
  215. println("literals crc mismatch")
  216. r.err = ErrSnappyCorrupt
  217. return written, r.err
  218. }
  219. err := r.block.encodeLits(r.block.literals, false)
  220. if err != nil {
  221. return written, err
  222. }
  223. n, r.err = w.Write(r.block.output)
  224. if r.err != nil {
  225. return written, err
  226. }
  227. written += int64(n)
  228. continue
  229. case chunkTypeStreamIdentifier:
  230. if debugEncoder {
  231. println("stream id", chunkLen, len(snappyMagicBody))
  232. }
  233. // Section 4.1. Stream identifier (chunk type 0xff).
  234. if chunkLen != len(snappyMagicBody) {
  235. println("chunkLen != len(snappyMagicBody)", chunkLen, len(snappyMagicBody))
  236. r.err = ErrSnappyCorrupt
  237. return written, r.err
  238. }
  239. if !r.readFull(r.buf[:len(snappyMagicBody)], false) {
  240. return written, r.err
  241. }
  242. for i := 0; i < len(snappyMagicBody); i++ {
  243. if r.buf[i] != snappyMagicBody[i] {
  244. println("r.buf[i] != snappyMagicBody[i]", r.buf[i], snappyMagicBody[i], i)
  245. r.err = ErrSnappyCorrupt
  246. return written, r.err
  247. }
  248. }
  249. continue
  250. }
  251. if chunkType <= 0x7f {
  252. // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
  253. println("chunkType <= 0x7f")
  254. r.err = ErrSnappyUnsupported
  255. return written, r.err
  256. }
  257. // Section 4.4 Padding (chunk type 0xfe).
  258. // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
  259. if !r.readFull(r.buf[:chunkLen], false) {
  260. return written, r.err
  261. }
  262. }
  263. }
  264. // decodeSnappy writes the decoding of src to dst. It assumes that the varint-encoded
  265. // length of the decompressed bytes has already been read.
  266. func decodeSnappy(blk *blockEnc, src []byte) error {
  267. //decodeRef(make([]byte, snappyMaxBlockSize), src)
  268. var s, length int
  269. lits := blk.extraLits
  270. var offset uint32
  271. for s < len(src) {
  272. switch src[s] & 0x03 {
  273. case snappyTagLiteral:
  274. x := uint32(src[s] >> 2)
  275. switch {
  276. case x < 60:
  277. s++
  278. case x == 60:
  279. s += 2
  280. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  281. println("uint(s) > uint(len(src)", s, src)
  282. return ErrSnappyCorrupt
  283. }
  284. x = uint32(src[s-1])
  285. case x == 61:
  286. s += 3
  287. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  288. println("uint(s) > uint(len(src)", s, src)
  289. return ErrSnappyCorrupt
  290. }
  291. x = uint32(src[s-2]) | uint32(src[s-1])<<8
  292. case x == 62:
  293. s += 4
  294. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  295. println("uint(s) > uint(len(src)", s, src)
  296. return ErrSnappyCorrupt
  297. }
  298. x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
  299. case x == 63:
  300. s += 5
  301. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  302. println("uint(s) > uint(len(src)", s, src)
  303. return ErrSnappyCorrupt
  304. }
  305. x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
  306. }
  307. if x > snappyMaxBlockSize {
  308. println("x > snappyMaxBlockSize", x, snappyMaxBlockSize)
  309. return ErrSnappyCorrupt
  310. }
  311. length = int(x) + 1
  312. if length <= 0 {
  313. println("length <= 0 ", length)
  314. return errUnsupportedLiteralLength
  315. }
  316. //if length > snappyMaxBlockSize-d || uint32(length) > len(src)-s {
  317. // return ErrSnappyCorrupt
  318. //}
  319. blk.literals = append(blk.literals, src[s:s+length]...)
  320. //println(length, "litLen")
  321. lits += length
  322. s += length
  323. continue
  324. case snappyTagCopy1:
  325. s += 2
  326. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  327. println("uint(s) > uint(len(src)", s, len(src))
  328. return ErrSnappyCorrupt
  329. }
  330. length = 4 + int(src[s-2])>>2&0x7
  331. offset = uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])
  332. case snappyTagCopy2:
  333. s += 3
  334. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  335. println("uint(s) > uint(len(src)", s, len(src))
  336. return ErrSnappyCorrupt
  337. }
  338. length = 1 + int(src[s-3])>>2
  339. offset = uint32(src[s-2]) | uint32(src[s-1])<<8
  340. case snappyTagCopy4:
  341. s += 5
  342. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  343. println("uint(s) > uint(len(src)", s, len(src))
  344. return ErrSnappyCorrupt
  345. }
  346. length = 1 + int(src[s-5])>>2
  347. offset = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
  348. }
  349. if offset <= 0 || blk.size+lits < int(offset) /*|| length > len(blk)-d */ {
  350. println("offset <= 0 || blk.size+lits < int(offset)", offset, blk.size+lits, int(offset), blk.size, lits)
  351. return ErrSnappyCorrupt
  352. }
  353. // Check if offset is one of the recent offsets.
  354. // Adjusts the output offset accordingly.
  355. // Gives a tiny bit of compression, typically around 1%.
  356. if false {
  357. offset = blk.matchOffset(offset, uint32(lits))
  358. } else {
  359. offset += 3
  360. }
  361. blk.sequences = append(blk.sequences, seq{
  362. litLen: uint32(lits),
  363. offset: offset,
  364. matchLen: uint32(length) - zstdMinMatch,
  365. })
  366. blk.size += length + lits
  367. lits = 0
  368. }
  369. blk.extraLits = lits
  370. return nil
  371. }
  372. func (r *SnappyConverter) readFull(p []byte, allowEOF bool) (ok bool) {
  373. if _, r.err = io.ReadFull(r.r, p); r.err != nil {
  374. if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
  375. r.err = ErrSnappyCorrupt
  376. }
  377. return false
  378. }
  379. return true
  380. }
  381. var crcTable = crc32.MakeTable(crc32.Castagnoli)
  382. // crc implements the checksum specified in section 3 of
  383. // https://github.com/google/snappy/blob/master/framing_format.txt
  384. func snappyCRC(b []byte) uint32 {
  385. c := crc32.Update(0, crcTable, b)
  386. return c>>15 | c<<17 + 0xa282ead8
  387. }
  388. // snappyDecodedLen returns the length of the decoded block and the number of bytes
  389. // that the length header occupied.
  390. func snappyDecodedLen(src []byte) (blockLen, headerLen int, err error) {
  391. v, n := binary.Uvarint(src)
  392. if n <= 0 || v > 0xffffffff {
  393. return 0, 0, ErrSnappyCorrupt
  394. }
  395. const wordSize = 32 << (^uint(0) >> 32 & 1)
  396. if wordSize == 32 && v > 0x7fffffff {
  397. return 0, 0, ErrSnappyTooLarge
  398. }
  399. return int(v), n, nil
  400. }