encode_better.go 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106
  1. // Copyright 2016 The Snappy-Go Authors. All rights reserved.
  2. // Copyright (c) 2019 Klaus Post. All rights reserved.
  3. // Use of this source code is governed by a BSD-style
  4. // license that can be found in the LICENSE file.
  5. package s2
  6. import (
  7. "bytes"
  8. "fmt"
  9. "math/bits"
  10. )
  11. // hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
  12. // Preferably h should be a constant and should always be <32.
  13. func hash4(u uint64, h uint8) uint32 {
  14. const prime4bytes = 2654435761
  15. return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
  16. }
  17. // hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
  18. // Preferably h should be a constant and should always be <64.
  19. func hash5(u uint64, h uint8) uint32 {
  20. const prime5bytes = 889523592379
  21. return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
  22. }
  23. // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
  24. // Preferably h should be a constant and should always be <64.
  25. func hash7(u uint64, h uint8) uint32 {
  26. const prime7bytes = 58295818150454627
  27. return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
  28. }
  29. // hash8 returns the hash of u to fit in a hash table with h bits.
  30. // Preferably h should be a constant and should always be <64.
  31. func hash8(u uint64, h uint8) uint32 {
  32. const prime8bytes = 0xcf1bbcdcb7a56463
  33. return uint32((u * prime8bytes) >> ((64 - h) & 63))
  34. }
  35. // encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
  36. // assumes that the varint-encoded length of the decompressed bytes has already
  37. // been written.
  38. //
  39. // It also assumes that:
  40. //
  41. // len(dst) >= MaxEncodedLen(len(src)) &&
  42. // minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
  43. func encodeBlockBetterGo(dst, src []byte) (d int) {
  44. // sLimit is when to stop looking for offset/length copies. The inputMargin
  45. // lets us use a fast path for emitLiteral in the main loop, while we are
  46. // looking for copies.
  47. sLimit := len(src) - inputMargin
  48. if len(src) < minNonLiteralBlockSize {
  49. return 0
  50. }
  51. // Initialize the hash tables.
  52. const (
  53. // Long hash matches.
  54. lTableBits = 17
  55. maxLTableSize = 1 << lTableBits
  56. // Short hash matches.
  57. sTableBits = 14
  58. maxSTableSize = 1 << sTableBits
  59. )
  60. var lTable [maxLTableSize]uint32
  61. var sTable [maxSTableSize]uint32
  62. // Bail if we can't compress to at least this.
  63. dstLimit := len(src) - len(src)>>5 - 6
  64. // nextEmit is where in src the next emitLiteral should start from.
  65. nextEmit := 0
  66. // The encoded form must start with a literal, as there are no previous
  67. // bytes to copy, so we start looking for hash matches at s == 1.
  68. s := 1
  69. cv := load64(src, s)
  70. // We initialize repeat to 0, so we never match on first attempt
  71. repeat := 0
  72. for {
  73. candidateL := 0
  74. nextS := 0
  75. for {
  76. // Next src position to check
  77. nextS = s + (s-nextEmit)>>7 + 1
  78. if nextS > sLimit {
  79. goto emitRemainder
  80. }
  81. hashL := hash7(cv, lTableBits)
  82. hashS := hash4(cv, sTableBits)
  83. candidateL = int(lTable[hashL])
  84. candidateS := int(sTable[hashS])
  85. lTable[hashL] = uint32(s)
  86. sTable[hashS] = uint32(s)
  87. valLong := load64(src, candidateL)
  88. valShort := load64(src, candidateS)
  89. // If long matches at least 8 bytes, use that.
  90. if cv == valLong {
  91. break
  92. }
  93. if cv == valShort {
  94. candidateL = candidateS
  95. break
  96. }
  97. // Check repeat at offset checkRep.
  98. const checkRep = 1
  99. // Minimum length of a repeat. Tested with various values.
  100. // While 4-5 offers improvements in some, 6 reduces
  101. // regressions significantly.
  102. const wantRepeatBytes = 6
  103. const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
  104. if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
  105. base := s + checkRep
  106. // Extend back
  107. for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
  108. i--
  109. base--
  110. }
  111. d += emitLiteral(dst[d:], src[nextEmit:base])
  112. // Extend forward
  113. candidate := s - repeat + wantRepeatBytes + checkRep
  114. s += wantRepeatBytes + checkRep
  115. for s < len(src) {
  116. if len(src)-s < 8 {
  117. if src[s] == src[candidate] {
  118. s++
  119. candidate++
  120. continue
  121. }
  122. break
  123. }
  124. if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
  125. s += bits.TrailingZeros64(diff) >> 3
  126. break
  127. }
  128. s += 8
  129. candidate += 8
  130. }
  131. // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
  132. d += emitRepeat(dst[d:], repeat, s-base)
  133. nextEmit = s
  134. if s >= sLimit {
  135. goto emitRemainder
  136. }
  137. // Index in-between
  138. index0 := base + 1
  139. index1 := s - 2
  140. for index0 < index1 {
  141. cv0 := load64(src, index0)
  142. cv1 := load64(src, index1)
  143. lTable[hash7(cv0, lTableBits)] = uint32(index0)
  144. sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
  145. lTable[hash7(cv1, lTableBits)] = uint32(index1)
  146. sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
  147. index0 += 2
  148. index1 -= 2
  149. }
  150. cv = load64(src, s)
  151. continue
  152. }
  153. // Long likely matches 7, so take that.
  154. if uint32(cv) == uint32(valLong) {
  155. break
  156. }
  157. // Check our short candidate
  158. if uint32(cv) == uint32(valShort) {
  159. // Try a long candidate at s+1
  160. hashL = hash7(cv>>8, lTableBits)
  161. candidateL = int(lTable[hashL])
  162. lTable[hashL] = uint32(s + 1)
  163. if uint32(cv>>8) == load32(src, candidateL) {
  164. s++
  165. break
  166. }
  167. // Use our short candidate.
  168. candidateL = candidateS
  169. break
  170. }
  171. cv = load64(src, nextS)
  172. s = nextS
  173. }
  174. // Extend backwards
  175. for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
  176. candidateL--
  177. s--
  178. }
  179. // Bail if we exceed the maximum size.
  180. if d+(s-nextEmit) > dstLimit {
  181. return 0
  182. }
  183. base := s
  184. offset := base - candidateL
  185. // Extend the 4-byte match as long as possible.
  186. s += 4
  187. candidateL += 4
  188. for s < len(src) {
  189. if len(src)-s < 8 {
  190. if src[s] == src[candidateL] {
  191. s++
  192. candidateL++
  193. continue
  194. }
  195. break
  196. }
  197. if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
  198. s += bits.TrailingZeros64(diff) >> 3
  199. break
  200. }
  201. s += 8
  202. candidateL += 8
  203. }
  204. if offset > 65535 && s-base <= 5 && repeat != offset {
  205. // Bail if the match is equal or worse to the encoding.
  206. s = nextS + 1
  207. if s >= sLimit {
  208. goto emitRemainder
  209. }
  210. cv = load64(src, s)
  211. continue
  212. }
  213. d += emitLiteral(dst[d:], src[nextEmit:base])
  214. if repeat == offset {
  215. d += emitRepeat(dst[d:], offset, s-base)
  216. } else {
  217. d += emitCopy(dst[d:], offset, s-base)
  218. repeat = offset
  219. }
  220. nextEmit = s
  221. if s >= sLimit {
  222. goto emitRemainder
  223. }
  224. if d > dstLimit {
  225. // Do we have space for more, if not bail.
  226. return 0
  227. }
  228. // Index short & long
  229. index0 := base + 1
  230. index1 := s - 2
  231. cv0 := load64(src, index0)
  232. cv1 := load64(src, index1)
  233. lTable[hash7(cv0, lTableBits)] = uint32(index0)
  234. sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
  235. // lTable could be postponed, but very minor difference.
  236. lTable[hash7(cv1, lTableBits)] = uint32(index1)
  237. sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
  238. index0 += 1
  239. index1 -= 1
  240. cv = load64(src, s)
  241. // Index large values sparsely in between.
  242. // We do two starting from different offsets for speed.
  243. index2 := (index0 + index1 + 1) >> 1
  244. for index2 < index1 {
  245. lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
  246. lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
  247. index0 += 2
  248. index2 += 2
  249. }
  250. }
  251. emitRemainder:
  252. if nextEmit < len(src) {
  253. // Bail if we exceed the maximum size.
  254. if d+len(src)-nextEmit > dstLimit {
  255. return 0
  256. }
  257. d += emitLiteral(dst[d:], src[nextEmit:])
  258. }
  259. return d
  260. }
  261. // encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
  262. // assumes that the varint-encoded length of the decompressed bytes has already
  263. // been written.
  264. //
  265. // It also assumes that:
  266. //
  267. // len(dst) >= MaxEncodedLen(len(src)) &&
  268. // minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
  269. func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
  270. // sLimit is when to stop looking for offset/length copies. The inputMargin
  271. // lets us use a fast path for emitLiteral in the main loop, while we are
  272. // looking for copies.
  273. sLimit := len(src) - inputMargin
  274. if len(src) < minNonLiteralBlockSize {
  275. return 0
  276. }
  277. // Initialize the hash tables.
  278. const (
  279. // Long hash matches.
  280. lTableBits = 16
  281. maxLTableSize = 1 << lTableBits
  282. // Short hash matches.
  283. sTableBits = 14
  284. maxSTableSize = 1 << sTableBits
  285. )
  286. var lTable [maxLTableSize]uint32
  287. var sTable [maxSTableSize]uint32
  288. // Bail if we can't compress to at least this.
  289. dstLimit := len(src) - len(src)>>5 - 6
  290. // nextEmit is where in src the next emitLiteral should start from.
  291. nextEmit := 0
  292. // The encoded form must start with a literal, as there are no previous
  293. // bytes to copy, so we start looking for hash matches at s == 1.
  294. s := 1
  295. cv := load64(src, s)
  296. // We initialize repeat to 0, so we never match on first attempt
  297. repeat := 0
  298. const maxSkip = 100
  299. for {
  300. candidateL := 0
  301. nextS := 0
  302. for {
  303. // Next src position to check
  304. nextS = (s-nextEmit)>>7 + 1
  305. if nextS > maxSkip {
  306. nextS = s + maxSkip
  307. } else {
  308. nextS += s
  309. }
  310. if nextS > sLimit {
  311. goto emitRemainder
  312. }
  313. hashL := hash7(cv, lTableBits)
  314. hashS := hash4(cv, sTableBits)
  315. candidateL = int(lTable[hashL])
  316. candidateS := int(sTable[hashS])
  317. lTable[hashL] = uint32(s)
  318. sTable[hashS] = uint32(s)
  319. if uint32(cv) == load32(src, candidateL) {
  320. break
  321. }
  322. // Check our short candidate
  323. if uint32(cv) == load32(src, candidateS) {
  324. // Try a long candidate at s+1
  325. hashL = hash7(cv>>8, lTableBits)
  326. candidateL = int(lTable[hashL])
  327. lTable[hashL] = uint32(s + 1)
  328. if uint32(cv>>8) == load32(src, candidateL) {
  329. s++
  330. break
  331. }
  332. // Use our short candidate.
  333. candidateL = candidateS
  334. break
  335. }
  336. cv = load64(src, nextS)
  337. s = nextS
  338. }
  339. // Extend backwards
  340. for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
  341. candidateL--
  342. s--
  343. }
  344. // Bail if we exceed the maximum size.
  345. if d+(s-nextEmit) > dstLimit {
  346. return 0
  347. }
  348. base := s
  349. offset := base - candidateL
  350. // Extend the 4-byte match as long as possible.
  351. s += 4
  352. candidateL += 4
  353. for s < len(src) {
  354. if len(src)-s < 8 {
  355. if src[s] == src[candidateL] {
  356. s++
  357. candidateL++
  358. continue
  359. }
  360. break
  361. }
  362. if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
  363. s += bits.TrailingZeros64(diff) >> 3
  364. break
  365. }
  366. s += 8
  367. candidateL += 8
  368. }
  369. if offset > 65535 && s-base <= 5 && repeat != offset {
  370. // Bail if the match is equal or worse to the encoding.
  371. s = nextS + 1
  372. if s >= sLimit {
  373. goto emitRemainder
  374. }
  375. cv = load64(src, s)
  376. continue
  377. }
  378. d += emitLiteral(dst[d:], src[nextEmit:base])
  379. d += emitCopyNoRepeat(dst[d:], offset, s-base)
  380. repeat = offset
  381. nextEmit = s
  382. if s >= sLimit {
  383. goto emitRemainder
  384. }
  385. if d > dstLimit {
  386. // Do we have space for more, if not bail.
  387. return 0
  388. }
  389. // Index short & long
  390. index0 := base + 1
  391. index1 := s - 2
  392. cv0 := load64(src, index0)
  393. cv1 := load64(src, index1)
  394. lTable[hash7(cv0, lTableBits)] = uint32(index0)
  395. sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
  396. lTable[hash7(cv1, lTableBits)] = uint32(index1)
  397. sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
  398. index0 += 1
  399. index1 -= 1
  400. cv = load64(src, s)
  401. // Index large values sparsely in between.
  402. // We do two starting from different offsets for speed.
  403. index2 := (index0 + index1 + 1) >> 1
  404. for index2 < index1 {
  405. lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
  406. lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
  407. index0 += 2
  408. index2 += 2
  409. }
  410. }
  411. emitRemainder:
  412. if nextEmit < len(src) {
  413. // Bail if we exceed the maximum size.
  414. if d+len(src)-nextEmit > dstLimit {
  415. return 0
  416. }
  417. d += emitLiteral(dst[d:], src[nextEmit:])
  418. }
  419. return d
  420. }
  421. // encodeBlockBetterDict encodes a non-empty src to a guaranteed-large-enough dst. It
  422. // assumes that the varint-encoded length of the decompressed bytes has already
  423. // been written.
  424. //
  425. // It also assumes that:
  426. //
  427. // len(dst) >= MaxEncodedLen(len(src)) &&
  428. // minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
  429. func encodeBlockBetterDict(dst, src []byte, dict *Dict) (d int) {
  430. // sLimit is when to stop looking for offset/length copies. The inputMargin
  431. // lets us use a fast path for emitLiteral in the main loop, while we are
  432. // looking for copies.
  433. // Initialize the hash tables.
  434. const (
  435. // Long hash matches.
  436. lTableBits = 17
  437. maxLTableSize = 1 << lTableBits
  438. // Short hash matches.
  439. sTableBits = 14
  440. maxSTableSize = 1 << sTableBits
  441. maxAhead = 8 // maximum bytes ahead without checking sLimit
  442. debug = false
  443. )
  444. sLimit := len(src) - inputMargin
  445. if sLimit > MaxDictSrcOffset-maxAhead {
  446. sLimit = MaxDictSrcOffset - maxAhead
  447. }
  448. if len(src) < minNonLiteralBlockSize {
  449. return 0
  450. }
  451. dict.initBetter()
  452. var lTable [maxLTableSize]uint32
  453. var sTable [maxSTableSize]uint32
  454. // Bail if we can't compress to at least this.
  455. dstLimit := len(src) - len(src)>>5 - 6
  456. // nextEmit is where in src the next emitLiteral should start from.
  457. nextEmit := 0
  458. // The encoded form must start with a literal, as there are no previous
  459. // bytes to copy, so we start looking for hash matches at s == 1.
  460. s := 0
  461. cv := load64(src, s)
  462. // We initialize repeat to 0, so we never match on first attempt
  463. repeat := len(dict.dict) - dict.repeat
  464. // While in dict
  465. searchDict:
  466. for {
  467. candidateL := 0
  468. nextS := 0
  469. for {
  470. // Next src position to check
  471. nextS = s + (s-nextEmit)>>7 + 1
  472. if nextS > sLimit {
  473. break searchDict
  474. }
  475. hashL := hash7(cv, lTableBits)
  476. hashS := hash4(cv, sTableBits)
  477. candidateL = int(lTable[hashL])
  478. candidateS := int(sTable[hashS])
  479. dictL := int(dict.betterTableLong[hashL])
  480. dictS := int(dict.betterTableShort[hashS])
  481. lTable[hashL] = uint32(s)
  482. sTable[hashS] = uint32(s)
  483. valLong := load64(src, candidateL)
  484. valShort := load64(src, candidateS)
  485. // If long matches at least 8 bytes, use that.
  486. if s != 0 {
  487. if cv == valLong {
  488. goto emitMatch
  489. }
  490. if cv == valShort {
  491. candidateL = candidateS
  492. goto emitMatch
  493. }
  494. }
  495. // Check dict repeat.
  496. if repeat >= s+4 {
  497. candidate := len(dict.dict) - repeat + s
  498. if candidate > 0 && uint32(cv) == load32(dict.dict, candidate) {
  499. // Extend back
  500. base := s
  501. for i := candidate; base > nextEmit && i > 0 && dict.dict[i-1] == src[base-1]; {
  502. i--
  503. base--
  504. }
  505. d += emitLiteral(dst[d:], src[nextEmit:base])
  506. if debug && nextEmit != base {
  507. fmt.Println("emitted ", base-nextEmit, "literals")
  508. }
  509. s += 4
  510. candidate += 4
  511. for candidate < len(dict.dict)-8 && s <= len(src)-8 {
  512. if diff := load64(src, s) ^ load64(dict.dict, candidate); diff != 0 {
  513. s += bits.TrailingZeros64(diff) >> 3
  514. break
  515. }
  516. s += 8
  517. candidate += 8
  518. }
  519. d += emitRepeat(dst[d:], repeat, s-base)
  520. if debug {
  521. fmt.Println("emitted dict repeat length", s-base, "offset:", repeat, "s:", s)
  522. }
  523. nextEmit = s
  524. if s >= sLimit {
  525. break searchDict
  526. }
  527. // Index in-between
  528. index0 := base + 1
  529. index1 := s - 2
  530. cv = load64(src, s)
  531. for index0 < index1 {
  532. cv0 := load64(src, index0)
  533. cv1 := load64(src, index1)
  534. lTable[hash7(cv0, lTableBits)] = uint32(index0)
  535. sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
  536. lTable[hash7(cv1, lTableBits)] = uint32(index1)
  537. sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
  538. index0 += 2
  539. index1 -= 2
  540. }
  541. continue
  542. }
  543. }
  544. // Don't try to find match at s==0
  545. if s == 0 {
  546. cv = load64(src, nextS)
  547. s = nextS
  548. continue
  549. }
  550. // Long likely matches 7, so take that.
  551. if uint32(cv) == uint32(valLong) {
  552. goto emitMatch
  553. }
  554. // Long dict...
  555. if uint32(cv) == load32(dict.dict, dictL) {
  556. candidateL = dictL
  557. goto emitDict
  558. }
  559. // Check our short candidate
  560. if uint32(cv) == uint32(valShort) {
  561. // Try a long candidate at s+1
  562. hashL = hash7(cv>>8, lTableBits)
  563. candidateL = int(lTable[hashL])
  564. lTable[hashL] = uint32(s + 1)
  565. if uint32(cv>>8) == load32(src, candidateL) {
  566. s++
  567. goto emitMatch
  568. }
  569. // Use our short candidate.
  570. candidateL = candidateS
  571. goto emitMatch
  572. }
  573. if uint32(cv) == load32(dict.dict, dictS) {
  574. // Try a long candidate at s+1
  575. hashL = hash7(cv>>8, lTableBits)
  576. candidateL = int(lTable[hashL])
  577. lTable[hashL] = uint32(s + 1)
  578. if uint32(cv>>8) == load32(src, candidateL) {
  579. s++
  580. goto emitMatch
  581. }
  582. candidateL = dictS
  583. goto emitDict
  584. }
  585. cv = load64(src, nextS)
  586. s = nextS
  587. }
  588. emitDict:
  589. {
  590. if debug {
  591. if load32(dict.dict, candidateL) != load32(src, s) {
  592. panic("dict emit mismatch")
  593. }
  594. }
  595. // Extend backwards.
  596. // The top bytes will be rechecked to get the full match.
  597. for candidateL > 0 && s > nextEmit && dict.dict[candidateL-1] == src[s-1] {
  598. candidateL--
  599. s--
  600. }
  601. // Bail if we exceed the maximum size.
  602. if d+(s-nextEmit) > dstLimit {
  603. return 0
  604. }
  605. // A 4-byte match has been found. We'll later see if more than 4 bytes
  606. // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
  607. // them as literal bytes.
  608. d += emitLiteral(dst[d:], src[nextEmit:s])
  609. if debug && nextEmit != s {
  610. fmt.Println("emitted ", s-nextEmit, "literals")
  611. }
  612. {
  613. // Invariant: we have a 4-byte match at s, and no need to emit any
  614. // literal bytes prior to s.
  615. base := s
  616. offset := s + (len(dict.dict)) - candidateL
  617. // Extend the 4-byte match as long as possible.
  618. s += 4
  619. candidateL += 4
  620. for s <= len(src)-8 && len(dict.dict)-candidateL >= 8 {
  621. if diff := load64(src, s) ^ load64(dict.dict, candidateL); diff != 0 {
  622. s += bits.TrailingZeros64(diff) >> 3
  623. break
  624. }
  625. s += 8
  626. candidateL += 8
  627. }
  628. if repeat == offset {
  629. if debug {
  630. fmt.Println("emitted dict repeat, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
  631. }
  632. d += emitRepeat(dst[d:], offset, s-base)
  633. } else {
  634. if debug {
  635. fmt.Println("emitted dict copy, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
  636. }
  637. // Matches longer than 64 are split.
  638. if s <= sLimit || s-base < 8 {
  639. d += emitCopy(dst[d:], offset, s-base)
  640. } else {
  641. // Split to ensure we don't start a copy within next block.
  642. d += emitCopy(dst[d:], offset, 4)
  643. d += emitRepeat(dst[d:], offset, s-base-4)
  644. }
  645. repeat = offset
  646. }
  647. if false {
  648. // Validate match.
  649. if s <= candidateL {
  650. panic("s <= candidate")
  651. }
  652. a := src[base:s]
  653. b := dict.dict[base-repeat : base-repeat+(s-base)]
  654. if !bytes.Equal(a, b) {
  655. panic("mismatch")
  656. }
  657. }
  658. nextEmit = s
  659. if s >= sLimit {
  660. break searchDict
  661. }
  662. if d > dstLimit {
  663. // Do we have space for more, if not bail.
  664. return 0
  665. }
  666. // Index short & long
  667. index0 := base + 1
  668. index1 := s - 2
  669. cv0 := load64(src, index0)
  670. cv1 := load64(src, index1)
  671. lTable[hash7(cv0, lTableBits)] = uint32(index0)
  672. sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
  673. lTable[hash7(cv1, lTableBits)] = uint32(index1)
  674. sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
  675. index0 += 1
  676. index1 -= 1
  677. cv = load64(src, s)
  678. // index every second long in between.
  679. for index0 < index1 {
  680. lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
  681. lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
  682. index0 += 2
  683. index1 -= 2
  684. }
  685. }
  686. continue
  687. }
  688. emitMatch:
  689. // Extend backwards
  690. for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
  691. candidateL--
  692. s--
  693. }
  694. // Bail if we exceed the maximum size.
  695. if d+(s-nextEmit) > dstLimit {
  696. return 0
  697. }
  698. base := s
  699. offset := base - candidateL
  700. // Extend the 4-byte match as long as possible.
  701. s += 4
  702. candidateL += 4
  703. for s < len(src) {
  704. if len(src)-s < 8 {
  705. if src[s] == src[candidateL] {
  706. s++
  707. candidateL++
  708. continue
  709. }
  710. break
  711. }
  712. if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
  713. s += bits.TrailingZeros64(diff) >> 3
  714. break
  715. }
  716. s += 8
  717. candidateL += 8
  718. }
  719. if offset > 65535 && s-base <= 5 && repeat != offset {
  720. // Bail if the match is equal or worse to the encoding.
  721. s = nextS + 1
  722. if s >= sLimit {
  723. goto emitRemainder
  724. }
  725. cv = load64(src, s)
  726. continue
  727. }
  728. d += emitLiteral(dst[d:], src[nextEmit:base])
  729. if debug && nextEmit != s {
  730. fmt.Println("emitted ", s-nextEmit, "literals")
  731. }
  732. if repeat == offset {
  733. if debug {
  734. fmt.Println("emitted match repeat, length", s-base, "offset:", offset, "s:", s)
  735. }
  736. d += emitRepeat(dst[d:], offset, s-base)
  737. } else {
  738. if debug {
  739. fmt.Println("emitted match copy, length", s-base, "offset:", offset, "s:", s)
  740. }
  741. d += emitCopy(dst[d:], offset, s-base)
  742. repeat = offset
  743. }
  744. nextEmit = s
  745. if s >= sLimit {
  746. goto emitRemainder
  747. }
  748. if d > dstLimit {
  749. // Do we have space for more, if not bail.
  750. return 0
  751. }
  752. // Index short & long
  753. index0 := base + 1
  754. index1 := s - 2
  755. cv0 := load64(src, index0)
  756. cv1 := load64(src, index1)
  757. lTable[hash7(cv0, lTableBits)] = uint32(index0)
  758. sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
  759. lTable[hash7(cv1, lTableBits)] = uint32(index1)
  760. sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
  761. index0 += 1
  762. index1 -= 1
  763. cv = load64(src, s)
  764. // Index large values sparsely in between.
  765. // We do two starting from different offsets for speed.
  766. index2 := (index0 + index1 + 1) >> 1
  767. for index2 < index1 {
  768. lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
  769. lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
  770. index0 += 2
  771. index2 += 2
  772. }
  773. }
  774. // Search without dict:
  775. if repeat > s {
  776. repeat = 0
  777. }
  778. // No more dict
  779. sLimit = len(src) - inputMargin
  780. if s >= sLimit {
  781. goto emitRemainder
  782. }
  783. cv = load64(src, s)
  784. if debug {
  785. fmt.Println("now", s, "->", sLimit, "out:", d, "left:", len(src)-s, "nextemit:", nextEmit, "dstLimit:", dstLimit, "s:", s)
  786. }
  787. for {
  788. candidateL := 0
  789. nextS := 0
  790. for {
  791. // Next src position to check
  792. nextS = s + (s-nextEmit)>>7 + 1
  793. if nextS > sLimit {
  794. goto emitRemainder
  795. }
  796. hashL := hash7(cv, lTableBits)
  797. hashS := hash4(cv, sTableBits)
  798. candidateL = int(lTable[hashL])
  799. candidateS := int(sTable[hashS])
  800. lTable[hashL] = uint32(s)
  801. sTable[hashS] = uint32(s)
  802. valLong := load64(src, candidateL)
  803. valShort := load64(src, candidateS)
  804. // If long matches at least 8 bytes, use that.
  805. if cv == valLong {
  806. break
  807. }
  808. if cv == valShort {
  809. candidateL = candidateS
  810. break
  811. }
  812. // Check repeat at offset checkRep.
  813. const checkRep = 1
  814. // Minimum length of a repeat. Tested with various values.
  815. // While 4-5 offers improvements in some, 6 reduces
  816. // regressions significantly.
  817. const wantRepeatBytes = 6
  818. const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
  819. if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
  820. base := s + checkRep
  821. // Extend back
  822. for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
  823. i--
  824. base--
  825. }
  826. d += emitLiteral(dst[d:], src[nextEmit:base])
  827. // Extend forward
  828. candidate := s - repeat + wantRepeatBytes + checkRep
  829. s += wantRepeatBytes + checkRep
  830. for s < len(src) {
  831. if len(src)-s < 8 {
  832. if src[s] == src[candidate] {
  833. s++
  834. candidate++
  835. continue
  836. }
  837. break
  838. }
  839. if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
  840. s += bits.TrailingZeros64(diff) >> 3
  841. break
  842. }
  843. s += 8
  844. candidate += 8
  845. }
  846. // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
  847. d += emitRepeat(dst[d:], repeat, s-base)
  848. nextEmit = s
  849. if s >= sLimit {
  850. goto emitRemainder
  851. }
  852. // Index in-between
  853. index0 := base + 1
  854. index1 := s - 2
  855. for index0 < index1 {
  856. cv0 := load64(src, index0)
  857. cv1 := load64(src, index1)
  858. lTable[hash7(cv0, lTableBits)] = uint32(index0)
  859. sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
  860. lTable[hash7(cv1, lTableBits)] = uint32(index1)
  861. sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
  862. index0 += 2
  863. index1 -= 2
  864. }
  865. cv = load64(src, s)
  866. continue
  867. }
  868. // Long likely matches 7, so take that.
  869. if uint32(cv) == uint32(valLong) {
  870. break
  871. }
  872. // Check our short candidate
  873. if uint32(cv) == uint32(valShort) {
  874. // Try a long candidate at s+1
  875. hashL = hash7(cv>>8, lTableBits)
  876. candidateL = int(lTable[hashL])
  877. lTable[hashL] = uint32(s + 1)
  878. if uint32(cv>>8) == load32(src, candidateL) {
  879. s++
  880. break
  881. }
  882. // Use our short candidate.
  883. candidateL = candidateS
  884. break
  885. }
  886. cv = load64(src, nextS)
  887. s = nextS
  888. }
  889. // Extend backwards
  890. for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
  891. candidateL--
  892. s--
  893. }
  894. // Bail if we exceed the maximum size.
  895. if d+(s-nextEmit) > dstLimit {
  896. return 0
  897. }
  898. base := s
  899. offset := base - candidateL
  900. // Extend the 4-byte match as long as possible.
  901. s += 4
  902. candidateL += 4
  903. for s < len(src) {
  904. if len(src)-s < 8 {
  905. if src[s] == src[candidateL] {
  906. s++
  907. candidateL++
  908. continue
  909. }
  910. break
  911. }
  912. if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
  913. s += bits.TrailingZeros64(diff) >> 3
  914. break
  915. }
  916. s += 8
  917. candidateL += 8
  918. }
  919. if offset > 65535 && s-base <= 5 && repeat != offset {
  920. // Bail if the match is equal or worse to the encoding.
  921. s = nextS + 1
  922. if s >= sLimit {
  923. goto emitRemainder
  924. }
  925. cv = load64(src, s)
  926. continue
  927. }
  928. d += emitLiteral(dst[d:], src[nextEmit:base])
  929. if repeat == offset {
  930. d += emitRepeat(dst[d:], offset, s-base)
  931. } else {
  932. d += emitCopy(dst[d:], offset, s-base)
  933. repeat = offset
  934. }
  935. nextEmit = s
  936. if s >= sLimit {
  937. goto emitRemainder
  938. }
  939. if d > dstLimit {
  940. // Do we have space for more, if not bail.
  941. return 0
  942. }
  943. // Index short & long
  944. index0 := base + 1
  945. index1 := s - 2
  946. cv0 := load64(src, index0)
  947. cv1 := load64(src, index1)
  948. lTable[hash7(cv0, lTableBits)] = uint32(index0)
  949. sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
  950. lTable[hash7(cv1, lTableBits)] = uint32(index1)
  951. sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
  952. index0 += 1
  953. index1 -= 1
  954. cv = load64(src, s)
  955. // Index large values sparsely in between.
  956. // We do two starting from different offsets for speed.
  957. index2 := (index0 + index1 + 1) >> 1
  958. for index2 < index1 {
  959. lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
  960. lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
  961. index0 += 2
  962. index2 += 2
  963. }
  964. }
  965. emitRemainder:
  966. if nextEmit < len(src) {
  967. // Bail if we exceed the maximum size.
  968. if d+len(src)-nextEmit > dstLimit {
  969. return 0
  970. }
  971. d += emitLiteral(dst[d:], src[nextEmit:])
  972. }
  973. return d
  974. }