zstd_lazy.c 101 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199
  1. /*
  2. * Copyright (c) Meta Platforms, Inc. and affiliates.
  3. * All rights reserved.
  4. *
  5. * This source code is licensed under both the BSD-style license (found in the
  6. * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  7. * in the COPYING file in the root directory of this source tree).
  8. * You may select, at your option, one of the above-listed licenses.
  9. */
  10. #include "zstd_compress_internal.h"
  11. #include "zstd_lazy.h"
  12. #include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
  13. #if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
  14. || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
  15. || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
  16. || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
  17. #define kLazySkippingStep 8
  18. /*-*************************************
  19. * Binary Tree search
  20. ***************************************/
  21. static
  22. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  23. void ZSTD_updateDUBT(ZSTD_matchState_t* ms,
  24. const BYTE* ip, const BYTE* iend,
  25. U32 mls)
  26. {
  27. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  28. U32* const hashTable = ms->hashTable;
  29. U32 const hashLog = cParams->hashLog;
  30. U32* const bt = ms->chainTable;
  31. U32 const btLog = cParams->chainLog - 1;
  32. U32 const btMask = (1 << btLog) - 1;
  33. const BYTE* const base = ms->window.base;
  34. U32 const target = (U32)(ip - base);
  35. U32 idx = ms->nextToUpdate;
  36. if (idx != target)
  37. DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
  38. idx, target, ms->window.dictLimit);
  39. assert(ip + 8 <= iend); /* condition for ZSTD_hashPtr */
  40. (void)iend;
  41. assert(idx >= ms->window.dictLimit); /* condition for valid base+idx */
  42. for ( ; idx < target ; idx++) {
  43. size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); /* assumption : ip + 8 <= iend */
  44. U32 const matchIndex = hashTable[h];
  45. U32* const nextCandidatePtr = bt + 2*(idx&btMask);
  46. U32* const sortMarkPtr = nextCandidatePtr + 1;
  47. DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx);
  48. hashTable[h] = idx; /* Update Hash Table */
  49. *nextCandidatePtr = matchIndex; /* update BT like a chain */
  50. *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;
  51. }
  52. ms->nextToUpdate = target;
  53. }
  54. /** ZSTD_insertDUBT1() :
  55. * sort one already inserted but unsorted position
  56. * assumption : curr >= btlow == (curr - btmask)
  57. * doesn't fail */
  58. static
  59. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  60. void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
  61. U32 curr, const BYTE* inputEnd,
  62. U32 nbCompares, U32 btLow,
  63. const ZSTD_dictMode_e dictMode)
  64. {
  65. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  66. U32* const bt = ms->chainTable;
  67. U32 const btLog = cParams->chainLog - 1;
  68. U32 const btMask = (1 << btLog) - 1;
  69. size_t commonLengthSmaller=0, commonLengthLarger=0;
  70. const BYTE* const base = ms->window.base;
  71. const BYTE* const dictBase = ms->window.dictBase;
  72. const U32 dictLimit = ms->window.dictLimit;
  73. const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;
  74. const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;
  75. const BYTE* const dictEnd = dictBase + dictLimit;
  76. const BYTE* const prefixStart = base + dictLimit;
  77. const BYTE* match;
  78. U32* smallerPtr = bt + 2*(curr&btMask);
  79. U32* largerPtr = smallerPtr + 1;
  80. U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
  81. U32 dummy32; /* to be nullified at the end */
  82. U32 const windowValid = ms->window.lowLimit;
  83. U32 const maxDistance = 1U << cParams->windowLog;
  84. U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
  85. DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
  86. curr, dictLimit, windowLow);
  87. assert(curr >= btLow);
  88. assert(ip < iend); /* condition for ZSTD_count */
  89. for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
  90. U32* const nextPtr = bt + 2*(matchIndex & btMask);
  91. size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
  92. assert(matchIndex < curr);
  93. /* note : all candidates are now supposed sorted,
  94. * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
  95. * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
  96. if ( (dictMode != ZSTD_extDict)
  97. || (matchIndex+matchLength >= dictLimit) /* both in current segment*/
  98. || (curr < dictLimit) /* both in extDict */) {
  99. const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
  100. || (matchIndex+matchLength >= dictLimit)) ?
  101. base : dictBase;
  102. assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */
  103. || (curr < dictLimit) );
  104. match = mBase + matchIndex;
  105. matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
  106. } else {
  107. match = dictBase + matchIndex;
  108. matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
  109. if (matchIndex+matchLength >= dictLimit)
  110. match = base + matchIndex; /* preparation for next read of match[matchLength] */
  111. }
  112. DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
  113. curr, matchIndex, (U32)matchLength);
  114. if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
  115. break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
  116. }
  117. if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */
  118. /* match is smaller than current */
  119. *smallerPtr = matchIndex; /* update smaller idx */
  120. commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
  121. if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */
  122. DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
  123. matchIndex, btLow, nextPtr[1]);
  124. smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */
  125. matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */
  126. } else {
  127. /* match is larger than current */
  128. *largerPtr = matchIndex;
  129. commonLengthLarger = matchLength;
  130. if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */
  131. DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
  132. matchIndex, btLow, nextPtr[0]);
  133. largerPtr = nextPtr;
  134. matchIndex = nextPtr[0];
  135. } }
  136. *smallerPtr = *largerPtr = 0;
  137. }
  138. static
  139. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  140. size_t ZSTD_DUBT_findBetterDictMatch (
  141. const ZSTD_matchState_t* ms,
  142. const BYTE* const ip, const BYTE* const iend,
  143. size_t* offsetPtr,
  144. size_t bestLength,
  145. U32 nbCompares,
  146. U32 const mls,
  147. const ZSTD_dictMode_e dictMode)
  148. {
  149. const ZSTD_matchState_t * const dms = ms->dictMatchState;
  150. const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
  151. const U32 * const dictHashTable = dms->hashTable;
  152. U32 const hashLog = dmsCParams->hashLog;
  153. size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
  154. U32 dictMatchIndex = dictHashTable[h];
  155. const BYTE* const base = ms->window.base;
  156. const BYTE* const prefixStart = base + ms->window.dictLimit;
  157. U32 const curr = (U32)(ip-base);
  158. const BYTE* const dictBase = dms->window.base;
  159. const BYTE* const dictEnd = dms->window.nextSrc;
  160. U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
  161. U32 const dictLowLimit = dms->window.lowLimit;
  162. U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
  163. U32* const dictBt = dms->chainTable;
  164. U32 const btLog = dmsCParams->chainLog - 1;
  165. U32 const btMask = (1 << btLog) - 1;
  166. U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
  167. size_t commonLengthSmaller=0, commonLengthLarger=0;
  168. (void)dictMode;
  169. assert(dictMode == ZSTD_dictMatchState);
  170. for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {
  171. U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
  172. size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
  173. const BYTE* match = dictBase + dictMatchIndex;
  174. matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
  175. if (dictMatchIndex+matchLength >= dictHighLimit)
  176. match = base + dictMatchIndex + dictIndexDelta; /* to prepare for next usage of match[matchLength] */
  177. if (matchLength > bestLength) {
  178. U32 matchIndex = dictMatchIndex + dictIndexDelta;
  179. if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
  180. DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
  181. curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
  182. bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
  183. }
  184. if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
  185. break; /* drop, to guarantee consistency (miss a little bit of compression) */
  186. }
  187. }
  188. if (match[matchLength] < ip[matchLength]) {
  189. if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */
  190. commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
  191. dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */
  192. } else {
  193. /* match is larger than current */
  194. if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */
  195. commonLengthLarger = matchLength;
  196. dictMatchIndex = nextPtr[0];
  197. }
  198. }
  199. if (bestLength >= MINMATCH) {
  200. U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
  201. DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
  202. curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
  203. }
  204. return bestLength;
  205. }
  206. static
  207. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  208. size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
  209. const BYTE* const ip, const BYTE* const iend,
  210. size_t* offBasePtr,
  211. U32 const mls,
  212. const ZSTD_dictMode_e dictMode)
  213. {
  214. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  215. U32* const hashTable = ms->hashTable;
  216. U32 const hashLog = cParams->hashLog;
  217. size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
  218. U32 matchIndex = hashTable[h];
  219. const BYTE* const base = ms->window.base;
  220. U32 const curr = (U32)(ip-base);
  221. U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
  222. U32* const bt = ms->chainTable;
  223. U32 const btLog = cParams->chainLog - 1;
  224. U32 const btMask = (1 << btLog) - 1;
  225. U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;
  226. U32 const unsortLimit = MAX(btLow, windowLow);
  227. U32* nextCandidate = bt + 2*(matchIndex&btMask);
  228. U32* unsortedMark = bt + 2*(matchIndex&btMask) + 1;
  229. U32 nbCompares = 1U << cParams->searchLog;
  230. U32 nbCandidates = nbCompares;
  231. U32 previousCandidate = 0;
  232. DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);
  233. assert(ip <= iend-8); /* required for h calculation */
  234. assert(dictMode != ZSTD_dedicatedDictSearch);
  235. /* reach end of unsorted candidates list */
  236. while ( (matchIndex > unsortLimit)
  237. && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)
  238. && (nbCandidates > 1) ) {
  239. DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
  240. matchIndex);
  241. *unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position */
  242. previousCandidate = matchIndex;
  243. matchIndex = *nextCandidate;
  244. nextCandidate = bt + 2*(matchIndex&btMask);
  245. unsortedMark = bt + 2*(matchIndex&btMask) + 1;
  246. nbCandidates --;
  247. }
  248. /* nullify last candidate if it's still unsorted
  249. * simplification, detrimental to compression ratio, beneficial for speed */
  250. if ( (matchIndex > unsortLimit)
  251. && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {
  252. DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
  253. matchIndex);
  254. *nextCandidate = *unsortedMark = 0;
  255. }
  256. /* batch sort stacked candidates */
  257. matchIndex = previousCandidate;
  258. while (matchIndex) { /* will end on matchIndex == 0 */
  259. U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;
  260. U32 const nextCandidateIdx = *nextCandidateIdxPtr;
  261. ZSTD_insertDUBT1(ms, matchIndex, iend,
  262. nbCandidates, unsortLimit, dictMode);
  263. matchIndex = nextCandidateIdx;
  264. nbCandidates++;
  265. }
  266. /* find longest match */
  267. { size_t commonLengthSmaller = 0, commonLengthLarger = 0;
  268. const BYTE* const dictBase = ms->window.dictBase;
  269. const U32 dictLimit = ms->window.dictLimit;
  270. const BYTE* const dictEnd = dictBase + dictLimit;
  271. const BYTE* const prefixStart = base + dictLimit;
  272. U32* smallerPtr = bt + 2*(curr&btMask);
  273. U32* largerPtr = bt + 2*(curr&btMask) + 1;
  274. U32 matchEndIdx = curr + 8 + 1;
  275. U32 dummy32; /* to be nullified at the end */
  276. size_t bestLength = 0;
  277. matchIndex = hashTable[h];
  278. hashTable[h] = curr; /* Update Hash Table */
  279. for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
  280. U32* const nextPtr = bt + 2*(matchIndex & btMask);
  281. size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
  282. const BYTE* match;
  283. if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) {
  284. match = base + matchIndex;
  285. matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
  286. } else {
  287. match = dictBase + matchIndex;
  288. matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
  289. if (matchIndex+matchLength >= dictLimit)
  290. match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
  291. }
  292. if (matchLength > bestLength) {
  293. if (matchLength > matchEndIdx - matchIndex)
  294. matchEndIdx = matchIndex + (U32)matchLength;
  295. if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
  296. bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
  297. if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
  298. if (dictMode == ZSTD_dictMatchState) {
  299. nbCompares = 0; /* in addition to avoiding checking any
  300. * further in this loop, make sure we
  301. * skip checking in the dictionary. */
  302. }
  303. break; /* drop, to guarantee consistency (miss a little bit of compression) */
  304. }
  305. }
  306. if (match[matchLength] < ip[matchLength]) {
  307. /* match is smaller than current */
  308. *smallerPtr = matchIndex; /* update smaller idx */
  309. commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
  310. if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */
  311. smallerPtr = nextPtr+1; /* new "smaller" => larger of match */
  312. matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */
  313. } else {
  314. /* match is larger than current */
  315. *largerPtr = matchIndex;
  316. commonLengthLarger = matchLength;
  317. if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */
  318. largerPtr = nextPtr;
  319. matchIndex = nextPtr[0];
  320. } }
  321. *smallerPtr = *largerPtr = 0;
  322. assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
  323. if (dictMode == ZSTD_dictMatchState && nbCompares) {
  324. bestLength = ZSTD_DUBT_findBetterDictMatch(
  325. ms, ip, iend,
  326. offBasePtr, bestLength, nbCompares,
  327. mls, dictMode);
  328. }
  329. assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
  330. ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
  331. if (bestLength >= MINMATCH) {
  332. U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
  333. DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
  334. curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
  335. }
  336. return bestLength;
  337. }
  338. }
  339. /** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
  340. FORCE_INLINE_TEMPLATE
  341. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  342. size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
  343. const BYTE* const ip, const BYTE* const iLimit,
  344. size_t* offBasePtr,
  345. const U32 mls /* template */,
  346. const ZSTD_dictMode_e dictMode)
  347. {
  348. DEBUGLOG(7, "ZSTD_BtFindBestMatch");
  349. if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
  350. ZSTD_updateDUBT(ms, ip, iLimit, mls);
  351. return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
  352. }
  353. /***********************************
  354. * Dedicated dict search
  355. ***********************************/
  356. void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
  357. {
  358. const BYTE* const base = ms->window.base;
  359. U32 const target = (U32)(ip - base);
  360. U32* const hashTable = ms->hashTable;
  361. U32* const chainTable = ms->chainTable;
  362. U32 const chainSize = 1 << ms->cParams.chainLog;
  363. U32 idx = ms->nextToUpdate;
  364. U32 const minChain = chainSize < target - idx ? target - chainSize : idx;
  365. U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
  366. U32 const cacheSize = bucketSize - 1;
  367. U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
  368. U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
  369. /* We know the hashtable is oversized by a factor of `bucketSize`.
  370. * We are going to temporarily pretend `bucketSize == 1`, keeping only a
  371. * single entry. We will use the rest of the space to construct a temporary
  372. * chaintable.
  373. */
  374. U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
  375. U32* const tmpHashTable = hashTable;
  376. U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
  377. U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
  378. U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
  379. U32 hashIdx;
  380. assert(ms->cParams.chainLog <= 24);
  381. assert(ms->cParams.hashLog > ms->cParams.chainLog);
  382. assert(idx != 0);
  383. assert(tmpMinChain <= minChain);
  384. /* fill conventional hash table and conventional chain table */
  385. for ( ; idx < target; idx++) {
  386. U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
  387. if (idx >= tmpMinChain) {
  388. tmpChainTable[idx - tmpMinChain] = hashTable[h];
  389. }
  390. tmpHashTable[h] = idx;
  391. }
  392. /* sort chains into ddss chain table */
  393. {
  394. U32 chainPos = 0;
  395. for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
  396. U32 count;
  397. U32 countBeyondMinChain = 0;
  398. U32 i = tmpHashTable[hashIdx];
  399. for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {
  400. /* skip through the chain to the first position that won't be
  401. * in the hash cache bucket */
  402. if (i < minChain) {
  403. countBeyondMinChain++;
  404. }
  405. i = tmpChainTable[i - tmpMinChain];
  406. }
  407. if (count == cacheSize) {
  408. for (count = 0; count < chainLimit;) {
  409. if (i < minChain) {
  410. if (!i || ++countBeyondMinChain > cacheSize) {
  411. /* only allow pulling `cacheSize` number of entries
  412. * into the cache or chainTable beyond `minChain`,
  413. * to replace the entries pulled out of the
  414. * chainTable into the cache. This lets us reach
  415. * back further without increasing the total number
  416. * of entries in the chainTable, guaranteeing the
  417. * DDSS chain table will fit into the space
  418. * allocated for the regular one. */
  419. break;
  420. }
  421. }
  422. chainTable[chainPos++] = i;
  423. count++;
  424. if (i < tmpMinChain) {
  425. break;
  426. }
  427. i = tmpChainTable[i - tmpMinChain];
  428. }
  429. } else {
  430. count = 0;
  431. }
  432. if (count) {
  433. tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
  434. } else {
  435. tmpHashTable[hashIdx] = 0;
  436. }
  437. }
  438. assert(chainPos <= chainSize); /* I believe this is guaranteed... */
  439. }
  440. /* move chain pointers into the last entry of each hash bucket */
  441. for (hashIdx = (1 << hashLog); hashIdx; ) {
  442. U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
  443. U32 const chainPackedPointer = tmpHashTable[hashIdx];
  444. U32 i;
  445. for (i = 0; i < cacheSize; i++) {
  446. hashTable[bucketIdx + i] = 0;
  447. }
  448. hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
  449. }
  450. /* fill the buckets of the hash table */
  451. for (idx = ms->nextToUpdate; idx < target; idx++) {
  452. U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
  453. << ZSTD_LAZY_DDSS_BUCKET_LOG;
  454. U32 i;
  455. /* Shift hash cache down 1. */
  456. for (i = cacheSize - 1; i; i--)
  457. hashTable[h + i] = hashTable[h + i - 1];
  458. hashTable[h] = idx;
  459. }
  460. ms->nextToUpdate = target;
  461. }
  462. /* Returns the longest match length found in the dedicated dict search structure.
  463. * If none are longer than the argument ml, then ml will be returned.
  464. */
  465. FORCE_INLINE_TEMPLATE
  466. size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
  467. const ZSTD_matchState_t* const dms,
  468. const BYTE* const ip, const BYTE* const iLimit,
  469. const BYTE* const prefixStart, const U32 curr,
  470. const U32 dictLimit, const size_t ddsIdx) {
  471. const U32 ddsLowestIndex = dms->window.dictLimit;
  472. const BYTE* const ddsBase = dms->window.base;
  473. const BYTE* const ddsEnd = dms->window.nextSrc;
  474. const U32 ddsSize = (U32)(ddsEnd - ddsBase);
  475. const U32 ddsIndexDelta = dictLimit - ddsSize;
  476. const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
  477. const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
  478. U32 ddsAttempt;
  479. U32 matchIndex;
  480. for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
  481. PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
  482. }
  483. {
  484. U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
  485. U32 const chainIndex = chainPackedPointer >> 8;
  486. PREFETCH_L1(&dms->chainTable[chainIndex]);
  487. }
  488. for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
  489. size_t currentMl=0;
  490. const BYTE* match;
  491. matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
  492. match = ddsBase + matchIndex;
  493. if (!matchIndex) {
  494. return ml;
  495. }
  496. /* guaranteed by table construction */
  497. (void)ddsLowestIndex;
  498. assert(matchIndex >= ddsLowestIndex);
  499. assert(match+4 <= ddsEnd);
  500. if (MEM_read32(match) == MEM_read32(ip)) {
  501. /* assumption : matchIndex <= dictLimit-4 (by table construction) */
  502. currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
  503. }
  504. /* save best solution */
  505. if (currentMl > ml) {
  506. ml = currentMl;
  507. *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
  508. if (ip+currentMl == iLimit) {
  509. /* best possible, avoids read overflow on next attempt */
  510. return ml;
  511. }
  512. }
  513. }
  514. {
  515. U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
  516. U32 chainIndex = chainPackedPointer >> 8;
  517. U32 const chainLength = chainPackedPointer & 0xFF;
  518. U32 const chainAttempts = nbAttempts - ddsAttempt;
  519. U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
  520. U32 chainAttempt;
  521. for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
  522. PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
  523. }
  524. for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
  525. size_t currentMl=0;
  526. const BYTE* match;
  527. matchIndex = dms->chainTable[chainIndex];
  528. match = ddsBase + matchIndex;
  529. /* guaranteed by table construction */
  530. assert(matchIndex >= ddsLowestIndex);
  531. assert(match+4 <= ddsEnd);
  532. if (MEM_read32(match) == MEM_read32(ip)) {
  533. /* assumption : matchIndex <= dictLimit-4 (by table construction) */
  534. currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
  535. }
  536. /* save best solution */
  537. if (currentMl > ml) {
  538. ml = currentMl;
  539. *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
  540. if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
  541. }
  542. }
  543. }
  544. return ml;
  545. }
  546. /* *********************************
  547. * Hash Chain
  548. ***********************************/
  549. #define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
  550. /* Update chains up to ip (excluded)
  551. Assumption : always within prefix (i.e. not within extDict) */
  552. FORCE_INLINE_TEMPLATE
  553. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  554. U32 ZSTD_insertAndFindFirstIndex_internal(
  555. ZSTD_matchState_t* ms,
  556. const ZSTD_compressionParameters* const cParams,
  557. const BYTE* ip, U32 const mls, U32 const lazySkipping)
  558. {
  559. U32* const hashTable = ms->hashTable;
  560. const U32 hashLog = cParams->hashLog;
  561. U32* const chainTable = ms->chainTable;
  562. const U32 chainMask = (1 << cParams->chainLog) - 1;
  563. const BYTE* const base = ms->window.base;
  564. const U32 target = (U32)(ip - base);
  565. U32 idx = ms->nextToUpdate;
  566. while(idx < target) { /* catch up */
  567. size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
  568. NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
  569. hashTable[h] = idx;
  570. idx++;
  571. /* Stop inserting every position when in the lazy skipping mode. */
  572. if (lazySkipping)
  573. break;
  574. }
  575. ms->nextToUpdate = target;
  576. return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
  577. }
  578. U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
  579. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  580. return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
  581. }
  582. /* inlining is important to hardwire a hot branch (template emulation) */
  583. FORCE_INLINE_TEMPLATE
  584. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  585. size_t ZSTD_HcFindBestMatch(
  586. ZSTD_matchState_t* ms,
  587. const BYTE* const ip, const BYTE* const iLimit,
  588. size_t* offsetPtr,
  589. const U32 mls, const ZSTD_dictMode_e dictMode)
  590. {
  591. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  592. U32* const chainTable = ms->chainTable;
  593. const U32 chainSize = (1 << cParams->chainLog);
  594. const U32 chainMask = chainSize-1;
  595. const BYTE* const base = ms->window.base;
  596. const BYTE* const dictBase = ms->window.dictBase;
  597. const U32 dictLimit = ms->window.dictLimit;
  598. const BYTE* const prefixStart = base + dictLimit;
  599. const BYTE* const dictEnd = dictBase + dictLimit;
  600. const U32 curr = (U32)(ip-base);
  601. const U32 maxDistance = 1U << cParams->windowLog;
  602. const U32 lowestValid = ms->window.lowLimit;
  603. const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
  604. const U32 isDictionary = (ms->loadedDictEnd != 0);
  605. const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
  606. const U32 minChain = curr > chainSize ? curr - chainSize : 0;
  607. U32 nbAttempts = 1U << cParams->searchLog;
  608. size_t ml=4-1;
  609. const ZSTD_matchState_t* const dms = ms->dictMatchState;
  610. const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
  611. ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
  612. const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
  613. ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
  614. U32 matchIndex;
  615. if (dictMode == ZSTD_dedicatedDictSearch) {
  616. const U32* entry = &dms->hashTable[ddsIdx];
  617. PREFETCH_L1(entry);
  618. }
  619. /* HC4 match finder */
  620. matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
  621. for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
  622. size_t currentMl=0;
  623. if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
  624. const BYTE* const match = base + matchIndex;
  625. assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
  626. /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
  627. if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
  628. currentMl = ZSTD_count(ip, match, iLimit);
  629. } else {
  630. const BYTE* const match = dictBase + matchIndex;
  631. assert(match+4 <= dictEnd);
  632. if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
  633. currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
  634. }
  635. /* save best solution */
  636. if (currentMl > ml) {
  637. ml = currentMl;
  638. *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
  639. if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
  640. }
  641. if (matchIndex <= minChain) break;
  642. matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
  643. }
  644. assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
  645. if (dictMode == ZSTD_dedicatedDictSearch) {
  646. ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
  647. ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
  648. } else if (dictMode == ZSTD_dictMatchState) {
  649. const U32* const dmsChainTable = dms->chainTable;
  650. const U32 dmsChainSize = (1 << dms->cParams.chainLog);
  651. const U32 dmsChainMask = dmsChainSize - 1;
  652. const U32 dmsLowestIndex = dms->window.dictLimit;
  653. const BYTE* const dmsBase = dms->window.base;
  654. const BYTE* const dmsEnd = dms->window.nextSrc;
  655. const U32 dmsSize = (U32)(dmsEnd - dmsBase);
  656. const U32 dmsIndexDelta = dictLimit - dmsSize;
  657. const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0;
  658. matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
  659. for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
  660. size_t currentMl=0;
  661. const BYTE* const match = dmsBase + matchIndex;
  662. assert(match+4 <= dmsEnd);
  663. if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
  664. currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
  665. /* save best solution */
  666. if (currentMl > ml) {
  667. ml = currentMl;
  668. assert(curr > matchIndex + dmsIndexDelta);
  669. *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
  670. if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
  671. }
  672. if (matchIndex <= dmsMinChain) break;
  673. matchIndex = dmsChainTable[matchIndex & dmsChainMask];
  674. }
  675. }
  676. return ml;
  677. }
  678. /* *********************************
  679. * (SIMD) Row-based matchfinder
  680. ***********************************/
  681. /* Constants for row-based hash */
  682. #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
  683. #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
  684. #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
  685. typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */
  686. /* ZSTD_VecMask_next():
  687. * Starting from the LSB, returns the idx of the next non-zero bit.
  688. * Basically counting the nb of trailing zeroes.
  689. */
  690. MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
  691. return ZSTD_countTrailingZeros64(val);
  692. }
  693. /* ZSTD_row_nextIndex():
  694. * Returns the next index to insert at within a tagTable row, and updates the "head"
  695. * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
  696. */
  697. FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
  698. U32 next = (*tagRow-1) & rowMask;
  699. next += (next == 0) ? rowMask : 0; /* skip first position */
  700. *tagRow = (BYTE)next;
  701. return next;
  702. }
  703. /* ZSTD_isAligned():
  704. * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
  705. */
  706. MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
  707. assert((align & (align - 1)) == 0);
  708. return (((size_t)ptr) & (align - 1)) == 0;
  709. }
  710. /* ZSTD_row_prefetch():
  711. * Performs prefetching for the hashTable and tagTable at a given row.
  712. */
  713. FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
  714. PREFETCH_L1(hashTable + relRow);
  715. if (rowLog >= 5) {
  716. PREFETCH_L1(hashTable + relRow + 16);
  717. /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */
  718. }
  719. PREFETCH_L1(tagTable + relRow);
  720. if (rowLog == 6) {
  721. PREFETCH_L1(tagTable + relRow + 32);
  722. }
  723. assert(rowLog == 4 || rowLog == 5 || rowLog == 6);
  724. assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
  725. assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */
  726. }
  727. /* ZSTD_row_fillHashCache():
  728. * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
  729. * but not beyond iLimit.
  730. */
  731. FORCE_INLINE_TEMPLATE
  732. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  733. void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
  734. U32 const rowLog, U32 const mls,
  735. U32 idx, const BYTE* const iLimit)
  736. {
  737. U32 const* const hashTable = ms->hashTable;
  738. BYTE const* const tagTable = ms->tagTable;
  739. U32 const hashLog = ms->rowHashLog;
  740. U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
  741. U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
  742. for (; idx < lim; ++idx) {
  743. U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
  744. U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
  745. ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
  746. ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
  747. }
  748. DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
  749. ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
  750. ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
  751. }
  752. /* ZSTD_row_nextCachedHash():
  753. * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
  754. * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
  755. */
  756. FORCE_INLINE_TEMPLATE
  757. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  758. U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
  759. BYTE const* tagTable, BYTE const* base,
  760. U32 idx, U32 const hashLog,
  761. U32 const rowLog, U32 const mls,
  762. U64 const hashSalt)
  763. {
  764. U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
  765. U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
  766. ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
  767. { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
  768. cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
  769. return hash;
  770. }
  771. }
  772. /* ZSTD_row_update_internalImpl():
  773. * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
  774. */
  775. FORCE_INLINE_TEMPLATE
  776. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  777. void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
  778. U32 updateStartIdx, U32 const updateEndIdx,
  779. U32 const mls, U32 const rowLog,
  780. U32 const rowMask, U32 const useCache)
  781. {
  782. U32* const hashTable = ms->hashTable;
  783. BYTE* const tagTable = ms->tagTable;
  784. U32 const hashLog = ms->rowHashLog;
  785. const BYTE* const base = ms->window.base;
  786. DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
  787. for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
  788. U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
  789. : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
  790. U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
  791. U32* const row = hashTable + relRow;
  792. BYTE* tagRow = tagTable + relRow;
  793. U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
  794. assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
  795. tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
  796. row[pos] = updateStartIdx;
  797. }
  798. }
  799. /* ZSTD_row_update_internal():
  800. * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
  801. * Skips sections of long matches as is necessary.
  802. */
  803. FORCE_INLINE_TEMPLATE
  804. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  805. void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
  806. U32 const mls, U32 const rowLog,
  807. U32 const rowMask, U32 const useCache)
  808. {
  809. U32 idx = ms->nextToUpdate;
  810. const BYTE* const base = ms->window.base;
  811. const U32 target = (U32)(ip - base);
  812. const U32 kSkipThreshold = 384;
  813. const U32 kMaxMatchStartPositionsToUpdate = 96;
  814. const U32 kMaxMatchEndPositionsToUpdate = 32;
  815. if (useCache) {
  816. /* Only skip positions when using hash cache, i.e.
  817. * if we are loading a dict, don't skip anything.
  818. * If we decide to skip, then we only update a set number
  819. * of positions at the beginning and end of the match.
  820. */
  821. if (UNLIKELY(target - idx > kSkipThreshold)) {
  822. U32 const bound = idx + kMaxMatchStartPositionsToUpdate;
  823. ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache);
  824. idx = target - kMaxMatchEndPositionsToUpdate;
  825. ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1);
  826. }
  827. }
  828. assert(target >= idx);
  829. ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache);
  830. ms->nextToUpdate = target;
  831. }
  832. /* ZSTD_row_update():
  833. * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
  834. * processing.
  835. */
  836. void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
  837. const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
  838. const U32 rowMask = (1u << rowLog) - 1;
  839. const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
  840. DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
  841. ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
  842. }
  843. /* Returns the mask width of bits group of which will be set to 1. Given not all
  844. * architectures have easy movemask instruction, this helps to iterate over
  845. * groups of bits easier and faster.
  846. */
  847. FORCE_INLINE_TEMPLATE U32
  848. ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
  849. {
  850. assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
  851. assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
  852. (void)rowEntries;
  853. #if defined(ZSTD_ARCH_ARM_NEON)
  854. /* NEON path only works for little endian */
  855. if (!MEM_isLittleEndian()) {
  856. return 1;
  857. }
  858. if (rowEntries == 16) {
  859. return 4;
  860. }
  861. if (rowEntries == 32) {
  862. return 2;
  863. }
  864. if (rowEntries == 64) {
  865. return 1;
  866. }
  867. #endif
  868. return 1;
  869. }
  870. #if defined(ZSTD_ARCH_X86_SSE2)
  871. FORCE_INLINE_TEMPLATE ZSTD_VecMask
  872. ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
  873. {
  874. const __m128i comparisonMask = _mm_set1_epi8((char)tag);
  875. int matches[4] = {0};
  876. int i;
  877. assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);
  878. for (i=0; i<nbChunks; i++) {
  879. const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i));
  880. const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);
  881. matches[i] = _mm_movemask_epi8(equalMask);
  882. }
  883. if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head);
  884. if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);
  885. assert(nbChunks == 4);
  886. return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
  887. }
  888. #endif
  889. #if defined(ZSTD_ARCH_ARM_NEON)
  890. FORCE_INLINE_TEMPLATE ZSTD_VecMask
  891. ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
  892. {
  893. assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
  894. if (rowEntries == 16) {
  895. /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
  896. * After that groups of 4 bits represent the equalMask. We lower
  897. * all bits except the highest in these groups by doing AND with
  898. * 0x88 = 0b10001000.
  899. */
  900. const uint8x16_t chunk = vld1q_u8(src);
  901. const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
  902. const uint8x8_t res = vshrn_n_u16(equalMask, 4);
  903. const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
  904. return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
  905. } else if (rowEntries == 32) {
  906. /* Same idea as with rowEntries == 16 but doing AND with
  907. * 0x55 = 0b01010101.
  908. */
  909. const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
  910. const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
  911. const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
  912. const uint8x16_t dup = vdupq_n_u8(tag);
  913. const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
  914. const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
  915. const uint8x8_t res = vsli_n_u8(t0, t1, 4);
  916. const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
  917. return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
  918. } else { /* rowEntries == 64 */
  919. const uint8x16x4_t chunk = vld4q_u8(src);
  920. const uint8x16_t dup = vdupq_n_u8(tag);
  921. const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
  922. const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
  923. const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
  924. const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
  925. const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
  926. const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
  927. const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
  928. const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
  929. const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
  930. const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
  931. return ZSTD_rotateRight_U64(matches, headGrouped);
  932. }
  933. }
  934. #endif
  935. /* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
  936. * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
  937. * matches the hash at the nth position in a row of the tagTable.
  938. * Each row is a circular buffer beginning at the value of "headGrouped". So we
  939. * must rotate the "matches" bitfield to match up with the actual layout of the
  940. * entries within the hashTable */
  941. FORCE_INLINE_TEMPLATE ZSTD_VecMask
  942. ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
  943. {
  944. const BYTE* const src = tagRow;
  945. assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
  946. assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
  947. assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
  948. #if defined(ZSTD_ARCH_X86_SSE2)
  949. return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
  950. #else /* SW or NEON-LE */
  951. # if defined(ZSTD_ARCH_ARM_NEON)
  952. /* This NEON path only works for little endian - otherwise use SWAR below */
  953. if (MEM_isLittleEndian()) {
  954. return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
  955. }
  956. # endif /* ZSTD_ARCH_ARM_NEON */
  957. /* SWAR */
  958. { const int chunkSize = sizeof(size_t);
  959. const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
  960. const size_t xFF = ~((size_t)0);
  961. const size_t x01 = xFF / 0xFF;
  962. const size_t x80 = x01 << 7;
  963. const size_t splatChar = tag * x01;
  964. ZSTD_VecMask matches = 0;
  965. int i = rowEntries - chunkSize;
  966. assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8));
  967. if (MEM_isLittleEndian()) { /* runtime check so have two loops */
  968. const size_t extractMagic = (xFF / 0x7F) >> chunkSize;
  969. do {
  970. size_t chunk = MEM_readST(&src[i]);
  971. chunk ^= splatChar;
  972. chunk = (((chunk | x80) - x01) | chunk) & x80;
  973. matches <<= chunkSize;
  974. matches |= (chunk * extractMagic) >> shiftAmount;
  975. i -= chunkSize;
  976. } while (i >= 0);
  977. } else { /* big endian: reverse bits during extraction */
  978. const size_t msb = xFF ^ (xFF >> 1);
  979. const size_t extractMagic = (msb / 0x1FF) | msb;
  980. do {
  981. size_t chunk = MEM_readST(&src[i]);
  982. chunk ^= splatChar;
  983. chunk = (((chunk | x80) - x01) | chunk) & x80;
  984. matches <<= chunkSize;
  985. matches |= ((chunk >> 7) * extractMagic) >> shiftAmount;
  986. i -= chunkSize;
  987. } while (i >= 0);
  988. }
  989. matches = ~matches;
  990. if (rowEntries == 16) {
  991. return ZSTD_rotateRight_U16((U16)matches, headGrouped);
  992. } else if (rowEntries == 32) {
  993. return ZSTD_rotateRight_U32((U32)matches, headGrouped);
  994. } else {
  995. return ZSTD_rotateRight_U64((U64)matches, headGrouped);
  996. }
  997. }
  998. #endif
  999. }
  1000. /* The high-level approach of the SIMD row based match finder is as follows:
  1001. * - Figure out where to insert the new entry:
  1002. * - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index.
  1003. * - The hash is salted by a value that changes on every contex reset, so when the same table is used
  1004. * we will avoid collisions that would otherwise slow us down by intorducing phantom matches.
  1005. * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines
  1006. * which row to insert into.
  1007. * - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can
  1008. * be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes
  1009. * per row).
  1010. * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and
  1011. * generate a bitfield that we can cycle through to check the collisions in the hash table.
  1012. * - Pick the longest match.
  1013. * - Insert the tag into the equivalent row and position in the tagTable.
  1014. */
  1015. FORCE_INLINE_TEMPLATE
  1016. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  1017. size_t ZSTD_RowFindBestMatch(
  1018. ZSTD_matchState_t* ms,
  1019. const BYTE* const ip, const BYTE* const iLimit,
  1020. size_t* offsetPtr,
  1021. const U32 mls, const ZSTD_dictMode_e dictMode,
  1022. const U32 rowLog)
  1023. {
  1024. U32* const hashTable = ms->hashTable;
  1025. BYTE* const tagTable = ms->tagTable;
  1026. U32* const hashCache = ms->hashCache;
  1027. const U32 hashLog = ms->rowHashLog;
  1028. const ZSTD_compressionParameters* const cParams = &ms->cParams;
  1029. const BYTE* const base = ms->window.base;
  1030. const BYTE* const dictBase = ms->window.dictBase;
  1031. const U32 dictLimit = ms->window.dictLimit;
  1032. const BYTE* const prefixStart = base + dictLimit;
  1033. const BYTE* const dictEnd = dictBase + dictLimit;
  1034. const U32 curr = (U32)(ip-base);
  1035. const U32 maxDistance = 1U << cParams->windowLog;
  1036. const U32 lowestValid = ms->window.lowLimit;
  1037. const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
  1038. const U32 isDictionary = (ms->loadedDictEnd != 0);
  1039. const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
  1040. const U32 rowEntries = (1U << rowLog);
  1041. const U32 rowMask = rowEntries - 1;
  1042. const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
  1043. const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
  1044. const U64 hashSalt = ms->hashSalt;
  1045. U32 nbAttempts = 1U << cappedSearchLog;
  1046. size_t ml=4-1;
  1047. U32 hash;
  1048. /* DMS/DDS variables that may be referenced laster */
  1049. const ZSTD_matchState_t* const dms = ms->dictMatchState;
  1050. /* Initialize the following variables to satisfy static analyzer */
  1051. size_t ddsIdx = 0;
  1052. U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
  1053. U32 dmsTag = 0;
  1054. U32* dmsRow = NULL;
  1055. BYTE* dmsTagRow = NULL;
  1056. if (dictMode == ZSTD_dedicatedDictSearch) {
  1057. const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
  1058. { /* Prefetch DDS hashtable entry */
  1059. ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
  1060. PREFETCH_L1(&dms->hashTable[ddsIdx]);
  1061. }
  1062. ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
  1063. }
  1064. if (dictMode == ZSTD_dictMatchState) {
  1065. /* Prefetch DMS rows */
  1066. U32* const dmsHashTable = dms->hashTable;
  1067. BYTE* const dmsTagTable = dms->tagTable;
  1068. U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
  1069. U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
  1070. dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
  1071. dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
  1072. dmsRow = dmsHashTable + dmsRelRow;
  1073. ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
  1074. }
  1075. /* Update the hashTable and tagTable up to (but not including) ip */
  1076. if (!ms->lazySkipping) {
  1077. ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
  1078. hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
  1079. } else {
  1080. /* Stop inserting every position when in the lazy skipping mode.
  1081. * The hash cache is also not kept up to date in this mode.
  1082. */
  1083. hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
  1084. ms->nextToUpdate = curr;
  1085. }
  1086. ms->hashSaltEntropy += hash; /* collect salt entropy */
  1087. { /* Get the hash for ip, compute the appropriate row */
  1088. U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
  1089. U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
  1090. U32* const row = hashTable + relRow;
  1091. BYTE* tagRow = (BYTE*)(tagTable + relRow);
  1092. U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
  1093. U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
  1094. size_t numMatches = 0;
  1095. size_t currMatch = 0;
  1096. ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
  1097. /* Cycle through the matches and prefetch */
  1098. for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
  1099. U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
  1100. U32 const matchIndex = row[matchPos];
  1101. if(matchPos == 0) continue;
  1102. assert(numMatches < rowEntries);
  1103. if (matchIndex < lowLimit)
  1104. break;
  1105. if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
  1106. PREFETCH_L1(base + matchIndex);
  1107. } else {
  1108. PREFETCH_L1(dictBase + matchIndex);
  1109. }
  1110. matchBuffer[numMatches++] = matchIndex;
  1111. --nbAttempts;
  1112. }
  1113. /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
  1114. in ZSTD_row_update_internal() at the next search. */
  1115. {
  1116. U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
  1117. tagRow[pos] = (BYTE)tag;
  1118. row[pos] = ms->nextToUpdate++;
  1119. }
  1120. /* Return the longest match */
  1121. for (; currMatch < numMatches; ++currMatch) {
  1122. U32 const matchIndex = matchBuffer[currMatch];
  1123. size_t currentMl=0;
  1124. assert(matchIndex < curr);
  1125. assert(matchIndex >= lowLimit);
  1126. if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
  1127. const BYTE* const match = base + matchIndex;
  1128. assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
  1129. /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
  1130. if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */
  1131. currentMl = ZSTD_count(ip, match, iLimit);
  1132. } else {
  1133. const BYTE* const match = dictBase + matchIndex;
  1134. assert(match+4 <= dictEnd);
  1135. if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
  1136. currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
  1137. }
  1138. /* Save best solution */
  1139. if (currentMl > ml) {
  1140. ml = currentMl;
  1141. *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
  1142. if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
  1143. }
  1144. }
  1145. }
  1146. assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
  1147. if (dictMode == ZSTD_dedicatedDictSearch) {
  1148. ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
  1149. ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
  1150. } else if (dictMode == ZSTD_dictMatchState) {
  1151. /* TODO: Measure and potentially add prefetching to DMS */
  1152. const U32 dmsLowestIndex = dms->window.dictLimit;
  1153. const BYTE* const dmsBase = dms->window.base;
  1154. const BYTE* const dmsEnd = dms->window.nextSrc;
  1155. const U32 dmsSize = (U32)(dmsEnd - dmsBase);
  1156. const U32 dmsIndexDelta = dictLimit - dmsSize;
  1157. { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
  1158. U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
  1159. size_t numMatches = 0;
  1160. size_t currMatch = 0;
  1161. ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
  1162. for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
  1163. U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
  1164. U32 const matchIndex = dmsRow[matchPos];
  1165. if(matchPos == 0) continue;
  1166. if (matchIndex < dmsLowestIndex)
  1167. break;
  1168. PREFETCH_L1(dmsBase + matchIndex);
  1169. matchBuffer[numMatches++] = matchIndex;
  1170. --nbAttempts;
  1171. }
  1172. /* Return the longest match */
  1173. for (; currMatch < numMatches; ++currMatch) {
  1174. U32 const matchIndex = matchBuffer[currMatch];
  1175. size_t currentMl=0;
  1176. assert(matchIndex >= dmsLowestIndex);
  1177. assert(matchIndex < curr);
  1178. { const BYTE* const match = dmsBase + matchIndex;
  1179. assert(match+4 <= dmsEnd);
  1180. if (MEM_read32(match) == MEM_read32(ip))
  1181. currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
  1182. }
  1183. if (currentMl > ml) {
  1184. ml = currentMl;
  1185. assert(curr > matchIndex + dmsIndexDelta);
  1186. *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
  1187. if (ip+currentMl == iLimit) break;
  1188. }
  1189. }
  1190. }
  1191. }
  1192. return ml;
  1193. }
  1194. /**
  1195. * Generate search functions templated on (dictMode, mls, rowLog).
  1196. * These functions are outlined for code size & compilation time.
  1197. * ZSTD_searchMax() dispatches to the correct implementation function.
  1198. *
  1199. * TODO: The start of the search function involves loading and calculating a
  1200. * bunch of constants from the ZSTD_matchState_t. These computations could be
  1201. * done in an initialization function, and saved somewhere in the match state.
  1202. * Then we could pass a pointer to the saved state instead of the match state,
  1203. * and avoid duplicate computations.
  1204. *
  1205. * TODO: Move the match re-winding into searchMax. This improves compression
  1206. * ratio, and unlocks further simplifications with the next TODO.
  1207. *
  1208. * TODO: Try moving the repcode search into searchMax. After the re-winding
  1209. * and repcode search are in searchMax, there is no more logic in the match
  1210. * finder loop that requires knowledge about the dictMode. So we should be
  1211. * able to avoid force inlining it, and we can join the extDict loop with
  1212. * the single segment loop. It should go in searchMax instead of its own
  1213. * function to avoid having multiple virtual function calls per search.
  1214. */
  1215. #define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
  1216. #define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
  1217. #define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
  1218. #define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
  1219. #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \
  1220. ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \
  1221. ZSTD_matchState_t* ms, \
  1222. const BYTE* ip, const BYTE* const iLimit, \
  1223. size_t* offBasePtr) \
  1224. { \
  1225. assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
  1226. return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
  1227. } \
  1228. #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \
  1229. ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \
  1230. ZSTD_matchState_t* ms, \
  1231. const BYTE* ip, const BYTE* const iLimit, \
  1232. size_t* offsetPtr) \
  1233. { \
  1234. assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
  1235. return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
  1236. } \
  1237. #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \
  1238. ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \
  1239. ZSTD_matchState_t* ms, \
  1240. const BYTE* ip, const BYTE* const iLimit, \
  1241. size_t* offsetPtr) \
  1242. { \
  1243. assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
  1244. assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
  1245. return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
  1246. } \
  1247. #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
  1248. X(dictMode, mls, 4) \
  1249. X(dictMode, mls, 5) \
  1250. X(dictMode, mls, 6)
  1251. #define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
  1252. ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \
  1253. ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \
  1254. ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
  1255. #define ZSTD_FOR_EACH_MLS(X, dictMode) \
  1256. X(dictMode, 4) \
  1257. X(dictMode, 5) \
  1258. X(dictMode, 6)
  1259. #define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
  1260. X(__VA_ARGS__, noDict) \
  1261. X(__VA_ARGS__, extDict) \
  1262. X(__VA_ARGS__, dictMatchState) \
  1263. X(__VA_ARGS__, dedicatedDictSearch)
  1264. /* Generate row search fns for each combination of (dictMode, mls, rowLog) */
  1265. ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
  1266. /* Generate binary Tree search fns for each combination of (dictMode, mls) */
  1267. ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
  1268. /* Generate hash chain search fns for each combination of (dictMode, mls) */
  1269. ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
  1270. typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
  1271. #define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \
  1272. case mls: \
  1273. return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
  1274. #define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \
  1275. case mls: \
  1276. return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
  1277. #define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \
  1278. case rowLog: \
  1279. return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
  1280. #define ZSTD_SWITCH_MLS(X, dictMode) \
  1281. switch (mls) { \
  1282. ZSTD_FOR_EACH_MLS(X, dictMode) \
  1283. }
  1284. #define ZSTD_SWITCH_ROWLOG(dictMode, mls) \
  1285. case mls: \
  1286. switch (rowLog) { \
  1287. ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
  1288. } \
  1289. ZSTD_UNREACHABLE; \
  1290. break;
  1291. #define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \
  1292. switch (searchMethod) { \
  1293. case search_hashChain: \
  1294. ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
  1295. break; \
  1296. case search_binaryTree: \
  1297. ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
  1298. break; \
  1299. case search_rowHash: \
  1300. ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \
  1301. break; \
  1302. } \
  1303. ZSTD_UNREACHABLE;
  1304. /**
  1305. * Searches for the longest match at @p ip.
  1306. * Dispatches to the correct implementation function based on the
  1307. * (searchMethod, dictMode, mls, rowLog). We use switch statements
  1308. * here instead of using an indirect function call through a function
  1309. * pointer because after Spectre and Meltdown mitigations, indirect
  1310. * function calls can be very costly, especially in the kernel.
  1311. *
  1312. * NOTE: dictMode and searchMethod should be templated, so those switch
  1313. * statements should be optimized out. Only the mls & rowLog switches
  1314. * should be left.
  1315. *
  1316. * @param ms The match state.
  1317. * @param ip The position to search at.
  1318. * @param iend The end of the input data.
  1319. * @param[out] offsetPtr Stores the match offset into this pointer.
  1320. * @param mls The minimum search length, in the range [4, 6].
  1321. * @param rowLog The row log (if applicable), in the range [4, 6].
  1322. * @param searchMethod The search method to use (templated).
  1323. * @param dictMode The dictMode (templated).
  1324. *
  1325. * @returns The length of the longest match found, or < mls if no match is found.
  1326. * If a match is found its offset is stored in @p offsetPtr.
  1327. */
  1328. FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
  1329. ZSTD_matchState_t* ms,
  1330. const BYTE* ip,
  1331. const BYTE* iend,
  1332. size_t* offsetPtr,
  1333. U32 const mls,
  1334. U32 const rowLog,
  1335. searchMethod_e const searchMethod,
  1336. ZSTD_dictMode_e const dictMode)
  1337. {
  1338. if (dictMode == ZSTD_noDict) {
  1339. ZSTD_SWITCH_SEARCH_METHOD(noDict)
  1340. } else if (dictMode == ZSTD_extDict) {
  1341. ZSTD_SWITCH_SEARCH_METHOD(extDict)
  1342. } else if (dictMode == ZSTD_dictMatchState) {
  1343. ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
  1344. } else if (dictMode == ZSTD_dedicatedDictSearch) {
  1345. ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
  1346. }
  1347. ZSTD_UNREACHABLE;
  1348. return 0;
  1349. }
  1350. /* *******************************
  1351. * Common parser - lazy strategy
  1352. *********************************/
  1353. FORCE_INLINE_TEMPLATE
  1354. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  1355. size_t ZSTD_compressBlock_lazy_generic(
  1356. ZSTD_matchState_t* ms, seqStore_t* seqStore,
  1357. U32 rep[ZSTD_REP_NUM],
  1358. const void* src, size_t srcSize,
  1359. const searchMethod_e searchMethod, const U32 depth,
  1360. ZSTD_dictMode_e const dictMode)
  1361. {
  1362. const BYTE* const istart = (const BYTE*)src;
  1363. const BYTE* ip = istart;
  1364. const BYTE* anchor = istart;
  1365. const BYTE* const iend = istart + srcSize;
  1366. const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
  1367. const BYTE* const base = ms->window.base;
  1368. const U32 prefixLowestIndex = ms->window.dictLimit;
  1369. const BYTE* const prefixLowest = base + prefixLowestIndex;
  1370. const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
  1371. const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
  1372. U32 offset_1 = rep[0], offset_2 = rep[1];
  1373. U32 offsetSaved1 = 0, offsetSaved2 = 0;
  1374. const int isDMS = dictMode == ZSTD_dictMatchState;
  1375. const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
  1376. const int isDxS = isDMS || isDDS;
  1377. const ZSTD_matchState_t* const dms = ms->dictMatchState;
  1378. const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0;
  1379. const BYTE* const dictBase = isDxS ? dms->window.base : NULL;
  1380. const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL;
  1381. const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL;
  1382. const U32 dictIndexDelta = isDxS ?
  1383. prefixLowestIndex - (U32)(dictEnd - dictBase) :
  1384. 0;
  1385. const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
  1386. DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
  1387. ip += (dictAndPrefixLength == 0);
  1388. if (dictMode == ZSTD_noDict) {
  1389. U32 const curr = (U32)(ip - base);
  1390. U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
  1391. U32 const maxRep = curr - windowLow;
  1392. if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
  1393. if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
  1394. }
  1395. if (isDxS) {
  1396. /* dictMatchState repCode checks don't currently handle repCode == 0
  1397. * disabling. */
  1398. assert(offset_1 <= dictAndPrefixLength);
  1399. assert(offset_2 <= dictAndPrefixLength);
  1400. }
  1401. /* Reset the lazy skipping state */
  1402. ms->lazySkipping = 0;
  1403. if (searchMethod == search_rowHash) {
  1404. ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
  1405. }
  1406. /* Match Loop */
  1407. #if defined(__GNUC__) && defined(__x86_64__)
  1408. /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
  1409. * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
  1410. */
  1411. __asm__(".p2align 5");
  1412. #endif
  1413. while (ip < ilimit) {
  1414. size_t matchLength=0;
  1415. size_t offBase = REPCODE1_TO_OFFBASE;
  1416. const BYTE* start=ip+1;
  1417. DEBUGLOG(7, "search baseline (depth 0)");
  1418. /* check repCode */
  1419. if (isDxS) {
  1420. const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
  1421. const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)
  1422. && repIndex < prefixLowestIndex) ?
  1423. dictBase + (repIndex - dictIndexDelta) :
  1424. base + repIndex;
  1425. if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
  1426. && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
  1427. const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
  1428. matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
  1429. if (depth==0) goto _storeSequence;
  1430. }
  1431. }
  1432. if ( dictMode == ZSTD_noDict
  1433. && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
  1434. matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
  1435. if (depth==0) goto _storeSequence;
  1436. }
  1437. /* first search (depth 0) */
  1438. { size_t offbaseFound = 999999999;
  1439. size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
  1440. if (ml2 > matchLength)
  1441. matchLength = ml2, start = ip, offBase = offbaseFound;
  1442. }
  1443. if (matchLength < 4) {
  1444. size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */;
  1445. ip += step;
  1446. /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
  1447. * In this mode we stop inserting every position into our tables, and only insert
  1448. * positions that we search, which is one in step positions.
  1449. * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
  1450. * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
  1451. * triggered once we've gone 2KB without finding any matches.
  1452. */
  1453. ms->lazySkipping = step > kLazySkippingStep;
  1454. continue;
  1455. }
  1456. /* let's try to find a better solution */
  1457. if (depth>=1)
  1458. while (ip<ilimit) {
  1459. DEBUGLOG(7, "search depth 1");
  1460. ip ++;
  1461. if ( (dictMode == ZSTD_noDict)
  1462. && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
  1463. size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
  1464. int const gain2 = (int)(mlRep * 3);
  1465. int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
  1466. if ((mlRep >= 4) && (gain2 > gain1))
  1467. matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
  1468. }
  1469. if (isDxS) {
  1470. const U32 repIndex = (U32)(ip - base) - offset_1;
  1471. const BYTE* repMatch = repIndex < prefixLowestIndex ?
  1472. dictBase + (repIndex - dictIndexDelta) :
  1473. base + repIndex;
  1474. if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
  1475. && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
  1476. const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
  1477. size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
  1478. int const gain2 = (int)(mlRep * 3);
  1479. int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
  1480. if ((mlRep >= 4) && (gain2 > gain1))
  1481. matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
  1482. }
  1483. }
  1484. { size_t ofbCandidate=999999999;
  1485. size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
  1486. int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
  1487. int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
  1488. if ((ml2 >= 4) && (gain2 > gain1)) {
  1489. matchLength = ml2, offBase = ofbCandidate, start = ip;
  1490. continue; /* search a better one */
  1491. } }
  1492. /* let's find an even better one */
  1493. if ((depth==2) && (ip<ilimit)) {
  1494. DEBUGLOG(7, "search depth 2");
  1495. ip ++;
  1496. if ( (dictMode == ZSTD_noDict)
  1497. && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
  1498. size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
  1499. int const gain2 = (int)(mlRep * 4);
  1500. int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
  1501. if ((mlRep >= 4) && (gain2 > gain1))
  1502. matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
  1503. }
  1504. if (isDxS) {
  1505. const U32 repIndex = (U32)(ip - base) - offset_1;
  1506. const BYTE* repMatch = repIndex < prefixLowestIndex ?
  1507. dictBase + (repIndex - dictIndexDelta) :
  1508. base + repIndex;
  1509. if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
  1510. && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
  1511. const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
  1512. size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
  1513. int const gain2 = (int)(mlRep * 4);
  1514. int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
  1515. if ((mlRep >= 4) && (gain2 > gain1))
  1516. matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
  1517. }
  1518. }
  1519. { size_t ofbCandidate=999999999;
  1520. size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
  1521. int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
  1522. int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
  1523. if ((ml2 >= 4) && (gain2 > gain1)) {
  1524. matchLength = ml2, offBase = ofbCandidate, start = ip;
  1525. continue;
  1526. } } }
  1527. break; /* nothing found : store previous solution */
  1528. }
  1529. /* NOTE:
  1530. * Pay attention that `start[-value]` can lead to strange undefined behavior
  1531. * notably if `value` is unsigned, resulting in a large positive `-value`.
  1532. */
  1533. /* catch up */
  1534. if (OFFBASE_IS_OFFSET(offBase)) {
  1535. if (dictMode == ZSTD_noDict) {
  1536. while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
  1537. && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */
  1538. { start--; matchLength++; }
  1539. }
  1540. if (isDxS) {
  1541. U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
  1542. const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
  1543. const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
  1544. while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
  1545. }
  1546. offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
  1547. }
  1548. /* store sequence */
  1549. _storeSequence:
  1550. { size_t const litLength = (size_t)(start - anchor);
  1551. ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
  1552. anchor = ip = start + matchLength;
  1553. }
  1554. if (ms->lazySkipping) {
  1555. /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
  1556. if (searchMethod == search_rowHash) {
  1557. ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
  1558. }
  1559. ms->lazySkipping = 0;
  1560. }
  1561. /* check immediate repcode */
  1562. if (isDxS) {
  1563. while (ip <= ilimit) {
  1564. U32 const current2 = (U32)(ip-base);
  1565. U32 const repIndex = current2 - offset_2;
  1566. const BYTE* repMatch = repIndex < prefixLowestIndex ?
  1567. dictBase - dictIndexDelta + repIndex :
  1568. base + repIndex;
  1569. if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
  1570. && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
  1571. const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
  1572. matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
  1573. offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */
  1574. ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
  1575. ip += matchLength;
  1576. anchor = ip;
  1577. continue;
  1578. }
  1579. break;
  1580. }
  1581. }
  1582. if (dictMode == ZSTD_noDict) {
  1583. while ( ((ip <= ilimit) & (offset_2>0))
  1584. && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
  1585. /* store sequence */
  1586. matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
  1587. offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
  1588. ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
  1589. ip += matchLength;
  1590. anchor = ip;
  1591. continue; /* faster when present ... (?) */
  1592. } } }
  1593. /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
  1594. * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
  1595. offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
  1596. /* save reps for next block */
  1597. rep[0] = offset_1 ? offset_1 : offsetSaved1;
  1598. rep[1] = offset_2 ? offset_2 : offsetSaved2;
  1599. /* Return the last literals size */
  1600. return (size_t)(iend - anchor);
  1601. }
  1602. #endif /* build exclusions */
  1603. #ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
  1604. size_t ZSTD_compressBlock_greedy(
  1605. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1606. void const* src, size_t srcSize)
  1607. {
  1608. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
  1609. }
  1610. size_t ZSTD_compressBlock_greedy_dictMatchState(
  1611. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1612. void const* src, size_t srcSize)
  1613. {
  1614. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
  1615. }
  1616. size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
  1617. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1618. void const* src, size_t srcSize)
  1619. {
  1620. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
  1621. }
  1622. size_t ZSTD_compressBlock_greedy_row(
  1623. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1624. void const* src, size_t srcSize)
  1625. {
  1626. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
  1627. }
  1628. size_t ZSTD_compressBlock_greedy_dictMatchState_row(
  1629. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1630. void const* src, size_t srcSize)
  1631. {
  1632. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
  1633. }
  1634. size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
  1635. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1636. void const* src, size_t srcSize)
  1637. {
  1638. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
  1639. }
  1640. #endif
  1641. #ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
  1642. size_t ZSTD_compressBlock_lazy(
  1643. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1644. void const* src, size_t srcSize)
  1645. {
  1646. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
  1647. }
  1648. size_t ZSTD_compressBlock_lazy_dictMatchState(
  1649. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1650. void const* src, size_t srcSize)
  1651. {
  1652. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
  1653. }
  1654. size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
  1655. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1656. void const* src, size_t srcSize)
  1657. {
  1658. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
  1659. }
  1660. size_t ZSTD_compressBlock_lazy_row(
  1661. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1662. void const* src, size_t srcSize)
  1663. {
  1664. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
  1665. }
  1666. size_t ZSTD_compressBlock_lazy_dictMatchState_row(
  1667. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1668. void const* src, size_t srcSize)
  1669. {
  1670. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
  1671. }
  1672. size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
  1673. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1674. void const* src, size_t srcSize)
  1675. {
  1676. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
  1677. }
  1678. #endif
  1679. #ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
  1680. size_t ZSTD_compressBlock_lazy2(
  1681. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1682. void const* src, size_t srcSize)
  1683. {
  1684. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
  1685. }
  1686. size_t ZSTD_compressBlock_lazy2_dictMatchState(
  1687. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1688. void const* src, size_t srcSize)
  1689. {
  1690. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
  1691. }
  1692. size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
  1693. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1694. void const* src, size_t srcSize)
  1695. {
  1696. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
  1697. }
  1698. size_t ZSTD_compressBlock_lazy2_row(
  1699. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1700. void const* src, size_t srcSize)
  1701. {
  1702. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
  1703. }
  1704. size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
  1705. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1706. void const* src, size_t srcSize)
  1707. {
  1708. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
  1709. }
  1710. size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
  1711. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1712. void const* src, size_t srcSize)
  1713. {
  1714. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
  1715. }
  1716. #endif
  1717. #ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
  1718. size_t ZSTD_compressBlock_btlazy2(
  1719. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1720. void const* src, size_t srcSize)
  1721. {
  1722. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
  1723. }
  1724. size_t ZSTD_compressBlock_btlazy2_dictMatchState(
  1725. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1726. void const* src, size_t srcSize)
  1727. {
  1728. return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
  1729. }
  1730. #endif
  1731. #if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
  1732. || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
  1733. || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
  1734. || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
  1735. FORCE_INLINE_TEMPLATE
  1736. ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
  1737. size_t ZSTD_compressBlock_lazy_extDict_generic(
  1738. ZSTD_matchState_t* ms, seqStore_t* seqStore,
  1739. U32 rep[ZSTD_REP_NUM],
  1740. const void* src, size_t srcSize,
  1741. const searchMethod_e searchMethod, const U32 depth)
  1742. {
  1743. const BYTE* const istart = (const BYTE*)src;
  1744. const BYTE* ip = istart;
  1745. const BYTE* anchor = istart;
  1746. const BYTE* const iend = istart + srcSize;
  1747. const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
  1748. const BYTE* const base = ms->window.base;
  1749. const U32 dictLimit = ms->window.dictLimit;
  1750. const BYTE* const prefixStart = base + dictLimit;
  1751. const BYTE* const dictBase = ms->window.dictBase;
  1752. const BYTE* const dictEnd = dictBase + dictLimit;
  1753. const BYTE* const dictStart = dictBase + ms->window.lowLimit;
  1754. const U32 windowLog = ms->cParams.windowLog;
  1755. const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
  1756. const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
  1757. U32 offset_1 = rep[0], offset_2 = rep[1];
  1758. DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
  1759. /* Reset the lazy skipping state */
  1760. ms->lazySkipping = 0;
  1761. /* init */
  1762. ip += (ip == prefixStart);
  1763. if (searchMethod == search_rowHash) {
  1764. ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
  1765. }
  1766. /* Match Loop */
  1767. #if defined(__GNUC__) && defined(__x86_64__)
  1768. /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
  1769. * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
  1770. */
  1771. __asm__(".p2align 5");
  1772. #endif
  1773. while (ip < ilimit) {
  1774. size_t matchLength=0;
  1775. size_t offBase = REPCODE1_TO_OFFBASE;
  1776. const BYTE* start=ip+1;
  1777. U32 curr = (U32)(ip-base);
  1778. /* check repCode */
  1779. { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog);
  1780. const U32 repIndex = (U32)(curr+1 - offset_1);
  1781. const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
  1782. const BYTE* const repMatch = repBase + repIndex;
  1783. if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
  1784. & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */
  1785. if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
  1786. /* repcode detected we should take it */
  1787. const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
  1788. matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4;
  1789. if (depth==0) goto _storeSequence;
  1790. } }
  1791. /* first search (depth 0) */
  1792. { size_t ofbCandidate = 999999999;
  1793. size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
  1794. if (ml2 > matchLength)
  1795. matchLength = ml2, start = ip, offBase = ofbCandidate;
  1796. }
  1797. if (matchLength < 4) {
  1798. size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
  1799. ip += step + 1; /* jump faster over incompressible sections */
  1800. /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
  1801. * In this mode we stop inserting every position into our tables, and only insert
  1802. * positions that we search, which is one in step positions.
  1803. * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
  1804. * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
  1805. * triggered once we've gone 2KB without finding any matches.
  1806. */
  1807. ms->lazySkipping = step > kLazySkippingStep;
  1808. continue;
  1809. }
  1810. /* let's try to find a better solution */
  1811. if (depth>=1)
  1812. while (ip<ilimit) {
  1813. ip ++;
  1814. curr++;
  1815. /* check repCode */
  1816. if (offBase) {
  1817. const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
  1818. const U32 repIndex = (U32)(curr - offset_1);
  1819. const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
  1820. const BYTE* const repMatch = repBase + repIndex;
  1821. if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
  1822. & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
  1823. if (MEM_read32(ip) == MEM_read32(repMatch)) {
  1824. /* repcode detected */
  1825. const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
  1826. size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
  1827. int const gain2 = (int)(repLength * 3);
  1828. int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
  1829. if ((repLength >= 4) && (gain2 > gain1))
  1830. matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
  1831. } }
  1832. /* search match, depth 1 */
  1833. { size_t ofbCandidate = 999999999;
  1834. size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
  1835. int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
  1836. int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
  1837. if ((ml2 >= 4) && (gain2 > gain1)) {
  1838. matchLength = ml2, offBase = ofbCandidate, start = ip;
  1839. continue; /* search a better one */
  1840. } }
  1841. /* let's find an even better one */
  1842. if ((depth==2) && (ip<ilimit)) {
  1843. ip ++;
  1844. curr++;
  1845. /* check repCode */
  1846. if (offBase) {
  1847. const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
  1848. const U32 repIndex = (U32)(curr - offset_1);
  1849. const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
  1850. const BYTE* const repMatch = repBase + repIndex;
  1851. if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
  1852. & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
  1853. if (MEM_read32(ip) == MEM_read32(repMatch)) {
  1854. /* repcode detected */
  1855. const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
  1856. size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
  1857. int const gain2 = (int)(repLength * 4);
  1858. int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
  1859. if ((repLength >= 4) && (gain2 > gain1))
  1860. matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
  1861. } }
  1862. /* search match, depth 2 */
  1863. { size_t ofbCandidate = 999999999;
  1864. size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
  1865. int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
  1866. int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
  1867. if ((ml2 >= 4) && (gain2 > gain1)) {
  1868. matchLength = ml2, offBase = ofbCandidate, start = ip;
  1869. continue;
  1870. } } }
  1871. break; /* nothing found : store previous solution */
  1872. }
  1873. /* catch up */
  1874. if (OFFBASE_IS_OFFSET(offBase)) {
  1875. U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
  1876. const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
  1877. const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
  1878. while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
  1879. offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
  1880. }
  1881. /* store sequence */
  1882. _storeSequence:
  1883. { size_t const litLength = (size_t)(start - anchor);
  1884. ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
  1885. anchor = ip = start + matchLength;
  1886. }
  1887. if (ms->lazySkipping) {
  1888. /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
  1889. if (searchMethod == search_rowHash) {
  1890. ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
  1891. }
  1892. ms->lazySkipping = 0;
  1893. }
  1894. /* check immediate repcode */
  1895. while (ip <= ilimit) {
  1896. const U32 repCurrent = (U32)(ip-base);
  1897. const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
  1898. const U32 repIndex = repCurrent - offset_2;
  1899. const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
  1900. const BYTE* const repMatch = repBase + repIndex;
  1901. if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
  1902. & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
  1903. if (MEM_read32(ip) == MEM_read32(repMatch)) {
  1904. /* repcode detected we should take it */
  1905. const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
  1906. matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
  1907. offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */
  1908. ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
  1909. ip += matchLength;
  1910. anchor = ip;
  1911. continue; /* faster when present ... (?) */
  1912. }
  1913. break;
  1914. } }
  1915. /* Save reps for next block */
  1916. rep[0] = offset_1;
  1917. rep[1] = offset_2;
  1918. /* Return the last literals size */
  1919. return (size_t)(iend - anchor);
  1920. }
  1921. #endif /* build exclusions */
  1922. #ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
  1923. size_t ZSTD_compressBlock_greedy_extDict(
  1924. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1925. void const* src, size_t srcSize)
  1926. {
  1927. return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
  1928. }
  1929. size_t ZSTD_compressBlock_greedy_extDict_row(
  1930. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1931. void const* src, size_t srcSize)
  1932. {
  1933. return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
  1934. }
  1935. #endif
  1936. #ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
  1937. size_t ZSTD_compressBlock_lazy_extDict(
  1938. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1939. void const* src, size_t srcSize)
  1940. {
  1941. return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
  1942. }
  1943. size_t ZSTD_compressBlock_lazy_extDict_row(
  1944. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1945. void const* src, size_t srcSize)
  1946. {
  1947. return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
  1948. }
  1949. #endif
  1950. #ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
  1951. size_t ZSTD_compressBlock_lazy2_extDict(
  1952. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1953. void const* src, size_t srcSize)
  1954. {
  1955. return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
  1956. }
  1957. size_t ZSTD_compressBlock_lazy2_extDict_row(
  1958. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1959. void const* src, size_t srcSize)
  1960. {
  1961. return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
  1962. }
  1963. #endif
  1964. #ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
  1965. size_t ZSTD_compressBlock_btlazy2_extDict(
  1966. ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
  1967. void const* src, size_t srcSize)
  1968. {
  1969. return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
  1970. }
  1971. #endif