cover.h 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. /*
  2. * Copyright (c) Meta Platforms, Inc. and affiliates.
  3. * All rights reserved.
  4. *
  5. * This source code is licensed under both the BSD-style license (found in the
  6. * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  7. * in the COPYING file in the root directory of this source tree).
  8. * You may select, at your option, one of the above-listed licenses.
  9. */
  10. #ifndef ZDICT_STATIC_LINKING_ONLY
  11. # define ZDICT_STATIC_LINKING_ONLY
  12. #endif
  13. #include <stdio.h> /* fprintf */
  14. #include <stdlib.h> /* malloc, free, qsort */
  15. #include <string.h> /* memset */
  16. #include <time.h> /* clock */
  17. #include "../common/mem.h" /* read */
  18. #include "../common/pool.h"
  19. #include "../common/threading.h"
  20. #include "../common/zstd_internal.h" /* includes zstd.h */
  21. #include "../zdict.h"
  22. /**
  23. * COVER_best_t is used for two purposes:
  24. * 1. Synchronizing threads.
  25. * 2. Saving the best parameters and dictionary.
  26. *
  27. * All of the methods except COVER_best_init() are thread safe if zstd is
  28. * compiled with multithreaded support.
  29. */
  30. typedef struct COVER_best_s {
  31. ZSTD_pthread_mutex_t mutex;
  32. ZSTD_pthread_cond_t cond;
  33. size_t liveJobs;
  34. void *dict;
  35. size_t dictSize;
  36. ZDICT_cover_params_t parameters;
  37. size_t compressedSize;
  38. } COVER_best_t;
  39. /**
  40. * A segment is a range in the source as well as the score of the segment.
  41. */
  42. typedef struct {
  43. U32 begin;
  44. U32 end;
  45. U32 score;
  46. } COVER_segment_t;
  47. /**
  48. *Number of epochs and size of each epoch.
  49. */
  50. typedef struct {
  51. U32 num;
  52. U32 size;
  53. } COVER_epoch_info_t;
  54. /**
  55. * Struct used for the dictionary selection function.
  56. */
  57. typedef struct COVER_dictSelection {
  58. BYTE* dictContent;
  59. size_t dictSize;
  60. size_t totalCompressedSize;
  61. } COVER_dictSelection_t;
  62. /**
  63. * Computes the number of epochs and the size of each epoch.
  64. * We will make sure that each epoch gets at least 10 * k bytes.
  65. *
  66. * The COVER algorithms divide the data up into epochs of equal size and
  67. * select one segment from each epoch.
  68. *
  69. * @param maxDictSize The maximum allowed dictionary size.
  70. * @param nbDmers The number of dmers we are training on.
  71. * @param k The parameter k (segment size).
  72. * @param passes The target number of passes over the dmer corpus.
  73. * More passes means a better dictionary.
  74. */
  75. COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
  76. U32 k, U32 passes);
  77. /**
  78. * Warns the user when their corpus is too small.
  79. */
  80. void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
  81. /**
  82. * Checks total compressed size of a dictionary
  83. */
  84. size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
  85. const size_t *samplesSizes, const BYTE *samples,
  86. size_t *offsets,
  87. size_t nbTrainSamples, size_t nbSamples,
  88. BYTE *const dict, size_t dictBufferCapacity);
  89. /**
  90. * Returns the sum of the sample sizes.
  91. */
  92. size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
  93. /**
  94. * Initialize the `COVER_best_t`.
  95. */
  96. void COVER_best_init(COVER_best_t *best);
  97. /**
  98. * Wait until liveJobs == 0.
  99. */
  100. void COVER_best_wait(COVER_best_t *best);
  101. /**
  102. * Call COVER_best_wait() and then destroy the COVER_best_t.
  103. */
  104. void COVER_best_destroy(COVER_best_t *best);
  105. /**
  106. * Called when a thread is about to be launched.
  107. * Increments liveJobs.
  108. */
  109. void COVER_best_start(COVER_best_t *best);
  110. /**
  111. * Called when a thread finishes executing, both on error or success.
  112. * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
  113. * If this dictionary is the best so far save it and its parameters.
  114. */
  115. void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
  116. COVER_dictSelection_t selection);
  117. /**
  118. * Error function for COVER_selectDict function. Checks if the return
  119. * value is an error.
  120. */
  121. unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
  122. /**
  123. * Error function for COVER_selectDict function. Returns a struct where
  124. * return.totalCompressedSize is a ZSTD error.
  125. */
  126. COVER_dictSelection_t COVER_dictSelectionError(size_t error);
  127. /**
  128. * Always call after selectDict is called to free up used memory from
  129. * newly created dictionary.
  130. */
  131. void COVER_dictSelectionFree(COVER_dictSelection_t selection);
  132. /**
  133. * Called to finalize the dictionary and select one based on whether or not
  134. * the shrink-dict flag was enabled. If enabled the dictionary used is the
  135. * smallest dictionary within a specified regression of the compressed size
  136. * from the largest dictionary.
  137. */
  138. COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
  139. size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
  140. size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);