cover.h 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. /*
  2. * Copyright (c) Meta Platforms, Inc. and affiliates.
  3. * All rights reserved.
  4. *
  5. * This source code is licensed under both the BSD-style license (found in the
  6. * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  7. * in the COPYING file in the root directory of this source tree).
  8. * You may select, at your option, one of the above-listed licenses.
  9. */
  10. #ifndef ZDICT_STATIC_LINKING_ONLY
  11. # define ZDICT_STATIC_LINKING_ONLY
  12. #endif
  13. #include "../common/threading.h" /* ZSTD_pthread_mutex_t */
  14. #include "../common/mem.h" /* U32, BYTE */
  15. #include "../zdict.h"
  16. /**
  17. * COVER_best_t is used for two purposes:
  18. * 1. Synchronizing threads.
  19. * 2. Saving the best parameters and dictionary.
  20. *
  21. * All of the methods except COVER_best_init() are thread safe if zstd is
  22. * compiled with multithreaded support.
  23. */
  24. typedef struct COVER_best_s {
  25. ZSTD_pthread_mutex_t mutex;
  26. ZSTD_pthread_cond_t cond;
  27. size_t liveJobs;
  28. void *dict;
  29. size_t dictSize;
  30. ZDICT_cover_params_t parameters;
  31. size_t compressedSize;
  32. } COVER_best_t;
  33. /**
  34. * A segment is a range in the source as well as the score of the segment.
  35. */
  36. typedef struct {
  37. U32 begin;
  38. U32 end;
  39. U32 score;
  40. } COVER_segment_t;
  41. /**
  42. *Number of epochs and size of each epoch.
  43. */
  44. typedef struct {
  45. U32 num;
  46. U32 size;
  47. } COVER_epoch_info_t;
  48. /**
  49. * Struct used for the dictionary selection function.
  50. */
  51. typedef struct COVER_dictSelection {
  52. BYTE* dictContent;
  53. size_t dictSize;
  54. size_t totalCompressedSize;
  55. } COVER_dictSelection_t;
  56. /**
  57. * Computes the number of epochs and the size of each epoch.
  58. * We will make sure that each epoch gets at least 10 * k bytes.
  59. *
  60. * The COVER algorithms divide the data up into epochs of equal size and
  61. * select one segment from each epoch.
  62. *
  63. * @param maxDictSize The maximum allowed dictionary size.
  64. * @param nbDmers The number of dmers we are training on.
  65. * @param k The parameter k (segment size).
  66. * @param passes The target number of passes over the dmer corpus.
  67. * More passes means a better dictionary.
  68. */
  69. COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
  70. U32 k, U32 passes);
  71. /**
  72. * Warns the user when their corpus is too small.
  73. */
  74. void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
  75. /**
  76. * Checks total compressed size of a dictionary
  77. */
  78. size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
  79. const size_t *samplesSizes, const BYTE *samples,
  80. size_t *offsets,
  81. size_t nbTrainSamples, size_t nbSamples,
  82. BYTE *const dict, size_t dictBufferCapacity);
  83. /**
  84. * Returns the sum of the sample sizes.
  85. */
  86. size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
  87. /**
  88. * Initialize the `COVER_best_t`.
  89. */
  90. void COVER_best_init(COVER_best_t *best);
  91. /**
  92. * Wait until liveJobs == 0.
  93. */
  94. void COVER_best_wait(COVER_best_t *best);
  95. /**
  96. * Call COVER_best_wait() and then destroy the COVER_best_t.
  97. */
  98. void COVER_best_destroy(COVER_best_t *best);
  99. /**
  100. * Called when a thread is about to be launched.
  101. * Increments liveJobs.
  102. */
  103. void COVER_best_start(COVER_best_t *best);
  104. /**
  105. * Called when a thread finishes executing, both on error or success.
  106. * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
  107. * If this dictionary is the best so far save it and its parameters.
  108. */
  109. void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
  110. COVER_dictSelection_t selection);
  111. /**
  112. * Error function for COVER_selectDict function. Checks if the return
  113. * value is an error.
  114. */
  115. unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
  116. /**
  117. * Error function for COVER_selectDict function. Returns a struct where
  118. * return.totalCompressedSize is a ZSTD error.
  119. */
  120. COVER_dictSelection_t COVER_dictSelectionError(size_t error);
  121. /**
  122. * Always call after selectDict is called to free up used memory from
  123. * newly created dictionary.
  124. */
  125. void COVER_dictSelectionFree(COVER_dictSelection_t selection);
  126. /**
  127. * Called to finalize the dictionary and select one based on whether or not
  128. * the shrink-dict flag was enabled. If enabled the dictionary used is the
  129. * smallest dictionary within a specified regression of the compressed size
  130. * from the largest dictionary.
  131. */
  132. COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
  133. size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
  134. size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);