rrdengine.c 51 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #define NETDATA_RRD_INTERNALS
  3. #include "rrdengine.h"
  4. rrdeng_stats_t global_io_errors = 0;
  5. rrdeng_stats_t global_fs_errors = 0;
  6. rrdeng_stats_t rrdeng_reserved_file_descriptors = 0;
  7. rrdeng_stats_t global_pg_cache_over_half_dirty_events = 0;
  8. rrdeng_stats_t global_flushing_pressure_page_deletions = 0;
  9. unsigned rrdeng_pages_per_extent = MAX_PAGES_PER_EXTENT;
  10. #if WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_MAX_OPCODE + 2)
  11. #error Please increase WORKER_UTILIZATION_MAX_JOB_TYPES to at least (RRDENG_MAX_OPCODE + 2)
  12. #endif
  13. void *dbengine_page_alloc() {
  14. void *page = NULL;
  15. if (unlikely(db_engine_use_malloc))
  16. page = mallocz(RRDENG_BLOCK_SIZE);
  17. else {
  18. page = netdata_mmap(NULL, RRDENG_BLOCK_SIZE, MAP_PRIVATE, enable_ksm);
  19. if(!page) fatal("Cannot allocate dbengine page cache page, with mmap()");
  20. }
  21. return page;
  22. }
  23. void dbengine_page_free(void *page) {
  24. if (unlikely(db_engine_use_malloc))
  25. freez(page);
  26. else
  27. munmap(page, RRDENG_BLOCK_SIZE);
  28. }
  29. static void sanity_check(void)
  30. {
  31. BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_MAX_OPCODE + 2));
  32. /* Magic numbers must fit in the super-blocks */
  33. BUILD_BUG_ON(strlen(RRDENG_DF_MAGIC) > RRDENG_MAGIC_SZ);
  34. BUILD_BUG_ON(strlen(RRDENG_JF_MAGIC) > RRDENG_MAGIC_SZ);
  35. /* Version strings must fit in the super-blocks */
  36. BUILD_BUG_ON(strlen(RRDENG_DF_VER) > RRDENG_VER_SZ);
  37. BUILD_BUG_ON(strlen(RRDENG_JF_VER) > RRDENG_VER_SZ);
  38. /* Data file super-block cannot be larger than RRDENG_BLOCK_SIZE */
  39. BUILD_BUG_ON(RRDENG_DF_SB_PADDING_SZ < 0);
  40. BUILD_BUG_ON(sizeof(uuid_t) != UUID_SZ); /* check UUID size */
  41. /* page count must fit in 8 bits */
  42. BUILD_BUG_ON(MAX_PAGES_PER_EXTENT > 255);
  43. /* extent cache count must fit in 32 bits */
  44. BUILD_BUG_ON(MAX_CACHED_EXTENTS > 32);
  45. /* page info scratch space must be able to hold 2 32-bit integers */
  46. BUILD_BUG_ON(sizeof(((struct rrdeng_page_info *)0)->scratch) < 2 * sizeof(uint32_t));
  47. }
  48. /* always inserts into tail */
  49. static inline void xt_cache_replaceQ_insert(struct rrdengine_worker_config* wc,
  50. struct extent_cache_element *xt_cache_elem)
  51. {
  52. struct extent_cache *xt_cache = &wc->xt_cache;
  53. xt_cache_elem->prev = NULL;
  54. xt_cache_elem->next = NULL;
  55. if (likely(NULL != xt_cache->replaceQ_tail)) {
  56. xt_cache_elem->prev = xt_cache->replaceQ_tail;
  57. xt_cache->replaceQ_tail->next = xt_cache_elem;
  58. }
  59. if (unlikely(NULL == xt_cache->replaceQ_head)) {
  60. xt_cache->replaceQ_head = xt_cache_elem;
  61. }
  62. xt_cache->replaceQ_tail = xt_cache_elem;
  63. }
  64. static inline void xt_cache_replaceQ_delete(struct rrdengine_worker_config* wc,
  65. struct extent_cache_element *xt_cache_elem)
  66. {
  67. struct extent_cache *xt_cache = &wc->xt_cache;
  68. struct extent_cache_element *prev, *next;
  69. prev = xt_cache_elem->prev;
  70. next = xt_cache_elem->next;
  71. if (likely(NULL != prev)) {
  72. prev->next = next;
  73. }
  74. if (likely(NULL != next)) {
  75. next->prev = prev;
  76. }
  77. if (unlikely(xt_cache_elem == xt_cache->replaceQ_head)) {
  78. xt_cache->replaceQ_head = next;
  79. }
  80. if (unlikely(xt_cache_elem == xt_cache->replaceQ_tail)) {
  81. xt_cache->replaceQ_tail = prev;
  82. }
  83. xt_cache_elem->prev = xt_cache_elem->next = NULL;
  84. }
  85. static inline void xt_cache_replaceQ_set_hot(struct rrdengine_worker_config* wc,
  86. struct extent_cache_element *xt_cache_elem)
  87. {
  88. xt_cache_replaceQ_delete(wc, xt_cache_elem);
  89. xt_cache_replaceQ_insert(wc, xt_cache_elem);
  90. }
  91. /* Returns the index of the cached extent if it was successfully inserted in the extent cache, otherwise -1 */
  92. static int try_insert_into_xt_cache(struct rrdengine_worker_config* wc, struct extent_info *extent)
  93. {
  94. struct extent_cache *xt_cache = &wc->xt_cache;
  95. struct extent_cache_element *xt_cache_elem;
  96. unsigned idx;
  97. int ret;
  98. ret = find_first_zero(xt_cache->allocation_bitmap);
  99. if (-1 == ret || ret >= MAX_CACHED_EXTENTS) {
  100. for (xt_cache_elem = xt_cache->replaceQ_head ; NULL != xt_cache_elem ; xt_cache_elem = xt_cache_elem->next) {
  101. idx = xt_cache_elem - xt_cache->extent_array;
  102. if (!check_bit(xt_cache->inflight_bitmap, idx)) {
  103. xt_cache_replaceQ_delete(wc, xt_cache_elem);
  104. break;
  105. }
  106. }
  107. if (NULL == xt_cache_elem)
  108. return -1;
  109. } else {
  110. idx = (unsigned)ret;
  111. xt_cache_elem = &xt_cache->extent_array[idx];
  112. }
  113. xt_cache_elem->extent = extent;
  114. xt_cache_elem->fileno = extent->datafile->fileno;
  115. xt_cache_elem->inflight_io_descr = NULL;
  116. xt_cache_replaceQ_insert(wc, xt_cache_elem);
  117. modify_bit(&xt_cache->allocation_bitmap, idx, 1);
  118. return (int)idx;
  119. }
  120. /**
  121. * Returns 0 if the cached extent was found in the extent cache, 1 otherwise.
  122. * Sets *idx to point to the position of the extent inside the cache.
  123. **/
  124. static uint8_t lookup_in_xt_cache(struct rrdengine_worker_config* wc, struct extent_info *extent, unsigned *idx)
  125. {
  126. struct extent_cache *xt_cache = &wc->xt_cache;
  127. struct extent_cache_element *xt_cache_elem;
  128. unsigned i;
  129. for (i = 0 ; i < MAX_CACHED_EXTENTS ; ++i) {
  130. xt_cache_elem = &xt_cache->extent_array[i];
  131. if (check_bit(xt_cache->allocation_bitmap, i) && xt_cache_elem->extent == extent &&
  132. xt_cache_elem->fileno == extent->datafile->fileno) {
  133. *idx = i;
  134. return 0;
  135. }
  136. }
  137. return 1;
  138. }
  139. #if 0 /* disabled code */
  140. static void delete_from_xt_cache(struct rrdengine_worker_config* wc, unsigned idx)
  141. {
  142. struct extent_cache *xt_cache = &wc->xt_cache;
  143. struct extent_cache_element *xt_cache_elem;
  144. xt_cache_elem = &xt_cache->extent_array[idx];
  145. xt_cache_replaceQ_delete(wc, xt_cache_elem);
  146. xt_cache_elem->extent = NULL;
  147. modify_bit(&wc->xt_cache.allocation_bitmap, idx, 0); /* invalidate it */
  148. modify_bit(&wc->xt_cache.inflight_bitmap, idx, 0); /* not in-flight anymore */
  149. }
  150. #endif
  151. void enqueue_inflight_read_to_xt_cache(struct rrdengine_worker_config* wc, unsigned idx,
  152. struct extent_io_descriptor *xt_io_descr)
  153. {
  154. struct extent_cache *xt_cache = &wc->xt_cache;
  155. struct extent_cache_element *xt_cache_elem;
  156. struct extent_io_descriptor *old_next;
  157. xt_cache_elem = &xt_cache->extent_array[idx];
  158. old_next = xt_cache_elem->inflight_io_descr->next;
  159. xt_cache_elem->inflight_io_descr->next = xt_io_descr;
  160. xt_io_descr->next = old_next;
  161. }
  162. void read_cached_extent_cb(struct rrdengine_worker_config* wc, unsigned idx, struct extent_io_descriptor *xt_io_descr)
  163. {
  164. unsigned i, j, page_offset;
  165. struct rrdengine_instance *ctx = wc->ctx;
  166. struct rrdeng_page_descr *descr;
  167. struct page_cache_descr *pg_cache_descr;
  168. void *page;
  169. struct extent_info *extent = xt_io_descr->descr_array[0]->extent;
  170. for (i = 0 ; i < xt_io_descr->descr_count; ++i) {
  171. page = dbengine_page_alloc();
  172. descr = xt_io_descr->descr_array[i];
  173. for (j = 0, page_offset = 0 ; j < extent->number_of_pages ; ++j) {
  174. /* care, we don't hold the descriptor mutex */
  175. if (!uuid_compare(*extent->pages[j]->id, *descr->id) &&
  176. extent->pages[j]->page_length == descr->page_length &&
  177. extent->pages[j]->start_time == descr->start_time &&
  178. extent->pages[j]->end_time == descr->end_time) {
  179. break;
  180. }
  181. page_offset += extent->pages[j]->page_length;
  182. }
  183. /* care, we don't hold the descriptor mutex */
  184. (void) memcpy(page, wc->xt_cache.extent_array[idx].pages + page_offset, descr->page_length);
  185. rrdeng_page_descr_mutex_lock(ctx, descr);
  186. pg_cache_descr = descr->pg_cache_descr;
  187. pg_cache_descr->page = page;
  188. pg_cache_descr->flags |= RRD_PAGE_POPULATED;
  189. pg_cache_descr->flags &= ~RRD_PAGE_READ_PENDING;
  190. rrdeng_page_descr_mutex_unlock(ctx, descr);
  191. pg_cache_replaceQ_insert(ctx, descr);
  192. if (xt_io_descr->release_descr) {
  193. pg_cache_put(ctx, descr);
  194. } else {
  195. debug(D_RRDENGINE, "%s: Waking up waiters.", __func__);
  196. pg_cache_wake_up_waiters(ctx, descr);
  197. }
  198. }
  199. if (xt_io_descr->completion)
  200. completion_mark_complete(xt_io_descr->completion);
  201. freez(xt_io_descr);
  202. }
  203. static void fill_page_with_nulls(void *page, uint32_t page_length, uint8_t type) {
  204. switch(type) {
  205. case PAGE_METRICS: {
  206. storage_number n = pack_storage_number(NAN, SN_FLAG_NONE);
  207. storage_number *array = (storage_number *)page;
  208. size_t slots = page_length / sizeof(n);
  209. for(size_t i = 0; i < slots ; i++)
  210. array[i] = n;
  211. }
  212. break;
  213. case PAGE_TIER: {
  214. storage_number_tier1_t n = {
  215. .min_value = NAN,
  216. .max_value = NAN,
  217. .sum_value = NAN,
  218. .count = 1,
  219. .anomaly_count = 0,
  220. };
  221. storage_number_tier1_t *array = (storage_number_tier1_t *)page;
  222. size_t slots = page_length / sizeof(n);
  223. for(size_t i = 0; i < slots ; i++)
  224. array[i] = n;
  225. }
  226. break;
  227. default: {
  228. static bool logged = false;
  229. if(!logged) {
  230. error("DBENGINE: cannot fill page with nulls on unknown page type id %d", type);
  231. logged = true;
  232. }
  233. memset(page, 0, page_length);
  234. }
  235. }
  236. }
  237. void read_extent_cb(uv_fs_t* req)
  238. {
  239. struct rrdengine_worker_config* wc = req->loop->data;
  240. struct rrdengine_instance *ctx = wc->ctx;
  241. struct extent_io_descriptor *xt_io_descr;
  242. struct rrdeng_page_descr *descr;
  243. struct page_cache_descr *pg_cache_descr;
  244. int ret;
  245. unsigned i, j, count;
  246. void *page, *uncompressed_buf = NULL;
  247. uint32_t payload_length, payload_offset, page_offset, uncompressed_payload_length = 0;
  248. uint8_t have_read_error = 0;
  249. /* persistent structures */
  250. struct rrdeng_df_extent_header *header;
  251. struct rrdeng_df_extent_trailer *trailer;
  252. uLong crc;
  253. xt_io_descr = req->data;
  254. header = xt_io_descr->buf;
  255. payload_length = header->payload_length;
  256. count = header->number_of_pages;
  257. payload_offset = sizeof(*header) + sizeof(header->descr[0]) * count;
  258. trailer = xt_io_descr->buf + xt_io_descr->bytes - sizeof(*trailer);
  259. if (req->result < 0) {
  260. struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile;
  261. ++ctx->stats.io_errors;
  262. rrd_stat_atomic_add(&global_io_errors, 1);
  263. have_read_error = 1;
  264. error("%s: uv_fs_read - %s - extent at offset %"PRIu64"(%u) in datafile %u-%u.", __func__,
  265. uv_strerror((int)req->result), xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno);
  266. goto after_crc_check;
  267. }
  268. crc = crc32(0L, Z_NULL, 0);
  269. crc = crc32(crc, xt_io_descr->buf, xt_io_descr->bytes - sizeof(*trailer));
  270. ret = crc32cmp(trailer->checksum, crc);
  271. #ifdef NETDATA_INTERNAL_CHECKS
  272. {
  273. struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile;
  274. debug(D_RRDENGINE, "%s: Extent at offset %"PRIu64"(%u) was read from datafile %u-%u. CRC32 check: %s", __func__,
  275. xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno, ret ? "FAILED" : "SUCCEEDED");
  276. }
  277. #endif
  278. if (unlikely(ret)) {
  279. struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile;
  280. ++ctx->stats.io_errors;
  281. rrd_stat_atomic_add(&global_io_errors, 1);
  282. have_read_error = 1;
  283. error("%s: Extent at offset %"PRIu64"(%u) was read from datafile %u-%u. CRC32 check: FAILED", __func__,
  284. xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno);
  285. }
  286. after_crc_check:
  287. if (!have_read_error && RRD_NO_COMPRESSION != header->compression_algorithm) {
  288. uncompressed_payload_length = 0;
  289. for (i = 0 ; i < count ; ++i) {
  290. uncompressed_payload_length += header->descr[i].page_length;
  291. }
  292. uncompressed_buf = mallocz(uncompressed_payload_length);
  293. ret = LZ4_decompress_safe(xt_io_descr->buf + payload_offset, uncompressed_buf,
  294. payload_length, uncompressed_payload_length);
  295. ctx->stats.before_decompress_bytes += payload_length;
  296. ctx->stats.after_decompress_bytes += ret;
  297. debug(D_RRDENGINE, "LZ4 decompressed %u bytes to %d bytes.", payload_length, ret);
  298. /* care, we don't hold the descriptor mutex */
  299. }
  300. {
  301. uint8_t xt_is_cached = 0;
  302. unsigned xt_idx;
  303. struct extent_info *extent = xt_io_descr->descr_array[0]->extent;
  304. xt_is_cached = !lookup_in_xt_cache(wc, extent, &xt_idx);
  305. if (xt_is_cached && check_bit(wc->xt_cache.inflight_bitmap, xt_idx)) {
  306. struct extent_cache *xt_cache = &wc->xt_cache;
  307. struct extent_cache_element *xt_cache_elem = &xt_cache->extent_array[xt_idx];
  308. struct extent_io_descriptor *curr, *next;
  309. if (have_read_error) {
  310. memset(xt_cache_elem->pages, 0, sizeof(xt_cache_elem->pages));
  311. } else if (RRD_NO_COMPRESSION == header->compression_algorithm) {
  312. (void)memcpy(xt_cache_elem->pages, xt_io_descr->buf + payload_offset, payload_length);
  313. } else {
  314. (void)memcpy(xt_cache_elem->pages, uncompressed_buf, uncompressed_payload_length);
  315. }
  316. /* complete all connected in-flight read requests */
  317. for (curr = xt_cache_elem->inflight_io_descr->next ; curr ; curr = next) {
  318. next = curr->next;
  319. read_cached_extent_cb(wc, xt_idx, curr);
  320. }
  321. xt_cache_elem->inflight_io_descr = NULL;
  322. modify_bit(&xt_cache->inflight_bitmap, xt_idx, 0); /* not in-flight anymore */
  323. }
  324. }
  325. for (i = 0, page_offset = 0; i < count; page_offset += header->descr[i++].page_length) {
  326. uint8_t is_prefetched_page;
  327. descr = NULL;
  328. for (j = 0 ; j < xt_io_descr->descr_count; ++j) {
  329. struct rrdeng_page_descr *descrj;
  330. descrj = xt_io_descr->descr_array[j];
  331. /* care, we don't hold the descriptor mutex */
  332. if (!uuid_compare(*(uuid_t *) header->descr[i].uuid, *descrj->id) &&
  333. header->descr[i].page_length == descrj->page_length &&
  334. header->descr[i].start_time == descrj->start_time &&
  335. header->descr[i].end_time == descrj->end_time) {
  336. descr = descrj;
  337. break;
  338. }
  339. }
  340. is_prefetched_page = 0;
  341. if (!descr) { /* This extent page has not been requested. Try populating it for locality (best effort). */
  342. descr = pg_cache_lookup_unpopulated_and_lock(ctx, (uuid_t *)header->descr[i].uuid,
  343. header->descr[i].start_time);
  344. if (!descr)
  345. continue; /* Failed to reserve a suitable page */
  346. is_prefetched_page = 1;
  347. }
  348. page = dbengine_page_alloc();
  349. /* care, we don't hold the descriptor mutex */
  350. if (have_read_error) {
  351. fill_page_with_nulls(page, descr->page_length, descr->type);
  352. } else if (RRD_NO_COMPRESSION == header->compression_algorithm) {
  353. (void) memcpy(page, xt_io_descr->buf + payload_offset + page_offset, descr->page_length);
  354. } else {
  355. (void) memcpy(page, uncompressed_buf + page_offset, descr->page_length);
  356. }
  357. rrdeng_page_descr_mutex_lock(ctx, descr);
  358. pg_cache_descr = descr->pg_cache_descr;
  359. pg_cache_descr->page = page;
  360. pg_cache_descr->flags |= RRD_PAGE_POPULATED;
  361. pg_cache_descr->flags &= ~RRD_PAGE_READ_PENDING;
  362. rrdeng_page_descr_mutex_unlock(ctx, descr);
  363. pg_cache_replaceQ_insert(ctx, descr);
  364. if (xt_io_descr->release_descr || is_prefetched_page) {
  365. pg_cache_put(ctx, descr);
  366. } else {
  367. debug(D_RRDENGINE, "%s: Waking up waiters.", __func__);
  368. pg_cache_wake_up_waiters(ctx, descr);
  369. }
  370. }
  371. if (!have_read_error && RRD_NO_COMPRESSION != header->compression_algorithm) {
  372. freez(uncompressed_buf);
  373. }
  374. if (xt_io_descr->completion)
  375. completion_mark_complete(xt_io_descr->completion);
  376. uv_fs_req_cleanup(req);
  377. free(xt_io_descr->buf);
  378. freez(xt_io_descr);
  379. }
  380. static void do_read_extent(struct rrdengine_worker_config* wc,
  381. struct rrdeng_page_descr **descr,
  382. unsigned count,
  383. uint8_t release_descr)
  384. {
  385. struct rrdengine_instance *ctx = wc->ctx;
  386. struct page_cache_descr *pg_cache_descr;
  387. int ret;
  388. unsigned i, size_bytes, pos, real_io_size;
  389. // uint32_t payload_length;
  390. struct extent_io_descriptor *xt_io_descr;
  391. struct rrdengine_datafile *datafile;
  392. struct extent_info *extent = descr[0]->extent;
  393. uint8_t xt_is_cached = 0, xt_is_inflight = 0;
  394. unsigned xt_idx;
  395. datafile = extent->datafile;
  396. pos = extent->offset;
  397. size_bytes = extent->size;
  398. xt_io_descr = callocz(1, sizeof(*xt_io_descr));
  399. for (i = 0 ; i < count; ++i) {
  400. rrdeng_page_descr_mutex_lock(ctx, descr[i]);
  401. pg_cache_descr = descr[i]->pg_cache_descr;
  402. pg_cache_descr->flags |= RRD_PAGE_READ_PENDING;
  403. // payload_length = descr[i]->page_length;
  404. rrdeng_page_descr_mutex_unlock(ctx, descr[i]);
  405. xt_io_descr->descr_array[i] = descr[i];
  406. }
  407. xt_io_descr->descr_count = count;
  408. xt_io_descr->bytes = size_bytes;
  409. xt_io_descr->pos = pos;
  410. xt_io_descr->req.data = xt_io_descr;
  411. xt_io_descr->completion = NULL;
  412. /* xt_io_descr->descr_commit_idx_array[0] */
  413. xt_io_descr->release_descr = release_descr;
  414. xt_is_cached = !lookup_in_xt_cache(wc, extent, &xt_idx);
  415. if (xt_is_cached) {
  416. xt_cache_replaceQ_set_hot(wc, &wc->xt_cache.extent_array[xt_idx]);
  417. xt_is_inflight = check_bit(wc->xt_cache.inflight_bitmap, xt_idx);
  418. if (xt_is_inflight) {
  419. enqueue_inflight_read_to_xt_cache(wc, xt_idx, xt_io_descr);
  420. return;
  421. }
  422. return read_cached_extent_cb(wc, xt_idx, xt_io_descr);
  423. } else {
  424. ret = try_insert_into_xt_cache(wc, extent);
  425. if (-1 != ret) {
  426. xt_idx = (unsigned)ret;
  427. modify_bit(&wc->xt_cache.inflight_bitmap, xt_idx, 1);
  428. wc->xt_cache.extent_array[xt_idx].inflight_io_descr = xt_io_descr;
  429. }
  430. }
  431. ret = posix_memalign((void *)&xt_io_descr->buf, RRDFILE_ALIGNMENT, ALIGN_BYTES_CEILING(size_bytes));
  432. if (unlikely(ret)) {
  433. fatal("posix_memalign:%s", strerror(ret));
  434. /* freez(xt_io_descr);
  435. return;*/
  436. }
  437. real_io_size = ALIGN_BYTES_CEILING(size_bytes);
  438. xt_io_descr->iov = uv_buf_init((void *)xt_io_descr->buf, real_io_size);
  439. ret = uv_fs_read(wc->loop, &xt_io_descr->req, datafile->file, &xt_io_descr->iov, 1, pos, read_extent_cb);
  440. fatal_assert(-1 != ret);
  441. ctx->stats.io_read_bytes += real_io_size;
  442. ++ctx->stats.io_read_requests;
  443. ctx->stats.io_read_extent_bytes += real_io_size;
  444. ++ctx->stats.io_read_extents;
  445. ctx->stats.pg_cache_backfills += count;
  446. }
  447. static void commit_data_extent(struct rrdengine_worker_config* wc, struct extent_io_descriptor *xt_io_descr)
  448. {
  449. struct rrdengine_instance *ctx = wc->ctx;
  450. unsigned count, payload_length, descr_size, size_bytes;
  451. void *buf;
  452. /* persistent structures */
  453. struct rrdeng_df_extent_header *df_header;
  454. struct rrdeng_jf_transaction_header *jf_header;
  455. struct rrdeng_jf_store_data *jf_metric_data;
  456. struct rrdeng_jf_transaction_trailer *jf_trailer;
  457. uLong crc;
  458. df_header = xt_io_descr->buf;
  459. count = df_header->number_of_pages;
  460. descr_size = sizeof(*jf_metric_data->descr) * count;
  461. payload_length = sizeof(*jf_metric_data) + descr_size;
  462. size_bytes = sizeof(*jf_header) + payload_length + sizeof(*jf_trailer);
  463. buf = wal_get_transaction_buffer(wc, size_bytes);
  464. jf_header = buf;
  465. jf_header->type = STORE_DATA;
  466. jf_header->reserved = 0;
  467. jf_header->id = ctx->commit_log.transaction_id++;
  468. jf_header->payload_length = payload_length;
  469. jf_metric_data = buf + sizeof(*jf_header);
  470. jf_metric_data->extent_offset = xt_io_descr->pos;
  471. jf_metric_data->extent_size = xt_io_descr->bytes;
  472. jf_metric_data->number_of_pages = count;
  473. memcpy(jf_metric_data->descr, df_header->descr, descr_size);
  474. jf_trailer = buf + sizeof(*jf_header) + payload_length;
  475. crc = crc32(0L, Z_NULL, 0);
  476. crc = crc32(crc, buf, sizeof(*jf_header) + payload_length);
  477. crc32set(jf_trailer->checksum, crc);
  478. }
  479. static void do_commit_transaction(struct rrdengine_worker_config* wc, uint8_t type, void *data)
  480. {
  481. switch (type) {
  482. case STORE_DATA:
  483. commit_data_extent(wc, (struct extent_io_descriptor *)data);
  484. break;
  485. default:
  486. fatal_assert(type == STORE_DATA);
  487. break;
  488. }
  489. }
  490. static void after_invalidate_oldest_committed(struct rrdengine_worker_config* wc)
  491. {
  492. int error;
  493. error = uv_thread_join(wc->now_invalidating_dirty_pages);
  494. if (error) {
  495. error("uv_thread_join(): %s", uv_strerror(error));
  496. }
  497. freez(wc->now_invalidating_dirty_pages);
  498. wc->now_invalidating_dirty_pages = NULL;
  499. wc->cleanup_thread_invalidating_dirty_pages = 0;
  500. }
  501. static void invalidate_oldest_committed(void *arg)
  502. {
  503. struct rrdengine_instance *ctx = arg;
  504. struct rrdengine_worker_config *wc = &ctx->worker_config;
  505. struct page_cache *pg_cache = &ctx->pg_cache;
  506. int ret;
  507. struct rrdeng_page_descr *descr;
  508. struct page_cache_descr *pg_cache_descr;
  509. Pvoid_t *PValue;
  510. Word_t Index;
  511. unsigned nr_committed_pages;
  512. do {
  513. uv_rwlock_wrlock(&pg_cache->committed_page_index.lock);
  514. for (Index = 0,
  515. PValue = JudyLFirst(pg_cache->committed_page_index.JudyL_array, &Index, PJE0),
  516. descr = unlikely(NULL == PValue) ? NULL : *PValue;
  517. descr != NULL;
  518. PValue = JudyLNext(pg_cache->committed_page_index.JudyL_array, &Index, PJE0),
  519. descr = unlikely(NULL == PValue) ? NULL : *PValue) {
  520. fatal_assert(0 != descr->page_length);
  521. rrdeng_page_descr_mutex_lock(ctx, descr);
  522. pg_cache_descr = descr->pg_cache_descr;
  523. if (!(pg_cache_descr->flags & RRD_PAGE_WRITE_PENDING) && pg_cache_try_get_unsafe(descr, 1)) {
  524. rrdeng_page_descr_mutex_unlock(ctx, descr);
  525. ret = JudyLDel(&pg_cache->committed_page_index.JudyL_array, Index, PJE0);
  526. fatal_assert(1 == ret);
  527. break;
  528. }
  529. rrdeng_page_descr_mutex_unlock(ctx, descr);
  530. }
  531. uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock);
  532. if (!descr) {
  533. info("Failed to invalidate any dirty pages to relieve page cache pressure.");
  534. goto out;
  535. }
  536. pg_cache_punch_hole(ctx, descr, 1, 1, NULL);
  537. uv_rwlock_wrlock(&pg_cache->committed_page_index.lock);
  538. nr_committed_pages = --pg_cache->committed_page_index.nr_committed_pages;
  539. uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock);
  540. rrd_stat_atomic_add(&ctx->stats.flushing_pressure_page_deletions, 1);
  541. rrd_stat_atomic_add(&global_flushing_pressure_page_deletions, 1);
  542. } while (nr_committed_pages >= pg_cache_committed_hard_limit(ctx));
  543. out:
  544. wc->cleanup_thread_invalidating_dirty_pages = 1;
  545. /* wake up event loop */
  546. fatal_assert(0 == uv_async_send(&wc->async));
  547. }
  548. void rrdeng_invalidate_oldest_committed(struct rrdengine_worker_config* wc)
  549. {
  550. struct rrdengine_instance *ctx = wc->ctx;
  551. struct page_cache *pg_cache = &ctx->pg_cache;
  552. unsigned nr_committed_pages;
  553. int error;
  554. if (unlikely(ctx->quiesce != NO_QUIESCE)) /* Shutting down */
  555. return;
  556. uv_rwlock_rdlock(&pg_cache->committed_page_index.lock);
  557. nr_committed_pages = pg_cache->committed_page_index.nr_committed_pages;
  558. uv_rwlock_rdunlock(&pg_cache->committed_page_index.lock);
  559. if (nr_committed_pages >= pg_cache_committed_hard_limit(ctx)) {
  560. /* delete the oldest page in memory */
  561. if (wc->now_invalidating_dirty_pages) {
  562. /* already deleting a page */
  563. return;
  564. }
  565. errno = 0;
  566. error("Failed to flush dirty buffers quickly enough in dbengine instance \"%s\". "
  567. "Metric data are being deleted, please reduce disk load or use a faster disk.", ctx->dbfiles_path);
  568. wc->now_invalidating_dirty_pages = mallocz(sizeof(*wc->now_invalidating_dirty_pages));
  569. wc->cleanup_thread_invalidating_dirty_pages = 0;
  570. error = uv_thread_create(wc->now_invalidating_dirty_pages, invalidate_oldest_committed, ctx);
  571. if (error) {
  572. error("uv_thread_create(): %s", uv_strerror(error));
  573. freez(wc->now_invalidating_dirty_pages);
  574. wc->now_invalidating_dirty_pages = NULL;
  575. }
  576. }
  577. }
  578. void flush_pages_cb(uv_fs_t* req)
  579. {
  580. struct rrdengine_worker_config* wc = req->loop->data;
  581. struct rrdengine_instance *ctx = wc->ctx;
  582. struct page_cache *pg_cache = &ctx->pg_cache;
  583. struct extent_io_descriptor *xt_io_descr;
  584. struct rrdeng_page_descr *descr;
  585. struct page_cache_descr *pg_cache_descr;
  586. unsigned i, count;
  587. xt_io_descr = req->data;
  588. if (req->result < 0) {
  589. ++ctx->stats.io_errors;
  590. rrd_stat_atomic_add(&global_io_errors, 1);
  591. error("%s: uv_fs_write: %s", __func__, uv_strerror((int)req->result));
  592. }
  593. #ifdef NETDATA_INTERNAL_CHECKS
  594. {
  595. struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile;
  596. debug(D_RRDENGINE, "%s: Extent at offset %"PRIu64"(%u) was written to datafile %u-%u. Waking up waiters.",
  597. __func__, xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno);
  598. }
  599. #endif
  600. count = xt_io_descr->descr_count;
  601. for (i = 0 ; i < count ; ++i) {
  602. /* care, we don't hold the descriptor mutex */
  603. descr = xt_io_descr->descr_array[i];
  604. pg_cache_replaceQ_insert(ctx, descr);
  605. rrdeng_page_descr_mutex_lock(ctx, descr);
  606. pg_cache_descr = descr->pg_cache_descr;
  607. pg_cache_descr->flags &= ~(RRD_PAGE_DIRTY | RRD_PAGE_WRITE_PENDING);
  608. /* wake up waiters, care no reference being held */
  609. pg_cache_wake_up_waiters_unsafe(descr);
  610. rrdeng_page_descr_mutex_unlock(ctx, descr);
  611. }
  612. if (xt_io_descr->completion)
  613. completion_mark_complete(xt_io_descr->completion);
  614. uv_fs_req_cleanup(req);
  615. free(xt_io_descr->buf);
  616. freez(xt_io_descr);
  617. uv_rwlock_wrlock(&pg_cache->committed_page_index.lock);
  618. pg_cache->committed_page_index.nr_committed_pages -= count;
  619. uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock);
  620. wc->inflight_dirty_pages -= count;
  621. }
  622. /*
  623. * completion must be NULL or valid.
  624. * Returns 0 when no flushing can take place.
  625. * Returns datafile bytes to be written on successful flushing initiation.
  626. */
  627. static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct completion *completion)
  628. {
  629. struct rrdengine_instance *ctx = wc->ctx;
  630. struct page_cache *pg_cache = &ctx->pg_cache;
  631. int ret;
  632. int compressed_size, max_compressed_size = 0;
  633. unsigned i, count, size_bytes, pos, real_io_size;
  634. uint32_t uncompressed_payload_length, payload_offset;
  635. struct rrdeng_page_descr *descr, *eligible_pages[MAX_PAGES_PER_EXTENT];
  636. struct page_cache_descr *pg_cache_descr;
  637. struct extent_io_descriptor *xt_io_descr;
  638. void *compressed_buf = NULL;
  639. Word_t descr_commit_idx_array[MAX_PAGES_PER_EXTENT];
  640. Pvoid_t *PValue;
  641. Word_t Index;
  642. uint8_t compression_algorithm = ctx->global_compress_alg;
  643. struct extent_info *extent;
  644. struct rrdengine_datafile *datafile;
  645. /* persistent structures */
  646. struct rrdeng_df_extent_header *header;
  647. struct rrdeng_df_extent_trailer *trailer;
  648. uLong crc;
  649. if (force) {
  650. debug(D_RRDENGINE, "Asynchronous flushing of extent has been forced by page pressure.");
  651. }
  652. uv_rwlock_wrlock(&pg_cache->committed_page_index.lock);
  653. for (Index = 0, count = 0, uncompressed_payload_length = 0,
  654. PValue = JudyLFirst(pg_cache->committed_page_index.JudyL_array, &Index, PJE0),
  655. descr = unlikely(NULL == PValue) ? NULL : *PValue ;
  656. descr != NULL && count != rrdeng_pages_per_extent;
  657. PValue = JudyLNext(pg_cache->committed_page_index.JudyL_array, &Index, PJE0),
  658. descr = unlikely(NULL == PValue) ? NULL : *PValue) {
  659. uint8_t page_write_pending;
  660. fatal_assert(0 != descr->page_length);
  661. page_write_pending = 0;
  662. rrdeng_page_descr_mutex_lock(ctx, descr);
  663. pg_cache_descr = descr->pg_cache_descr;
  664. if (!(pg_cache_descr->flags & RRD_PAGE_WRITE_PENDING)) {
  665. page_write_pending = 1;
  666. /* care, no reference being held */
  667. pg_cache_descr->flags |= RRD_PAGE_WRITE_PENDING;
  668. uncompressed_payload_length += descr->page_length;
  669. descr_commit_idx_array[count] = Index;
  670. eligible_pages[count++] = descr;
  671. }
  672. rrdeng_page_descr_mutex_unlock(ctx, descr);
  673. if (page_write_pending) {
  674. ret = JudyLDel(&pg_cache->committed_page_index.JudyL_array, Index, PJE0);
  675. fatal_assert(1 == ret);
  676. }
  677. }
  678. uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock);
  679. if (!count) {
  680. debug(D_RRDENGINE, "%s: no pages eligible for flushing.", __func__);
  681. if (completion)
  682. completion_mark_complete(completion);
  683. return 0;
  684. }
  685. wc->inflight_dirty_pages += count;
  686. xt_io_descr = mallocz(sizeof(*xt_io_descr));
  687. payload_offset = sizeof(*header) + count * sizeof(header->descr[0]);
  688. switch (compression_algorithm) {
  689. case RRD_NO_COMPRESSION:
  690. size_bytes = payload_offset + uncompressed_payload_length + sizeof(*trailer);
  691. break;
  692. default: /* Compress */
  693. fatal_assert(uncompressed_payload_length < LZ4_MAX_INPUT_SIZE);
  694. max_compressed_size = LZ4_compressBound(uncompressed_payload_length);
  695. compressed_buf = mallocz(max_compressed_size);
  696. size_bytes = payload_offset + MAX(uncompressed_payload_length, (unsigned)max_compressed_size) + sizeof(*trailer);
  697. break;
  698. }
  699. ret = posix_memalign((void *)&xt_io_descr->buf, RRDFILE_ALIGNMENT, ALIGN_BYTES_CEILING(size_bytes));
  700. if (unlikely(ret)) {
  701. fatal("posix_memalign:%s", strerror(ret));
  702. /* freez(xt_io_descr);*/
  703. }
  704. memset(xt_io_descr->buf, 0, ALIGN_BYTES_CEILING(size_bytes));
  705. (void) memcpy(xt_io_descr->descr_array, eligible_pages, sizeof(struct rrdeng_page_descr *) * count);
  706. xt_io_descr->descr_count = count;
  707. pos = 0;
  708. header = xt_io_descr->buf;
  709. header->compression_algorithm = compression_algorithm;
  710. header->number_of_pages = count;
  711. pos += sizeof(*header);
  712. extent = mallocz(sizeof(*extent) + count * sizeof(extent->pages[0]));
  713. datafile = ctx->datafiles.last; /* TODO: check for exceeded size quota */
  714. extent->offset = datafile->pos;
  715. extent->number_of_pages = count;
  716. extent->datafile = datafile;
  717. extent->next = NULL;
  718. for (i = 0 ; i < count ; ++i) {
  719. /* This is here for performance reasons */
  720. xt_io_descr->descr_commit_idx_array[i] = descr_commit_idx_array[i];
  721. descr = xt_io_descr->descr_array[i];
  722. header->descr[i].type = descr->type;
  723. uuid_copy(*(uuid_t *)header->descr[i].uuid, *descr->id);
  724. header->descr[i].page_length = descr->page_length;
  725. header->descr[i].start_time = descr->start_time;
  726. header->descr[i].end_time = descr->end_time;
  727. pos += sizeof(header->descr[i]);
  728. }
  729. for (i = 0 ; i < count ; ++i) {
  730. descr = xt_io_descr->descr_array[i];
  731. /* care, we don't hold the descriptor mutex */
  732. (void) memcpy(xt_io_descr->buf + pos, descr->pg_cache_descr->page, descr->page_length);
  733. descr->extent = extent;
  734. extent->pages[i] = descr;
  735. pos += descr->page_length;
  736. }
  737. df_extent_insert(extent);
  738. switch (compression_algorithm) {
  739. case RRD_NO_COMPRESSION:
  740. header->payload_length = uncompressed_payload_length;
  741. break;
  742. default: /* Compress */
  743. compressed_size = LZ4_compress_default(xt_io_descr->buf + payload_offset, compressed_buf,
  744. uncompressed_payload_length, max_compressed_size);
  745. ctx->stats.before_compress_bytes += uncompressed_payload_length;
  746. ctx->stats.after_compress_bytes += compressed_size;
  747. debug(D_RRDENGINE, "LZ4 compressed %"PRIu32" bytes to %d bytes.", uncompressed_payload_length, compressed_size);
  748. (void) memcpy(xt_io_descr->buf + payload_offset, compressed_buf, compressed_size);
  749. freez(compressed_buf);
  750. size_bytes = payload_offset + compressed_size + sizeof(*trailer);
  751. header->payload_length = compressed_size;
  752. break;
  753. }
  754. extent->size = size_bytes;
  755. xt_io_descr->bytes = size_bytes;
  756. xt_io_descr->pos = datafile->pos;
  757. xt_io_descr->req.data = xt_io_descr;
  758. xt_io_descr->completion = completion;
  759. trailer = xt_io_descr->buf + size_bytes - sizeof(*trailer);
  760. crc = crc32(0L, Z_NULL, 0);
  761. crc = crc32(crc, xt_io_descr->buf, size_bytes - sizeof(*trailer));
  762. crc32set(trailer->checksum, crc);
  763. real_io_size = ALIGN_BYTES_CEILING(size_bytes);
  764. xt_io_descr->iov = uv_buf_init((void *)xt_io_descr->buf, real_io_size);
  765. ret = uv_fs_write(wc->loop, &xt_io_descr->req, datafile->file, &xt_io_descr->iov, 1, datafile->pos, flush_pages_cb);
  766. fatal_assert(-1 != ret);
  767. ctx->stats.io_write_bytes += real_io_size;
  768. ++ctx->stats.io_write_requests;
  769. ctx->stats.io_write_extent_bytes += real_io_size;
  770. ++ctx->stats.io_write_extents;
  771. do_commit_transaction(wc, STORE_DATA, xt_io_descr);
  772. datafile->pos += ALIGN_BYTES_CEILING(size_bytes);
  773. ctx->disk_space += ALIGN_BYTES_CEILING(size_bytes);
  774. rrdeng_test_quota(wc);
  775. return ALIGN_BYTES_CEILING(size_bytes);
  776. }
  777. static void after_delete_old_data(struct rrdengine_worker_config* wc)
  778. {
  779. struct rrdengine_instance *ctx = wc->ctx;
  780. struct rrdengine_datafile *datafile;
  781. struct rrdengine_journalfile *journalfile;
  782. unsigned deleted_bytes, journalfile_bytes, datafile_bytes;
  783. int ret, error;
  784. char path[RRDENG_PATH_MAX];
  785. datafile = ctx->datafiles.first;
  786. journalfile = datafile->journalfile;
  787. datafile_bytes = datafile->pos;
  788. journalfile_bytes = journalfile->pos;
  789. deleted_bytes = 0;
  790. info("Deleting data and journal file pair.");
  791. datafile_list_delete(ctx, datafile);
  792. ret = destroy_journal_file(journalfile, datafile);
  793. if (!ret) {
  794. generate_journalfilepath(datafile, path, sizeof(path));
  795. info("Deleted journal file \"%s\".", path);
  796. deleted_bytes += journalfile_bytes;
  797. }
  798. ret = destroy_data_file(datafile);
  799. if (!ret) {
  800. generate_datafilepath(datafile, path, sizeof(path));
  801. info("Deleted data file \"%s\".", path);
  802. deleted_bytes += datafile_bytes;
  803. }
  804. freez(journalfile);
  805. freez(datafile);
  806. ctx->disk_space -= deleted_bytes;
  807. info("Reclaimed %u bytes of disk space.", deleted_bytes);
  808. error = uv_thread_join(wc->now_deleting_files);
  809. if (error) {
  810. error("uv_thread_join(): %s", uv_strerror(error));
  811. }
  812. freez(wc->now_deleting_files);
  813. /* unfreeze command processing */
  814. wc->now_deleting_files = NULL;
  815. wc->cleanup_thread_deleting_files = 0;
  816. aclk_data_rotated();
  817. rrdcontext_db_rotation();
  818. /* interrupt event loop */
  819. uv_stop(wc->loop);
  820. }
  821. static void delete_old_data(void *arg)
  822. {
  823. struct rrdengine_instance *ctx = arg;
  824. struct rrdengine_worker_config* wc = &ctx->worker_config;
  825. struct rrdengine_datafile *datafile;
  826. struct extent_info *extent, *next;
  827. struct rrdeng_page_descr *descr;
  828. unsigned count, i;
  829. uint8_t can_delete_metric;
  830. uuid_t metric_id;
  831. /* Safe to use since it will be deleted after we are done */
  832. datafile = ctx->datafiles.first;
  833. for (extent = datafile->extents.first ; extent != NULL ; extent = next) {
  834. count = extent->number_of_pages;
  835. for (i = 0 ; i < count ; ++i) {
  836. descr = extent->pages[i];
  837. can_delete_metric = pg_cache_punch_hole(ctx, descr, 0, 0, &metric_id);
  838. if (unlikely(can_delete_metric && ctx->metalog_ctx->initialized)) {
  839. /*
  840. * If the metric is empty, has no active writers and if the metadata log has been initialized then
  841. * attempt to delete the corresponding netdata dimension.
  842. */
  843. metalog_delete_dimension_by_uuid(ctx->metalog_ctx, &metric_id);
  844. }
  845. }
  846. next = extent->next;
  847. freez(extent);
  848. }
  849. wc->cleanup_thread_deleting_files = 1;
  850. /* wake up event loop */
  851. fatal_assert(0 == uv_async_send(&wc->async));
  852. }
  853. void rrdeng_test_quota(struct rrdengine_worker_config* wc)
  854. {
  855. struct rrdengine_instance *ctx = wc->ctx;
  856. struct rrdengine_datafile *datafile;
  857. unsigned current_size, target_size;
  858. uint8_t out_of_space, only_one_datafile;
  859. int ret, error;
  860. out_of_space = 0;
  861. /* Do not allow the pinned pages to exceed the disk space quota to avoid deadlocks */
  862. if (unlikely(ctx->disk_space > MAX(ctx->max_disk_space, 2 * ctx->metric_API_max_producers * RRDENG_BLOCK_SIZE))) {
  863. out_of_space = 1;
  864. }
  865. datafile = ctx->datafiles.last;
  866. current_size = datafile->pos;
  867. target_size = ctx->max_disk_space / TARGET_DATAFILES;
  868. target_size = MIN(target_size, MAX_DATAFILE_SIZE);
  869. target_size = MAX(target_size, MIN_DATAFILE_SIZE);
  870. only_one_datafile = (datafile == ctx->datafiles.first) ? 1 : 0;
  871. if (unlikely(current_size >= target_size || (out_of_space && only_one_datafile))) {
  872. /* Finalize data and journal file and create a new pair */
  873. wal_flush_transaction_buffer(wc);
  874. ret = create_new_datafile_pair(ctx, 1, ctx->last_fileno + 1);
  875. if (likely(!ret)) {
  876. ++ctx->last_fileno;
  877. }
  878. }
  879. if (unlikely(out_of_space && NO_QUIESCE == ctx->quiesce)) {
  880. /* delete old data */
  881. if (wc->now_deleting_files) {
  882. /* already deleting data */
  883. return;
  884. }
  885. if (NULL == ctx->datafiles.first->next) {
  886. error("Cannot delete data file \"%s/"DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION"\""
  887. " to reclaim space, there are no other file pairs left.",
  888. ctx->dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno);
  889. return;
  890. }
  891. info("Deleting data file \"%s/"DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION"\".",
  892. ctx->dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno);
  893. wc->now_deleting_files = mallocz(sizeof(*wc->now_deleting_files));
  894. wc->cleanup_thread_deleting_files = 0;
  895. error = uv_thread_create(wc->now_deleting_files, delete_old_data, ctx);
  896. if (error) {
  897. error("uv_thread_create(): %s", uv_strerror(error));
  898. freez(wc->now_deleting_files);
  899. wc->now_deleting_files = NULL;
  900. }
  901. }
  902. }
  903. static inline int rrdeng_threads_alive(struct rrdengine_worker_config* wc)
  904. {
  905. if (wc->now_invalidating_dirty_pages || wc->now_deleting_files) {
  906. return 1;
  907. }
  908. return 0;
  909. }
  910. static void rrdeng_cleanup_finished_threads(struct rrdengine_worker_config* wc)
  911. {
  912. struct rrdengine_instance *ctx = wc->ctx;
  913. if (unlikely(wc->cleanup_thread_invalidating_dirty_pages)) {
  914. after_invalidate_oldest_committed(wc);
  915. }
  916. if (unlikely(wc->cleanup_thread_deleting_files)) {
  917. after_delete_old_data(wc);
  918. }
  919. if (unlikely(SET_QUIESCE == ctx->quiesce && !rrdeng_threads_alive(wc))) {
  920. ctx->quiesce = QUIESCED;
  921. completion_mark_complete(&ctx->rrdengine_completion);
  922. }
  923. }
  924. /* return 0 on success */
  925. int init_rrd_files(struct rrdengine_instance *ctx)
  926. {
  927. return init_data_files(ctx);
  928. }
  929. void finalize_rrd_files(struct rrdengine_instance *ctx)
  930. {
  931. return finalize_data_files(ctx);
  932. }
  933. void rrdeng_init_cmd_queue(struct rrdengine_worker_config* wc)
  934. {
  935. wc->cmd_queue.head = wc->cmd_queue.tail = 0;
  936. wc->queue_size = 0;
  937. fatal_assert(0 == uv_cond_init(&wc->cmd_cond));
  938. fatal_assert(0 == uv_mutex_init(&wc->cmd_mutex));
  939. }
  940. void rrdeng_enq_cmd(struct rrdengine_worker_config* wc, struct rrdeng_cmd *cmd)
  941. {
  942. unsigned queue_size;
  943. /* wait for free space in queue */
  944. uv_mutex_lock(&wc->cmd_mutex);
  945. while ((queue_size = wc->queue_size) == RRDENG_CMD_Q_MAX_SIZE) {
  946. uv_cond_wait(&wc->cmd_cond, &wc->cmd_mutex);
  947. }
  948. fatal_assert(queue_size < RRDENG_CMD_Q_MAX_SIZE);
  949. /* enqueue command */
  950. wc->cmd_queue.cmd_array[wc->cmd_queue.tail] = *cmd;
  951. wc->cmd_queue.tail = wc->cmd_queue.tail != RRDENG_CMD_Q_MAX_SIZE - 1 ?
  952. wc->cmd_queue.tail + 1 : 0;
  953. wc->queue_size = queue_size + 1;
  954. uv_mutex_unlock(&wc->cmd_mutex);
  955. /* wake up event loop */
  956. fatal_assert(0 == uv_async_send(&wc->async));
  957. }
  958. struct rrdeng_cmd rrdeng_deq_cmd(struct rrdengine_worker_config* wc)
  959. {
  960. struct rrdeng_cmd ret;
  961. unsigned queue_size;
  962. uv_mutex_lock(&wc->cmd_mutex);
  963. queue_size = wc->queue_size;
  964. if (queue_size == 0) {
  965. ret.opcode = RRDENG_NOOP;
  966. } else {
  967. /* dequeue command */
  968. ret = wc->cmd_queue.cmd_array[wc->cmd_queue.head];
  969. if (queue_size == 1) {
  970. wc->cmd_queue.head = wc->cmd_queue.tail = 0;
  971. } else {
  972. wc->cmd_queue.head = wc->cmd_queue.head != RRDENG_CMD_Q_MAX_SIZE - 1 ?
  973. wc->cmd_queue.head + 1 : 0;
  974. }
  975. wc->queue_size = queue_size - 1;
  976. /* wake up producers */
  977. uv_cond_signal(&wc->cmd_cond);
  978. }
  979. uv_mutex_unlock(&wc->cmd_mutex);
  980. return ret;
  981. }
  982. static void load_configuration_dynamic(void)
  983. {
  984. unsigned read_num = (unsigned)config_get_number(CONFIG_SECTION_DB, "dbengine pages per extent", MAX_PAGES_PER_EXTENT);
  985. if (read_num > 0 && read_num <= MAX_PAGES_PER_EXTENT)
  986. rrdeng_pages_per_extent = read_num;
  987. else {
  988. error("Invalid dbengine pages per extent %u given. Using %u.", read_num, rrdeng_pages_per_extent);
  989. config_set_number(CONFIG_SECTION_DB, "dbengine pages per extent", rrdeng_pages_per_extent);
  990. }
  991. }
  992. void async_cb(uv_async_t *handle)
  993. {
  994. uv_stop(handle->loop);
  995. uv_update_time(handle->loop);
  996. debug(D_RRDENGINE, "%s called, active=%d.", __func__, uv_is_active((uv_handle_t *)handle));
  997. }
  998. /* Flushes dirty pages when timer expires */
  999. #define TIMER_PERIOD_MS (1000)
  1000. void timer_cb(uv_timer_t* handle)
  1001. {
  1002. worker_is_busy(RRDENG_MAX_OPCODE + 1);
  1003. struct rrdengine_worker_config* wc = handle->data;
  1004. struct rrdengine_instance *ctx = wc->ctx;
  1005. uv_stop(handle->loop);
  1006. uv_update_time(handle->loop);
  1007. if (unlikely(!ctx->metalog_ctx->initialized)) {
  1008. worker_is_idle();
  1009. return; /* Wait for the metadata log to initialize */
  1010. }
  1011. rrdeng_test_quota(wc);
  1012. debug(D_RRDENGINE, "%s: timeout reached.", __func__);
  1013. if (likely(!wc->now_deleting_files && !wc->now_invalidating_dirty_pages)) {
  1014. /* There is free space so we can write to disk and we are not actively deleting dirty buffers */
  1015. struct page_cache *pg_cache = &ctx->pg_cache;
  1016. unsigned long total_bytes, bytes_written, nr_committed_pages, bytes_to_write = 0, producers, low_watermark,
  1017. high_watermark;
  1018. uv_rwlock_rdlock(&pg_cache->committed_page_index.lock);
  1019. nr_committed_pages = pg_cache->committed_page_index.nr_committed_pages;
  1020. uv_rwlock_rdunlock(&pg_cache->committed_page_index.lock);
  1021. producers = ctx->metric_API_max_producers;
  1022. /* are flushable pages more than 25% of the maximum page cache size */
  1023. high_watermark = (ctx->max_cache_pages * 25LLU) / 100;
  1024. low_watermark = (ctx->max_cache_pages * 5LLU) / 100; /* 5%, must be smaller than high_watermark */
  1025. /* Flush more pages only if disk can keep up */
  1026. if (wc->inflight_dirty_pages < high_watermark + producers) {
  1027. if (nr_committed_pages > producers &&
  1028. /* committed to be written pages are more than the produced number */
  1029. nr_committed_pages - producers > high_watermark) {
  1030. /* Flushing speed must increase to stop page cache from filling with dirty pages */
  1031. bytes_to_write = (nr_committed_pages - producers - low_watermark) * RRDENG_BLOCK_SIZE;
  1032. }
  1033. bytes_to_write = MAX(DATAFILE_IDEAL_IO_SIZE, bytes_to_write);
  1034. debug(D_RRDENGINE, "Flushing pages to disk.");
  1035. for (total_bytes = bytes_written = do_flush_pages(wc, 0, NULL);
  1036. bytes_written && (total_bytes < bytes_to_write);
  1037. total_bytes += bytes_written) {
  1038. bytes_written = do_flush_pages(wc, 0, NULL);
  1039. }
  1040. }
  1041. }
  1042. load_configuration_dynamic();
  1043. #ifdef NETDATA_INTERNAL_CHECKS
  1044. {
  1045. char buf[4096];
  1046. debug(D_RRDENGINE, "%s", get_rrdeng_statistics(wc->ctx, buf, sizeof(buf)));
  1047. }
  1048. #endif
  1049. worker_is_idle();
  1050. }
  1051. #define MAX_CMD_BATCH_SIZE (256)
  1052. void rrdeng_worker(void* arg)
  1053. {
  1054. worker_register("DBENGINE");
  1055. worker_register_job_name(RRDENG_NOOP, "noop");
  1056. worker_register_job_name(RRDENG_READ_PAGE, "page read");
  1057. worker_register_job_name(RRDENG_READ_EXTENT, "extent read");
  1058. worker_register_job_name(RRDENG_COMMIT_PAGE, "commit");
  1059. worker_register_job_name(RRDENG_FLUSH_PAGES, "flush");
  1060. worker_register_job_name(RRDENG_SHUTDOWN, "shutdown");
  1061. worker_register_job_name(RRDENG_INVALIDATE_OLDEST_MEMORY_PAGE, "page lru");
  1062. worker_register_job_name(RRDENG_QUIESCE, "quiesce");
  1063. worker_register_job_name(RRDENG_MAX_OPCODE, "cleanup");
  1064. worker_register_job_name(RRDENG_MAX_OPCODE + 1, "timer");
  1065. struct rrdengine_worker_config* wc = arg;
  1066. struct rrdengine_instance *ctx = wc->ctx;
  1067. uv_loop_t* loop;
  1068. int shutdown, ret;
  1069. enum rrdeng_opcode opcode;
  1070. uv_timer_t timer_req;
  1071. struct rrdeng_cmd cmd;
  1072. unsigned cmd_batch_size;
  1073. rrdeng_init_cmd_queue(wc);
  1074. loop = wc->loop = mallocz(sizeof(uv_loop_t));
  1075. ret = uv_loop_init(loop);
  1076. if (ret) {
  1077. error("uv_loop_init(): %s", uv_strerror(ret));
  1078. goto error_after_loop_init;
  1079. }
  1080. loop->data = wc;
  1081. ret = uv_async_init(wc->loop, &wc->async, async_cb);
  1082. if (ret) {
  1083. error("uv_async_init(): %s", uv_strerror(ret));
  1084. goto error_after_async_init;
  1085. }
  1086. wc->async.data = wc;
  1087. wc->now_deleting_files = NULL;
  1088. wc->cleanup_thread_deleting_files = 0;
  1089. wc->now_invalidating_dirty_pages = NULL;
  1090. wc->cleanup_thread_invalidating_dirty_pages = 0;
  1091. wc->inflight_dirty_pages = 0;
  1092. /* dirty page flushing timer */
  1093. ret = uv_timer_init(loop, &timer_req);
  1094. if (ret) {
  1095. error("uv_timer_init(): %s", uv_strerror(ret));
  1096. goto error_after_timer_init;
  1097. }
  1098. timer_req.data = wc;
  1099. wc->error = 0;
  1100. /* wake up initialization thread */
  1101. completion_mark_complete(&ctx->rrdengine_completion);
  1102. fatal_assert(0 == uv_timer_start(&timer_req, timer_cb, TIMER_PERIOD_MS, TIMER_PERIOD_MS));
  1103. shutdown = 0;
  1104. int set_name = 0;
  1105. while (likely(shutdown == 0 || rrdeng_threads_alive(wc))) {
  1106. worker_is_idle();
  1107. uv_run(loop, UV_RUN_DEFAULT);
  1108. worker_is_busy(RRDENG_MAX_OPCODE);
  1109. rrdeng_cleanup_finished_threads(wc);
  1110. /* wait for commands */
  1111. cmd_batch_size = 0;
  1112. do {
  1113. /*
  1114. * Avoid starving the loop when there are too many commands coming in.
  1115. * timer_cb will interrupt the loop again to allow serving more commands.
  1116. */
  1117. if (unlikely(cmd_batch_size >= MAX_CMD_BATCH_SIZE))
  1118. break;
  1119. cmd = rrdeng_deq_cmd(wc);
  1120. opcode = cmd.opcode;
  1121. ++cmd_batch_size;
  1122. if(likely(opcode != RRDENG_NOOP))
  1123. worker_is_busy(opcode);
  1124. switch (opcode) {
  1125. case RRDENG_NOOP:
  1126. /* the command queue was empty, do nothing */
  1127. break;
  1128. case RRDENG_SHUTDOWN:
  1129. shutdown = 1;
  1130. break;
  1131. case RRDENG_QUIESCE:
  1132. ctx->drop_metrics_under_page_cache_pressure = 0;
  1133. ctx->quiesce = SET_QUIESCE;
  1134. fatal_assert(0 == uv_timer_stop(&timer_req));
  1135. uv_close((uv_handle_t *)&timer_req, NULL);
  1136. while (do_flush_pages(wc, 1, NULL)) {
  1137. ; /* Force flushing of all committed pages. */
  1138. }
  1139. wal_flush_transaction_buffer(wc);
  1140. if (!rrdeng_threads_alive(wc)) {
  1141. ctx->quiesce = QUIESCED;
  1142. completion_mark_complete(&ctx->rrdengine_completion);
  1143. }
  1144. break;
  1145. case RRDENG_READ_PAGE:
  1146. do_read_extent(wc, &cmd.read_page.page_cache_descr, 1, 0);
  1147. break;
  1148. case RRDENG_READ_EXTENT:
  1149. do_read_extent(wc, cmd.read_extent.page_cache_descr, cmd.read_extent.page_count, 1);
  1150. if (unlikely(!set_name)) {
  1151. set_name = 1;
  1152. uv_thread_set_name_np(ctx->worker_config.thread, "DBENGINE");
  1153. }
  1154. break;
  1155. case RRDENG_COMMIT_PAGE:
  1156. do_commit_transaction(wc, STORE_DATA, NULL);
  1157. break;
  1158. case RRDENG_FLUSH_PAGES: {
  1159. if (wc->now_invalidating_dirty_pages) {
  1160. /* Do not flush if the disk cannot keep up */
  1161. completion_mark_complete(cmd.completion);
  1162. } else {
  1163. (void)do_flush_pages(wc, 1, cmd.completion);
  1164. }
  1165. break;
  1166. case RRDENG_INVALIDATE_OLDEST_MEMORY_PAGE:
  1167. rrdeng_invalidate_oldest_committed(wc);
  1168. break;
  1169. }
  1170. default:
  1171. debug(D_RRDENGINE, "%s: default.", __func__);
  1172. break;
  1173. }
  1174. } while (opcode != RRDENG_NOOP);
  1175. }
  1176. /* cleanup operations of the event loop */
  1177. info("Shutting down RRD engine event loop.");
  1178. /*
  1179. * uv_async_send after uv_close does not seem to crash in linux at the moment,
  1180. * it is however undocumented behaviour and we need to be aware if this becomes
  1181. * an issue in the future.
  1182. */
  1183. uv_close((uv_handle_t *)&wc->async, NULL);
  1184. while (do_flush_pages(wc, 1, NULL)) {
  1185. ; /* Force flushing of all committed pages. */
  1186. }
  1187. wal_flush_transaction_buffer(wc);
  1188. uv_run(loop, UV_RUN_DEFAULT);
  1189. info("Shutting down RRD engine event loop complete.");
  1190. /* TODO: don't let the API block by waiting to enqueue commands */
  1191. uv_cond_destroy(&wc->cmd_cond);
  1192. /* uv_mutex_destroy(&wc->cmd_mutex); */
  1193. fatal_assert(0 == uv_loop_close(loop));
  1194. freez(loop);
  1195. worker_unregister();
  1196. return;
  1197. error_after_timer_init:
  1198. uv_close((uv_handle_t *)&wc->async, NULL);
  1199. error_after_async_init:
  1200. fatal_assert(0 == uv_loop_close(loop));
  1201. error_after_loop_init:
  1202. freez(loop);
  1203. wc->error = UV_EAGAIN;
  1204. /* wake up initialization thread */
  1205. completion_mark_complete(&ctx->rrdengine_completion);
  1206. worker_unregister();
  1207. }
  1208. /* C entry point for development purposes
  1209. * make "LDFLAGS=-errdengine_main"
  1210. */
  1211. void rrdengine_main(void)
  1212. {
  1213. int ret;
  1214. struct rrdengine_instance *ctx;
  1215. sanity_check();
  1216. ret = rrdeng_init(NULL, &ctx, "/tmp", RRDENG_MIN_PAGE_CACHE_SIZE_MB, RRDENG_MIN_DISK_SPACE_MB, 0);
  1217. if (ret) {
  1218. exit(ret);
  1219. }
  1220. rrdeng_exit(ctx);
  1221. fprintf(stderr, "Hello world!");
  1222. exit(0);
  1223. }