rrdengine.c 50 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #define NETDATA_RRD_INTERNALS
  3. #include "rrdengine.h"
  4. rrdeng_stats_t global_io_errors = 0;
  5. rrdeng_stats_t global_fs_errors = 0;
  6. rrdeng_stats_t rrdeng_reserved_file_descriptors = 0;
  7. rrdeng_stats_t global_pg_cache_over_half_dirty_events = 0;
  8. rrdeng_stats_t global_flushing_pressure_page_deletions = 0;
  9. static unsigned pages_per_extent = MAX_PAGES_PER_EXTENT;
  10. #if WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_MAX_OPCODE + 2)
  11. #error Please increase WORKER_UTILIZATION_MAX_JOB_TYPES to at least (RRDENG_MAX_OPCODE + 2)
  12. #endif
  13. void *dbengine_page_alloc() {
  14. void *page = netdata_mmap(NULL, RRDENG_BLOCK_SIZE, MAP_PRIVATE, enable_ksm);
  15. if(!page) fatal("Cannot allocate dbengine page cache page, with mmap()");
  16. return page;
  17. }
  18. void dbengine_page_free(void *page) {
  19. munmap(page, RRDENG_BLOCK_SIZE);
  20. }
  21. static void sanity_check(void)
  22. {
  23. BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_MAX_OPCODE + 2));
  24. /* Magic numbers must fit in the super-blocks */
  25. BUILD_BUG_ON(strlen(RRDENG_DF_MAGIC) > RRDENG_MAGIC_SZ);
  26. BUILD_BUG_ON(strlen(RRDENG_JF_MAGIC) > RRDENG_MAGIC_SZ);
  27. /* Version strings must fit in the super-blocks */
  28. BUILD_BUG_ON(strlen(RRDENG_DF_VER) > RRDENG_VER_SZ);
  29. BUILD_BUG_ON(strlen(RRDENG_JF_VER) > RRDENG_VER_SZ);
  30. /* Data file super-block cannot be larger than RRDENG_BLOCK_SIZE */
  31. BUILD_BUG_ON(RRDENG_DF_SB_PADDING_SZ < 0);
  32. BUILD_BUG_ON(sizeof(uuid_t) != UUID_SZ); /* check UUID size */
  33. /* page count must fit in 8 bits */
  34. BUILD_BUG_ON(MAX_PAGES_PER_EXTENT > 255);
  35. /* extent cache count must fit in 32 bits */
  36. BUILD_BUG_ON(MAX_CACHED_EXTENTS > 32);
  37. /* page info scratch space must be able to hold 2 32-bit integers */
  38. BUILD_BUG_ON(sizeof(((struct rrdeng_page_info *)0)->scratch) < 2 * sizeof(uint32_t));
  39. }
  40. /* always inserts into tail */
  41. static inline void xt_cache_replaceQ_insert(struct rrdengine_worker_config* wc,
  42. struct extent_cache_element *xt_cache_elem)
  43. {
  44. struct extent_cache *xt_cache = &wc->xt_cache;
  45. xt_cache_elem->prev = NULL;
  46. xt_cache_elem->next = NULL;
  47. if (likely(NULL != xt_cache->replaceQ_tail)) {
  48. xt_cache_elem->prev = xt_cache->replaceQ_tail;
  49. xt_cache->replaceQ_tail->next = xt_cache_elem;
  50. }
  51. if (unlikely(NULL == xt_cache->replaceQ_head)) {
  52. xt_cache->replaceQ_head = xt_cache_elem;
  53. }
  54. xt_cache->replaceQ_tail = xt_cache_elem;
  55. }
  56. static inline void xt_cache_replaceQ_delete(struct rrdengine_worker_config* wc,
  57. struct extent_cache_element *xt_cache_elem)
  58. {
  59. struct extent_cache *xt_cache = &wc->xt_cache;
  60. struct extent_cache_element *prev, *next;
  61. prev = xt_cache_elem->prev;
  62. next = xt_cache_elem->next;
  63. if (likely(NULL != prev)) {
  64. prev->next = next;
  65. }
  66. if (likely(NULL != next)) {
  67. next->prev = prev;
  68. }
  69. if (unlikely(xt_cache_elem == xt_cache->replaceQ_head)) {
  70. xt_cache->replaceQ_head = next;
  71. }
  72. if (unlikely(xt_cache_elem == xt_cache->replaceQ_tail)) {
  73. xt_cache->replaceQ_tail = prev;
  74. }
  75. xt_cache_elem->prev = xt_cache_elem->next = NULL;
  76. }
  77. static inline void xt_cache_replaceQ_set_hot(struct rrdengine_worker_config* wc,
  78. struct extent_cache_element *xt_cache_elem)
  79. {
  80. xt_cache_replaceQ_delete(wc, xt_cache_elem);
  81. xt_cache_replaceQ_insert(wc, xt_cache_elem);
  82. }
  83. /* Returns the index of the cached extent if it was successfully inserted in the extent cache, otherwise -1 */
  84. static int try_insert_into_xt_cache(struct rrdengine_worker_config* wc, struct extent_info *extent)
  85. {
  86. struct extent_cache *xt_cache = &wc->xt_cache;
  87. struct extent_cache_element *xt_cache_elem;
  88. unsigned idx;
  89. int ret;
  90. ret = find_first_zero(xt_cache->allocation_bitmap);
  91. if (-1 == ret || ret >= MAX_CACHED_EXTENTS) {
  92. for (xt_cache_elem = xt_cache->replaceQ_head ; NULL != xt_cache_elem ; xt_cache_elem = xt_cache_elem->next) {
  93. idx = xt_cache_elem - xt_cache->extent_array;
  94. if (!check_bit(xt_cache->inflight_bitmap, idx)) {
  95. xt_cache_replaceQ_delete(wc, xt_cache_elem);
  96. break;
  97. }
  98. }
  99. if (NULL == xt_cache_elem)
  100. return -1;
  101. } else {
  102. idx = (unsigned)ret;
  103. xt_cache_elem = &xt_cache->extent_array[idx];
  104. }
  105. xt_cache_elem->extent = extent;
  106. xt_cache_elem->fileno = extent->datafile->fileno;
  107. xt_cache_elem->inflight_io_descr = NULL;
  108. xt_cache_replaceQ_insert(wc, xt_cache_elem);
  109. modify_bit(&xt_cache->allocation_bitmap, idx, 1);
  110. return (int)idx;
  111. }
  112. /**
  113. * Returns 0 if the cached extent was found in the extent cache, 1 otherwise.
  114. * Sets *idx to point to the position of the extent inside the cache.
  115. **/
  116. static uint8_t lookup_in_xt_cache(struct rrdengine_worker_config* wc, struct extent_info *extent, unsigned *idx)
  117. {
  118. struct extent_cache *xt_cache = &wc->xt_cache;
  119. struct extent_cache_element *xt_cache_elem;
  120. unsigned i;
  121. for (i = 0 ; i < MAX_CACHED_EXTENTS ; ++i) {
  122. xt_cache_elem = &xt_cache->extent_array[i];
  123. if (check_bit(xt_cache->allocation_bitmap, i) && xt_cache_elem->extent == extent &&
  124. xt_cache_elem->fileno == extent->datafile->fileno) {
  125. *idx = i;
  126. return 0;
  127. }
  128. }
  129. return 1;
  130. }
  131. #if 0 /* disabled code */
  132. static void delete_from_xt_cache(struct rrdengine_worker_config* wc, unsigned idx)
  133. {
  134. struct extent_cache *xt_cache = &wc->xt_cache;
  135. struct extent_cache_element *xt_cache_elem;
  136. xt_cache_elem = &xt_cache->extent_array[idx];
  137. xt_cache_replaceQ_delete(wc, xt_cache_elem);
  138. xt_cache_elem->extent = NULL;
  139. modify_bit(&wc->xt_cache.allocation_bitmap, idx, 0); /* invalidate it */
  140. modify_bit(&wc->xt_cache.inflight_bitmap, idx, 0); /* not in-flight anymore */
  141. }
  142. #endif
  143. void enqueue_inflight_read_to_xt_cache(struct rrdengine_worker_config* wc, unsigned idx,
  144. struct extent_io_descriptor *xt_io_descr)
  145. {
  146. struct extent_cache *xt_cache = &wc->xt_cache;
  147. struct extent_cache_element *xt_cache_elem;
  148. struct extent_io_descriptor *old_next;
  149. xt_cache_elem = &xt_cache->extent_array[idx];
  150. old_next = xt_cache_elem->inflight_io_descr->next;
  151. xt_cache_elem->inflight_io_descr->next = xt_io_descr;
  152. xt_io_descr->next = old_next;
  153. }
  154. void read_cached_extent_cb(struct rrdengine_worker_config* wc, unsigned idx, struct extent_io_descriptor *xt_io_descr)
  155. {
  156. unsigned i, j, page_offset;
  157. struct rrdengine_instance *ctx = wc->ctx;
  158. struct rrdeng_page_descr *descr;
  159. struct page_cache_descr *pg_cache_descr;
  160. void *page;
  161. struct extent_info *extent = xt_io_descr->descr_array[0]->extent;
  162. for (i = 0 ; i < xt_io_descr->descr_count; ++i) {
  163. page = dbengine_page_alloc();
  164. descr = xt_io_descr->descr_array[i];
  165. for (j = 0, page_offset = 0 ; j < extent->number_of_pages ; ++j) {
  166. /* care, we don't hold the descriptor mutex */
  167. if (!uuid_compare(*extent->pages[j]->id, *descr->id) &&
  168. extent->pages[j]->page_length == descr->page_length &&
  169. extent->pages[j]->start_time == descr->start_time &&
  170. extent->pages[j]->end_time == descr->end_time) {
  171. break;
  172. }
  173. page_offset += extent->pages[j]->page_length;
  174. }
  175. /* care, we don't hold the descriptor mutex */
  176. (void) memcpy(page, wc->xt_cache.extent_array[idx].pages + page_offset, descr->page_length);
  177. rrdeng_page_descr_mutex_lock(ctx, descr);
  178. pg_cache_descr = descr->pg_cache_descr;
  179. pg_cache_descr->page = page;
  180. pg_cache_descr->flags |= RRD_PAGE_POPULATED;
  181. pg_cache_descr->flags &= ~RRD_PAGE_READ_PENDING;
  182. rrdeng_page_descr_mutex_unlock(ctx, descr);
  183. pg_cache_replaceQ_insert(ctx, descr);
  184. if (xt_io_descr->release_descr) {
  185. pg_cache_put(ctx, descr);
  186. } else {
  187. debug(D_RRDENGINE, "%s: Waking up waiters.", __func__);
  188. pg_cache_wake_up_waiters(ctx, descr);
  189. }
  190. }
  191. if (xt_io_descr->completion)
  192. completion_mark_complete(xt_io_descr->completion);
  193. freez(xt_io_descr);
  194. }
  195. void read_extent_cb(uv_fs_t* req)
  196. {
  197. struct rrdengine_worker_config* wc = req->loop->data;
  198. struct rrdengine_instance *ctx = wc->ctx;
  199. struct extent_io_descriptor *xt_io_descr;
  200. struct rrdeng_page_descr *descr;
  201. struct page_cache_descr *pg_cache_descr;
  202. int ret;
  203. unsigned i, j, count;
  204. void *page, *uncompressed_buf = NULL;
  205. uint32_t payload_length, payload_offset, page_offset, uncompressed_payload_length = 0;
  206. uint8_t have_read_error = 0;
  207. /* persistent structures */
  208. struct rrdeng_df_extent_header *header;
  209. struct rrdeng_df_extent_trailer *trailer;
  210. uLong crc;
  211. xt_io_descr = req->data;
  212. header = xt_io_descr->buf;
  213. payload_length = header->payload_length;
  214. count = header->number_of_pages;
  215. payload_offset = sizeof(*header) + sizeof(header->descr[0]) * count;
  216. trailer = xt_io_descr->buf + xt_io_descr->bytes - sizeof(*trailer);
  217. if (req->result < 0) {
  218. struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile;
  219. ++ctx->stats.io_errors;
  220. rrd_stat_atomic_add(&global_io_errors, 1);
  221. have_read_error = 1;
  222. error("%s: uv_fs_read - %s - extent at offset %"PRIu64"(%u) in datafile %u-%u.", __func__,
  223. uv_strerror((int)req->result), xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno);
  224. goto after_crc_check;
  225. }
  226. crc = crc32(0L, Z_NULL, 0);
  227. crc = crc32(crc, xt_io_descr->buf, xt_io_descr->bytes - sizeof(*trailer));
  228. ret = crc32cmp(trailer->checksum, crc);
  229. #ifdef NETDATA_INTERNAL_CHECKS
  230. {
  231. struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile;
  232. debug(D_RRDENGINE, "%s: Extent at offset %"PRIu64"(%u) was read from datafile %u-%u. CRC32 check: %s", __func__,
  233. xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno, ret ? "FAILED" : "SUCCEEDED");
  234. }
  235. #endif
  236. if (unlikely(ret)) {
  237. struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile;
  238. ++ctx->stats.io_errors;
  239. rrd_stat_atomic_add(&global_io_errors, 1);
  240. have_read_error = 1;
  241. error("%s: Extent at offset %"PRIu64"(%u) was read from datafile %u-%u. CRC32 check: FAILED", __func__,
  242. xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno);
  243. }
  244. after_crc_check:
  245. if (!have_read_error && RRD_NO_COMPRESSION != header->compression_algorithm) {
  246. uncompressed_payload_length = 0;
  247. for (i = 0 ; i < count ; ++i) {
  248. uncompressed_payload_length += header->descr[i].page_length;
  249. }
  250. uncompressed_buf = mallocz(uncompressed_payload_length);
  251. ret = LZ4_decompress_safe(xt_io_descr->buf + payload_offset, uncompressed_buf,
  252. payload_length, uncompressed_payload_length);
  253. ctx->stats.before_decompress_bytes += payload_length;
  254. ctx->stats.after_decompress_bytes += ret;
  255. debug(D_RRDENGINE, "LZ4 decompressed %u bytes to %d bytes.", payload_length, ret);
  256. /* care, we don't hold the descriptor mutex */
  257. }
  258. {
  259. uint8_t xt_is_cached = 0;
  260. unsigned xt_idx;
  261. struct extent_info *extent = xt_io_descr->descr_array[0]->extent;
  262. xt_is_cached = !lookup_in_xt_cache(wc, extent, &xt_idx);
  263. if (xt_is_cached && check_bit(wc->xt_cache.inflight_bitmap, xt_idx)) {
  264. struct extent_cache *xt_cache = &wc->xt_cache;
  265. struct extent_cache_element *xt_cache_elem = &xt_cache->extent_array[xt_idx];
  266. struct extent_io_descriptor *curr, *next;
  267. if (have_read_error) {
  268. memset(xt_cache_elem->pages, 0, sizeof(xt_cache_elem->pages));
  269. } else if (RRD_NO_COMPRESSION == header->compression_algorithm) {
  270. (void)memcpy(xt_cache_elem->pages, xt_io_descr->buf + payload_offset, payload_length);
  271. } else {
  272. (void)memcpy(xt_cache_elem->pages, uncompressed_buf, uncompressed_payload_length);
  273. }
  274. /* complete all connected in-flight read requests */
  275. for (curr = xt_cache_elem->inflight_io_descr->next ; curr ; curr = next) {
  276. next = curr->next;
  277. read_cached_extent_cb(wc, xt_idx, curr);
  278. }
  279. xt_cache_elem->inflight_io_descr = NULL;
  280. modify_bit(&xt_cache->inflight_bitmap, xt_idx, 0); /* not in-flight anymore */
  281. }
  282. }
  283. for (i = 0, page_offset = 0; i < count; page_offset += header->descr[i++].page_length) {
  284. uint8_t is_prefetched_page;
  285. descr = NULL;
  286. for (j = 0 ; j < xt_io_descr->descr_count; ++j) {
  287. struct rrdeng_page_descr *descrj;
  288. descrj = xt_io_descr->descr_array[j];
  289. /* care, we don't hold the descriptor mutex */
  290. if (!uuid_compare(*(uuid_t *) header->descr[i].uuid, *descrj->id) &&
  291. header->descr[i].page_length == descrj->page_length &&
  292. header->descr[i].start_time == descrj->start_time &&
  293. header->descr[i].end_time == descrj->end_time) {
  294. descr = descrj;
  295. break;
  296. }
  297. }
  298. is_prefetched_page = 0;
  299. if (!descr) { /* This extent page has not been requested. Try populating it for locality (best effort). */
  300. descr = pg_cache_lookup_unpopulated_and_lock(ctx, (uuid_t *)header->descr[i].uuid,
  301. header->descr[i].start_time);
  302. if (!descr)
  303. continue; /* Failed to reserve a suitable page */
  304. is_prefetched_page = 1;
  305. }
  306. page = dbengine_page_alloc();
  307. /* care, we don't hold the descriptor mutex */
  308. if (have_read_error) {
  309. /* Applications should make sure NULL values match 0 as does SN_EMPTY_SLOT */
  310. memset(page, SN_EMPTY_SLOT, descr->page_length);
  311. } else if (RRD_NO_COMPRESSION == header->compression_algorithm) {
  312. (void) memcpy(page, xt_io_descr->buf + payload_offset + page_offset, descr->page_length);
  313. } else {
  314. (void) memcpy(page, uncompressed_buf + page_offset, descr->page_length);
  315. }
  316. rrdeng_page_descr_mutex_lock(ctx, descr);
  317. pg_cache_descr = descr->pg_cache_descr;
  318. pg_cache_descr->page = page;
  319. pg_cache_descr->flags |= RRD_PAGE_POPULATED;
  320. pg_cache_descr->flags &= ~RRD_PAGE_READ_PENDING;
  321. rrdeng_page_descr_mutex_unlock(ctx, descr);
  322. pg_cache_replaceQ_insert(ctx, descr);
  323. if (xt_io_descr->release_descr || is_prefetched_page) {
  324. pg_cache_put(ctx, descr);
  325. } else {
  326. debug(D_RRDENGINE, "%s: Waking up waiters.", __func__);
  327. pg_cache_wake_up_waiters(ctx, descr);
  328. }
  329. }
  330. if (!have_read_error && RRD_NO_COMPRESSION != header->compression_algorithm) {
  331. freez(uncompressed_buf);
  332. }
  333. if (xt_io_descr->completion)
  334. completion_mark_complete(xt_io_descr->completion);
  335. uv_fs_req_cleanup(req);
  336. free(xt_io_descr->buf);
  337. freez(xt_io_descr);
  338. }
  339. static void do_read_extent(struct rrdengine_worker_config* wc,
  340. struct rrdeng_page_descr **descr,
  341. unsigned count,
  342. uint8_t release_descr)
  343. {
  344. struct rrdengine_instance *ctx = wc->ctx;
  345. struct page_cache_descr *pg_cache_descr;
  346. int ret;
  347. unsigned i, size_bytes, pos, real_io_size;
  348. // uint32_t payload_length;
  349. struct extent_io_descriptor *xt_io_descr;
  350. struct rrdengine_datafile *datafile;
  351. struct extent_info *extent = descr[0]->extent;
  352. uint8_t xt_is_cached = 0, xt_is_inflight = 0;
  353. unsigned xt_idx;
  354. datafile = extent->datafile;
  355. pos = extent->offset;
  356. size_bytes = extent->size;
  357. xt_io_descr = callocz(1, sizeof(*xt_io_descr));
  358. for (i = 0 ; i < count; ++i) {
  359. rrdeng_page_descr_mutex_lock(ctx, descr[i]);
  360. pg_cache_descr = descr[i]->pg_cache_descr;
  361. pg_cache_descr->flags |= RRD_PAGE_READ_PENDING;
  362. // payload_length = descr[i]->page_length;
  363. rrdeng_page_descr_mutex_unlock(ctx, descr[i]);
  364. xt_io_descr->descr_array[i] = descr[i];
  365. }
  366. xt_io_descr->descr_count = count;
  367. xt_io_descr->bytes = size_bytes;
  368. xt_io_descr->pos = pos;
  369. xt_io_descr->req.data = xt_io_descr;
  370. xt_io_descr->completion = NULL;
  371. /* xt_io_descr->descr_commit_idx_array[0] */
  372. xt_io_descr->release_descr = release_descr;
  373. xt_is_cached = !lookup_in_xt_cache(wc, extent, &xt_idx);
  374. if (xt_is_cached) {
  375. xt_cache_replaceQ_set_hot(wc, &wc->xt_cache.extent_array[xt_idx]);
  376. xt_is_inflight = check_bit(wc->xt_cache.inflight_bitmap, xt_idx);
  377. if (xt_is_inflight) {
  378. enqueue_inflight_read_to_xt_cache(wc, xt_idx, xt_io_descr);
  379. return;
  380. }
  381. return read_cached_extent_cb(wc, xt_idx, xt_io_descr);
  382. } else {
  383. ret = try_insert_into_xt_cache(wc, extent);
  384. if (-1 != ret) {
  385. xt_idx = (unsigned)ret;
  386. modify_bit(&wc->xt_cache.inflight_bitmap, xt_idx, 1);
  387. wc->xt_cache.extent_array[xt_idx].inflight_io_descr = xt_io_descr;
  388. }
  389. }
  390. ret = posix_memalign((void *)&xt_io_descr->buf, RRDFILE_ALIGNMENT, ALIGN_BYTES_CEILING(size_bytes));
  391. if (unlikely(ret)) {
  392. fatal("posix_memalign:%s", strerror(ret));
  393. /* freez(xt_io_descr);
  394. return;*/
  395. }
  396. real_io_size = ALIGN_BYTES_CEILING(size_bytes);
  397. xt_io_descr->iov = uv_buf_init((void *)xt_io_descr->buf, real_io_size);
  398. ret = uv_fs_read(wc->loop, &xt_io_descr->req, datafile->file, &xt_io_descr->iov, 1, pos, read_extent_cb);
  399. fatal_assert(-1 != ret);
  400. ctx->stats.io_read_bytes += real_io_size;
  401. ++ctx->stats.io_read_requests;
  402. ctx->stats.io_read_extent_bytes += real_io_size;
  403. ++ctx->stats.io_read_extents;
  404. ctx->stats.pg_cache_backfills += count;
  405. }
  406. static void commit_data_extent(struct rrdengine_worker_config* wc, struct extent_io_descriptor *xt_io_descr)
  407. {
  408. struct rrdengine_instance *ctx = wc->ctx;
  409. unsigned count, payload_length, descr_size, size_bytes;
  410. void *buf;
  411. /* persistent structures */
  412. struct rrdeng_df_extent_header *df_header;
  413. struct rrdeng_jf_transaction_header *jf_header;
  414. struct rrdeng_jf_store_data *jf_metric_data;
  415. struct rrdeng_jf_transaction_trailer *jf_trailer;
  416. uLong crc;
  417. df_header = xt_io_descr->buf;
  418. count = df_header->number_of_pages;
  419. descr_size = sizeof(*jf_metric_data->descr) * count;
  420. payload_length = sizeof(*jf_metric_data) + descr_size;
  421. size_bytes = sizeof(*jf_header) + payload_length + sizeof(*jf_trailer);
  422. buf = wal_get_transaction_buffer(wc, size_bytes);
  423. jf_header = buf;
  424. jf_header->type = STORE_DATA;
  425. jf_header->reserved = 0;
  426. jf_header->id = ctx->commit_log.transaction_id++;
  427. jf_header->payload_length = payload_length;
  428. jf_metric_data = buf + sizeof(*jf_header);
  429. jf_metric_data->extent_offset = xt_io_descr->pos;
  430. jf_metric_data->extent_size = xt_io_descr->bytes;
  431. jf_metric_data->number_of_pages = count;
  432. memcpy(jf_metric_data->descr, df_header->descr, descr_size);
  433. jf_trailer = buf + sizeof(*jf_header) + payload_length;
  434. crc = crc32(0L, Z_NULL, 0);
  435. crc = crc32(crc, buf, sizeof(*jf_header) + payload_length);
  436. crc32set(jf_trailer->checksum, crc);
  437. }
  438. static void do_commit_transaction(struct rrdengine_worker_config* wc, uint8_t type, void *data)
  439. {
  440. switch (type) {
  441. case STORE_DATA:
  442. commit_data_extent(wc, (struct extent_io_descriptor *)data);
  443. break;
  444. default:
  445. fatal_assert(type == STORE_DATA);
  446. break;
  447. }
  448. }
  449. static void after_invalidate_oldest_committed(struct rrdengine_worker_config* wc)
  450. {
  451. int error;
  452. error = uv_thread_join(wc->now_invalidating_dirty_pages);
  453. if (error) {
  454. error("uv_thread_join(): %s", uv_strerror(error));
  455. }
  456. freez(wc->now_invalidating_dirty_pages);
  457. wc->now_invalidating_dirty_pages = NULL;
  458. wc->cleanup_thread_invalidating_dirty_pages = 0;
  459. }
  460. static void invalidate_oldest_committed(void *arg)
  461. {
  462. struct rrdengine_instance *ctx = arg;
  463. struct rrdengine_worker_config *wc = &ctx->worker_config;
  464. struct page_cache *pg_cache = &ctx->pg_cache;
  465. int ret;
  466. struct rrdeng_page_descr *descr;
  467. struct page_cache_descr *pg_cache_descr;
  468. Pvoid_t *PValue;
  469. Word_t Index;
  470. unsigned nr_committed_pages;
  471. do {
  472. uv_rwlock_wrlock(&pg_cache->committed_page_index.lock);
  473. for (Index = 0,
  474. PValue = JudyLFirst(pg_cache->committed_page_index.JudyL_array, &Index, PJE0),
  475. descr = unlikely(NULL == PValue) ? NULL : *PValue;
  476. descr != NULL;
  477. PValue = JudyLNext(pg_cache->committed_page_index.JudyL_array, &Index, PJE0),
  478. descr = unlikely(NULL == PValue) ? NULL : *PValue) {
  479. fatal_assert(0 != descr->page_length);
  480. rrdeng_page_descr_mutex_lock(ctx, descr);
  481. pg_cache_descr = descr->pg_cache_descr;
  482. if (!(pg_cache_descr->flags & RRD_PAGE_WRITE_PENDING) && pg_cache_try_get_unsafe(descr, 1)) {
  483. rrdeng_page_descr_mutex_unlock(ctx, descr);
  484. ret = JudyLDel(&pg_cache->committed_page_index.JudyL_array, Index, PJE0);
  485. fatal_assert(1 == ret);
  486. break;
  487. }
  488. rrdeng_page_descr_mutex_unlock(ctx, descr);
  489. }
  490. uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock);
  491. if (!descr) {
  492. info("Failed to invalidate any dirty pages to relieve page cache pressure.");
  493. goto out;
  494. }
  495. pg_cache_punch_hole(ctx, descr, 1, 1, NULL);
  496. uv_rwlock_wrlock(&pg_cache->committed_page_index.lock);
  497. nr_committed_pages = --pg_cache->committed_page_index.nr_committed_pages;
  498. uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock);
  499. rrd_stat_atomic_add(&ctx->stats.flushing_pressure_page_deletions, 1);
  500. rrd_stat_atomic_add(&global_flushing_pressure_page_deletions, 1);
  501. } while (nr_committed_pages >= pg_cache_committed_hard_limit(ctx));
  502. out:
  503. wc->cleanup_thread_invalidating_dirty_pages = 1;
  504. /* wake up event loop */
  505. fatal_assert(0 == uv_async_send(&wc->async));
  506. }
  507. void rrdeng_invalidate_oldest_committed(struct rrdengine_worker_config* wc)
  508. {
  509. struct rrdengine_instance *ctx = wc->ctx;
  510. struct page_cache *pg_cache = &ctx->pg_cache;
  511. unsigned nr_committed_pages;
  512. int error;
  513. if (unlikely(ctx->quiesce != NO_QUIESCE)) /* Shutting down */
  514. return;
  515. uv_rwlock_rdlock(&pg_cache->committed_page_index.lock);
  516. nr_committed_pages = pg_cache->committed_page_index.nr_committed_pages;
  517. uv_rwlock_rdunlock(&pg_cache->committed_page_index.lock);
  518. if (nr_committed_pages >= pg_cache_committed_hard_limit(ctx)) {
  519. /* delete the oldest page in memory */
  520. if (wc->now_invalidating_dirty_pages) {
  521. /* already deleting a page */
  522. return;
  523. }
  524. errno = 0;
  525. error("Failed to flush dirty buffers quickly enough in dbengine instance \"%s\". "
  526. "Metric data are being deleted, please reduce disk load or use a faster disk.", ctx->dbfiles_path);
  527. wc->now_invalidating_dirty_pages = mallocz(sizeof(*wc->now_invalidating_dirty_pages));
  528. wc->cleanup_thread_invalidating_dirty_pages = 0;
  529. error = uv_thread_create(wc->now_invalidating_dirty_pages, invalidate_oldest_committed, ctx);
  530. if (error) {
  531. error("uv_thread_create(): %s", uv_strerror(error));
  532. freez(wc->now_invalidating_dirty_pages);
  533. wc->now_invalidating_dirty_pages = NULL;
  534. }
  535. }
  536. }
  537. void flush_pages_cb(uv_fs_t* req)
  538. {
  539. struct rrdengine_worker_config* wc = req->loop->data;
  540. struct rrdengine_instance *ctx = wc->ctx;
  541. struct page_cache *pg_cache = &ctx->pg_cache;
  542. struct extent_io_descriptor *xt_io_descr;
  543. struct rrdeng_page_descr *descr;
  544. struct page_cache_descr *pg_cache_descr;
  545. unsigned i, count;
  546. xt_io_descr = req->data;
  547. if (req->result < 0) {
  548. ++ctx->stats.io_errors;
  549. rrd_stat_atomic_add(&global_io_errors, 1);
  550. error("%s: uv_fs_write: %s", __func__, uv_strerror((int)req->result));
  551. }
  552. #ifdef NETDATA_INTERNAL_CHECKS
  553. {
  554. struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile;
  555. debug(D_RRDENGINE, "%s: Extent at offset %"PRIu64"(%u) was written to datafile %u-%u. Waking up waiters.",
  556. __func__, xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno);
  557. }
  558. #endif
  559. count = xt_io_descr->descr_count;
  560. for (i = 0 ; i < count ; ++i) {
  561. /* care, we don't hold the descriptor mutex */
  562. descr = xt_io_descr->descr_array[i];
  563. pg_cache_replaceQ_insert(ctx, descr);
  564. rrdeng_page_descr_mutex_lock(ctx, descr);
  565. pg_cache_descr = descr->pg_cache_descr;
  566. pg_cache_descr->flags &= ~(RRD_PAGE_DIRTY | RRD_PAGE_WRITE_PENDING);
  567. /* wake up waiters, care no reference being held */
  568. pg_cache_wake_up_waiters_unsafe(descr);
  569. rrdeng_page_descr_mutex_unlock(ctx, descr);
  570. }
  571. if (xt_io_descr->completion)
  572. completion_mark_complete(xt_io_descr->completion);
  573. uv_fs_req_cleanup(req);
  574. free(xt_io_descr->buf);
  575. freez(xt_io_descr);
  576. uv_rwlock_wrlock(&pg_cache->committed_page_index.lock);
  577. pg_cache->committed_page_index.nr_committed_pages -= count;
  578. uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock);
  579. wc->inflight_dirty_pages -= count;
  580. }
  581. /*
  582. * completion must be NULL or valid.
  583. * Returns 0 when no flushing can take place.
  584. * Returns datafile bytes to be written on successful flushing initiation.
  585. */
  586. static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct completion *completion)
  587. {
  588. struct rrdengine_instance *ctx = wc->ctx;
  589. struct page_cache *pg_cache = &ctx->pg_cache;
  590. int ret;
  591. int compressed_size, max_compressed_size = 0;
  592. unsigned i, count, size_bytes, pos, real_io_size;
  593. uint32_t uncompressed_payload_length, payload_offset;
  594. struct rrdeng_page_descr *descr, *eligible_pages[MAX_PAGES_PER_EXTENT];
  595. struct page_cache_descr *pg_cache_descr;
  596. struct extent_io_descriptor *xt_io_descr;
  597. void *compressed_buf = NULL;
  598. Word_t descr_commit_idx_array[MAX_PAGES_PER_EXTENT];
  599. Pvoid_t *PValue;
  600. Word_t Index;
  601. uint8_t compression_algorithm = ctx->global_compress_alg;
  602. struct extent_info *extent;
  603. struct rrdengine_datafile *datafile;
  604. /* persistent structures */
  605. struct rrdeng_df_extent_header *header;
  606. struct rrdeng_df_extent_trailer *trailer;
  607. uLong crc;
  608. if (force) {
  609. debug(D_RRDENGINE, "Asynchronous flushing of extent has been forced by page pressure.");
  610. }
  611. uv_rwlock_wrlock(&pg_cache->committed_page_index.lock);
  612. for (Index = 0, count = 0, uncompressed_payload_length = 0,
  613. PValue = JudyLFirst(pg_cache->committed_page_index.JudyL_array, &Index, PJE0),
  614. descr = unlikely(NULL == PValue) ? NULL : *PValue ;
  615. descr != NULL && count != pages_per_extent ;
  616. PValue = JudyLNext(pg_cache->committed_page_index.JudyL_array, &Index, PJE0),
  617. descr = unlikely(NULL == PValue) ? NULL : *PValue) {
  618. uint8_t page_write_pending;
  619. fatal_assert(0 != descr->page_length);
  620. page_write_pending = 0;
  621. rrdeng_page_descr_mutex_lock(ctx, descr);
  622. pg_cache_descr = descr->pg_cache_descr;
  623. if (!(pg_cache_descr->flags & RRD_PAGE_WRITE_PENDING)) {
  624. page_write_pending = 1;
  625. /* care, no reference being held */
  626. pg_cache_descr->flags |= RRD_PAGE_WRITE_PENDING;
  627. uncompressed_payload_length += descr->page_length;
  628. descr_commit_idx_array[count] = Index;
  629. eligible_pages[count++] = descr;
  630. }
  631. rrdeng_page_descr_mutex_unlock(ctx, descr);
  632. if (page_write_pending) {
  633. ret = JudyLDel(&pg_cache->committed_page_index.JudyL_array, Index, PJE0);
  634. fatal_assert(1 == ret);
  635. }
  636. }
  637. uv_rwlock_wrunlock(&pg_cache->committed_page_index.lock);
  638. if (!count) {
  639. debug(D_RRDENGINE, "%s: no pages eligible for flushing.", __func__);
  640. if (completion)
  641. completion_mark_complete(completion);
  642. return 0;
  643. }
  644. wc->inflight_dirty_pages += count;
  645. xt_io_descr = mallocz(sizeof(*xt_io_descr));
  646. payload_offset = sizeof(*header) + count * sizeof(header->descr[0]);
  647. switch (compression_algorithm) {
  648. case RRD_NO_COMPRESSION:
  649. size_bytes = payload_offset + uncompressed_payload_length + sizeof(*trailer);
  650. break;
  651. default: /* Compress */
  652. fatal_assert(uncompressed_payload_length < LZ4_MAX_INPUT_SIZE);
  653. max_compressed_size = LZ4_compressBound(uncompressed_payload_length);
  654. compressed_buf = mallocz(max_compressed_size);
  655. size_bytes = payload_offset + MAX(uncompressed_payload_length, (unsigned)max_compressed_size) + sizeof(*trailer);
  656. break;
  657. }
  658. ret = posix_memalign((void *)&xt_io_descr->buf, RRDFILE_ALIGNMENT, ALIGN_BYTES_CEILING(size_bytes));
  659. if (unlikely(ret)) {
  660. fatal("posix_memalign:%s", strerror(ret));
  661. /* freez(xt_io_descr);*/
  662. }
  663. memset(xt_io_descr->buf, 0, ALIGN_BYTES_CEILING(size_bytes));
  664. (void) memcpy(xt_io_descr->descr_array, eligible_pages, sizeof(struct rrdeng_page_descr *) * count);
  665. xt_io_descr->descr_count = count;
  666. pos = 0;
  667. header = xt_io_descr->buf;
  668. header->compression_algorithm = compression_algorithm;
  669. header->number_of_pages = count;
  670. pos += sizeof(*header);
  671. extent = mallocz(sizeof(*extent) + count * sizeof(extent->pages[0]));
  672. datafile = ctx->datafiles.last; /* TODO: check for exceeded size quota */
  673. extent->offset = datafile->pos;
  674. extent->number_of_pages = count;
  675. extent->datafile = datafile;
  676. extent->next = NULL;
  677. for (i = 0 ; i < count ; ++i) {
  678. /* This is here for performance reasons */
  679. xt_io_descr->descr_commit_idx_array[i] = descr_commit_idx_array[i];
  680. descr = xt_io_descr->descr_array[i];
  681. header->descr[i].type = PAGE_METRICS;
  682. uuid_copy(*(uuid_t *)header->descr[i].uuid, *descr->id);
  683. header->descr[i].page_length = descr->page_length;
  684. header->descr[i].start_time = descr->start_time;
  685. header->descr[i].end_time = descr->end_time;
  686. pos += sizeof(header->descr[i]);
  687. }
  688. for (i = 0 ; i < count ; ++i) {
  689. descr = xt_io_descr->descr_array[i];
  690. /* care, we don't hold the descriptor mutex */
  691. (void) memcpy(xt_io_descr->buf + pos, descr->pg_cache_descr->page, descr->page_length);
  692. descr->extent = extent;
  693. extent->pages[i] = descr;
  694. pos += descr->page_length;
  695. }
  696. df_extent_insert(extent);
  697. switch (compression_algorithm) {
  698. case RRD_NO_COMPRESSION:
  699. header->payload_length = uncompressed_payload_length;
  700. break;
  701. default: /* Compress */
  702. compressed_size = LZ4_compress_default(xt_io_descr->buf + payload_offset, compressed_buf,
  703. uncompressed_payload_length, max_compressed_size);
  704. ctx->stats.before_compress_bytes += uncompressed_payload_length;
  705. ctx->stats.after_compress_bytes += compressed_size;
  706. debug(D_RRDENGINE, "LZ4 compressed %"PRIu32" bytes to %d bytes.", uncompressed_payload_length, compressed_size);
  707. (void) memcpy(xt_io_descr->buf + payload_offset, compressed_buf, compressed_size);
  708. freez(compressed_buf);
  709. size_bytes = payload_offset + compressed_size + sizeof(*trailer);
  710. header->payload_length = compressed_size;
  711. break;
  712. }
  713. extent->size = size_bytes;
  714. xt_io_descr->bytes = size_bytes;
  715. xt_io_descr->pos = datafile->pos;
  716. xt_io_descr->req.data = xt_io_descr;
  717. xt_io_descr->completion = completion;
  718. trailer = xt_io_descr->buf + size_bytes - sizeof(*trailer);
  719. crc = crc32(0L, Z_NULL, 0);
  720. crc = crc32(crc, xt_io_descr->buf, size_bytes - sizeof(*trailer));
  721. crc32set(trailer->checksum, crc);
  722. real_io_size = ALIGN_BYTES_CEILING(size_bytes);
  723. xt_io_descr->iov = uv_buf_init((void *)xt_io_descr->buf, real_io_size);
  724. ret = uv_fs_write(wc->loop, &xt_io_descr->req, datafile->file, &xt_io_descr->iov, 1, datafile->pos, flush_pages_cb);
  725. fatal_assert(-1 != ret);
  726. ctx->stats.io_write_bytes += real_io_size;
  727. ++ctx->stats.io_write_requests;
  728. ctx->stats.io_write_extent_bytes += real_io_size;
  729. ++ctx->stats.io_write_extents;
  730. do_commit_transaction(wc, STORE_DATA, xt_io_descr);
  731. datafile->pos += ALIGN_BYTES_CEILING(size_bytes);
  732. ctx->disk_space += ALIGN_BYTES_CEILING(size_bytes);
  733. rrdeng_test_quota(wc);
  734. return ALIGN_BYTES_CEILING(size_bytes);
  735. }
  736. static void after_delete_old_data(struct rrdengine_worker_config* wc)
  737. {
  738. struct rrdengine_instance *ctx = wc->ctx;
  739. struct rrdengine_datafile *datafile;
  740. struct rrdengine_journalfile *journalfile;
  741. unsigned deleted_bytes, journalfile_bytes, datafile_bytes;
  742. int ret, error;
  743. char path[RRDENG_PATH_MAX];
  744. datafile = ctx->datafiles.first;
  745. journalfile = datafile->journalfile;
  746. datafile_bytes = datafile->pos;
  747. journalfile_bytes = journalfile->pos;
  748. deleted_bytes = 0;
  749. info("Deleting data and journal file pair.");
  750. datafile_list_delete(ctx, datafile);
  751. ret = destroy_journal_file(journalfile, datafile);
  752. if (!ret) {
  753. generate_journalfilepath(datafile, path, sizeof(path));
  754. info("Deleted journal file \"%s\".", path);
  755. deleted_bytes += journalfile_bytes;
  756. }
  757. ret = destroy_data_file(datafile);
  758. if (!ret) {
  759. generate_datafilepath(datafile, path, sizeof(path));
  760. info("Deleted data file \"%s\".", path);
  761. deleted_bytes += datafile_bytes;
  762. }
  763. freez(journalfile);
  764. freez(datafile);
  765. ctx->disk_space -= deleted_bytes;
  766. info("Reclaimed %u bytes of disk space.", deleted_bytes);
  767. error = uv_thread_join(wc->now_deleting_files);
  768. if (error) {
  769. error("uv_thread_join(): %s", uv_strerror(error));
  770. }
  771. freez(wc->now_deleting_files);
  772. /* unfreeze command processing */
  773. wc->now_deleting_files = NULL;
  774. wc->cleanup_thread_deleting_files = 0;
  775. aclk_data_rotated();
  776. /* interrupt event loop */
  777. uv_stop(wc->loop);
  778. }
  779. static void delete_old_data(void *arg)
  780. {
  781. struct rrdengine_instance *ctx = arg;
  782. struct rrdengine_worker_config* wc = &ctx->worker_config;
  783. struct rrdengine_datafile *datafile;
  784. struct extent_info *extent, *next;
  785. struct rrdeng_page_descr *descr;
  786. unsigned count, i;
  787. uint8_t can_delete_metric;
  788. uuid_t metric_id;
  789. /* Safe to use since it will be deleted after we are done */
  790. datafile = ctx->datafiles.first;
  791. for (extent = datafile->extents.first ; extent != NULL ; extent = next) {
  792. count = extent->number_of_pages;
  793. for (i = 0 ; i < count ; ++i) {
  794. descr = extent->pages[i];
  795. can_delete_metric = pg_cache_punch_hole(ctx, descr, 0, 0, &metric_id);
  796. if (unlikely(can_delete_metric && ctx->metalog_ctx->initialized)) {
  797. /*
  798. * If the metric is empty, has no active writers and if the metadata log has been initialized then
  799. * attempt to delete the corresponding netdata dimension.
  800. */
  801. metalog_delete_dimension_by_uuid(ctx->metalog_ctx, &metric_id);
  802. }
  803. }
  804. next = extent->next;
  805. freez(extent);
  806. }
  807. wc->cleanup_thread_deleting_files = 1;
  808. /* wake up event loop */
  809. fatal_assert(0 == uv_async_send(&wc->async));
  810. }
  811. void rrdeng_test_quota(struct rrdengine_worker_config* wc)
  812. {
  813. struct rrdengine_instance *ctx = wc->ctx;
  814. struct rrdengine_datafile *datafile;
  815. unsigned current_size, target_size;
  816. uint8_t out_of_space, only_one_datafile;
  817. int ret, error;
  818. out_of_space = 0;
  819. /* Do not allow the pinned pages to exceed the disk space quota to avoid deadlocks */
  820. if (unlikely(ctx->disk_space > MAX(ctx->max_disk_space, 2 * ctx->metric_API_max_producers * RRDENG_BLOCK_SIZE))) {
  821. out_of_space = 1;
  822. }
  823. datafile = ctx->datafiles.last;
  824. current_size = datafile->pos;
  825. target_size = ctx->max_disk_space / TARGET_DATAFILES;
  826. target_size = MIN(target_size, MAX_DATAFILE_SIZE);
  827. target_size = MAX(target_size, MIN_DATAFILE_SIZE);
  828. only_one_datafile = (datafile == ctx->datafiles.first) ? 1 : 0;
  829. if (unlikely(current_size >= target_size || (out_of_space && only_one_datafile))) {
  830. /* Finalize data and journal file and create a new pair */
  831. wal_flush_transaction_buffer(wc);
  832. ret = create_new_datafile_pair(ctx, 1, ctx->last_fileno + 1);
  833. if (likely(!ret)) {
  834. ++ctx->last_fileno;
  835. }
  836. }
  837. if (unlikely(out_of_space && NO_QUIESCE == ctx->quiesce)) {
  838. /* delete old data */
  839. if (wc->now_deleting_files) {
  840. /* already deleting data */
  841. return;
  842. }
  843. if (NULL == ctx->datafiles.first->next) {
  844. error("Cannot delete data file \"%s/"DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION"\""
  845. " to reclaim space, there are no other file pairs left.",
  846. ctx->dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno);
  847. return;
  848. }
  849. info("Deleting data file \"%s/"DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION"\".",
  850. ctx->dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno);
  851. wc->now_deleting_files = mallocz(sizeof(*wc->now_deleting_files));
  852. wc->cleanup_thread_deleting_files = 0;
  853. error = uv_thread_create(wc->now_deleting_files, delete_old_data, ctx);
  854. if (error) {
  855. error("uv_thread_create(): %s", uv_strerror(error));
  856. freez(wc->now_deleting_files);
  857. wc->now_deleting_files = NULL;
  858. }
  859. }
  860. }
  861. static inline int rrdeng_threads_alive(struct rrdengine_worker_config* wc)
  862. {
  863. if (wc->now_invalidating_dirty_pages || wc->now_deleting_files) {
  864. return 1;
  865. }
  866. return 0;
  867. }
  868. static void rrdeng_cleanup_finished_threads(struct rrdengine_worker_config* wc)
  869. {
  870. struct rrdengine_instance *ctx = wc->ctx;
  871. if (unlikely(wc->cleanup_thread_invalidating_dirty_pages)) {
  872. after_invalidate_oldest_committed(wc);
  873. }
  874. if (unlikely(wc->cleanup_thread_deleting_files)) {
  875. after_delete_old_data(wc);
  876. }
  877. if (unlikely(SET_QUIESCE == ctx->quiesce && !rrdeng_threads_alive(wc))) {
  878. ctx->quiesce = QUIESCED;
  879. completion_mark_complete(&ctx->rrdengine_completion);
  880. }
  881. }
  882. /* return 0 on success */
  883. int init_rrd_files(struct rrdengine_instance *ctx)
  884. {
  885. return init_data_files(ctx);
  886. }
  887. void finalize_rrd_files(struct rrdengine_instance *ctx)
  888. {
  889. return finalize_data_files(ctx);
  890. }
  891. void rrdeng_init_cmd_queue(struct rrdengine_worker_config* wc)
  892. {
  893. wc->cmd_queue.head = wc->cmd_queue.tail = 0;
  894. wc->queue_size = 0;
  895. fatal_assert(0 == uv_cond_init(&wc->cmd_cond));
  896. fatal_assert(0 == uv_mutex_init(&wc->cmd_mutex));
  897. }
  898. void rrdeng_enq_cmd(struct rrdengine_worker_config* wc, struct rrdeng_cmd *cmd)
  899. {
  900. unsigned queue_size;
  901. /* wait for free space in queue */
  902. uv_mutex_lock(&wc->cmd_mutex);
  903. while ((queue_size = wc->queue_size) == RRDENG_CMD_Q_MAX_SIZE) {
  904. uv_cond_wait(&wc->cmd_cond, &wc->cmd_mutex);
  905. }
  906. fatal_assert(queue_size < RRDENG_CMD_Q_MAX_SIZE);
  907. /* enqueue command */
  908. wc->cmd_queue.cmd_array[wc->cmd_queue.tail] = *cmd;
  909. wc->cmd_queue.tail = wc->cmd_queue.tail != RRDENG_CMD_Q_MAX_SIZE - 1 ?
  910. wc->cmd_queue.tail + 1 : 0;
  911. wc->queue_size = queue_size + 1;
  912. uv_mutex_unlock(&wc->cmd_mutex);
  913. /* wake up event loop */
  914. fatal_assert(0 == uv_async_send(&wc->async));
  915. }
  916. struct rrdeng_cmd rrdeng_deq_cmd(struct rrdengine_worker_config* wc)
  917. {
  918. struct rrdeng_cmd ret;
  919. unsigned queue_size;
  920. uv_mutex_lock(&wc->cmd_mutex);
  921. queue_size = wc->queue_size;
  922. if (queue_size == 0) {
  923. ret.opcode = RRDENG_NOOP;
  924. } else {
  925. /* dequeue command */
  926. ret = wc->cmd_queue.cmd_array[wc->cmd_queue.head];
  927. if (queue_size == 1) {
  928. wc->cmd_queue.head = wc->cmd_queue.tail = 0;
  929. } else {
  930. wc->cmd_queue.head = wc->cmd_queue.head != RRDENG_CMD_Q_MAX_SIZE - 1 ?
  931. wc->cmd_queue.head + 1 : 0;
  932. }
  933. wc->queue_size = queue_size - 1;
  934. /* wake up producers */
  935. uv_cond_signal(&wc->cmd_cond);
  936. }
  937. uv_mutex_unlock(&wc->cmd_mutex);
  938. return ret;
  939. }
  940. static void load_configuration_dynamic(void)
  941. {
  942. unsigned read_num;
  943. static int printed_error = 0;
  944. read_num = (unsigned) config_get_number(CONFIG_SECTION_GLOBAL, "dbengine extent pages",
  945. MAX_PAGES_PER_EXTENT);
  946. if (read_num > 0 && read_num <= MAX_PAGES_PER_EXTENT) {
  947. pages_per_extent = read_num;
  948. } else if (!printed_error) {
  949. printed_error = 1;
  950. error("Invalid dbengine extent pages %u given. Defaulting to %u.", read_num, pages_per_extent);
  951. }
  952. }
  953. void async_cb(uv_async_t *handle)
  954. {
  955. uv_stop(handle->loop);
  956. uv_update_time(handle->loop);
  957. debug(D_RRDENGINE, "%s called, active=%d.", __func__, uv_is_active((uv_handle_t *)handle));
  958. }
  959. /* Flushes dirty pages when timer expires */
  960. #define TIMER_PERIOD_MS (1000)
  961. void timer_cb(uv_timer_t* handle)
  962. {
  963. worker_is_busy(RRDENG_MAX_OPCODE + 1);
  964. struct rrdengine_worker_config* wc = handle->data;
  965. struct rrdengine_instance *ctx = wc->ctx;
  966. uv_stop(handle->loop);
  967. uv_update_time(handle->loop);
  968. if (unlikely(!ctx->metalog_ctx->initialized)) {
  969. worker_is_idle();
  970. return; /* Wait for the metadata log to initialize */
  971. }
  972. rrdeng_test_quota(wc);
  973. debug(D_RRDENGINE, "%s: timeout reached.", __func__);
  974. if (likely(!wc->now_deleting_files && !wc->now_invalidating_dirty_pages)) {
  975. /* There is free space so we can write to disk and we are not actively deleting dirty buffers */
  976. struct page_cache *pg_cache = &ctx->pg_cache;
  977. unsigned long total_bytes, bytes_written, nr_committed_pages, bytes_to_write = 0, producers, low_watermark,
  978. high_watermark;
  979. uv_rwlock_rdlock(&pg_cache->committed_page_index.lock);
  980. nr_committed_pages = pg_cache->committed_page_index.nr_committed_pages;
  981. uv_rwlock_rdunlock(&pg_cache->committed_page_index.lock);
  982. producers = ctx->metric_API_max_producers;
  983. /* are flushable pages more than 25% of the maximum page cache size */
  984. high_watermark = (ctx->max_cache_pages * 25LLU) / 100;
  985. low_watermark = (ctx->max_cache_pages * 5LLU) / 100; /* 5%, must be smaller than high_watermark */
  986. /* Flush more pages only if disk can keep up */
  987. if (wc->inflight_dirty_pages < high_watermark + producers) {
  988. if (nr_committed_pages > producers &&
  989. /* committed to be written pages are more than the produced number */
  990. nr_committed_pages - producers > high_watermark) {
  991. /* Flushing speed must increase to stop page cache from filling with dirty pages */
  992. bytes_to_write = (nr_committed_pages - producers - low_watermark) * RRDENG_BLOCK_SIZE;
  993. }
  994. bytes_to_write = MAX(DATAFILE_IDEAL_IO_SIZE, bytes_to_write);
  995. debug(D_RRDENGINE, "Flushing pages to disk.");
  996. for (total_bytes = bytes_written = do_flush_pages(wc, 0, NULL);
  997. bytes_written && (total_bytes < bytes_to_write);
  998. total_bytes += bytes_written) {
  999. bytes_written = do_flush_pages(wc, 0, NULL);
  1000. }
  1001. }
  1002. }
  1003. load_configuration_dynamic();
  1004. #ifdef NETDATA_INTERNAL_CHECKS
  1005. {
  1006. char buf[4096];
  1007. debug(D_RRDENGINE, "%s", get_rrdeng_statistics(wc->ctx, buf, sizeof(buf)));
  1008. }
  1009. #endif
  1010. worker_is_idle();
  1011. }
  1012. #define MAX_CMD_BATCH_SIZE (256)
  1013. void rrdeng_worker(void* arg)
  1014. {
  1015. worker_register("DBENGINE");
  1016. worker_register_job_name(RRDENG_NOOP, "noop");
  1017. worker_register_job_name(RRDENG_READ_PAGE, "page read");
  1018. worker_register_job_name(RRDENG_READ_EXTENT, "extent read");
  1019. worker_register_job_name(RRDENG_COMMIT_PAGE, "commit");
  1020. worker_register_job_name(RRDENG_FLUSH_PAGES, "flush");
  1021. worker_register_job_name(RRDENG_SHUTDOWN, "shutdown");
  1022. worker_register_job_name(RRDENG_INVALIDATE_OLDEST_MEMORY_PAGE, "page lru");
  1023. worker_register_job_name(RRDENG_QUIESCE, "quiesce");
  1024. worker_register_job_name(RRDENG_MAX_OPCODE, "cleanup");
  1025. worker_register_job_name(RRDENG_MAX_OPCODE + 1, "timer");
  1026. struct rrdengine_worker_config* wc = arg;
  1027. struct rrdengine_instance *ctx = wc->ctx;
  1028. uv_loop_t* loop;
  1029. int shutdown, ret;
  1030. enum rrdeng_opcode opcode;
  1031. uv_timer_t timer_req;
  1032. struct rrdeng_cmd cmd;
  1033. unsigned cmd_batch_size;
  1034. rrdeng_init_cmd_queue(wc);
  1035. loop = wc->loop = mallocz(sizeof(uv_loop_t));
  1036. ret = uv_loop_init(loop);
  1037. if (ret) {
  1038. error("uv_loop_init(): %s", uv_strerror(ret));
  1039. goto error_after_loop_init;
  1040. }
  1041. loop->data = wc;
  1042. ret = uv_async_init(wc->loop, &wc->async, async_cb);
  1043. if (ret) {
  1044. error("uv_async_init(): %s", uv_strerror(ret));
  1045. goto error_after_async_init;
  1046. }
  1047. wc->async.data = wc;
  1048. wc->now_deleting_files = NULL;
  1049. wc->cleanup_thread_deleting_files = 0;
  1050. wc->now_invalidating_dirty_pages = NULL;
  1051. wc->cleanup_thread_invalidating_dirty_pages = 0;
  1052. wc->inflight_dirty_pages = 0;
  1053. /* dirty page flushing timer */
  1054. ret = uv_timer_init(loop, &timer_req);
  1055. if (ret) {
  1056. error("uv_timer_init(): %s", uv_strerror(ret));
  1057. goto error_after_timer_init;
  1058. }
  1059. timer_req.data = wc;
  1060. wc->error = 0;
  1061. /* wake up initialization thread */
  1062. completion_mark_complete(&ctx->rrdengine_completion);
  1063. fatal_assert(0 == uv_timer_start(&timer_req, timer_cb, TIMER_PERIOD_MS, TIMER_PERIOD_MS));
  1064. shutdown = 0;
  1065. int set_name = 0;
  1066. while (likely(shutdown == 0 || rrdeng_threads_alive(wc))) {
  1067. worker_is_idle();
  1068. uv_run(loop, UV_RUN_DEFAULT);
  1069. worker_is_busy(RRDENG_MAX_OPCODE);
  1070. rrdeng_cleanup_finished_threads(wc);
  1071. /* wait for commands */
  1072. cmd_batch_size = 0;
  1073. do {
  1074. /*
  1075. * Avoid starving the loop when there are too many commands coming in.
  1076. * timer_cb will interrupt the loop again to allow serving more commands.
  1077. */
  1078. if (unlikely(cmd_batch_size >= MAX_CMD_BATCH_SIZE))
  1079. break;
  1080. cmd = rrdeng_deq_cmd(wc);
  1081. opcode = cmd.opcode;
  1082. ++cmd_batch_size;
  1083. if(likely(opcode != RRDENG_NOOP))
  1084. worker_is_busy(opcode);
  1085. switch (opcode) {
  1086. case RRDENG_NOOP:
  1087. /* the command queue was empty, do nothing */
  1088. break;
  1089. case RRDENG_SHUTDOWN:
  1090. shutdown = 1;
  1091. break;
  1092. case RRDENG_QUIESCE:
  1093. ctx->drop_metrics_under_page_cache_pressure = 0;
  1094. ctx->quiesce = SET_QUIESCE;
  1095. fatal_assert(0 == uv_timer_stop(&timer_req));
  1096. uv_close((uv_handle_t *)&timer_req, NULL);
  1097. while (do_flush_pages(wc, 1, NULL)) {
  1098. ; /* Force flushing of all committed pages. */
  1099. }
  1100. wal_flush_transaction_buffer(wc);
  1101. if (!rrdeng_threads_alive(wc)) {
  1102. ctx->quiesce = QUIESCED;
  1103. completion_mark_complete(&ctx->rrdengine_completion);
  1104. }
  1105. break;
  1106. case RRDENG_READ_PAGE:
  1107. do_read_extent(wc, &cmd.read_page.page_cache_descr, 1, 0);
  1108. break;
  1109. case RRDENG_READ_EXTENT:
  1110. do_read_extent(wc, cmd.read_extent.page_cache_descr, cmd.read_extent.page_count, 1);
  1111. if (unlikely(!set_name)) {
  1112. set_name = 1;
  1113. uv_thread_set_name_np(ctx->worker_config.thread, "DBENGINE");
  1114. }
  1115. break;
  1116. case RRDENG_COMMIT_PAGE:
  1117. do_commit_transaction(wc, STORE_DATA, NULL);
  1118. break;
  1119. case RRDENG_FLUSH_PAGES: {
  1120. if (wc->now_invalidating_dirty_pages) {
  1121. /* Do not flush if the disk cannot keep up */
  1122. completion_mark_complete(cmd.completion);
  1123. } else {
  1124. (void)do_flush_pages(wc, 1, cmd.completion);
  1125. }
  1126. break;
  1127. case RRDENG_INVALIDATE_OLDEST_MEMORY_PAGE:
  1128. rrdeng_invalidate_oldest_committed(wc);
  1129. break;
  1130. }
  1131. default:
  1132. debug(D_RRDENGINE, "%s: default.", __func__);
  1133. break;
  1134. }
  1135. } while (opcode != RRDENG_NOOP);
  1136. }
  1137. /* cleanup operations of the event loop */
  1138. info("Shutting down RRD engine event loop.");
  1139. /*
  1140. * uv_async_send after uv_close does not seem to crash in linux at the moment,
  1141. * it is however undocumented behaviour and we need to be aware if this becomes
  1142. * an issue in the future.
  1143. */
  1144. uv_close((uv_handle_t *)&wc->async, NULL);
  1145. while (do_flush_pages(wc, 1, NULL)) {
  1146. ; /* Force flushing of all committed pages. */
  1147. }
  1148. wal_flush_transaction_buffer(wc);
  1149. uv_run(loop, UV_RUN_DEFAULT);
  1150. info("Shutting down RRD engine event loop complete.");
  1151. /* TODO: don't let the API block by waiting to enqueue commands */
  1152. uv_cond_destroy(&wc->cmd_cond);
  1153. /* uv_mutex_destroy(&wc->cmd_mutex); */
  1154. fatal_assert(0 == uv_loop_close(loop));
  1155. freez(loop);
  1156. worker_unregister();
  1157. return;
  1158. error_after_timer_init:
  1159. uv_close((uv_handle_t *)&wc->async, NULL);
  1160. error_after_async_init:
  1161. fatal_assert(0 == uv_loop_close(loop));
  1162. error_after_loop_init:
  1163. freez(loop);
  1164. wc->error = UV_EAGAIN;
  1165. /* wake up initialization thread */
  1166. completion_mark_complete(&ctx->rrdengine_completion);
  1167. worker_unregister();
  1168. }
  1169. /* C entry point for development purposes
  1170. * make "LDFLAGS=-errdengine_main"
  1171. */
  1172. void rrdengine_main(void)
  1173. {
  1174. int ret;
  1175. struct rrdengine_instance *ctx;
  1176. sanity_check();
  1177. ret = rrdeng_init(NULL, &ctx, "/tmp", RRDENG_MIN_PAGE_CACHE_SIZE_MB, RRDENG_MIN_DISK_SPACE_MB);
  1178. if (ret) {
  1179. exit(ret);
  1180. }
  1181. rrdeng_exit(ctx);
  1182. fprintf(stderr, "Hello world!");
  1183. exit(0);
  1184. }