receiver.c 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "rrdpush.h"
  3. #include "web/server/h2o/http_server.h"
  4. extern struct config stream_config;
  5. void receiver_state_free(struct receiver_state *rpt) {
  6. freez(rpt->key);
  7. freez(rpt->hostname);
  8. freez(rpt->registry_hostname);
  9. freez(rpt->machine_guid);
  10. freez(rpt->os);
  11. freez(rpt->timezone);
  12. freez(rpt->abbrev_timezone);
  13. freez(rpt->tags);
  14. freez(rpt->client_ip);
  15. freez(rpt->client_port);
  16. freez(rpt->program_name);
  17. freez(rpt->program_version);
  18. #ifdef ENABLE_HTTPS
  19. netdata_ssl_close(&rpt->ssl);
  20. #endif
  21. if(rpt->fd != -1) {
  22. internal_error(true, "closing socket...");
  23. close(rpt->fd);
  24. }
  25. rrdpush_decompressor_destroy(&rpt->decompressor);
  26. if(rpt->system_info)
  27. rrdhost_system_info_free(rpt->system_info);
  28. __atomic_sub_fetch(&netdata_buffers_statistics.rrdhost_receivers, sizeof(*rpt), __ATOMIC_RELAXED);
  29. freez(rpt);
  30. }
  31. #include "collectors/plugins.d/pluginsd_parser.h"
  32. // IMPORTANT: to add workers, you have to edit WORKER_PARSER_FIRST_JOB accordingly
  33. #define WORKER_RECEIVER_JOB_BYTES_READ (WORKER_PARSER_FIRST_JOB - 1)
  34. #define WORKER_RECEIVER_JOB_BYTES_UNCOMPRESSED (WORKER_PARSER_FIRST_JOB - 2)
  35. // this has to be the same at parser.h
  36. #define WORKER_RECEIVER_JOB_REPLICATION_COMPLETION (WORKER_PARSER_FIRST_JOB - 3)
  37. #if WORKER_PARSER_FIRST_JOB < 1
  38. #error The define WORKER_PARSER_FIRST_JOB needs to be at least 1
  39. #endif
  40. static inline int read_stream(struct receiver_state *r, char* buffer, size_t size) {
  41. if(unlikely(!size)) {
  42. internal_error(true, "%s() asked to read zero bytes", __FUNCTION__);
  43. return 0;
  44. }
  45. #ifdef ENABLE_H2O
  46. if (is_h2o_rrdpush(r))
  47. return (int)h2o_stream_read(r->h2o_ctx, buffer, size);
  48. #endif
  49. int tries = 100;
  50. ssize_t bytes_read;
  51. do {
  52. errno = 0;
  53. #ifdef ENABLE_HTTPS
  54. if (SSL_connection(&r->ssl))
  55. bytes_read = netdata_ssl_read(&r->ssl, buffer, size);
  56. else
  57. bytes_read = read(r->fd, buffer, size);
  58. #else
  59. bytes_read = read(r->fd, buffer, size);
  60. #endif
  61. } while(bytes_read < 0 && errno == EINTR && tries--);
  62. if((bytes_read == 0 || bytes_read == -1) && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINPROGRESS)) {
  63. netdata_log_error("STREAM: %s(): timeout while waiting for data on socket!", __FUNCTION__);
  64. bytes_read = -3;
  65. }
  66. else if (bytes_read == 0) {
  67. netdata_log_error("STREAM: %s(): EOF while reading data from socket!", __FUNCTION__);
  68. bytes_read = -1;
  69. }
  70. else if (bytes_read < 0) {
  71. netdata_log_error("STREAM: %s() failed to read from socket!", __FUNCTION__);
  72. bytes_read = -2;
  73. }
  74. return (int)bytes_read;
  75. }
  76. static inline STREAM_HANDSHAKE read_stream_error_to_reason(int code) {
  77. if(code > 0)
  78. return 0;
  79. switch(code) {
  80. case 0:
  81. // asked to read zero bytes
  82. return STREAM_HANDSHAKE_DISCONNECT_NOT_SUFFICIENT_READ_BUFFER;
  83. case -1:
  84. // EOF
  85. return STREAM_HANDSHAKE_DISCONNECT_SOCKET_EOF;
  86. case -2:
  87. // failed to read
  88. return STREAM_HANDSHAKE_DISCONNECT_SOCKET_READ_FAILED;
  89. case -3:
  90. // timeout
  91. return STREAM_HANDSHAKE_DISCONNECT_SOCKET_READ_TIMEOUT;
  92. default:
  93. // anything else
  94. return STREAM_HANDSHAKE_DISCONNECT_UNKNOWN_SOCKET_READ_ERROR;
  95. }
  96. }
  97. static inline bool receiver_read_uncompressed(struct receiver_state *r, STREAM_HANDSHAKE *reason) {
  98. #ifdef NETDATA_INTERNAL_CHECKS
  99. if(r->reader.read_buffer[r->reader.read_len] != '\0')
  100. fatal("%s(): read_buffer does not start with zero", __FUNCTION__ );
  101. #endif
  102. int bytes_read = read_stream(r, r->reader.read_buffer + r->reader.read_len, sizeof(r->reader.read_buffer) - r->reader.read_len - 1);
  103. if(unlikely(bytes_read <= 0)) {
  104. *reason = read_stream_error_to_reason(bytes_read);
  105. return false;
  106. }
  107. worker_set_metric(WORKER_RECEIVER_JOB_BYTES_READ, (NETDATA_DOUBLE)bytes_read);
  108. worker_set_metric(WORKER_RECEIVER_JOB_BYTES_UNCOMPRESSED, (NETDATA_DOUBLE)bytes_read);
  109. r->reader.read_len += bytes_read;
  110. r->reader.read_buffer[r->reader.read_len] = '\0';
  111. return true;
  112. }
  113. static inline bool receiver_read_compressed(struct receiver_state *r, STREAM_HANDSHAKE *reason) {
  114. internal_fatal(r->reader.read_buffer[r->reader.read_len] != '\0',
  115. "%s: read_buffer does not start with zero #2", __FUNCTION__ );
  116. // first use any available uncompressed data
  117. if (likely(rrdpush_decompressed_bytes_in_buffer(&r->decompressor))) {
  118. size_t available = sizeof(r->reader.read_buffer) - r->reader.read_len - 1;
  119. if (likely(available)) {
  120. size_t len = rrdpush_decompressor_get(&r->decompressor, r->reader.read_buffer + r->reader.read_len, available);
  121. if (unlikely(!len)) {
  122. internal_error(true, "decompressor returned zero length #1");
  123. return false;
  124. }
  125. r->reader.read_len += (int)len;
  126. r->reader.read_buffer[r->reader.read_len] = '\0';
  127. }
  128. else
  129. internal_fatal(true, "The line to read is too big! Already have %zd bytes in read_buffer.", r->reader.read_len);
  130. return true;
  131. }
  132. // no decompressed data available
  133. // read the compression signature of the next block
  134. if(unlikely(r->reader.read_len + r->decompressor.signature_size > sizeof(r->reader.read_buffer) - 1)) {
  135. internal_error(true, "The last incomplete line does not leave enough room for the next compression header! "
  136. "Already have %zd bytes in read_buffer.", r->reader.read_len);
  137. return false;
  138. }
  139. // read the compression signature from the stream
  140. // we have to do a loop here, because read_stream() may return less than the data we need
  141. int bytes_read = 0;
  142. do {
  143. int ret = read_stream(r, r->reader.read_buffer + r->reader.read_len + bytes_read, r->decompressor.signature_size - bytes_read);
  144. if (unlikely(ret <= 0)) {
  145. *reason = read_stream_error_to_reason(ret);
  146. return false;
  147. }
  148. bytes_read += ret;
  149. } while(unlikely(bytes_read < (int)r->decompressor.signature_size));
  150. worker_set_metric(WORKER_RECEIVER_JOB_BYTES_READ, (NETDATA_DOUBLE)bytes_read);
  151. if(unlikely(bytes_read != (int)r->decompressor.signature_size))
  152. fatal("read %d bytes, but expected compression signature of size %zu", bytes_read, r->decompressor.signature_size);
  153. size_t compressed_message_size = rrdpush_decompressor_start(&r->decompressor, r->reader.read_buffer + r->reader.read_len, bytes_read);
  154. if (unlikely(!compressed_message_size)) {
  155. internal_error(true, "multiplexed uncompressed data in compressed stream!");
  156. r->reader.read_len += bytes_read;
  157. r->reader.read_buffer[r->reader.read_len] = '\0';
  158. return true;
  159. }
  160. if(unlikely(compressed_message_size > COMPRESSION_MAX_MSG_SIZE)) {
  161. netdata_log_error("received a compressed message of %zu bytes, which is bigger than the max compressed message size supported of %zu. Ignoring message.",
  162. compressed_message_size, (size_t)COMPRESSION_MAX_MSG_SIZE);
  163. return false;
  164. }
  165. // delete compression header from our read buffer
  166. r->reader.read_buffer[r->reader.read_len] = '\0';
  167. // Read the entire compressed block of compressed data
  168. char compressed[compressed_message_size];
  169. size_t compressed_bytes_read = 0;
  170. do {
  171. size_t start = compressed_bytes_read;
  172. size_t remaining = compressed_message_size - start;
  173. int last_read_bytes = read_stream(r, &compressed[start], remaining);
  174. if (unlikely(last_read_bytes <= 0)) {
  175. *reason = read_stream_error_to_reason(last_read_bytes);
  176. return false;
  177. }
  178. compressed_bytes_read += last_read_bytes;
  179. } while(unlikely(compressed_message_size > compressed_bytes_read));
  180. worker_set_metric(WORKER_RECEIVER_JOB_BYTES_READ, (NETDATA_DOUBLE)compressed_bytes_read);
  181. // decompress the compressed block
  182. size_t bytes_to_parse = rrdpush_decompress(&r->decompressor, compressed, compressed_bytes_read);
  183. if (unlikely(!bytes_to_parse)) {
  184. internal_error(true, "no bytes to parse.");
  185. return false;
  186. }
  187. worker_set_metric(WORKER_RECEIVER_JOB_BYTES_UNCOMPRESSED, (NETDATA_DOUBLE)bytes_to_parse);
  188. // fill read buffer with decompressed data
  189. size_t len = (int) rrdpush_decompressor_get(&r->decompressor, r->reader.read_buffer + r->reader.read_len, sizeof(r->reader.read_buffer) - r->reader.read_len - 1);
  190. if (unlikely(!len)) {
  191. internal_error(true, "decompressor returned zero length #2");
  192. return false;
  193. }
  194. r->reader.read_len += (int)len;
  195. r->reader.read_buffer[r->reader.read_len] = '\0';
  196. return true;
  197. }
  198. bool plugin_is_enabled(struct plugind *cd);
  199. static void receiver_set_exit_reason(struct receiver_state *rpt, STREAM_HANDSHAKE reason, bool force) {
  200. if(force || !rpt->exit.reason)
  201. rpt->exit.reason = reason;
  202. }
  203. static inline bool receiver_should_stop(struct receiver_state *rpt) {
  204. static __thread size_t counter = 0;
  205. if(unlikely(rpt->exit.shutdown)) {
  206. receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_SHUTDOWN, false);
  207. return true;
  208. }
  209. if(unlikely(!service_running(SERVICE_STREAMING))) {
  210. receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_NETDATA_EXIT, false);
  211. return true;
  212. }
  213. if(unlikely((counter++ % 1000) == 0)) {
  214. // check every 1000 lines read
  215. netdata_thread_testcancel();
  216. rpt->last_msg_t = now_monotonic_sec();
  217. }
  218. return false;
  219. }
  220. static size_t streaming_parser(struct receiver_state *rpt, struct plugind *cd, int fd, void *ssl) {
  221. size_t result = 0;
  222. PARSER *parser = NULL;
  223. {
  224. PARSER_USER_OBJECT user = {
  225. .enabled = plugin_is_enabled(cd),
  226. .host = rpt->host,
  227. .opaque = rpt,
  228. .cd = cd,
  229. .trust_durations = 1,
  230. .capabilities = rpt->capabilities,
  231. };
  232. parser = parser_init(&user, NULL, NULL, fd, PARSER_INPUT_SPLIT, ssl);
  233. }
  234. #ifdef ENABLE_H2O
  235. parser->h2o_ctx = rpt->h2o_ctx;
  236. #endif
  237. pluginsd_keywords_init(parser, PARSER_INIT_STREAMING);
  238. rrd_collector_started();
  239. // this keeps the parser with its current value
  240. // so, parser needs to be allocated before pushing it
  241. netdata_thread_cleanup_push(pluginsd_process_thread_cleanup, parser);
  242. {
  243. bool compressed_connection = rrdpush_decompression_initialize(rpt);
  244. buffered_reader_init(&rpt->reader);
  245. #ifdef NETDATA_LOG_STREAM_RECEIVE
  246. {
  247. char filename[FILENAME_MAX + 1];
  248. snprintfz(filename, FILENAME_MAX, "/tmp/stream-receiver-%s.txt", rpt->host ? rrdhost_hostname(
  249. rpt->host) : "unknown"
  250. );
  251. parser->user.stream_log_fp = fopen(filename, "w");
  252. parser->user.stream_log_repertoire = PARSER_REP_METADATA;
  253. }
  254. #endif
  255. CLEAN_BUFFER *buffer = buffer_create(sizeof(rpt->reader.read_buffer), NULL);
  256. ND_LOG_STACK lgs[] = {
  257. ND_LOG_FIELD_CB(NDF_REQUEST, line_splitter_reconstruct_line, &parser->line),
  258. ND_LOG_FIELD_CB(NDF_NIDL_NODE, parser_reconstruct_node, parser),
  259. ND_LOG_FIELD_CB(NDF_NIDL_INSTANCE, parser_reconstruct_instance, parser),
  260. ND_LOG_FIELD_CB(NDF_NIDL_CONTEXT, parser_reconstruct_context, parser),
  261. ND_LOG_FIELD_END(),
  262. };
  263. ND_LOG_STACK_PUSH(lgs);
  264. while(!receiver_should_stop(rpt)) {
  265. if(!buffered_reader_next_line(&rpt->reader, buffer)) {
  266. STREAM_HANDSHAKE reason = STREAM_HANDSHAKE_DISCONNECT_UNKNOWN_SOCKET_READ_ERROR;
  267. bool have_new_data = compressed_connection ? receiver_read_compressed(rpt, &reason)
  268. : receiver_read_uncompressed(rpt, &reason);
  269. if(unlikely(!have_new_data)) {
  270. receiver_set_exit_reason(rpt, reason, false);
  271. break;
  272. }
  273. continue;
  274. }
  275. if(unlikely(parser_action(parser, buffer->buffer))) {
  276. receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_PARSER_FAILED, false);
  277. break;
  278. }
  279. buffer->len = 0;
  280. buffer->buffer[0] = '\0';
  281. }
  282. result = parser->user.data_collections_count;
  283. }
  284. // free parser with the pop function
  285. netdata_thread_cleanup_pop(1);
  286. return result;
  287. }
  288. static void rrdpush_receiver_replication_reset(RRDHOST *host) {
  289. RRDSET *st;
  290. rrdset_foreach_read(st, host) {
  291. rrdset_flag_clear(st, RRDSET_FLAG_RECEIVER_REPLICATION_IN_PROGRESS);
  292. rrdset_flag_set(st, RRDSET_FLAG_RECEIVER_REPLICATION_FINISHED);
  293. }
  294. rrdset_foreach_done(st);
  295. rrdhost_receiver_replicating_charts_zero(host);
  296. }
  297. static bool rrdhost_set_receiver(RRDHOST *host, struct receiver_state *rpt) {
  298. bool signal_rrdcontext = false;
  299. bool set_this = false;
  300. netdata_mutex_lock(&host->receiver_lock);
  301. if (!host->receiver) {
  302. rrdhost_flag_clear(host, RRDHOST_FLAG_ORPHAN);
  303. host->rrdpush_receiver_connection_counter++;
  304. __atomic_add_fetch(&localhost->connected_children_count, 1, __ATOMIC_RELAXED);
  305. host->receiver = rpt;
  306. rpt->host = host;
  307. host->child_connect_time = now_realtime_sec();
  308. host->child_disconnected_time = 0;
  309. host->child_last_chart_command = 0;
  310. host->trigger_chart_obsoletion_check = 1;
  311. if (rpt->config.health_enabled != CONFIG_BOOLEAN_NO) {
  312. if (rpt->config.alarms_delay > 0) {
  313. host->health.health_delay_up_to = now_realtime_sec() + rpt->config.alarms_delay;
  314. nd_log(NDLS_DAEMON, NDLP_DEBUG,
  315. "[%s]: Postponing health checks for %" PRId64 " seconds, because it was just connected.",
  316. rrdhost_hostname(host),
  317. (int64_t) rpt->config.alarms_delay);
  318. }
  319. }
  320. host->health_log.health_log_history = rpt->config.alarms_history;
  321. // this is a test
  322. // if(rpt->hops <= host->sender->hops)
  323. // rrdpush_sender_thread_stop(host, "HOPS MISMATCH", false);
  324. signal_rrdcontext = true;
  325. rrdpush_receiver_replication_reset(host);
  326. rrdhost_flag_clear(rpt->host, RRDHOST_FLAG_RRDPUSH_RECEIVER_DISCONNECTED);
  327. aclk_queue_node_info(rpt->host, true);
  328. rrdpush_reset_destinations_postpone_time(host);
  329. set_this = true;
  330. }
  331. netdata_mutex_unlock(&host->receiver_lock);
  332. if(signal_rrdcontext)
  333. rrdcontext_host_child_connected(host);
  334. return set_this;
  335. }
  336. static void rrdhost_clear_receiver(struct receiver_state *rpt) {
  337. bool signal_rrdcontext = false;
  338. RRDHOST *host = rpt->host;
  339. if(host) {
  340. netdata_mutex_lock(&host->receiver_lock);
  341. // Make sure that we detach this thread and don't kill a freshly arriving receiver
  342. if(host->receiver == rpt) {
  343. __atomic_sub_fetch(&localhost->connected_children_count, 1, __ATOMIC_RELAXED);
  344. rrdhost_flag_set(rpt->host, RRDHOST_FLAG_RRDPUSH_RECEIVER_DISCONNECTED);
  345. host->trigger_chart_obsoletion_check = 0;
  346. host->child_connect_time = 0;
  347. host->child_disconnected_time = now_realtime_sec();
  348. if (rpt->config.health_enabled == CONFIG_BOOLEAN_AUTO)
  349. host->health.health_enabled = 0;
  350. rrdpush_sender_thread_stop(host, STREAM_HANDSHAKE_DISCONNECT_RECEIVER_LEFT, false);
  351. signal_rrdcontext = true;
  352. rrdpush_receiver_replication_reset(host);
  353. rrdhost_flag_set(host, RRDHOST_FLAG_ORPHAN);
  354. host->receiver = NULL;
  355. host->rrdpush_last_receiver_exit_reason = rpt->exit.reason;
  356. }
  357. netdata_mutex_unlock(&host->receiver_lock);
  358. if(signal_rrdcontext)
  359. rrdcontext_host_child_disconnected(host);
  360. rrdpush_reset_destinations_postpone_time(host);
  361. }
  362. }
  363. bool stop_streaming_receiver(RRDHOST *host, STREAM_HANDSHAKE reason) {
  364. bool ret = false;
  365. netdata_mutex_lock(&host->receiver_lock);
  366. if(host->receiver) {
  367. if(!host->receiver->exit.shutdown) {
  368. host->receiver->exit.shutdown = true;
  369. receiver_set_exit_reason(host->receiver, reason, true);
  370. shutdown(host->receiver->fd, SHUT_RDWR);
  371. }
  372. netdata_thread_cancel(host->receiver->thread);
  373. }
  374. int count = 2000;
  375. while (host->receiver && count-- > 0) {
  376. netdata_mutex_unlock(&host->receiver_lock);
  377. // let the lock for the receiver thread to exit
  378. sleep_usec(1 * USEC_PER_MS);
  379. netdata_mutex_lock(&host->receiver_lock);
  380. }
  381. if(host->receiver)
  382. netdata_log_error("STREAM '%s' [receive from [%s]:%s]: "
  383. "thread %d takes too long to stop, giving up..."
  384. , rrdhost_hostname(host)
  385. , host->receiver->client_ip, host->receiver->client_port
  386. , host->receiver->tid);
  387. else
  388. ret = true;
  389. netdata_mutex_unlock(&host->receiver_lock);
  390. return ret;
  391. }
  392. static void rrdpush_send_error_on_taken_over_connection(struct receiver_state *rpt, const char *msg) {
  393. (void) send_timeout(
  394. #ifdef ENABLE_HTTPS
  395. &rpt->ssl,
  396. #endif
  397. rpt->fd,
  398. (char *)msg,
  399. strlen(msg),
  400. 0,
  401. 5);
  402. }
  403. void rrdpush_receive_log_status(struct receiver_state *rpt, const char *msg, const char *status, ND_LOG_FIELD_PRIORITY priority) {
  404. // this function may be called BEFORE we spawn the receiver thread
  405. // so, we need to add the fields again (it does not harm)
  406. ND_LOG_STACK lgs[] = {
  407. ND_LOG_FIELD_TXT(NDF_SRC_IP, rpt->client_ip),
  408. ND_LOG_FIELD_TXT(NDF_SRC_PORT, rpt->client_port),
  409. ND_LOG_FIELD_TXT(NDF_NIDL_NODE, (rpt->hostname && *rpt->hostname) ? rpt->hostname : ""),
  410. ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, status),
  411. ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &streaming_from_child_msgid),
  412. ND_LOG_FIELD_END(),
  413. };
  414. ND_LOG_STACK_PUSH(lgs);
  415. nd_log(NDLS_ACCESS, priority, "api_key:'%s' machine_guid:'%s' msg:'%s'"
  416. , (rpt->key && *rpt->key)? rpt->key : ""
  417. , (rpt->machine_guid && *rpt->machine_guid) ? rpt->machine_guid : ""
  418. , msg);
  419. nd_log(NDLS_DAEMON, priority, "STREAM_RECEIVER for '%s': %s %s%s%s"
  420. , (rpt->hostname && *rpt->hostname) ? rpt->hostname : ""
  421. , msg
  422. , rpt->exit.reason != STREAM_HANDSHAKE_NEVER?" (":""
  423. , stream_handshake_error_to_string(rpt->exit.reason)
  424. , rpt->exit.reason != STREAM_HANDSHAKE_NEVER?")":""
  425. );
  426. }
  427. static void rrdpush_receive(struct receiver_state *rpt)
  428. {
  429. rpt->config.mode = default_rrd_memory_mode;
  430. rpt->config.history = default_rrd_history_entries;
  431. rpt->config.health_enabled = (int)default_health_enabled;
  432. rpt->config.alarms_delay = 60;
  433. rpt->config.alarms_history = HEALTH_LOG_DEFAULT_HISTORY;
  434. rpt->config.rrdpush_enabled = (int)default_rrdpush_enabled;
  435. rpt->config.rrdpush_destination = default_rrdpush_destination;
  436. rpt->config.rrdpush_api_key = default_rrdpush_api_key;
  437. rpt->config.rrdpush_send_charts_matching = default_rrdpush_send_charts_matching;
  438. rpt->config.rrdpush_enable_replication = default_rrdpush_enable_replication;
  439. rpt->config.rrdpush_seconds_to_replicate = default_rrdpush_seconds_to_replicate;
  440. rpt->config.rrdpush_replication_step = default_rrdpush_replication_step;
  441. rpt->config.update_every = (int)appconfig_get_number(&stream_config, rpt->machine_guid, "update every", rpt->config.update_every);
  442. if(rpt->config.update_every < 0) rpt->config.update_every = 1;
  443. rpt->config.history = (int)appconfig_get_number(&stream_config, rpt->key, "default history", rpt->config.history);
  444. rpt->config.history = (int)appconfig_get_number(&stream_config, rpt->machine_guid, "history", rpt->config.history);
  445. if(rpt->config.history < 5) rpt->config.history = 5;
  446. rpt->config.mode = rrd_memory_mode_id(appconfig_get(&stream_config, rpt->key, "default memory mode", rrd_memory_mode_name(rpt->config.mode)));
  447. rpt->config.mode = rrd_memory_mode_id(appconfig_get(&stream_config, rpt->machine_guid, "memory mode", rrd_memory_mode_name(rpt->config.mode)));
  448. if (unlikely(rpt->config.mode == RRD_MEMORY_MODE_DBENGINE && !dbengine_enabled)) {
  449. netdata_log_error("STREAM '%s' [receive from %s:%s]: "
  450. "dbengine is not enabled, falling back to default."
  451. , rpt->hostname
  452. , rpt->client_ip, rpt->client_port
  453. );
  454. rpt->config.mode = default_rrd_memory_mode;
  455. }
  456. rpt->config.health_enabled = appconfig_get_boolean_ondemand(&stream_config, rpt->key, "health enabled by default", rpt->config.health_enabled);
  457. rpt->config.health_enabled = appconfig_get_boolean_ondemand(&stream_config, rpt->machine_guid, "health enabled", rpt->config.health_enabled);
  458. rpt->config.alarms_delay = appconfig_get_number(&stream_config, rpt->key, "default postpone alarms on connect seconds", rpt->config.alarms_delay);
  459. rpt->config.alarms_delay = appconfig_get_number(&stream_config, rpt->machine_guid, "postpone alarms on connect seconds", rpt->config.alarms_delay);
  460. rpt->config.alarms_history = appconfig_get_number(&stream_config, rpt->key, "default health log history", rpt->config.alarms_history);
  461. rpt->config.alarms_history = appconfig_get_number(&stream_config, rpt->machine_guid, "health log history", rpt->config.alarms_history);
  462. rpt->config.rrdpush_enabled = appconfig_get_boolean(&stream_config, rpt->key, "default proxy enabled", rpt->config.rrdpush_enabled);
  463. rpt->config.rrdpush_enabled = appconfig_get_boolean(&stream_config, rpt->machine_guid, "proxy enabled", rpt->config.rrdpush_enabled);
  464. rpt->config.rrdpush_destination = appconfig_get(&stream_config, rpt->key, "default proxy destination", rpt->config.rrdpush_destination);
  465. rpt->config.rrdpush_destination = appconfig_get(&stream_config, rpt->machine_guid, "proxy destination", rpt->config.rrdpush_destination);
  466. rpt->config.rrdpush_api_key = appconfig_get(&stream_config, rpt->key, "default proxy api key", rpt->config.rrdpush_api_key);
  467. rpt->config.rrdpush_api_key = appconfig_get(&stream_config, rpt->machine_guid, "proxy api key", rpt->config.rrdpush_api_key);
  468. rpt->config.rrdpush_send_charts_matching = appconfig_get(&stream_config, rpt->key, "default proxy send charts matching", rpt->config.rrdpush_send_charts_matching);
  469. rpt->config.rrdpush_send_charts_matching = appconfig_get(&stream_config, rpt->machine_guid, "proxy send charts matching", rpt->config.rrdpush_send_charts_matching);
  470. rpt->config.rrdpush_enable_replication = appconfig_get_boolean(&stream_config, rpt->key, "enable replication", rpt->config.rrdpush_enable_replication);
  471. rpt->config.rrdpush_enable_replication = appconfig_get_boolean(&stream_config, rpt->machine_guid, "enable replication", rpt->config.rrdpush_enable_replication);
  472. rpt->config.rrdpush_seconds_to_replicate = appconfig_get_number(&stream_config, rpt->key, "seconds to replicate", rpt->config.rrdpush_seconds_to_replicate);
  473. rpt->config.rrdpush_seconds_to_replicate = appconfig_get_number(&stream_config, rpt->machine_guid, "seconds to replicate", rpt->config.rrdpush_seconds_to_replicate);
  474. rpt->config.rrdpush_replication_step = appconfig_get_number(&stream_config, rpt->key, "seconds per replication step", rpt->config.rrdpush_replication_step);
  475. rpt->config.rrdpush_replication_step = appconfig_get_number(&stream_config, rpt->machine_guid, "seconds per replication step", rpt->config.rrdpush_replication_step);
  476. rpt->config.rrdpush_compression = default_rrdpush_compression_enabled;
  477. rpt->config.rrdpush_compression = appconfig_get_boolean(&stream_config, rpt->key, "enable compression", rpt->config.rrdpush_compression);
  478. rpt->config.rrdpush_compression = appconfig_get_boolean(&stream_config, rpt->machine_guid, "enable compression", rpt->config.rrdpush_compression);
  479. bool is_ephemeral = false;
  480. is_ephemeral = appconfig_get_boolean(&stream_config, rpt->key, "is ephemeral node", CONFIG_BOOLEAN_NO);
  481. is_ephemeral = appconfig_get_boolean(&stream_config, rpt->machine_guid, "is ephemeral node", is_ephemeral);
  482. if(rpt->config.rrdpush_compression) {
  483. char *order = appconfig_get(&stream_config, rpt->key, "compression algorithms order", RRDPUSH_COMPRESSION_ALGORITHMS_ORDER);
  484. order = appconfig_get(&stream_config, rpt->machine_guid, "compression algorithms order", order);
  485. rrdpush_parse_compression_order(rpt, order);
  486. }
  487. (void)appconfig_set_default(&stream_config, rpt->machine_guid, "host tags", (rpt->tags)?rpt->tags:"");
  488. // find the host for this receiver
  489. {
  490. // this will also update the host with our system_info
  491. RRDHOST *host = rrdhost_find_or_create(
  492. rpt->hostname,
  493. rpt->registry_hostname,
  494. rpt->machine_guid,
  495. rpt->os,
  496. rpt->timezone,
  497. rpt->abbrev_timezone,
  498. rpt->utc_offset,
  499. rpt->tags,
  500. rpt->program_name,
  501. rpt->program_version,
  502. rpt->config.update_every,
  503. rpt->config.history,
  504. rpt->config.mode,
  505. (unsigned int)(rpt->config.health_enabled != CONFIG_BOOLEAN_NO),
  506. (unsigned int)(rpt->config.rrdpush_enabled && rpt->config.rrdpush_destination &&
  507. *rpt->config.rrdpush_destination && rpt->config.rrdpush_api_key &&
  508. *rpt->config.rrdpush_api_key),
  509. rpt->config.rrdpush_destination,
  510. rpt->config.rrdpush_api_key,
  511. rpt->config.rrdpush_send_charts_matching,
  512. rpt->config.rrdpush_enable_replication,
  513. rpt->config.rrdpush_seconds_to_replicate,
  514. rpt->config.rrdpush_replication_step,
  515. rpt->system_info,
  516. 0);
  517. if(!host) {
  518. rrdpush_receive_log_status(
  519. rpt,"failed to find/create host structure, rejecting connection",
  520. RRDPUSH_STATUS_INTERNAL_SERVER_ERROR, NDLP_ERR);
  521. rrdpush_send_error_on_taken_over_connection(rpt, START_STREAMING_ERROR_INTERNAL_ERROR);
  522. goto cleanup;
  523. }
  524. if (unlikely(rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_CONTEXT_LOAD))) {
  525. rrdpush_receive_log_status(
  526. rpt, "host is initializing, retry later",
  527. RRDPUSH_STATUS_INITIALIZATION_IN_PROGRESS, NDLP_NOTICE);
  528. rrdpush_send_error_on_taken_over_connection(rpt, START_STREAMING_ERROR_INITIALIZATION);
  529. goto cleanup;
  530. }
  531. // system_info has been consumed by the host structure
  532. rpt->system_info = NULL;
  533. if(!rrdhost_set_receiver(host, rpt)) {
  534. rrdpush_receive_log_status(
  535. rpt, "host is already served by another receiver",
  536. RRDPUSH_STATUS_DUPLICATE_RECEIVER, NDLP_INFO);
  537. rrdpush_send_error_on_taken_over_connection(rpt, START_STREAMING_ERROR_ALREADY_STREAMING);
  538. goto cleanup;
  539. }
  540. }
  541. #ifdef NETDATA_INTERNAL_CHECKS
  542. netdata_log_info("STREAM '%s' [receive from [%s]:%s]: "
  543. "client willing to stream metrics for host '%s' with machine_guid '%s': "
  544. "update every = %d, history = %d, memory mode = %s, health %s,%s tags '%s'"
  545. , rpt->hostname
  546. , rpt->client_ip
  547. , rpt->client_port
  548. , rrdhost_hostname(rpt->host)
  549. , rpt->host->machine_guid
  550. , rpt->host->rrd_update_every
  551. , rpt->host->rrd_history_entries
  552. , rrd_memory_mode_name(rpt->host->rrd_memory_mode)
  553. , (rpt->config.health_enabled == CONFIG_BOOLEAN_NO)?"disabled":((rpt->config.health_enabled == CONFIG_BOOLEAN_YES)?"enabled":"auto")
  554. #ifdef ENABLE_HTTPS
  555. , (rpt->ssl.conn != NULL) ? " SSL," : ""
  556. #else
  557. , ""
  558. #endif
  559. , rrdhost_tags(rpt->host)
  560. );
  561. #endif // NETDATA_INTERNAL_CHECKS
  562. struct plugind cd = {
  563. .update_every = default_rrd_update_every,
  564. .unsafe = {
  565. .spinlock = NETDATA_SPINLOCK_INITIALIZER,
  566. .running = true,
  567. .enabled = true,
  568. },
  569. .started_t = now_realtime_sec(),
  570. };
  571. // put the client IP and port into the buffers used by plugins.d
  572. snprintfz(cd.id, CONFIG_MAX_NAME, "%s:%s", rpt->client_ip, rpt->client_port);
  573. snprintfz(cd.filename, FILENAME_MAX, "%s:%s", rpt->client_ip, rpt->client_port);
  574. snprintfz(cd.fullfilename, FILENAME_MAX, "%s:%s", rpt->client_ip, rpt->client_port);
  575. snprintfz(cd.cmd, PLUGINSD_CMD_MAX, "%s:%s", rpt->client_ip, rpt->client_port);
  576. rrdpush_select_receiver_compression_algorithm(rpt);
  577. {
  578. // netdata_log_info("STREAM %s [receive from [%s]:%s]: initializing communication...", rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port);
  579. char initial_response[HTTP_HEADER_SIZE];
  580. if (stream_has_capability(rpt, STREAM_CAP_VCAPS)) {
  581. log_receiver_capabilities(rpt);
  582. sprintf(initial_response, "%s%u", START_STREAMING_PROMPT_VN, rpt->capabilities);
  583. }
  584. else if (stream_has_capability(rpt, STREAM_CAP_VN)) {
  585. log_receiver_capabilities(rpt);
  586. sprintf(initial_response, "%s%d", START_STREAMING_PROMPT_VN, stream_capabilities_to_vn(rpt->capabilities));
  587. }
  588. else if (stream_has_capability(rpt, STREAM_CAP_V2)) {
  589. log_receiver_capabilities(rpt);
  590. sprintf(initial_response, "%s", START_STREAMING_PROMPT_V2);
  591. }
  592. else { // stream_has_capability(rpt, STREAM_CAP_V1)
  593. log_receiver_capabilities(rpt);
  594. sprintf(initial_response, "%s", START_STREAMING_PROMPT_V1);
  595. }
  596. netdata_log_debug(D_STREAM, "Initial response to %s: %s", rpt->client_ip, initial_response);
  597. #ifdef ENABLE_H2O
  598. if (is_h2o_rrdpush(rpt)) {
  599. h2o_stream_write(rpt->h2o_ctx, initial_response, strlen(initial_response));
  600. } else {
  601. #endif
  602. ssize_t bytes_sent = send_timeout(
  603. #ifdef ENABLE_HTTPS
  604. &rpt->ssl,
  605. #endif
  606. rpt->fd, initial_response, strlen(initial_response), 0, 60);
  607. if(bytes_sent != (ssize_t)strlen(initial_response)) {
  608. internal_error(true, "Cannot send response, got %zd bytes, expecting %zu bytes", bytes_sent, strlen(initial_response));
  609. rrdpush_receive_log_status(
  610. rpt, "cannot reply back, dropping connection",
  611. RRDPUSH_STATUS_CANT_REPLY, NDLP_ERR);
  612. goto cleanup;
  613. }
  614. #ifdef ENABLE_H2O
  615. }
  616. #endif
  617. }
  618. #ifdef ENABLE_H2O
  619. unless_h2o_rrdpush(rpt)
  620. #endif
  621. {
  622. // remove the non-blocking flag from the socket
  623. if(sock_delnonblock(rpt->fd) < 0)
  624. netdata_log_error("STREAM '%s' [receive from [%s]:%s]: "
  625. "cannot remove the non-blocking flag from socket %d"
  626. , rrdhost_hostname(rpt->host)
  627. , rpt->client_ip, rpt->client_port
  628. , rpt->fd);
  629. struct timeval timeout;
  630. timeout.tv_sec = 600;
  631. timeout.tv_usec = 0;
  632. if (unlikely(setsockopt(rpt->fd, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof timeout) != 0))
  633. netdata_log_error("STREAM '%s' [receive from [%s]:%s]: "
  634. "cannot set timeout for socket %d"
  635. , rrdhost_hostname(rpt->host)
  636. , rpt->client_ip, rpt->client_port
  637. , rpt->fd);
  638. }
  639. rrdpush_receive_log_status(
  640. rpt, "connected and ready to receive data",
  641. RRDPUSH_STATUS_CONNECTED, NDLP_INFO);
  642. #ifdef ENABLE_ACLK
  643. // in case we have cloud connection we inform cloud
  644. // new child connected
  645. if (netdata_cloud_enabled)
  646. aclk_host_state_update(rpt->host, 1, 1);
  647. #endif
  648. rrdhost_set_is_parent_label();
  649. if (is_ephemeral)
  650. rrdhost_option_set(rpt->host, RRDHOST_OPTION_EPHEMERAL_HOST);
  651. // let it reconnect to parent immediately
  652. rrdpush_reset_destinations_postpone_time(rpt->host);
  653. size_t count = streaming_parser(rpt, &cd, rpt->fd,
  654. #ifdef ENABLE_HTTPS
  655. (rpt->ssl.conn) ? &rpt->ssl : NULL
  656. #else
  657. NULL
  658. #endif
  659. );
  660. receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_PARSER_EXIT, false);
  661. {
  662. char msg[100 + 1];
  663. snprintfz(msg, sizeof(msg) - 1, "disconnected (completed %zu updates)", count);
  664. rrdpush_receive_log_status(
  665. rpt, msg,
  666. RRDPUSH_STATUS_DISCONNECTED, NDLP_WARNING);
  667. }
  668. #ifdef ENABLE_ACLK
  669. // in case we have cloud connection we inform cloud
  670. // a child disconnected
  671. if (netdata_cloud_enabled)
  672. aclk_host_state_update(rpt->host, 0, 1);
  673. #endif
  674. cleanup:
  675. ;
  676. }
  677. static void rrdpush_receiver_thread_cleanup(void *ptr) {
  678. struct receiver_state *rpt = (struct receiver_state *) ptr;
  679. worker_unregister();
  680. rrdhost_clear_receiver(rpt);
  681. netdata_log_info("STREAM '%s' [receive from [%s]:%s]: "
  682. "receive thread ended (task id %d)"
  683. , rpt->hostname ? rpt->hostname : "-"
  684. , rpt->client_ip ? rpt->client_ip : "-", rpt->client_port ? rpt->client_port : "-"
  685. , gettid());
  686. receiver_state_free(rpt);
  687. rrdhost_set_is_parent_label();
  688. }
  689. static bool stream_receiver_log_capabilities(BUFFER *wb, void *ptr) {
  690. struct receiver_state *rpt = ptr;
  691. if(!rpt)
  692. return false;
  693. stream_capabilities_to_string(wb, rpt->capabilities);
  694. return true;
  695. }
  696. static bool stream_receiver_log_transport(BUFFER *wb, void *ptr) {
  697. struct receiver_state *rpt = ptr;
  698. if(!rpt)
  699. return false;
  700. #ifdef ENABLE_HTTPS
  701. buffer_strcat(wb, SSL_connection(&rpt->ssl) ? "https" : "http");
  702. #else
  703. buffer_strcat(wb, "http");
  704. #endif
  705. return true;
  706. }
  707. void *rrdpush_receiver_thread(void *ptr) {
  708. netdata_thread_cleanup_push(rrdpush_receiver_thread_cleanup, ptr);
  709. {
  710. worker_register("STREAMRCV");
  711. worker_register_job_custom_metric(WORKER_RECEIVER_JOB_BYTES_READ,
  712. "received bytes", "bytes/s",
  713. WORKER_METRIC_INCREMENT);
  714. worker_register_job_custom_metric(WORKER_RECEIVER_JOB_BYTES_UNCOMPRESSED,
  715. "uncompressed bytes", "bytes/s",
  716. WORKER_METRIC_INCREMENT);
  717. worker_register_job_custom_metric(WORKER_RECEIVER_JOB_REPLICATION_COMPLETION,
  718. "replication completion", "%",
  719. WORKER_METRIC_ABSOLUTE);
  720. struct receiver_state *rpt = (struct receiver_state *) ptr;
  721. rpt->tid = gettid();
  722. ND_LOG_STACK lgs[] = {
  723. ND_LOG_FIELD_TXT(NDF_SRC_IP, rpt->client_ip),
  724. ND_LOG_FIELD_TXT(NDF_SRC_PORT, rpt->client_port),
  725. ND_LOG_FIELD_TXT(NDF_NIDL_NODE, rpt->hostname),
  726. ND_LOG_FIELD_CB(NDF_SRC_TRANSPORT, stream_receiver_log_transport, rpt),
  727. ND_LOG_FIELD_CB(NDF_SRC_CAPABILITIES, stream_receiver_log_capabilities, rpt),
  728. ND_LOG_FIELD_END(),
  729. };
  730. ND_LOG_STACK_PUSH(lgs);
  731. netdata_log_info("STREAM %s [%s]:%s: receive thread started", rpt->hostname, rpt->client_ip
  732. , rpt->client_port);
  733. rrdpush_receive(rpt);
  734. }
  735. netdata_thread_cleanup_pop(1);
  736. return NULL;
  737. }