sender.c 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "rrdpush.h"
  3. #define WORKER_SENDER_JOB_CONNECT 0
  4. #define WORKER_SENDER_JOB_PIPE_READ 1
  5. #define WORKER_SENDER_JOB_SOCKET_RECEIVE 2
  6. #define WORKER_SENDER_JOB_EXECUTE 3
  7. #define WORKER_SENDER_JOB_SOCKET_SEND 4
  8. #define WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE 5
  9. #define WORKER_SENDER_JOB_DISCONNECT_OVERFLOW 6
  10. #define WORKER_SENDER_JOB_DISCONNECT_TIMEOUT 7
  11. #define WORKER_SENDER_JOB_DISCONNECT_POLL_ERROR 8
  12. #define WORKER_SENDER_JOB_DISCONNECT_SOCKER_ERROR 9
  13. #define WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR 10
  14. #define WORKER_SENDER_JOB_DISCONNECT_PARENT_CLOSED 11
  15. #define WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR 12
  16. #define WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR 13
  17. #define WORKER_SENDER_JOB_DISCONNECT_NO_COMPRESSION 14
  18. #if WORKER_UTILIZATION_MAX_JOB_TYPES < 15
  19. #error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 15
  20. #endif
  21. extern struct config stream_config;
  22. extern int netdata_use_ssl_on_stream;
  23. extern char *netdata_ssl_ca_path;
  24. extern char *netdata_ssl_ca_file;
  25. // Collector thread starting a transmission
  26. void sender_start(struct sender_state *s) {
  27. netdata_mutex_lock(&s->mutex);
  28. buffer_flush(s->build);
  29. }
  30. static inline void rrdpush_sender_thread_close_socket(RRDHOST *host);
  31. #ifdef ENABLE_COMPRESSION
  32. /*
  33. * In case of stream compression buffer oveflow
  34. * Inform the user through the error log file and
  35. * deactivate compression by downgrading the stream protocol.
  36. */
  37. static inline void deactivate_compression(struct sender_state *s) {
  38. worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_NO_COMPRESSION);
  39. error("STREAM_COMPRESSION: Deactivating compression to avoid stream corruption");
  40. default_compression_enabled = 0;
  41. s->rrdpush_compression = 0;
  42. s->version = STREAM_VERSION_CLABELS;
  43. error("STREAM_COMPRESSION %s [send to %s]: Restarting connection without compression", s->host->hostname, s->connected_to);
  44. rrdpush_sender_thread_close_socket(s->host);
  45. }
  46. #endif
  47. // Collector thread finishing a transmission
  48. void sender_commit(struct sender_state *s) {
  49. char *src = (char *)buffer_tostring(s->host->sender->build);
  50. size_t src_len = s->host->sender->build->len;
  51. #ifdef ENABLE_COMPRESSION
  52. if (src && src_len) {
  53. if (s->compressor && s->rrdpush_compression) {
  54. src_len = s->compressor->compress(s->compressor, src, src_len, &src);
  55. if (!src_len) {
  56. deactivate_compression(s);
  57. buffer_flush(s->build);
  58. netdata_mutex_unlock(&s->mutex);
  59. return;
  60. }
  61. }
  62. if(cbuffer_add_unsafe(s->host->sender->buffer, src, src_len))
  63. s->overflow = 1;
  64. }
  65. #else
  66. if(cbuffer_add_unsafe(s->host->sender->buffer, src, src_len))
  67. s->overflow = 1;
  68. #endif
  69. buffer_flush(s->build);
  70. netdata_mutex_unlock(&s->mutex);
  71. }
  72. static inline void rrdpush_sender_thread_close_socket(RRDHOST *host) {
  73. host->rrdpush_sender_connected = 0;
  74. if(host->rrdpush_sender_socket != -1) {
  75. close(host->rrdpush_sender_socket);
  76. host->rrdpush_sender_socket = -1;
  77. }
  78. }
  79. static inline void rrdpush_sender_add_host_variable_to_buffer_nolock(RRDHOST *host, RRDVAR *rv) {
  80. calculated_number *value = (calculated_number *)rv->value;
  81. buffer_sprintf(
  82. host->sender->build
  83. , "VARIABLE HOST %s = " CALCULATED_NUMBER_FORMAT "\n"
  84. , rv->name
  85. , *value
  86. );
  87. debug(D_STREAM, "RRDVAR pushed HOST VARIABLE %s = " CALCULATED_NUMBER_FORMAT, rv->name, *value);
  88. }
  89. void rrdpush_sender_send_this_host_variable_now(RRDHOST *host, RRDVAR *rv) {
  90. if(host->rrdpush_send_enabled && host->rrdpush_sender_spawn && host->rrdpush_sender_connected) {
  91. sender_start(host->sender);
  92. rrdpush_sender_add_host_variable_to_buffer_nolock(host, rv);
  93. sender_commit(host->sender);
  94. }
  95. }
  96. static int rrdpush_sender_thread_custom_host_variables_callback(void *rrdvar_ptr, void *host_ptr) {
  97. RRDVAR *rv = (RRDVAR *)rrdvar_ptr;
  98. RRDHOST *host = (RRDHOST *)host_ptr;
  99. if(unlikely(rv->options & RRDVAR_OPTION_CUSTOM_HOST_VAR && rv->type == RRDVAR_TYPE_CALCULATED)) {
  100. rrdpush_sender_add_host_variable_to_buffer_nolock(host, rv);
  101. // return 1, so that the traversal will return the number of variables sent
  102. return 1;
  103. }
  104. // returning a negative number will break the traversal
  105. return 0;
  106. }
  107. static void rrdpush_sender_thread_send_custom_host_variables(RRDHOST *host) {
  108. sender_start(host->sender);
  109. int ret = rrdvar_callback_for_all_host_variables(host, rrdpush_sender_thread_custom_host_variables_callback, host);
  110. (void)ret;
  111. sender_commit(host->sender);
  112. debug(D_STREAM, "RRDVAR sent %d VARIABLES", ret);
  113. }
  114. // resets all the chart, so that their definitions
  115. // will be resent to the central netdata
  116. static void rrdpush_sender_thread_reset_all_charts(RRDHOST *host) {
  117. rrdhost_rdlock(host);
  118. RRDSET *st;
  119. rrdset_foreach_read(st, host) {
  120. rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED);
  121. st->upstream_resync_time = 0;
  122. rrdset_rdlock(st);
  123. RRDDIM *rd;
  124. rrddim_foreach_read(rd, st)
  125. rd->exposed = 0;
  126. rrdset_unlock(st);
  127. }
  128. rrdhost_unlock(host);
  129. }
  130. static inline void rrdpush_sender_thread_data_flush(RRDHOST *host) {
  131. netdata_mutex_lock(&host->sender->mutex);
  132. size_t len = cbuffer_next_unsafe(host->sender->buffer, NULL);
  133. if (len)
  134. error("STREAM %s [send]: discarding %zu bytes of metrics already in the buffer.", host->hostname, len);
  135. cbuffer_remove_unsafe(host->sender->buffer, len);
  136. netdata_mutex_unlock(&host->sender->mutex);
  137. rrdpush_sender_thread_reset_all_charts(host);
  138. rrdpush_sender_thread_send_custom_host_variables(host);
  139. }
  140. static inline void rrdpush_set_flags_to_newest_stream(RRDHOST *host) {
  141. host->labels.labels_flag |= LABEL_FLAG_UPDATE_STREAM;
  142. host->labels.labels_flag &= ~LABEL_FLAG_STOP_STREAM;
  143. }
  144. void rrdpush_encode_variable(stream_encoded_t *se, RRDHOST *host)
  145. {
  146. se->os_name = (host->system_info->host_os_name)?url_encode(host->system_info->host_os_name):"";
  147. se->os_id = (host->system_info->host_os_id)?url_encode(host->system_info->host_os_id):"";
  148. se->os_version = (host->system_info->host_os_version)?url_encode(host->system_info->host_os_version):"";
  149. se->kernel_name = (host->system_info->kernel_name)?url_encode(host->system_info->kernel_name):"";
  150. se->kernel_version = (host->system_info->kernel_version)?url_encode(host->system_info->kernel_version):"";
  151. }
  152. void rrdpush_clean_encoded(stream_encoded_t *se)
  153. {
  154. if (se->os_name)
  155. freez(se->os_name);
  156. if (se->os_id)
  157. freez(se->os_id);
  158. if (se->os_version)
  159. freez(se->os_version);
  160. if (se->kernel_name)
  161. freez(se->kernel_name);
  162. if (se->kernel_version)
  163. freez(se->kernel_version);
  164. }
  165. static inline long int parse_stream_version_for_errors(char *http)
  166. {
  167. if (!memcmp(http, START_STREAMING_ERROR_SAME_LOCALHOST, sizeof(START_STREAMING_ERROR_SAME_LOCALHOST)))
  168. return -2;
  169. else if (!memcmp(http, START_STREAMING_ERROR_ALREADY_STREAMING, sizeof(START_STREAMING_ERROR_ALREADY_STREAMING)))
  170. return -3;
  171. else if (!memcmp(http, START_STREAMING_ERROR_NOT_PERMITTED, sizeof(START_STREAMING_ERROR_NOT_PERMITTED)))
  172. return -4;
  173. else
  174. return -1;
  175. }
  176. static inline long int parse_stream_version(RRDHOST *host, char *http)
  177. {
  178. long int stream_version = -1;
  179. int answer = -1;
  180. char *stream_version_start = strchr(http, '=');
  181. if (stream_version_start) {
  182. stream_version_start++;
  183. stream_version = strtol(stream_version_start, NULL, 10);
  184. answer = memcmp(http, START_STREAMING_PROMPT_VN, (size_t)(stream_version_start - http));
  185. if (!answer) {
  186. rrdpush_set_flags_to_newest_stream(host);
  187. }
  188. } else {
  189. answer = memcmp(http, START_STREAMING_PROMPT_V2, strlen(START_STREAMING_PROMPT_V2));
  190. if (!answer) {
  191. stream_version = 1;
  192. rrdpush_set_flags_to_newest_stream(host);
  193. } else {
  194. answer = memcmp(http, START_STREAMING_PROMPT, strlen(START_STREAMING_PROMPT));
  195. if (!answer) {
  196. stream_version = 0;
  197. host->labels.labels_flag |= LABEL_FLAG_STOP_STREAM;
  198. host->labels.labels_flag &= ~LABEL_FLAG_UPDATE_STREAM;
  199. }
  200. else {
  201. stream_version = parse_stream_version_for_errors(http);
  202. }
  203. }
  204. }
  205. return stream_version;
  206. }
  207. static int rrdpush_sender_thread_connect_to_parent(RRDHOST *host, int default_port, int timeout,
  208. struct sender_state *s) {
  209. struct timeval tv = {
  210. .tv_sec = timeout,
  211. .tv_usec = 0
  212. };
  213. // make sure the socket is closed
  214. rrdpush_sender_thread_close_socket(host);
  215. debug(D_STREAM, "STREAM: Attempting to connect...");
  216. info("STREAM %s [send to %s]: connecting...", host->hostname, host->rrdpush_send_destination);
  217. host->rrdpush_sender_socket = connect_to_one_of_destinations(
  218. host->destinations
  219. , default_port
  220. , &tv
  221. , &s->reconnects_counter
  222. , s->connected_to
  223. , sizeof(s->connected_to)-1
  224. , &host->destination
  225. );
  226. if(unlikely(host->rrdpush_sender_socket == -1)) {
  227. error("STREAM %s [send to %s]: failed to connect", host->hostname, host->rrdpush_send_destination);
  228. return 0;
  229. }
  230. info("STREAM %s [send to %s]: initializing communication...", host->hostname, s->connected_to);
  231. #ifdef ENABLE_HTTPS
  232. if( netdata_client_ctx ){
  233. host->ssl.flags = NETDATA_SSL_START;
  234. if (!host->ssl.conn){
  235. host->ssl.conn = SSL_new(netdata_client_ctx);
  236. if(!host->ssl.conn){
  237. error("Failed to allocate SSL structure.");
  238. host->ssl.flags = NETDATA_SSL_NO_HANDSHAKE;
  239. }
  240. }
  241. else{
  242. SSL_clear(host->ssl.conn);
  243. }
  244. if (host->ssl.conn)
  245. {
  246. if (SSL_set_fd(host->ssl.conn, host->rrdpush_sender_socket) != 1) {
  247. error("Failed to set the socket to the SSL on socket fd %d.", host->rrdpush_sender_socket);
  248. host->ssl.flags = NETDATA_SSL_NO_HANDSHAKE;
  249. } else{
  250. host->ssl.flags = NETDATA_SSL_HANDSHAKE_COMPLETE;
  251. }
  252. }
  253. }
  254. else {
  255. host->ssl.flags = NETDATA_SSL_NO_HANDSHAKE;
  256. }
  257. #endif
  258. #ifdef ENABLE_COMPRESSION
  259. // Negotiate stream VERSION_CLABELS if stream compression is not supported
  260. s->rrdpush_compression = (default_compression_enabled && (s->version >= STREAM_VERSION_COMPRESSION));
  261. if(!s->rrdpush_compression)
  262. s->version = STREAM_VERSION_CLABELS;
  263. #endif //ENABLE_COMPRESSION
  264. /* TODO: During the implementation of #7265 switch the set of variables to HOST_* and CONTAINER_* if the
  265. version negotiation resulted in a high enough version.
  266. */
  267. stream_encoded_t se;
  268. rrdpush_encode_variable(&se, host);
  269. char http[HTTP_HEADER_SIZE + 1];
  270. int eol = snprintfz(http, HTTP_HEADER_SIZE,
  271. "STREAM "
  272. "key=%s"
  273. "&hostname=%s"
  274. "&registry_hostname=%s"
  275. "&machine_guid=%s"
  276. "&update_every=%d"
  277. "&os=%s"
  278. "&timezone=%s"
  279. "&abbrev_timezone=%s"
  280. "&utc_offset=%d"
  281. "&hops=%d"
  282. "&ml_capable=%d"
  283. "&ml_enabled=%d"
  284. "&mc_version=%d"
  285. "&tags=%s"
  286. "&ver=%d"
  287. "&NETDATA_INSTANCE_CLOUD_TYPE=%s"
  288. "&NETDATA_INSTANCE_CLOUD_INSTANCE_TYPE=%s"
  289. "&NETDATA_INSTANCE_CLOUD_INSTANCE_REGION=%s"
  290. "&NETDATA_SYSTEM_OS_NAME=%s"
  291. "&NETDATA_SYSTEM_OS_ID=%s"
  292. "&NETDATA_SYSTEM_OS_ID_LIKE=%s"
  293. "&NETDATA_SYSTEM_OS_VERSION=%s"
  294. "&NETDATA_SYSTEM_OS_VERSION_ID=%s"
  295. "&NETDATA_SYSTEM_OS_DETECTION=%s"
  296. "&NETDATA_HOST_IS_K8S_NODE=%s"
  297. "&NETDATA_SYSTEM_KERNEL_NAME=%s"
  298. "&NETDATA_SYSTEM_KERNEL_VERSION=%s"
  299. "&NETDATA_SYSTEM_ARCHITECTURE=%s"
  300. "&NETDATA_SYSTEM_VIRTUALIZATION=%s"
  301. "&NETDATA_SYSTEM_VIRT_DETECTION=%s"
  302. "&NETDATA_SYSTEM_CONTAINER=%s"
  303. "&NETDATA_SYSTEM_CONTAINER_DETECTION=%s"
  304. "&NETDATA_CONTAINER_OS_NAME=%s"
  305. "&NETDATA_CONTAINER_OS_ID=%s"
  306. "&NETDATA_CONTAINER_OS_ID_LIKE=%s"
  307. "&NETDATA_CONTAINER_OS_VERSION=%s"
  308. "&NETDATA_CONTAINER_OS_VERSION_ID=%s"
  309. "&NETDATA_CONTAINER_OS_DETECTION=%s"
  310. "&NETDATA_SYSTEM_CPU_LOGICAL_CPU_COUNT=%s"
  311. "&NETDATA_SYSTEM_CPU_FREQ=%s"
  312. "&NETDATA_SYSTEM_TOTAL_RAM=%s"
  313. "&NETDATA_SYSTEM_TOTAL_DISK_SIZE=%s"
  314. "&NETDATA_PROTOCOL_VERSION=%s"
  315. " HTTP/1.1\r\n"
  316. "User-Agent: %s/%s\r\n"
  317. "Accept: */*\r\n\r\n"
  318. , host->rrdpush_send_api_key
  319. , host->hostname
  320. , host->registry_hostname
  321. , host->machine_guid
  322. , default_rrd_update_every
  323. , host->os
  324. , host->timezone
  325. , host->abbrev_timezone
  326. , host->utc_offset
  327. , host->system_info->hops + 1
  328. , host->system_info->ml_capable
  329. , host->system_info->ml_enabled
  330. , host->system_info->mc_version
  331. , (host->tags) ? host->tags : ""
  332. , s->version
  333. , (host->system_info->cloud_provider_type) ? host->system_info->cloud_provider_type : ""
  334. , (host->system_info->cloud_instance_type) ? host->system_info->cloud_instance_type : ""
  335. , (host->system_info->cloud_instance_region) ? host->system_info->cloud_instance_region : ""
  336. , se.os_name
  337. , se.os_id
  338. , (host->system_info->host_os_id_like) ? host->system_info->host_os_id_like : ""
  339. , se.os_version
  340. , (host->system_info->host_os_version_id) ? host->system_info->host_os_version_id : ""
  341. , (host->system_info->host_os_detection) ? host->system_info->host_os_detection : ""
  342. , (host->system_info->is_k8s_node) ? host->system_info->is_k8s_node : ""
  343. , se.kernel_name
  344. , se.kernel_version
  345. , (host->system_info->architecture) ? host->system_info->architecture : ""
  346. , (host->system_info->virtualization) ? host->system_info->virtualization : ""
  347. , (host->system_info->virt_detection) ? host->system_info->virt_detection : ""
  348. , (host->system_info->container) ? host->system_info->container : ""
  349. , (host->system_info->container_detection) ? host->system_info->container_detection : ""
  350. , (host->system_info->container_os_name) ? host->system_info->container_os_name : ""
  351. , (host->system_info->container_os_id) ? host->system_info->container_os_id : ""
  352. , (host->system_info->container_os_id_like) ? host->system_info->container_os_id_like : ""
  353. , (host->system_info->container_os_version) ? host->system_info->container_os_version : ""
  354. , (host->system_info->container_os_version_id) ? host->system_info->container_os_version_id : ""
  355. , (host->system_info->container_os_detection) ? host->system_info->container_os_detection : ""
  356. , (host->system_info->host_cores) ? host->system_info->host_cores : ""
  357. , (host->system_info->host_cpu_freq) ? host->system_info->host_cpu_freq : ""
  358. , (host->system_info->host_ram_total) ? host->system_info->host_ram_total : ""
  359. , (host->system_info->host_disk_space) ? host->system_info->host_disk_space : ""
  360. , STREAMING_PROTOCOL_VERSION
  361. , host->program_name
  362. , host->program_version
  363. );
  364. http[eol] = 0x00;
  365. rrdpush_clean_encoded(&se);
  366. #ifdef ENABLE_HTTPS
  367. if (!host->ssl.flags) {
  368. ERR_clear_error();
  369. SSL_set_connect_state(host->ssl.conn);
  370. int err = SSL_connect(host->ssl.conn);
  371. if (err != 1){
  372. err = SSL_get_error(host->ssl.conn, err);
  373. error("SSL cannot connect with the server: %s ",ERR_error_string((long)SSL_get_error(host->ssl.conn,err),NULL));
  374. if (netdata_use_ssl_on_stream == NETDATA_SSL_FORCE) {
  375. worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR);
  376. rrdpush_sender_thread_close_socket(host);
  377. if (host->destination->next)
  378. host->destination->disabled_no_proper_reply = 1;
  379. return 0;
  380. }else {
  381. host->ssl.flags = NETDATA_SSL_NO_HANDSHAKE;
  382. }
  383. }
  384. else {
  385. if (netdata_use_ssl_on_stream == NETDATA_SSL_FORCE) {
  386. if (netdata_validate_server == NETDATA_SSL_VALID_CERTIFICATE) {
  387. if ( security_test_certificate(host->ssl.conn)) {
  388. worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR);
  389. error("Closing the stream connection, because the server SSL certificate is not valid.");
  390. rrdpush_sender_thread_close_socket(host);
  391. if (host->destination->next)
  392. host->destination->disabled_no_proper_reply = 1;
  393. return 0;
  394. }
  395. }
  396. }
  397. }
  398. }
  399. if(send_timeout(&host->ssl,host->rrdpush_sender_socket, http, strlen(http), 0, timeout) == -1) {
  400. #else
  401. if(send_timeout(host->rrdpush_sender_socket, http, strlen(http), 0, timeout) == -1) {
  402. #endif
  403. worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT);
  404. error("STREAM %s [send to %s]: failed to send HTTP header to remote netdata.", host->hostname, s->connected_to);
  405. rrdpush_sender_thread_close_socket(host);
  406. return 0;
  407. }
  408. info("STREAM %s [send to %s]: waiting response from remote netdata...", host->hostname, s->connected_to);
  409. ssize_t received;
  410. #ifdef ENABLE_HTTPS
  411. received = recv_timeout(&host->ssl,host->rrdpush_sender_socket, http, HTTP_HEADER_SIZE, 0, timeout);
  412. if(received == -1) {
  413. #else
  414. received = recv_timeout(host->rrdpush_sender_socket, http, HTTP_HEADER_SIZE, 0, timeout);
  415. if(received == -1) {
  416. #endif
  417. worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT);
  418. error("STREAM %s [send to %s]: remote netdata does not respond.", host->hostname, s->connected_to);
  419. rrdpush_sender_thread_close_socket(host);
  420. return 0;
  421. }
  422. http[received] = '\0';
  423. debug(D_STREAM, "Response to sender from far end: %s", http);
  424. int32_t version = (int32_t)parse_stream_version(host, http);
  425. if(version == -1) {
  426. worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE);
  427. error("STREAM %s [send to %s]: server is not replying properly (is it a netdata?).", host->hostname, s->connected_to);
  428. rrdpush_sender_thread_close_socket(host);
  429. //catch other reject reasons and force to check other destinations
  430. if (host->destination->next)
  431. host->destination->disabled_no_proper_reply = 1;
  432. return 0;
  433. }
  434. else if(version == -2) {
  435. error("STREAM %s [send to %s]: remote server is the localhost for [%s].", host->hostname, s->connected_to, host->hostname);
  436. rrdpush_sender_thread_close_socket(host);
  437. host->destination->disabled_because_of_localhost = 1;
  438. return 0;
  439. }
  440. else if(version == -3) {
  441. error("STREAM %s [send to %s]: remote server already receives metrics for [%s].", host->hostname, s->connected_to, host->hostname);
  442. rrdpush_sender_thread_close_socket(host);
  443. host->destination->disabled_already_streaming = now_realtime_sec();
  444. return 0;
  445. }
  446. else if(version == -4) {
  447. error("STREAM %s [send to %s]: remote server denied access for [%s].", host->hostname, s->connected_to, host->hostname);
  448. rrdpush_sender_thread_close_socket(host);
  449. if (host->destination->next)
  450. host->destination->disabled_because_of_denied_access = 1;
  451. return 0;
  452. }
  453. s->version = version;
  454. #ifdef ENABLE_COMPRESSION
  455. s->rrdpush_compression = (s->rrdpush_compression && (s->version >= STREAM_VERSION_COMPRESSION));
  456. if(s->rrdpush_compression)
  457. {
  458. // parent supports compression
  459. if(s->compressor)
  460. s->compressor->reset(s->compressor);
  461. }
  462. else {
  463. //parent does not support compression or has compression disabled
  464. debug(D_STREAM, "Stream is uncompressed! One of the agents (%s <-> %s) does not support compression OR compression is disabled.", s->connected_to, s->host->hostname);
  465. infoerr("Stream is uncompressed! One of the agents (%s <-> %s) does not support compression OR compression is disabled.", s->connected_to, s->host->hostname);
  466. s->version = STREAM_VERSION_CLABELS;
  467. }
  468. #endif //ENABLE_COMPRESSION
  469. info("STREAM %s [send to %s]: established communication with a parent using protocol version %d - ready to send metrics..."
  470. , host->hostname
  471. , s->connected_to
  472. , s->version);
  473. if(sock_setnonblock(host->rrdpush_sender_socket) < 0)
  474. error("STREAM %s [send to %s]: cannot set non-blocking mode for socket.", host->hostname, s->connected_to);
  475. if(sock_enlarge_out(host->rrdpush_sender_socket) < 0)
  476. error("STREAM %s [send to %s]: cannot enlarge the socket buffer.", host->hostname, s->connected_to);
  477. debug(D_STREAM, "STREAM: Connected on fd %d...", host->rrdpush_sender_socket);
  478. return 1;
  479. }
  480. static void attempt_to_connect(struct sender_state *state)
  481. {
  482. state->send_attempts = 0;
  483. if(rrdpush_sender_thread_connect_to_parent(state->host, state->default_port, state->timeout, state)) {
  484. state->last_sent_t = now_monotonic_sec();
  485. // reset the buffer, to properly send charts and metrics
  486. rrdpush_sender_thread_data_flush(state->host);
  487. // send from the beginning
  488. state->begin = 0;
  489. // make sure the next reconnection will be immediate
  490. state->not_connected_loops = 0;
  491. // reset the bytes we have sent for this session
  492. state->sent_bytes_on_this_connection = 0;
  493. // let the data collection threads know we are ready
  494. state->host->rrdpush_sender_connected = 1;
  495. }
  496. else {
  497. // increase the failed connections counter
  498. state->not_connected_loops++;
  499. // reset the number of bytes sent
  500. state->sent_bytes_on_this_connection = 0;
  501. // slow re-connection on repeating errors
  502. sleep_usec(USEC_PER_SEC * state->reconnect_delay); // seconds
  503. }
  504. }
  505. // TCP window is open and we have data to transmit.
  506. void attempt_to_send(struct sender_state *s) {
  507. rrdpush_send_labels(s->host);
  508. #ifdef NETDATA_INTERNAL_CHECKS
  509. struct circular_buffer *cb = s->buffer;
  510. #endif
  511. netdata_thread_disable_cancelability();
  512. netdata_mutex_lock(&s->mutex);
  513. char *chunk;
  514. size_t outstanding = cbuffer_next_unsafe(s->buffer, &chunk);
  515. debug(D_STREAM, "STREAM: Sending data. Buffer r=%zu w=%zu s=%zu, next chunk=%zu", cb->read, cb->write, cb->size, outstanding);
  516. ssize_t ret;
  517. #ifdef ENABLE_HTTPS
  518. SSL *conn = s->host->ssl.conn ;
  519. if(conn && !s->host->ssl.flags) {
  520. ret = SSL_write(conn, chunk, outstanding);
  521. } else {
  522. ret = send(s->host->rrdpush_sender_socket, chunk, outstanding, MSG_DONTWAIT);
  523. }
  524. #else
  525. ret = send(s->host->rrdpush_sender_socket, chunk, outstanding, MSG_DONTWAIT);
  526. #endif
  527. if (likely(ret > 0)) {
  528. cbuffer_remove_unsafe(s->buffer, ret);
  529. s->sent_bytes_on_this_connection += ret;
  530. s->sent_bytes += ret;
  531. debug(D_STREAM, "STREAM %s [send to %s]: Sent %zd bytes", s->host->hostname, s->connected_to, ret);
  532. s->last_sent_t = now_monotonic_sec();
  533. }
  534. else if (ret == -1 && (errno == EAGAIN || errno == EINTR || errno == EWOULDBLOCK))
  535. debug(D_STREAM, "STREAM %s [send to %s]: unavailable after polling POLLOUT", s->host->hostname, s->connected_to);
  536. else if (ret == -1) {
  537. worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR);
  538. debug(D_STREAM, "STREAM: Send failed - closing socket...");
  539. error("STREAM %s [send to %s]: failed to send metrics - closing connection - we have sent %zu bytes on this connection.", s->host->hostname, s->connected_to, s->sent_bytes_on_this_connection);
  540. rrdpush_sender_thread_close_socket(s->host);
  541. }
  542. else {
  543. debug(D_STREAM, "STREAM: send() returned 0 -> no error but no transmission");
  544. }
  545. netdata_mutex_unlock(&s->mutex);
  546. netdata_thread_enable_cancelability();
  547. }
  548. void attempt_read(struct sender_state *s) {
  549. int ret;
  550. #ifdef ENABLE_HTTPS
  551. if (s->host->ssl.conn && !s->host->stream_ssl.flags) {
  552. ERR_clear_error();
  553. int desired = sizeof(s->read_buffer) - s->read_len - 1;
  554. ret = SSL_read(s->host->ssl.conn, s->read_buffer, desired);
  555. if (ret > 0 ) {
  556. s->read_len += ret;
  557. return;
  558. }
  559. int sslerrno = SSL_get_error(s->host->ssl.conn, desired);
  560. if (sslerrno == SSL_ERROR_WANT_READ || sslerrno == SSL_ERROR_WANT_WRITE)
  561. return;
  562. worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR);
  563. u_long err;
  564. char buf[256];
  565. while ((err = ERR_get_error()) != 0) {
  566. ERR_error_string_n(err, buf, sizeof(buf));
  567. error("STREAM %s [send to %s] ssl error: %s", s->host->hostname, s->connected_to, buf);
  568. }
  569. error("Restarting connection");
  570. rrdpush_sender_thread_close_socket(s->host);
  571. return;
  572. }
  573. #endif
  574. ret = recv(s->host->rrdpush_sender_socket, s->read_buffer + s->read_len, sizeof(s->read_buffer) - s->read_len - 1,MSG_DONTWAIT);
  575. if (ret>0) {
  576. s->read_len += ret;
  577. return;
  578. }
  579. debug(D_STREAM, "Socket was POLLIN, but req %zu bytes gave %d", sizeof(s->read_buffer) - s->read_len - 1, ret);
  580. if (ret<0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR))
  581. return;
  582. if (ret==0) {
  583. worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_PARENT_CLOSED);
  584. error("STREAM %s [send to %s]: connection closed by far end. Restarting connection", s->host->hostname, s->connected_to);
  585. }
  586. else {
  587. worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR);
  588. error("STREAM %s [send to %s]: error during receive (%d). Restarting connection", s->host->hostname, s->connected_to, ret);
  589. }
  590. rrdpush_sender_thread_close_socket(s->host);
  591. }
  592. // This is just a placeholder until the gap filling state machine is inserted
  593. void execute_commands(struct sender_state *s) {
  594. char *start = s->read_buffer, *end = &s->read_buffer[s->read_len], *newline;
  595. *end = 0;
  596. while( start<end && (newline=strchr(start, '\n')) ) {
  597. *newline = 0;
  598. info("STREAM %s [send to %s] received command over connection: %s", s->host->hostname, s->connected_to, start);
  599. start = newline+1;
  600. }
  601. if (start<end) {
  602. memmove(s->read_buffer, start, end-start);
  603. s->read_len = end-start;
  604. }
  605. }
  606. static void rrdpush_sender_thread_cleanup_callback(void *ptr) {
  607. worker_unregister();
  608. RRDHOST *host = (RRDHOST *)ptr;
  609. netdata_mutex_lock(&host->sender->mutex);
  610. info("STREAM %s [send]: sending thread cleans up...", host->hostname);
  611. rrdpush_sender_thread_close_socket(host);
  612. // close the pipe
  613. if(host->rrdpush_sender_pipe[PIPE_READ] != -1) {
  614. close(host->rrdpush_sender_pipe[PIPE_READ]);
  615. host->rrdpush_sender_pipe[PIPE_READ] = -1;
  616. }
  617. if(host->rrdpush_sender_pipe[PIPE_WRITE] != -1) {
  618. close(host->rrdpush_sender_pipe[PIPE_WRITE]);
  619. host->rrdpush_sender_pipe[PIPE_WRITE] = -1;
  620. }
  621. if(!host->rrdpush_sender_join) {
  622. info("STREAM %s [send]: sending thread detaches itself.", host->hostname);
  623. netdata_thread_detach(netdata_thread_self());
  624. }
  625. host->rrdpush_sender_spawn = 0;
  626. info("STREAM %s [send]: sending thread now exits.", host->hostname);
  627. netdata_mutex_unlock(&host->sender->mutex);
  628. }
  629. void sender_init(struct sender_state *s, RRDHOST *parent) {
  630. memset(s, 0, sizeof(*s));
  631. s->host = parent;
  632. s->buffer = cbuffer_new(1024, 1024*1024);
  633. s->build = buffer_create(1);
  634. #ifdef ENABLE_COMPRESSION
  635. s->rrdpush_compression = default_compression_enabled;
  636. if (default_compression_enabled)
  637. s->compressor = create_compressor();
  638. #endif
  639. netdata_mutex_init(&s->mutex);
  640. }
  641. void *rrdpush_sender_thread(void *ptr) {
  642. struct sender_state *s = ptr;
  643. s->task_id = gettid();
  644. if(!s->host->rrdpush_send_enabled || !s->host->rrdpush_send_destination ||
  645. !*s->host->rrdpush_send_destination || !s->host->rrdpush_send_api_key ||
  646. !*s->host->rrdpush_send_api_key) {
  647. error("STREAM %s [send]: thread created (task id %d), but host has streaming disabled.",
  648. s->host->hostname, s->task_id);
  649. return NULL;
  650. }
  651. #ifdef ENABLE_HTTPS
  652. if (netdata_use_ssl_on_stream & NETDATA_SSL_FORCE ){
  653. security_start_ssl(NETDATA_SSL_CONTEXT_STREAMING);
  654. security_location_for_context(netdata_client_ctx, netdata_ssl_ca_file, netdata_ssl_ca_path);
  655. }
  656. #endif
  657. info("STREAM %s [send]: thread created (task id %d)", s->host->hostname, s->task_id);
  658. s->timeout = (int)appconfig_get_number(&stream_config, CONFIG_SECTION_STREAM, "timeout seconds", 60);
  659. s->default_port = (int)appconfig_get_number(&stream_config, CONFIG_SECTION_STREAM, "default port", 19999);
  660. s->buffer->max_size =
  661. (size_t)appconfig_get_number(&stream_config, CONFIG_SECTION_STREAM, "buffer size bytes", 1024 * 1024 * 10);
  662. s->reconnect_delay =
  663. (unsigned int)appconfig_get_number(&stream_config, CONFIG_SECTION_STREAM, "reconnect delay seconds", 5);
  664. remote_clock_resync_iterations = (unsigned int)appconfig_get_number(
  665. &stream_config, CONFIG_SECTION_STREAM,
  666. "initial clock resync iterations",
  667. remote_clock_resync_iterations); // TODO: REMOVE FOR SLEW / GAPFILLING
  668. // initialize rrdpush globals
  669. s->host->rrdpush_sender_connected = 0;
  670. if(pipe(s->host->rrdpush_sender_pipe) == -1) {
  671. error("STREAM %s [send]: cannot create required pipe. DISABLING STREAMING THREAD", s->host->hostname);
  672. return NULL;
  673. }
  674. s->version = STREAMING_PROTOCOL_CURRENT_VERSION;
  675. enum {
  676. Collector,
  677. Socket
  678. };
  679. struct pollfd fds[2];
  680. fds[Collector].fd = s->host->rrdpush_sender_pipe[PIPE_READ];
  681. fds[Collector].events = POLLIN;
  682. worker_register("STREAMSND");
  683. worker_register_job_name(WORKER_SENDER_JOB_CONNECT, "connect");
  684. worker_register_job_name(WORKER_SENDER_JOB_PIPE_READ, "pipe read");
  685. worker_register_job_name(WORKER_SENDER_JOB_SOCKET_RECEIVE, "receive");
  686. worker_register_job_name(WORKER_SENDER_JOB_EXECUTE, "execute");
  687. worker_register_job_name(WORKER_SENDER_JOB_SOCKET_SEND, "send");
  688. // disconnection reasons
  689. worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT, "disconnect timeout");
  690. worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_POLL_ERROR, "disconnect poll error");
  691. worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_SOCKER_ERROR, "disconnect socket error");
  692. worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_OVERFLOW, "disconnect overflow");
  693. worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_SSL_ERROR, "disconnect ssl error");
  694. worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_PARENT_CLOSED, "disconnect parent closed");
  695. worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR, "disconnect receive error");
  696. worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR, "disconnect send error");
  697. worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_NO_COMPRESSION, "disconnect no compression");
  698. worker_register_job_name(WORKER_SENDER_JOB_DISCONNECT_BAD_HANDSHAKE, "disconnect bad handshake");
  699. netdata_thread_cleanup_push(rrdpush_sender_thread_cleanup_callback, s->host);
  700. for(; s->host->rrdpush_send_enabled && !netdata_exit ;) {
  701. // check for outstanding cancellation requests
  702. netdata_thread_testcancel();
  703. // The connection attempt blocks (after which we use the socket in nonblocking)
  704. if(unlikely(s->host->rrdpush_sender_socket == -1)) {
  705. worker_is_busy(WORKER_SENDER_JOB_CONNECT);
  706. s->overflow = 0;
  707. s->read_len = 0;
  708. s->buffer->read = 0;
  709. s->buffer->write = 0;
  710. attempt_to_connect(s);
  711. if (s->version >= VERSION_GAP_FILLING) {
  712. time_t now = now_realtime_sec();
  713. sender_start(s);
  714. buffer_sprintf(s->build, "TIMESTAMP %"PRId64"", (int64_t)now);
  715. sender_commit(s);
  716. }
  717. rrdpush_claimed_id(s->host);
  718. continue;
  719. }
  720. // If the TCP window never opened then something is wrong, restart connection
  721. if(unlikely(now_monotonic_sec() - s->last_sent_t > s->timeout)) {
  722. worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_TIMEOUT);
  723. error("STREAM %s [send to %s]: could not send metrics for %d seconds - closing connection - we have sent %zu bytes on this connection via %zu send attempts.", s->host->hostname, s->connected_to, s->timeout, s->sent_bytes_on_this_connection, s->send_attempts);
  724. rrdpush_sender_thread_close_socket(s->host);
  725. continue;
  726. }
  727. worker_is_idle();
  728. // Wait until buffer opens in the socket or a rrdset_done_push wakes us
  729. fds[Collector].revents = 0;
  730. fds[Socket].revents = 0;
  731. fds[Socket].fd = s->host->rrdpush_sender_socket;
  732. netdata_mutex_lock(&s->mutex);
  733. char *chunk;
  734. size_t outstanding = cbuffer_next_unsafe(s->host->sender->buffer, &chunk);
  735. chunk = NULL; // Do not cache pointer outside of region - could be invalidated
  736. netdata_mutex_unlock(&s->mutex);
  737. if(outstanding) {
  738. s->send_attempts++;
  739. fds[Socket].events = POLLIN | POLLOUT;
  740. }
  741. else {
  742. fds[Socket].events = POLLIN;
  743. }
  744. int retval = poll(fds, 2, 1000);
  745. debug(D_STREAM, "STREAM: poll() finished collector=%d socket=%d (current chunk %zu bytes)...",
  746. fds[Collector].revents, fds[Socket].revents, outstanding);
  747. if(unlikely(netdata_exit)) break;
  748. // Spurious wake-ups without error - loop again
  749. if (retval == 0 || ((retval == -1) && (errno == EAGAIN || errno == EINTR))) {
  750. debug(D_STREAM, "Spurious wakeup");
  751. continue;
  752. }
  753. // Only errors from poll() are internal, but try restarting the connection
  754. if(unlikely(retval == -1)) {
  755. worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_POLL_ERROR);
  756. error("STREAM %s [send to %s]: failed to poll(). Closing socket.", s->host->hostname, s->connected_to);
  757. rrdpush_sender_thread_close_socket(s->host);
  758. continue;
  759. }
  760. // If the collector woke us up then empty the pipe to remove the signal
  761. if (fds[Collector].revents & POLLIN || fds[Collector].revents & POLLPRI) {
  762. worker_is_busy(WORKER_SENDER_JOB_PIPE_READ);
  763. debug(D_STREAM, "STREAM: Data added to send buffer (current buffer chunk %zu bytes)...", outstanding);
  764. char buffer[1000 + 1];
  765. if (read(s->host->rrdpush_sender_pipe[PIPE_READ], buffer, 1000) == -1)
  766. error("STREAM %s [send to %s]: cannot read from internal pipe.", s->host->hostname, s->connected_to);
  767. }
  768. // Read as much as possible to fill the buffer, split into full lines for execution.
  769. if (fds[Socket].revents & POLLIN) {
  770. worker_is_busy(WORKER_SENDER_JOB_SOCKET_RECEIVE);
  771. attempt_read(s);
  772. }
  773. worker_is_busy(WORKER_SENDER_JOB_EXECUTE);
  774. execute_commands(s);
  775. // If we have data and have seen the TCP window open then try to close it by a transmission.
  776. if (outstanding && fds[Socket].revents & POLLOUT) {
  777. worker_is_busy(WORKER_SENDER_JOB_SOCKET_SEND);
  778. attempt_to_send(s);
  779. }
  780. // TODO-GAPS - why do we only check this on the socket, not the pipe?
  781. if (outstanding) {
  782. char *error = NULL;
  783. if (unlikely(fds[Socket].revents & POLLERR))
  784. error = "socket reports errors (POLLERR)";
  785. else if (unlikely(fds[Socket].revents & POLLHUP))
  786. error = "connection closed by remote end (POLLHUP)";
  787. else if (unlikely(fds[Socket].revents & POLLNVAL))
  788. error = "connection is invalid (POLLNVAL)";
  789. if(unlikely(error)) {
  790. worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SOCKER_ERROR);
  791. error("STREAM %s [send to %s]: restart stream because %s - %zu bytes transmitted.", s->host->hostname,
  792. s->connected_to, error, s->sent_bytes_on_this_connection);
  793. rrdpush_sender_thread_close_socket(s->host);
  794. }
  795. }
  796. // protection from overflow
  797. if (s->overflow) {
  798. worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_OVERFLOW);
  799. errno = 0;
  800. error("STREAM %s [send to %s]: buffer full (%zu-bytes) after %zu bytes. Restarting connection",
  801. s->host->hostname, s->connected_to, s->buffer->size, s->sent_bytes_on_this_connection);
  802. rrdpush_sender_thread_close_socket(s->host);
  803. }
  804. }
  805. netdata_thread_cleanup_pop(1);
  806. return NULL;
  807. }