aclk_rx_msgs.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "aclk_rx_msgs.h"
  3. #include "aclk_stats.h"
  4. #include "aclk_query_queue.h"
  5. #include "aclk.h"
  6. #include "schema-wrappers/proto_2_json.h"
  7. #define ACLK_V2_PAYLOAD_SEPARATOR "\x0D\x0A\x0D\x0A"
  8. #define ACLK_CLOUD_REQ_V2_PREFIX "GET /"
  9. #define ACLK_V_COMPRESSION 2
  10. struct aclk_request {
  11. char *type_id;
  12. char *msg_id;
  13. char *callback_topic;
  14. char *payload;
  15. int version;
  16. int timeout;
  17. int min_version;
  18. int max_version;
  19. };
  20. static int cloud_to_agent_parse(JSON_ENTRY *e)
  21. {
  22. struct aclk_request *data = e->callback_data;
  23. switch (e->type) {
  24. case JSON_OBJECT:
  25. case JSON_ARRAY:
  26. break;
  27. case JSON_STRING:
  28. if (!strcmp(e->name, "msg-id")) {
  29. data->msg_id = strdupz(e->data.string);
  30. break;
  31. }
  32. if (!strcmp(e->name, "type")) {
  33. data->type_id = strdupz(e->data.string);
  34. break;
  35. }
  36. if (!strcmp(e->name, "callback-topic")) {
  37. data->callback_topic = strdupz(e->data.string);
  38. break;
  39. }
  40. if (!strcmp(e->name, "payload")) {
  41. if (likely(e->data.string)) {
  42. size_t len = strlen(e->data.string);
  43. data->payload = mallocz(len+1);
  44. if (!url_decode_r(data->payload, e->data.string, len + 1))
  45. strcpy(data->payload, e->data.string);
  46. }
  47. break;
  48. }
  49. break;
  50. case JSON_NUMBER:
  51. if (!strcmp(e->name, "version")) {
  52. data->version = (int)e->data.number;
  53. break;
  54. }
  55. if (!strcmp(e->name, "timeout")) {
  56. data->timeout = (int)e->data.number;
  57. break;
  58. }
  59. if (!strcmp(e->name, "min-version")) {
  60. data->min_version = (int)e->data.number;
  61. break;
  62. }
  63. if (!strcmp(e->name, "max-version")) {
  64. data->max_version = (int)e->data.number;
  65. break;
  66. }
  67. break;
  68. case JSON_BOOLEAN:
  69. break;
  70. case JSON_NULL:
  71. break;
  72. }
  73. return 0;
  74. }
  75. static inline int aclk_extract_v2_data(char *payload, char **data)
  76. {
  77. char* ptr = strstr(payload, ACLK_V2_PAYLOAD_SEPARATOR);
  78. if(!ptr)
  79. return 1;
  80. ptr += strlen(ACLK_V2_PAYLOAD_SEPARATOR);
  81. *data = strdupz(ptr);
  82. return 0;
  83. }
  84. static inline int aclk_v2_payload_get_query(const char *payload, char **query_url)
  85. {
  86. const char *start, *end;
  87. // TODO better check of URL
  88. if(strncmp(payload, ACLK_CLOUD_REQ_V2_PREFIX, strlen(ACLK_CLOUD_REQ_V2_PREFIX))) {
  89. errno = 0;
  90. error("Only accepting requests that start with \"%s\" from CLOUD.", ACLK_CLOUD_REQ_V2_PREFIX);
  91. return 1;
  92. }
  93. start = payload + 4;
  94. if(!(end = strstr(payload, " HTTP/1.1\x0D\x0A"))) {
  95. errno = 0;
  96. error("Doesn't look like HTTP GET request.");
  97. return 1;
  98. }
  99. *query_url = mallocz((end - start) + 1);
  100. strncpyz(*query_url, start, end - start);
  101. return 0;
  102. }
  103. static int aclk_handle_cloud_http_request_v2(struct aclk_request *cloud_to_agent, char *raw_payload)
  104. {
  105. aclk_query_t query;
  106. errno = 0;
  107. if (cloud_to_agent->version < ACLK_V_COMPRESSION) {
  108. error(
  109. "This handler cannot reply to request with version older than %d, received %d.",
  110. ACLK_V_COMPRESSION,
  111. cloud_to_agent->version);
  112. return 1;
  113. }
  114. query = aclk_query_new(HTTP_API_V2);
  115. if (unlikely(aclk_extract_v2_data(raw_payload, &query->data.http_api_v2.payload))) {
  116. error("Error extracting payload expected after the JSON dictionary.");
  117. goto error;
  118. }
  119. if (unlikely(aclk_v2_payload_get_query(query->data.http_api_v2.payload, &query->dedup_id))) {
  120. error("Could not extract payload from query");
  121. goto error;
  122. }
  123. if (unlikely(!cloud_to_agent->callback_topic)) {
  124. error("Missing callback_topic");
  125. goto error;
  126. }
  127. if (unlikely(!cloud_to_agent->msg_id)) {
  128. error("Missing msg_id");
  129. goto error;
  130. }
  131. // aclk_queue_query takes ownership of data pointer
  132. query->callback_topic = cloud_to_agent->callback_topic;
  133. query->timeout = cloud_to_agent->timeout;
  134. // for clarity and code readability as when we process the request
  135. // it would be strange to get URL from `dedup_id`
  136. query->data.http_api_v2.query = query->dedup_id;
  137. query->msg_id = cloud_to_agent->msg_id;
  138. aclk_queue_query(query);
  139. return 0;
  140. error:
  141. aclk_query_free(query);
  142. return 1;
  143. }
  144. int aclk_handle_cloud_cmd_message(char *payload)
  145. {
  146. struct aclk_request cloud_to_agent;
  147. memset(&cloud_to_agent, 0, sizeof(struct aclk_request));
  148. if (unlikely(!payload)) {
  149. error_report("ACLK incoming 'cmd' message is empty");
  150. return 1;
  151. }
  152. debug(D_ACLK, "ACLK incoming 'cmd' message (%s)", payload);
  153. int rc = json_parse(payload, &cloud_to_agent, cloud_to_agent_parse);
  154. if (unlikely(rc != JSON_OK)) {
  155. error_report("Malformed json request (%s)", payload);
  156. goto err_cleanup;
  157. }
  158. if (!cloud_to_agent.type_id) {
  159. error_report("Cloud message is missing compulsory key \"type\"");
  160. goto err_cleanup;
  161. }
  162. // Originally we were expecting to have multiple types of 'cmd' message,
  163. // but after the new protocol was designed we will ever only have 'http'
  164. if (strcmp(cloud_to_agent.type_id, "http")) {
  165. error_report("Only 'http' cmd message is supported");
  166. goto err_cleanup;
  167. }
  168. if (likely(!aclk_handle_cloud_http_request_v2(&cloud_to_agent, payload))) {
  169. // aclk_handle_cloud_request takes ownership of the pointers
  170. // (to avoid copying) in case of success
  171. freez(cloud_to_agent.type_id);
  172. return 0;
  173. }
  174. err_cleanup:
  175. if (cloud_to_agent.payload)
  176. freez(cloud_to_agent.payload);
  177. if (cloud_to_agent.type_id)
  178. freez(cloud_to_agent.type_id);
  179. if (cloud_to_agent.msg_id)
  180. freez(cloud_to_agent.msg_id);
  181. if (cloud_to_agent.callback_topic)
  182. freez(cloud_to_agent.callback_topic);
  183. return 1;
  184. }
  185. typedef uint32_t simple_hash_t;
  186. typedef int(*rx_msg_handler)(const char *msg, size_t msg_len);
  187. int handle_old_proto_cmd(const char *msg, size_t msg_len)
  188. {
  189. // msg is binary payload in all other cases
  190. // however in this message from old legacy cloud
  191. // we have to convert it to C string
  192. char *str = mallocz(msg_len+1);
  193. memcpy(str, msg, msg_len);
  194. str[msg_len] = 0;
  195. if (aclk_handle_cloud_cmd_message(str)) {
  196. freez(str);
  197. return 1;
  198. }
  199. freez(str);
  200. return 0;
  201. }
  202. int create_node_instance_result(const char *msg, size_t msg_len)
  203. {
  204. node_instance_creation_result_t res = parse_create_node_instance_result(msg, msg_len);
  205. if (!res.machine_guid || !res.node_id) {
  206. error_report("Error parsing CreateNodeInstanceResult");
  207. freez(res.machine_guid);
  208. freez(res.node_id);
  209. return 1;
  210. }
  211. debug(D_ACLK, "CreateNodeInstanceResult: guid:%s nodeid:%s", res.machine_guid, res.node_id);
  212. uuid_t host_id, node_id;
  213. if (uuid_parse(res.machine_guid, host_id)) {
  214. error("Error parsing machine_guid provided by CreateNodeInstanceResult");
  215. freez(res.machine_guid);
  216. freez(res.node_id);
  217. return 1;
  218. }
  219. if (uuid_parse(res.node_id, node_id)) {
  220. error("Error parsing node_id provided by CreateNodeInstanceResult");
  221. freez(res.machine_guid);
  222. freez(res.node_id);
  223. return 1;
  224. }
  225. update_node_id(&host_id, &node_id);
  226. aclk_query_t query = aclk_query_new(NODE_STATE_UPDATE);
  227. node_instance_connection_t node_state_update = {
  228. .hops = 1,
  229. .live = 0,
  230. .queryable = 1,
  231. .session_id = aclk_session_newarch,
  232. .node_id = res.node_id
  233. };
  234. RRDHOST *host = rrdhost_find_by_guid(res.machine_guid, 0);
  235. if (host) {
  236. // not all host must have RRDHOST struct created for them
  237. // if they never connected during runtime of agent
  238. if (host == localhost) {
  239. node_state_update.live = 1;
  240. node_state_update.hops = 0;
  241. } else {
  242. netdata_mutex_lock(&host->receiver_lock);
  243. node_state_update.live = (host->receiver != NULL);
  244. netdata_mutex_unlock(&host->receiver_lock);
  245. node_state_update.hops = host->system_info->hops;
  246. }
  247. }
  248. struct capability caps[] = {
  249. { .name = "proto", .version = 1, .enabled = 1 },
  250. { .name = "ml", .version = ml_capable(localhost), .enabled = host ? ml_enabled(host) : 0 },
  251. { .name = "mc", .version = enable_metric_correlations ? metric_correlations_version : 0, .enabled = enable_metric_correlations },
  252. { .name = "ctx", .version = 1, .enabled = 1 },
  253. { .name = NULL, .version = 0, .enabled = 0 }
  254. };
  255. node_state_update.capabilities = caps;
  256. rrdhost_aclk_state_lock(localhost);
  257. node_state_update.claim_id = localhost->aclk_state.claimed_id;
  258. query->data.bin_payload.payload = generate_node_instance_connection(&query->data.bin_payload.size, &node_state_update);
  259. rrdhost_aclk_state_unlock(localhost);
  260. query->data.bin_payload.msg_name = "UpdateNodeInstanceConnection";
  261. query->data.bin_payload.topic = ACLK_TOPICID_NODE_CONN;
  262. aclk_queue_query(query);
  263. freez(res.node_id);
  264. freez(res.machine_guid);
  265. return 0;
  266. }
  267. int send_node_instances(const char *msg, size_t msg_len)
  268. {
  269. UNUSED(msg);
  270. UNUSED(msg_len);
  271. aclk_send_node_instances();
  272. return 0;
  273. }
  274. int stream_charts_and_dimensions(const char *msg, size_t msg_len)
  275. {
  276. aclk_ctx_based = 0;
  277. stream_charts_and_dims_t res = parse_stream_charts_and_dims(msg, msg_len);
  278. if (!res.claim_id || !res.node_id) {
  279. error("Error parsing StreamChartsAndDimensions msg");
  280. freez(res.claim_id);
  281. freez(res.node_id);
  282. return 1;
  283. }
  284. chart_batch_id = res.batch_id;
  285. aclk_start_streaming(res.node_id, res.seq_id, res.seq_id_created_at.tv_sec, res.batch_id);
  286. freez(res.claim_id);
  287. freez(res.node_id);
  288. return 0;
  289. }
  290. int charts_and_dimensions_ack(const char *msg, size_t msg_len)
  291. {
  292. chart_and_dim_ack_t res = parse_chart_and_dimensions_ack(msg, msg_len);
  293. if (!res.claim_id || !res.node_id) {
  294. error("Error parsing StreamChartsAndDimensions msg");
  295. freez(res.claim_id);
  296. freez(res.node_id);
  297. return 1;
  298. }
  299. aclk_ack_chart_sequence_id(res.node_id, res.last_seq_id);
  300. freez(res.claim_id);
  301. freez(res.node_id);
  302. return 0;
  303. }
  304. int update_chart_configs(const char *msg, size_t msg_len)
  305. {
  306. struct update_chart_config res = parse_update_chart_config(msg, msg_len);
  307. if (!res.claim_id || !res.node_id || !res.hashes)
  308. error("Error parsing UpdateChartConfigs msg");
  309. else
  310. aclk_get_chart_config(res.hashes);
  311. destroy_update_chart_config(&res);
  312. return 0;
  313. }
  314. int start_alarm_streaming(const char *msg, size_t msg_len)
  315. {
  316. struct start_alarm_streaming res = parse_start_alarm_streaming(msg, msg_len);
  317. if (!res.node_id || !res.batch_id) {
  318. error("Error parsing StartAlarmStreaming");
  319. freez(res.node_id);
  320. return 1;
  321. }
  322. aclk_start_alert_streaming(res.node_id, res.batch_id, res.start_seq_id);
  323. freez(res.node_id);
  324. return 0;
  325. }
  326. int send_alarm_log_health(const char *msg, size_t msg_len)
  327. {
  328. char *node_id = parse_send_alarm_log_health(msg, msg_len);
  329. if (!node_id) {
  330. error("Error parsing SendAlarmLogHealth");
  331. return 1;
  332. }
  333. aclk_send_alarm_health_log(node_id);
  334. freez(node_id);
  335. return 0;
  336. }
  337. int send_alarm_configuration(const char *msg, size_t msg_len)
  338. {
  339. char *config_hash = parse_send_alarm_configuration(msg, msg_len);
  340. if (!config_hash || !*config_hash) {
  341. error("Error parsing SendAlarmConfiguration");
  342. freez(config_hash);
  343. return 1;
  344. }
  345. aclk_send_alarm_configuration(config_hash);
  346. freez(config_hash);
  347. return 0;
  348. }
  349. int send_alarm_snapshot(const char *msg, size_t msg_len)
  350. {
  351. struct send_alarm_snapshot *sas = parse_send_alarm_snapshot(msg, msg_len);
  352. if (!sas->node_id || !sas->claim_id) {
  353. error("Error parsing SendAlarmSnapshot");
  354. destroy_send_alarm_snapshot(sas);
  355. return 1;
  356. }
  357. aclk_process_send_alarm_snapshot(sas->node_id, sas->claim_id, sas->snapshot_id, sas->sequence_id);
  358. destroy_send_alarm_snapshot(sas);
  359. return 0;
  360. }
  361. int handle_disconnect_req(const char *msg, size_t msg_len)
  362. {
  363. struct disconnect_cmd *cmd = parse_disconnect_cmd(msg, msg_len);
  364. if (!cmd)
  365. return 1;
  366. if (cmd->permaban) {
  367. error("Cloud Banned This Agent!");
  368. aclk_disable_runtime = 1;
  369. }
  370. info("Cloud requested disconnect (EC=%u, \"%s\")", (unsigned int)cmd->error_code, cmd->error_description);
  371. if (cmd->reconnect_after_s > 0) {
  372. aclk_block_until = now_monotonic_sec() + cmd->reconnect_after_s;
  373. info(
  374. "Cloud asks not to reconnect for %u seconds. We shall honor that request",
  375. (unsigned int)cmd->reconnect_after_s);
  376. }
  377. disconnect_req = 1;
  378. freez(cmd->error_description);
  379. freez(cmd);
  380. return 0;
  381. }
  382. int contexts_checkpoint(const char *msg, size_t msg_len)
  383. {
  384. aclk_ctx_based = 1;
  385. struct ctxs_checkpoint *cmd = parse_ctxs_checkpoint(msg, msg_len);
  386. if (!cmd)
  387. return 1;
  388. rrdcontext_hub_checkpoint_command(cmd);
  389. freez(cmd->claim_id);
  390. freez(cmd->node_id);
  391. freez(cmd);
  392. return 0;
  393. }
  394. int stop_streaming_contexts(const char *msg, size_t msg_len)
  395. {
  396. if (!aclk_ctx_based) {
  397. error_report("Received StopStreamingContexts message but context based communication was not enabled (Cloud violated the protocol). Ignoring message");
  398. return 1;
  399. }
  400. struct stop_streaming_ctxs *cmd = parse_stop_streaming_ctxs(msg, msg_len);
  401. if (!cmd)
  402. return 1;
  403. rrdcontext_hub_stop_streaming_command(cmd);
  404. freez(cmd->claim_id);
  405. freez(cmd->node_id);
  406. freez(cmd);
  407. return 0;
  408. }
  409. typedef struct {
  410. const char *name;
  411. simple_hash_t name_hash;
  412. rx_msg_handler fnc;
  413. } new_cloud_rx_msg_t;
  414. new_cloud_rx_msg_t rx_msgs[] = {
  415. { .name = "cmd", .name_hash = 0, .fnc = handle_old_proto_cmd },
  416. { .name = "CreateNodeInstanceResult", .name_hash = 0, .fnc = create_node_instance_result },
  417. { .name = "SendNodeInstances", .name_hash = 0, .fnc = send_node_instances },
  418. { .name = "StreamChartsAndDimensions", .name_hash = 0, .fnc = stream_charts_and_dimensions },
  419. { .name = "ChartsAndDimensionsAck", .name_hash = 0, .fnc = charts_and_dimensions_ack },
  420. { .name = "UpdateChartConfigs", .name_hash = 0, .fnc = update_chart_configs },
  421. { .name = "StartAlarmStreaming", .name_hash = 0, .fnc = start_alarm_streaming },
  422. { .name = "SendAlarmLogHealth", .name_hash = 0, .fnc = send_alarm_log_health },
  423. { .name = "SendAlarmConfiguration", .name_hash = 0, .fnc = send_alarm_configuration },
  424. { .name = "SendAlarmSnapshot", .name_hash = 0, .fnc = send_alarm_snapshot },
  425. { .name = "DisconnectReq", .name_hash = 0, .fnc = handle_disconnect_req },
  426. { .name = "ContextsCheckpoint", .name_hash = 0, .fnc = contexts_checkpoint },
  427. { .name = "StopStreamingContexts", .name_hash = 0, .fnc = stop_streaming_contexts },
  428. { .name = NULL, .name_hash = 0, .fnc = NULL },
  429. };
  430. new_cloud_rx_msg_t *find_rx_handler_by_hash(simple_hash_t hash)
  431. {
  432. // we can afford to not compare strings after hash match
  433. // because we check for collisions at initialization in
  434. // aclk_init_rx_msg_handlers()
  435. for (int i = 0; rx_msgs[i].fnc; i++) {
  436. if (rx_msgs[i].name_hash == hash)
  437. return &rx_msgs[i];
  438. }
  439. return NULL;
  440. }
  441. const char *rx_handler_get_name(size_t i)
  442. {
  443. return rx_msgs[i].name;
  444. }
  445. unsigned int aclk_init_rx_msg_handlers(void)
  446. {
  447. int i;
  448. for (i = 0; rx_msgs[i].fnc; i++) {
  449. simple_hash_t hash = simple_hash(rx_msgs[i].name);
  450. new_cloud_rx_msg_t *hdl = find_rx_handler_by_hash(hash);
  451. if (unlikely(hdl)) {
  452. // the list of message names changes only by changing
  453. // the source code, therefore fatal is appropriate
  454. fatal("Hash collision. Choose better hash. Added '%s' clashes with existing '%s'", rx_msgs[i].name, hdl->name);
  455. }
  456. rx_msgs[i].name_hash = hash;
  457. }
  458. return i;
  459. }
  460. void aclk_handle_new_cloud_msg(const char *message_type, const char *msg, size_t msg_len, const char *topic)
  461. {
  462. if (aclk_stats_enabled) {
  463. ACLK_STATS_LOCK;
  464. aclk_metrics_per_sample.cloud_req_recvd++;
  465. ACLK_STATS_UNLOCK;
  466. }
  467. new_cloud_rx_msg_t *msg_descriptor = find_rx_handler_by_hash(simple_hash(message_type));
  468. debug(D_ACLK, "Got message named '%s' from cloud", message_type);
  469. if (unlikely(!msg_descriptor)) {
  470. error("Do not know how to handle message of type '%s'. Ignoring", message_type);
  471. if (aclk_stats_enabled) {
  472. ACLK_STATS_LOCK;
  473. aclk_metrics_per_sample.cloud_req_err++;
  474. ACLK_STATS_UNLOCK;
  475. }
  476. return;
  477. }
  478. #ifdef NETDATA_INTERNAL_CHECKS
  479. if (!strncmp(message_type, "cmd", strlen("cmd"))) {
  480. log_aclk_message_bin(msg, msg_len, 0, topic, msg_descriptor->name);
  481. } else {
  482. char *json = protomsg_to_json(msg, msg_len, msg_descriptor->name);
  483. log_aclk_message_bin(json, strlen(json), 0, topic, msg_descriptor->name);
  484. freez(json);
  485. }
  486. #endif
  487. if (aclk_stats_enabled) {
  488. ACLK_STATS_LOCK;
  489. aclk_proto_rx_msgs_sample[msg_descriptor-rx_msgs]++;
  490. ACLK_STATS_UNLOCK;
  491. }
  492. if (msg_descriptor->fnc(msg, msg_len)) {
  493. error("Error processing message of type '%s'", message_type);
  494. if (aclk_stats_enabled) {
  495. ACLK_STATS_LOCK;
  496. aclk_metrics_per_sample.cloud_req_err++;
  497. ACLK_STATS_UNLOCK;
  498. }
  499. return;
  500. }
  501. }