aclk_rx_msgs.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "aclk_rx_msgs.h"
  3. #include "aclk_stats.h"
  4. #include "aclk_query_queue.h"
  5. #include "aclk.h"
  6. #include "aclk_capas.h"
  7. #include "aclk_query.h"
  8. #include "schema-wrappers/proto_2_json.h"
  9. #define ACLK_V2_PAYLOAD_SEPARATOR "\x0D\x0A\x0D\x0A"
  10. #define ACLK_V_COMPRESSION 2
  11. struct aclk_request {
  12. char *type_id;
  13. char *msg_id;
  14. char *callback_topic;
  15. char *payload;
  16. int version;
  17. int timeout;
  18. int min_version;
  19. int max_version;
  20. };
  21. static int cloud_to_agent_parse(JSON_ENTRY *e)
  22. {
  23. struct aclk_request *data = e->callback_data;
  24. switch (e->type) {
  25. case JSON_OBJECT:
  26. case JSON_ARRAY:
  27. break;
  28. case JSON_STRING:
  29. if (!strcmp(e->name, "msg-id")) {
  30. data->msg_id = strdupz(e->data.string);
  31. break;
  32. }
  33. if (!strcmp(e->name, "type")) {
  34. data->type_id = strdupz(e->data.string);
  35. break;
  36. }
  37. if (!strcmp(e->name, "callback-topic")) {
  38. data->callback_topic = strdupz(e->data.string);
  39. break;
  40. }
  41. if (!strcmp(e->name, "payload")) {
  42. if (likely(e->data.string)) {
  43. size_t len = strlen(e->data.string);
  44. data->payload = mallocz(len+1);
  45. if (!url_decode_r(data->payload, e->data.string, len + 1))
  46. strcpy(data->payload, e->data.string);
  47. }
  48. break;
  49. }
  50. break;
  51. case JSON_NUMBER:
  52. if (!strcmp(e->name, "version")) {
  53. data->version = (int)e->data.number;
  54. break;
  55. }
  56. if (!strcmp(e->name, "timeout")) {
  57. data->timeout = (int)e->data.number;
  58. break;
  59. }
  60. if (!strcmp(e->name, "min-version")) {
  61. data->min_version = (int)e->data.number;
  62. break;
  63. }
  64. if (!strcmp(e->name, "max-version")) {
  65. data->max_version = (int)e->data.number;
  66. break;
  67. }
  68. break;
  69. case JSON_BOOLEAN:
  70. break;
  71. case JSON_NULL:
  72. break;
  73. }
  74. return 0;
  75. }
  76. static inline int aclk_extract_v2_data(char *payload, char **data)
  77. {
  78. char* ptr = strstr(payload, ACLK_V2_PAYLOAD_SEPARATOR);
  79. if(!ptr)
  80. return 1;
  81. ptr += strlen(ACLK_V2_PAYLOAD_SEPARATOR);
  82. *data = strdupz(ptr);
  83. return 0;
  84. }
  85. static inline int aclk_v2_payload_get_query(const char *payload, char **query_url)
  86. {
  87. const char *start, *end;
  88. if(strncmp(payload, "GET /", 5) == 0 || strncmp(payload, "PUT /", 5) == 0)
  89. start = payload + 4;
  90. else if(strncmp(payload, "POST /", 6) == 0)
  91. start = payload + 5;
  92. else if(strncmp(payload, "DELETE /", 8) == 0)
  93. start = payload + 7;
  94. else {
  95. errno = 0;
  96. netdata_log_error("Only accepting requests that start with GET, POST, PUT, DELETE from CLOUD.");
  97. return 1;
  98. }
  99. if(!(end = strstr(payload, HTTP_1_1 HTTP_ENDL))) {
  100. errno = 0;
  101. netdata_log_error("Doesn't look like HTTP GET request.");
  102. return 1;
  103. }
  104. *query_url = mallocz((end - start) + 1);
  105. strncpyz(*query_url, start, end - start);
  106. return 0;
  107. }
  108. static int aclk_handle_cloud_http_request_v2(struct aclk_request *cloud_to_agent, char *raw_payload)
  109. {
  110. aclk_query_t query;
  111. errno = 0;
  112. if (cloud_to_agent->version < ACLK_V_COMPRESSION) {
  113. netdata_log_error(
  114. "This handler cannot reply to request with version older than %d, received %d.",
  115. ACLK_V_COMPRESSION,
  116. cloud_to_agent->version);
  117. return 1;
  118. }
  119. query = aclk_query_new(HTTP_API_V2);
  120. if (unlikely(aclk_extract_v2_data(raw_payload, &query->data.http_api_v2.payload))) {
  121. netdata_log_error("Error extracting payload expected after the JSON dictionary.");
  122. goto error;
  123. }
  124. if (unlikely(aclk_v2_payload_get_query(query->data.http_api_v2.payload, &query->dedup_id))) {
  125. netdata_log_error("Could not extract payload from query");
  126. goto error;
  127. }
  128. if (unlikely(!cloud_to_agent->callback_topic)) {
  129. netdata_log_error("Missing callback_topic");
  130. goto error;
  131. }
  132. if (unlikely(!cloud_to_agent->msg_id)) {
  133. netdata_log_error("Missing msg_id");
  134. goto error;
  135. }
  136. // aclk_queue_query takes ownership of data pointer
  137. query->callback_topic = cloud_to_agent->callback_topic;
  138. query->timeout = cloud_to_agent->timeout;
  139. // for clarity and code readability as when we process the request
  140. // it would be strange to get URL from `dedup_id`
  141. query->data.http_api_v2.query = query->dedup_id;
  142. query->msg_id = cloud_to_agent->msg_id;
  143. aclk_queue_query(query);
  144. return 0;
  145. error:
  146. aclk_query_free(query);
  147. return 1;
  148. }
  149. int aclk_handle_cloud_cmd_message(char *payload)
  150. {
  151. struct aclk_request cloud_to_agent;
  152. memset(&cloud_to_agent, 0, sizeof(struct aclk_request));
  153. if (unlikely(!payload)) {
  154. error_report("ACLK incoming 'cmd' message is empty");
  155. return 1;
  156. }
  157. netdata_log_debug(D_ACLK, "ACLK incoming 'cmd' message (%s)", payload);
  158. int rc = json_parse(payload, &cloud_to_agent, cloud_to_agent_parse);
  159. if (unlikely(rc != JSON_OK)) {
  160. error_report("Malformed json request (%s)", payload);
  161. goto err_cleanup;
  162. }
  163. if (!cloud_to_agent.type_id) {
  164. error_report("Cloud message is missing compulsory key \"type\"");
  165. goto err_cleanup;
  166. }
  167. // Originally we were expecting to have multiple types of 'cmd' message,
  168. // but after the new protocol was designed we will ever only have 'http'
  169. if (strcmp(cloud_to_agent.type_id, "http") != 0) {
  170. error_report("Only 'http' cmd message is supported");
  171. goto err_cleanup;
  172. }
  173. if (likely(!aclk_handle_cloud_http_request_v2(&cloud_to_agent, payload))) {
  174. // aclk_handle_cloud_request takes ownership of the pointers
  175. // (to avoid copying) in case of success
  176. freez(cloud_to_agent.type_id);
  177. return 0;
  178. }
  179. err_cleanup:
  180. if (cloud_to_agent.payload)
  181. freez(cloud_to_agent.payload);
  182. if (cloud_to_agent.type_id)
  183. freez(cloud_to_agent.type_id);
  184. if (cloud_to_agent.msg_id)
  185. freez(cloud_to_agent.msg_id);
  186. if (cloud_to_agent.callback_topic)
  187. freez(cloud_to_agent.callback_topic);
  188. return 1;
  189. }
  190. typedef uint32_t simple_hash_t;
  191. typedef int(*rx_msg_handler)(const char *msg, size_t msg_len);
  192. int handle_old_proto_cmd(const char *msg, size_t msg_len)
  193. {
  194. // msg is binary payload in all other cases
  195. // however in this message from old legacy cloud
  196. // we have to convert it to C string
  197. char *str = mallocz(msg_len+1);
  198. memcpy(str, msg, msg_len);
  199. str[msg_len] = 0;
  200. if (aclk_handle_cloud_cmd_message(str)) {
  201. freez(str);
  202. return 1;
  203. }
  204. freez(str);
  205. return 0;
  206. }
  207. int create_node_instance_result(const char *msg, size_t msg_len)
  208. {
  209. node_instance_creation_result_t res = parse_create_node_instance_result(msg, msg_len);
  210. if (!res.machine_guid || !res.node_id) {
  211. error_report("Error parsing CreateNodeInstanceResult");
  212. freez(res.machine_guid);
  213. freez(res.node_id);
  214. return 1;
  215. }
  216. netdata_log_debug(D_ACLK, "CreateNodeInstanceResult: guid:%s nodeid:%s", res.machine_guid, res.node_id);
  217. uuid_t host_id, node_id;
  218. if (uuid_parse(res.machine_guid, host_id)) {
  219. netdata_log_error("Error parsing machine_guid provided by CreateNodeInstanceResult");
  220. freez(res.machine_guid);
  221. freez(res.node_id);
  222. return 1;
  223. }
  224. if (uuid_parse(res.node_id, node_id)) {
  225. netdata_log_error("Error parsing node_id provided by CreateNodeInstanceResult");
  226. freez(res.machine_guid);
  227. freez(res.node_id);
  228. return 1;
  229. }
  230. update_node_id(&host_id, &node_id);
  231. aclk_query_t query = aclk_query_new(NODE_STATE_UPDATE);
  232. node_instance_connection_t node_state_update = {
  233. .hops = 1,
  234. .live = 0,
  235. .queryable = 1,
  236. .session_id = aclk_session_newarch,
  237. .node_id = res.node_id,
  238. .capabilities = NULL
  239. };
  240. RRDHOST *host = rrdhost_find_by_guid(res.machine_guid);
  241. if (likely(host)) {
  242. if (host == localhost) {
  243. node_state_update.live = 1;
  244. node_state_update.hops = 0;
  245. } else {
  246. node_state_update.live = (!rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN));
  247. node_state_update.hops = host->system_info->hops;
  248. }
  249. node_state_update.capabilities = aclk_get_node_instance_capas(host);
  250. }
  251. rrdhost_aclk_state_lock(localhost);
  252. node_state_update.claim_id = localhost->aclk_state.claimed_id;
  253. query->data.bin_payload.payload = generate_node_instance_connection(&query->data.bin_payload.size, &node_state_update);
  254. rrdhost_aclk_state_unlock(localhost);
  255. freez((void *)node_state_update.capabilities);
  256. query->data.bin_payload.msg_name = "UpdateNodeInstanceConnection";
  257. query->data.bin_payload.topic = ACLK_TOPICID_NODE_CONN;
  258. aclk_queue_query(query);
  259. freez(res.node_id);
  260. freez(res.machine_guid);
  261. return 0;
  262. }
  263. int send_node_instances(const char *msg, size_t msg_len)
  264. {
  265. UNUSED(msg);
  266. UNUSED(msg_len);
  267. aclk_send_node_instances();
  268. return 0;
  269. }
  270. int stream_charts_and_dimensions(const char *msg, size_t msg_len)
  271. {
  272. UNUSED(msg);
  273. UNUSED(msg_len);
  274. error_report("Received obsolete StreamChartsAndDimensions msg");
  275. return 0;
  276. }
  277. int charts_and_dimensions_ack(const char *msg, size_t msg_len)
  278. {
  279. UNUSED(msg);
  280. UNUSED(msg_len);
  281. error_report("Received obsolete StreamChartsAndDimensionsAck msg");
  282. return 0;
  283. }
  284. int update_chart_configs(const char *msg, size_t msg_len)
  285. {
  286. UNUSED(msg);
  287. UNUSED(msg_len);
  288. error_report("Received obsolete UpdateChartConfigs msg");
  289. return 0;
  290. }
  291. int start_alarm_streaming(const char *msg, size_t msg_len)
  292. {
  293. struct start_alarm_streaming res = parse_start_alarm_streaming(msg, msg_len);
  294. if (!res.node_id) {
  295. netdata_log_error("Error parsing StartAlarmStreaming");
  296. return 1;
  297. }
  298. aclk_start_alert_streaming(res.node_id, res.resets);
  299. freez(res.node_id);
  300. return 0;
  301. }
  302. int send_alarm_checkpoint(const char *msg, size_t msg_len)
  303. {
  304. struct send_alarm_checkpoint sac = parse_send_alarm_checkpoint(msg, msg_len);
  305. if (!sac.node_id || !sac.claim_id) {
  306. netdata_log_error("Error parsing SendAlarmCheckpoint");
  307. freez(sac.node_id);
  308. freez(sac.claim_id);
  309. return 1;
  310. }
  311. aclk_send_alarm_checkpoint(sac.node_id, sac.claim_id);
  312. freez(sac.node_id);
  313. freez(sac.claim_id);
  314. return 0;
  315. }
  316. int send_alarm_configuration(const char *msg, size_t msg_len)
  317. {
  318. char *config_hash = parse_send_alarm_configuration(msg, msg_len);
  319. if (!config_hash || !*config_hash) {
  320. netdata_log_error("Error parsing SendAlarmConfiguration");
  321. freez(config_hash);
  322. return 1;
  323. }
  324. aclk_send_alarm_configuration(config_hash);
  325. freez(config_hash);
  326. return 0;
  327. }
  328. int send_alarm_snapshot(const char *msg, size_t msg_len)
  329. {
  330. struct send_alarm_snapshot *sas = parse_send_alarm_snapshot(msg, msg_len);
  331. if (!sas->node_id || !sas->claim_id || !sas->snapshot_uuid) {
  332. netdata_log_error("Error parsing SendAlarmSnapshot");
  333. destroy_send_alarm_snapshot(sas);
  334. return 1;
  335. }
  336. aclk_process_send_alarm_snapshot(sas->node_id, sas->claim_id, sas->snapshot_uuid);
  337. destroy_send_alarm_snapshot(sas);
  338. return 0;
  339. }
  340. int handle_disconnect_req(const char *msg, size_t msg_len)
  341. {
  342. struct disconnect_cmd *cmd = parse_disconnect_cmd(msg, msg_len);
  343. if (!cmd)
  344. return 1;
  345. if (cmd->permaban) {
  346. netdata_log_error("Cloud Banned This Agent!");
  347. aclk_disable_runtime = 1;
  348. }
  349. netdata_log_info("Cloud requested disconnect (EC=%u, \"%s\")", (unsigned int)cmd->error_code, cmd->error_description);
  350. if (cmd->reconnect_after_s > 0) {
  351. aclk_block_until = now_monotonic_sec() + cmd->reconnect_after_s;
  352. netdata_log_info(
  353. "Cloud asks not to reconnect for %u seconds. We shall honor that request",
  354. (unsigned int)cmd->reconnect_after_s);
  355. }
  356. disconnect_req = 1;
  357. freez(cmd->error_description);
  358. freez(cmd);
  359. return 0;
  360. }
  361. int contexts_checkpoint(const char *msg, size_t msg_len)
  362. {
  363. aclk_ctx_based = 1;
  364. struct ctxs_checkpoint *cmd = parse_ctxs_checkpoint(msg, msg_len);
  365. if (!cmd)
  366. return 1;
  367. rrdcontext_hub_checkpoint_command(cmd);
  368. freez(cmd->claim_id);
  369. freez(cmd->node_id);
  370. freez(cmd);
  371. return 0;
  372. }
  373. int stop_streaming_contexts(const char *msg, size_t msg_len)
  374. {
  375. if (!aclk_ctx_based) {
  376. error_report("Received StopStreamingContexts message but context based communication was not enabled (Cloud violated the protocol). Ignoring message");
  377. return 1;
  378. }
  379. struct stop_streaming_ctxs *cmd = parse_stop_streaming_ctxs(msg, msg_len);
  380. if (!cmd)
  381. return 1;
  382. rrdcontext_hub_stop_streaming_command(cmd);
  383. freez(cmd->claim_id);
  384. freez(cmd->node_id);
  385. freez(cmd);
  386. return 0;
  387. }
  388. int cancel_pending_req(const char *msg, size_t msg_len)
  389. {
  390. struct aclk_cancel_pending_req cmd = {.request_id = NULL, .trace_id = NULL};
  391. if(parse_cancel_pending_req(msg, msg_len, &cmd)) {
  392. error_report("Error parsing CancelPendingReq");
  393. return 1;
  394. }
  395. nd_log(NDLS_ACCESS, NDLP_NOTICE, "ACLK CancelPendingRequest REQ: %s, cloud trace-id: %s", cmd.request_id, cmd.trace_id);
  396. if (mark_pending_req_cancelled(cmd.request_id))
  397. error_report("CancelPending Request for %s failed. No such pending request.", cmd.request_id);
  398. free_cancel_pending_req(&cmd);
  399. return 0;
  400. }
  401. typedef struct {
  402. const char *name;
  403. simple_hash_t name_hash;
  404. rx_msg_handler fnc;
  405. } new_cloud_rx_msg_t;
  406. new_cloud_rx_msg_t rx_msgs[] = {
  407. { .name = "cmd", .name_hash = 0, .fnc = handle_old_proto_cmd },
  408. { .name = "CreateNodeInstanceResult", .name_hash = 0, .fnc = create_node_instance_result },
  409. { .name = "SendNodeInstances", .name_hash = 0, .fnc = send_node_instances },
  410. { .name = "StreamChartsAndDimensions", .name_hash = 0, .fnc = stream_charts_and_dimensions },
  411. { .name = "ChartsAndDimensionsAck", .name_hash = 0, .fnc = charts_and_dimensions_ack },
  412. { .name = "UpdateChartConfigs", .name_hash = 0, .fnc = update_chart_configs },
  413. { .name = "StartAlarmStreaming", .name_hash = 0, .fnc = start_alarm_streaming },
  414. { .name = "SendAlarmCheckpoint", .name_hash = 0, .fnc = send_alarm_checkpoint },
  415. { .name = "SendAlarmConfiguration", .name_hash = 0, .fnc = send_alarm_configuration },
  416. { .name = "SendAlarmSnapshot", .name_hash = 0, .fnc = send_alarm_snapshot },
  417. { .name = "DisconnectReq", .name_hash = 0, .fnc = handle_disconnect_req },
  418. { .name = "ContextsCheckpoint", .name_hash = 0, .fnc = contexts_checkpoint },
  419. { .name = "StopStreamingContexts", .name_hash = 0, .fnc = stop_streaming_contexts },
  420. { .name = "CancelPendingRequest", .name_hash = 0, .fnc = cancel_pending_req },
  421. { .name = NULL, .name_hash = 0, .fnc = NULL },
  422. };
  423. new_cloud_rx_msg_t *find_rx_handler_by_hash(simple_hash_t hash)
  424. {
  425. // we can afford to not compare strings after hash match
  426. // because we check for collisions at initialization in
  427. // aclk_init_rx_msg_handlers()
  428. for (int i = 0; rx_msgs[i].fnc; i++) {
  429. if (rx_msgs[i].name_hash == hash)
  430. return &rx_msgs[i];
  431. }
  432. return NULL;
  433. }
  434. const char *rx_handler_get_name(size_t i)
  435. {
  436. return rx_msgs[i].name;
  437. }
  438. unsigned int aclk_init_rx_msg_handlers(void)
  439. {
  440. int i;
  441. for (i = 0; rx_msgs[i].fnc; i++) {
  442. simple_hash_t hash = simple_hash(rx_msgs[i].name);
  443. new_cloud_rx_msg_t *hdl = find_rx_handler_by_hash(hash);
  444. if (unlikely(hdl)) {
  445. // the list of message names changes only by changing
  446. // the source code, therefore fatal is appropriate
  447. fatal("Hash collision. Choose better hash. Added '%s' clashes with existing '%s'", rx_msgs[i].name, hdl->name);
  448. }
  449. rx_msgs[i].name_hash = hash;
  450. }
  451. return i;
  452. }
  453. void aclk_handle_new_cloud_msg(const char *message_type, const char *msg, size_t msg_len, const char *topic __maybe_unused)
  454. {
  455. if (aclk_stats_enabled) {
  456. ACLK_STATS_LOCK;
  457. aclk_metrics_per_sample.cloud_req_recvd++;
  458. ACLK_STATS_UNLOCK;
  459. }
  460. new_cloud_rx_msg_t *msg_descriptor = find_rx_handler_by_hash(simple_hash(message_type));
  461. netdata_log_debug(D_ACLK, "Got message named '%s' from cloud", message_type);
  462. if (unlikely(!msg_descriptor)) {
  463. netdata_log_error("Do not know how to handle message of type '%s'. Ignoring", message_type);
  464. if (aclk_stats_enabled) {
  465. ACLK_STATS_LOCK;
  466. aclk_metrics_per_sample.cloud_req_err++;
  467. ACLK_STATS_UNLOCK;
  468. }
  469. return;
  470. }
  471. if (aclklog_enabled) {
  472. if (!strncmp(message_type, "cmd", strlen("cmd"))) {
  473. log_aclk_message_bin(msg, msg_len, 0, topic, msg_descriptor->name);
  474. } else {
  475. char *json = protomsg_to_json(msg, msg_len, msg_descriptor->name);
  476. log_aclk_message_bin(json, strlen(json), 0, topic, msg_descriptor->name);
  477. freez(json);
  478. }
  479. }
  480. if (aclk_stats_enabled) {
  481. ACLK_STATS_LOCK;
  482. aclk_proto_rx_msgs_sample[msg_descriptor-rx_msgs]++;
  483. ACLK_STATS_UNLOCK;
  484. }
  485. if (msg_descriptor->fnc(msg, msg_len)) {
  486. netdata_log_error("Error processing message of type '%s'", message_type);
  487. if (aclk_stats_enabled) {
  488. ACLK_STATS_LOCK;
  489. aclk_metrics_per_sample.cloud_req_err++;
  490. ACLK_STATS_UNLOCK;
  491. }
  492. return;
  493. }
  494. }