aclk_rx_msgs.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "aclk_rx_msgs.h"
  3. #include "aclk_stats.h"
  4. #include "aclk_query_queue.h"
  5. #include "aclk.h"
  6. #define ACLK_V2_PAYLOAD_SEPARATOR "\x0D\x0A\x0D\x0A"
  7. #define ACLK_CLOUD_REQ_V2_PREFIX "GET /"
  8. #define ACLK_V_COMPRESSION 2
  9. struct aclk_request {
  10. char *type_id;
  11. char *msg_id;
  12. char *callback_topic;
  13. char *payload;
  14. int version;
  15. int timeout;
  16. int min_version;
  17. int max_version;
  18. };
  19. static int cloud_to_agent_parse(JSON_ENTRY *e)
  20. {
  21. struct aclk_request *data = e->callback_data;
  22. switch (e->type) {
  23. case JSON_OBJECT:
  24. case JSON_ARRAY:
  25. break;
  26. case JSON_STRING:
  27. if (!strcmp(e->name, "msg-id")) {
  28. data->msg_id = strdupz(e->data.string);
  29. break;
  30. }
  31. if (!strcmp(e->name, "type")) {
  32. data->type_id = strdupz(e->data.string);
  33. break;
  34. }
  35. if (!strcmp(e->name, "callback-topic")) {
  36. data->callback_topic = strdupz(e->data.string);
  37. break;
  38. }
  39. if (!strcmp(e->name, "payload")) {
  40. if (likely(e->data.string)) {
  41. size_t len = strlen(e->data.string);
  42. data->payload = mallocz(len+1);
  43. if (!url_decode_r(data->payload, e->data.string, len + 1))
  44. strcpy(data->payload, e->data.string);
  45. }
  46. break;
  47. }
  48. break;
  49. case JSON_NUMBER:
  50. if (!strcmp(e->name, "version")) {
  51. data->version = e->data.number;
  52. break;
  53. }
  54. if (!strcmp(e->name, "timeout")) {
  55. data->timeout = e->data.number;
  56. break;
  57. }
  58. if (!strcmp(e->name, "min-version")) {
  59. data->min_version = e->data.number;
  60. break;
  61. }
  62. if (!strcmp(e->name, "max-version")) {
  63. data->max_version = e->data.number;
  64. break;
  65. }
  66. break;
  67. case JSON_BOOLEAN:
  68. break;
  69. case JSON_NULL:
  70. break;
  71. }
  72. return 0;
  73. }
  74. static inline int aclk_extract_v2_data(char *payload, char **data)
  75. {
  76. char* ptr = strstr(payload, ACLK_V2_PAYLOAD_SEPARATOR);
  77. if(!ptr)
  78. return 1;
  79. ptr += strlen(ACLK_V2_PAYLOAD_SEPARATOR);
  80. *data = strdupz(ptr);
  81. return 0;
  82. }
  83. static inline int aclk_v2_payload_get_query(const char *payload, char **query_url)
  84. {
  85. const char *start, *end;
  86. // TODO better check of URL
  87. if(strncmp(payload, ACLK_CLOUD_REQ_V2_PREFIX, strlen(ACLK_CLOUD_REQ_V2_PREFIX))) {
  88. errno = 0;
  89. error("Only accepting requests that start with \"%s\" from CLOUD.", ACLK_CLOUD_REQ_V2_PREFIX);
  90. return 1;
  91. }
  92. start = payload + 4;
  93. if(!(end = strstr(payload, " HTTP/1.1\x0D\x0A"))) {
  94. errno = 0;
  95. error("Doesn't look like HTTP GET request.");
  96. return 1;
  97. }
  98. *query_url = mallocz((end - start) + 1);
  99. strncpyz(*query_url, start, end - start);
  100. return 0;
  101. }
  102. #define HTTP_CHECK_AGENT_INITIALIZED() ACLK_SHARED_STATE_LOCK;\
  103. if (unlikely(aclk_shared_state.agent_state == ACLK_HOST_INITIALIZING)) {\
  104. debug(D_ACLK, "Ignoring \"http\" cloud request; agent not in stable state");\
  105. ACLK_SHARED_STATE_UNLOCK;\
  106. return 1;\
  107. }\
  108. ACLK_SHARED_STATE_UNLOCK;
  109. static int aclk_handle_cloud_http_request_v2(struct aclk_request *cloud_to_agent, char *raw_payload)
  110. {
  111. if (!aclk_use_new_cloud_arch) {
  112. HTTP_CHECK_AGENT_INITIALIZED();
  113. }
  114. aclk_query_t query;
  115. errno = 0;
  116. if (cloud_to_agent->version < ACLK_V_COMPRESSION) {
  117. error(
  118. "This handler cannot reply to request with version older than %d, received %d.",
  119. ACLK_V_COMPRESSION,
  120. cloud_to_agent->version);
  121. return 1;
  122. }
  123. query = aclk_query_new(HTTP_API_V2);
  124. if (unlikely(aclk_extract_v2_data(raw_payload, &query->data.http_api_v2.payload))) {
  125. error("Error extracting payload expected after the JSON dictionary.");
  126. goto error;
  127. }
  128. if (unlikely(aclk_v2_payload_get_query(query->data.http_api_v2.payload, &query->dedup_id))) {
  129. error("Could not extract payload from query");
  130. goto error;
  131. }
  132. if (unlikely(!cloud_to_agent->callback_topic)) {
  133. error("Missing callback_topic");
  134. goto error;
  135. }
  136. if (unlikely(!cloud_to_agent->msg_id)) {
  137. error("Missing msg_id");
  138. goto error;
  139. }
  140. // aclk_queue_query takes ownership of data pointer
  141. query->callback_topic = cloud_to_agent->callback_topic;
  142. query->timeout = cloud_to_agent->timeout;
  143. // for clarity and code readability as when we process the request
  144. // it would be strange to get URL from `dedup_id`
  145. query->data.http_api_v2.query = query->dedup_id;
  146. query->msg_id = cloud_to_agent->msg_id;
  147. aclk_queue_query(query);
  148. return 0;
  149. error:
  150. aclk_query_free(query);
  151. return 1;
  152. }
  153. int aclk_handle_cloud_cmd_message(char *payload)
  154. {
  155. struct aclk_request cloud_to_agent;
  156. memset(&cloud_to_agent, 0, sizeof(struct aclk_request));
  157. if (unlikely(!payload)) {
  158. error_report("ACLK incoming 'cmd' message is empty");
  159. return 1;
  160. }
  161. debug(D_ACLK, "ACLK incoming 'cmd' message (%s)", payload);
  162. int rc = json_parse(payload, &cloud_to_agent, cloud_to_agent_parse);
  163. if (unlikely(rc != JSON_OK)) {
  164. error_report("Malformed json request (%s)", payload);
  165. goto err_cleanup;
  166. }
  167. if (!cloud_to_agent.type_id) {
  168. error_report("Cloud message is missing compulsory key \"type\"");
  169. goto err_cleanup;
  170. }
  171. // Originally we were expecting to have multiple types of 'cmd' message,
  172. // but after the new protocol was designed we will ever only have 'http'
  173. if (strcmp(cloud_to_agent.type_id, "http")) {
  174. error_report("Only 'http' cmd message is supported");
  175. goto err_cleanup;
  176. }
  177. if (likely(!aclk_handle_cloud_http_request_v2(&cloud_to_agent, payload))) {
  178. // aclk_handle_cloud_request takes ownership of the pointers
  179. // (to avoid copying) in case of success
  180. freez(cloud_to_agent.type_id);
  181. return 0;
  182. }
  183. err_cleanup:
  184. if (cloud_to_agent.payload)
  185. freez(cloud_to_agent.payload);
  186. if (cloud_to_agent.type_id)
  187. freez(cloud_to_agent.type_id);
  188. if (cloud_to_agent.msg_id)
  189. freez(cloud_to_agent.msg_id);
  190. if (cloud_to_agent.callback_topic)
  191. freez(cloud_to_agent.callback_topic);
  192. return 1;
  193. }
  194. #ifdef ENABLE_NEW_CLOUD_PROTOCOL
  195. typedef uint32_t simple_hash_t;
  196. typedef int(*rx_msg_handler)(const char *msg, size_t msg_len);
  197. int handle_old_proto_cmd(const char *msg, size_t msg_len)
  198. {
  199. // msg is binary payload in all other cases
  200. // however in this message from old legacy cloud
  201. // we have to convert it to C string
  202. char *str = mallocz(msg_len+1);
  203. memcpy(str, msg, msg_len);
  204. str[msg_len] = 0;
  205. if (aclk_handle_cloud_cmd_message(str)) {
  206. freez(str);
  207. return 1;
  208. }
  209. freez(str);
  210. return 0;
  211. }
  212. int create_node_instance_result(const char *msg, size_t msg_len)
  213. {
  214. node_instance_creation_result_t res = parse_create_node_instance_result(msg, msg_len);
  215. if (!res.machine_guid || !res.node_id) {
  216. error_report("Error parsing CreateNodeInstanceResult");
  217. freez(res.machine_guid);
  218. freez(res.node_id);
  219. return 1;
  220. }
  221. debug(D_ACLK, "CreateNodeInstanceResult: guid:%s nodeid:%s", res.machine_guid, res.node_id);
  222. uuid_t host_id, node_id;
  223. if (uuid_parse(res.machine_guid, host_id)) {
  224. error("Error parsing machine_guid provided by CreateNodeInstanceResult");
  225. freez(res.machine_guid);
  226. freez(res.node_id);
  227. return 1;
  228. }
  229. if (uuid_parse(res.node_id, node_id)) {
  230. error("Error parsing node_id provided by CreateNodeInstanceResult");
  231. freez(res.machine_guid);
  232. freez(res.node_id);
  233. return 1;
  234. }
  235. update_node_id(&host_id, &node_id);
  236. aclk_query_t query = aclk_query_new(NODE_STATE_UPDATE);
  237. node_instance_connection_t node_state_update = {
  238. .hops = 1,
  239. .live = 0,
  240. .queryable = 1,
  241. .session_id = aclk_session_newarch,
  242. .node_id = res.node_id
  243. };
  244. RRDHOST *host = rrdhost_find_by_guid(res.machine_guid, 0);
  245. if (host) {
  246. // not all host must have RRDHOST struct created for them
  247. // if they never connected during runtime of agent
  248. if (host == localhost) {
  249. node_state_update.live = 1;
  250. node_state_update.hops = 0;
  251. } else {
  252. netdata_mutex_lock(&host->receiver_lock);
  253. node_state_update.live = (host->receiver != NULL);
  254. netdata_mutex_unlock(&host->receiver_lock);
  255. node_state_update.hops = host->system_info->hops;
  256. }
  257. }
  258. rrdhost_aclk_state_lock(localhost);
  259. node_state_update.claim_id = localhost->aclk_state.claimed_id;
  260. query->data.bin_payload.payload = generate_node_instance_connection(&query->data.bin_payload.size, &node_state_update);
  261. rrdhost_aclk_state_unlock(localhost);
  262. query->data.bin_payload.msg_name = "UpdateNodeInstanceConnection";
  263. query->data.bin_payload.topic = ACLK_TOPICID_NODE_CONN;
  264. aclk_queue_query(query);
  265. freez(res.node_id);
  266. freez(res.machine_guid);
  267. return 0;
  268. }
  269. int send_node_instances(const char *msg, size_t msg_len)
  270. {
  271. UNUSED(msg);
  272. UNUSED(msg_len);
  273. aclk_send_node_instances();
  274. return 0;
  275. }
  276. int stream_charts_and_dimensions(const char *msg, size_t msg_len)
  277. {
  278. stream_charts_and_dims_t res = parse_stream_charts_and_dims(msg, msg_len);
  279. if (!res.claim_id || !res.node_id) {
  280. error("Error parsing StreamChartsAndDimensions msg");
  281. freez(res.claim_id);
  282. freez(res.node_id);
  283. return 1;
  284. }
  285. chart_batch_id = res.batch_id;
  286. aclk_start_streaming(res.node_id, res.seq_id, res.seq_id_created_at.tv_sec, res.batch_id);
  287. freez(res.claim_id);
  288. freez(res.node_id);
  289. return 0;
  290. }
  291. int charts_and_dimensions_ack(const char *msg, size_t msg_len)
  292. {
  293. chart_and_dim_ack_t res = parse_chart_and_dimensions_ack(msg, msg_len);
  294. if (!res.claim_id || !res.node_id) {
  295. error("Error parsing StreamChartsAndDimensions msg");
  296. freez(res.claim_id);
  297. freez(res.node_id);
  298. return 1;
  299. }
  300. aclk_ack_chart_sequence_id(res.node_id, res.last_seq_id);
  301. freez(res.claim_id);
  302. freez(res.node_id);
  303. return 0;
  304. }
  305. int update_chart_configs(const char *msg, size_t msg_len)
  306. {
  307. struct update_chart_config res = parse_update_chart_config(msg, msg_len);
  308. if (!res.claim_id || !res.node_id || !res.hashes)
  309. error("Error parsing UpdateChartConfigs msg");
  310. else
  311. aclk_get_chart_config(res.hashes);
  312. destroy_update_chart_config(&res);
  313. return 0;
  314. }
  315. int start_alarm_streaming(const char *msg, size_t msg_len)
  316. {
  317. struct start_alarm_streaming res = parse_start_alarm_streaming(msg, msg_len);
  318. if (!res.node_id || !res.batch_id) {
  319. error("Error parsing StartAlarmStreaming");
  320. freez(res.node_id);
  321. return 1;
  322. }
  323. aclk_start_alert_streaming(res.node_id, res.batch_id, res.start_seq_id);
  324. freez(res.node_id);
  325. return 0;
  326. }
  327. int send_alarm_log_health(const char *msg, size_t msg_len)
  328. {
  329. char *node_id = parse_send_alarm_log_health(msg, msg_len);
  330. if (!node_id) {
  331. error("Error parsing SendAlarmLogHealth");
  332. return 1;
  333. }
  334. aclk_send_alarm_health_log(node_id);
  335. freez(node_id);
  336. return 0;
  337. }
  338. int send_alarm_configuration(const char *msg, size_t msg_len)
  339. {
  340. char *config_hash = parse_send_alarm_configuration(msg, msg_len);
  341. if (!config_hash || !*config_hash) {
  342. error("Error parsing SendAlarmConfiguration");
  343. freez(config_hash);
  344. return 1;
  345. }
  346. aclk_send_alarm_configuration(config_hash);
  347. freez(config_hash);
  348. return 0;
  349. }
  350. int send_alarm_snapshot(const char *msg, size_t msg_len)
  351. {
  352. struct send_alarm_snapshot *sas = parse_send_alarm_snapshot(msg, msg_len);
  353. if (!sas->node_id || !sas->claim_id) {
  354. error("Error parsing SendAlarmSnapshot");
  355. destroy_send_alarm_snapshot(sas);
  356. return 1;
  357. }
  358. aclk_process_send_alarm_snapshot(sas->node_id, sas->claim_id, sas->snapshot_id, sas->sequence_id);
  359. destroy_send_alarm_snapshot(sas);
  360. return 0;
  361. }
  362. int handle_disconnect_req(const char *msg, size_t msg_len)
  363. {
  364. struct disconnect_cmd *cmd = parse_disconnect_cmd(msg, msg_len);
  365. if (!cmd)
  366. return 1;
  367. if (cmd->permaban) {
  368. error("Cloud Banned This Agent!");
  369. aclk_disable_runtime = 1;
  370. }
  371. info("Cloud requested disconnect (EC=%u, \"%s\")", (unsigned int)cmd->error_code, cmd->error_description);
  372. if (cmd->reconnect_after_s > 0) {
  373. aclk_block_until = now_monotonic_sec() + cmd->reconnect_after_s;
  374. info(
  375. "Cloud asks not to reconnect for %u seconds. We shall honor that request",
  376. (unsigned int)cmd->reconnect_after_s);
  377. }
  378. disconnect_req = 1;
  379. freez(cmd->error_description);
  380. freez(cmd);
  381. return 0;
  382. }
  383. typedef struct {
  384. const char *name;
  385. simple_hash_t name_hash;
  386. rx_msg_handler fnc;
  387. } new_cloud_rx_msg_t;
  388. new_cloud_rx_msg_t rx_msgs[] = {
  389. { .name = "cmd", .name_hash = 0, .fnc = handle_old_proto_cmd },
  390. { .name = "CreateNodeInstanceResult", .name_hash = 0, .fnc = create_node_instance_result },
  391. { .name = "SendNodeInstances", .name_hash = 0, .fnc = send_node_instances },
  392. { .name = "StreamChartsAndDimensions", .name_hash = 0, .fnc = stream_charts_and_dimensions },
  393. { .name = "ChartsAndDimensionsAck", .name_hash = 0, .fnc = charts_and_dimensions_ack },
  394. { .name = "UpdateChartConfigs", .name_hash = 0, .fnc = update_chart_configs },
  395. { .name = "StartAlarmStreaming", .name_hash = 0, .fnc = start_alarm_streaming },
  396. { .name = "SendAlarmLogHealth", .name_hash = 0, .fnc = send_alarm_log_health },
  397. { .name = "SendAlarmConfiguration", .name_hash = 0, .fnc = send_alarm_configuration },
  398. { .name = "SendAlarmSnapshot", .name_hash = 0, .fnc = send_alarm_snapshot },
  399. { .name = "DisconnectReq", .name_hash = 0, .fnc = handle_disconnect_req },
  400. { .name = NULL, .name_hash = 0, .fnc = NULL },
  401. };
  402. new_cloud_rx_msg_t *find_rx_handler_by_hash(simple_hash_t hash)
  403. {
  404. // we can afford to not compare strings after hash match
  405. // because we check for collisions at initialization in
  406. // aclk_init_rx_msg_handlers()
  407. for (int i = 0; rx_msgs[i].fnc; i++) {
  408. if (rx_msgs[i].name_hash == hash)
  409. return &rx_msgs[i];
  410. }
  411. return NULL;
  412. }
  413. const char *rx_handler_get_name(size_t i)
  414. {
  415. return rx_msgs[i].name;
  416. }
  417. unsigned int aclk_init_rx_msg_handlers(void)
  418. {
  419. int i;
  420. for (i = 0; rx_msgs[i].fnc; i++) {
  421. simple_hash_t hash = simple_hash(rx_msgs[i].name);
  422. new_cloud_rx_msg_t *hdl = find_rx_handler_by_hash(hash);
  423. if (unlikely(hdl)) {
  424. // the list of message names changes only by changing
  425. // the source code, therefore fatal is appropriate
  426. fatal("Hash collision. Choose better hash. Added '%s' clashes with existing '%s'", rx_msgs[i].name, hdl->name);
  427. }
  428. rx_msgs[i].name_hash = hash;
  429. }
  430. return i;
  431. }
  432. void aclk_handle_new_cloud_msg(const char *message_type, const char *msg, size_t msg_len)
  433. {
  434. if (aclk_stats_enabled) {
  435. ACLK_STATS_LOCK;
  436. aclk_metrics_per_sample.cloud_req_recvd++;
  437. ACLK_STATS_UNLOCK;
  438. }
  439. new_cloud_rx_msg_t *msg_descriptor = find_rx_handler_by_hash(simple_hash(message_type));
  440. debug(D_ACLK, "Got message named '%s' from cloud", message_type);
  441. if (unlikely(!msg_descriptor)) {
  442. error("Do not know how to handle message of type '%s'. Ignoring", message_type);
  443. if (aclk_stats_enabled) {
  444. ACLK_STATS_LOCK;
  445. aclk_metrics_per_sample.cloud_req_err++;
  446. ACLK_STATS_UNLOCK;
  447. }
  448. return;
  449. }
  450. if (aclk_stats_enabled) {
  451. ACLK_STATS_LOCK;
  452. aclk_proto_rx_msgs_sample[msg_descriptor-rx_msgs]++;
  453. ACLK_STATS_UNLOCK;
  454. }
  455. if (msg_descriptor->fnc(msg, msg_len)) {
  456. error("Error processing message of type '%s'", message_type);
  457. if (aclk_stats_enabled) {
  458. ACLK_STATS_LOCK;
  459. aclk_metrics_per_sample.cloud_req_err++;
  460. ACLK_STATS_UNLOCK;
  461. }
  462. return;
  463. }
  464. }
  465. #endif