aclk_rx_msgs.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "aclk_rx_msgs.h"
  3. #include "aclk_stats.h"
  4. #include "aclk_query_queue.h"
  5. #include "aclk.h"
  6. #define ACLK_V2_PAYLOAD_SEPARATOR "\x0D\x0A\x0D\x0A"
  7. #define ACLK_CLOUD_REQ_V2_PREFIX "GET /"
  8. #define ACLK_V_COMPRESSION 2
  9. struct aclk_request {
  10. char *type_id;
  11. char *msg_id;
  12. char *callback_topic;
  13. char *payload;
  14. int version;
  15. int min_version;
  16. int max_version;
  17. };
  18. static int cloud_to_agent_parse(JSON_ENTRY *e)
  19. {
  20. struct aclk_request *data = e->callback_data;
  21. switch (e->type) {
  22. case JSON_OBJECT:
  23. case JSON_ARRAY:
  24. break;
  25. case JSON_STRING:
  26. if (!strcmp(e->name, "msg-id")) {
  27. data->msg_id = strdupz(e->data.string);
  28. break;
  29. }
  30. if (!strcmp(e->name, "type")) {
  31. data->type_id = strdupz(e->data.string);
  32. break;
  33. }
  34. if (!strcmp(e->name, "callback-topic")) {
  35. data->callback_topic = strdupz(e->data.string);
  36. break;
  37. }
  38. if (!strcmp(e->name, "payload")) {
  39. if (likely(e->data.string)) {
  40. size_t len = strlen(e->data.string);
  41. data->payload = mallocz(len+1);
  42. if (!url_decode_r(data->payload, e->data.string, len + 1))
  43. strcpy(data->payload, e->data.string);
  44. }
  45. break;
  46. }
  47. break;
  48. case JSON_NUMBER:
  49. if (!strcmp(e->name, "version")) {
  50. data->version = e->data.number;
  51. break;
  52. }
  53. if (!strcmp(e->name, "min-version")) {
  54. data->min_version = e->data.number;
  55. break;
  56. }
  57. if (!strcmp(e->name, "max-version")) {
  58. data->max_version = e->data.number;
  59. break;
  60. }
  61. break;
  62. case JSON_BOOLEAN:
  63. break;
  64. case JSON_NULL:
  65. break;
  66. }
  67. return 0;
  68. }
  69. static inline int aclk_extract_v2_data(char *payload, char **data)
  70. {
  71. char* ptr = strstr(payload, ACLK_V2_PAYLOAD_SEPARATOR);
  72. if(!ptr)
  73. return 1;
  74. ptr += strlen(ACLK_V2_PAYLOAD_SEPARATOR);
  75. *data = strdupz(ptr);
  76. return 0;
  77. }
  78. static inline int aclk_v2_payload_get_query(const char *payload, char **query_url)
  79. {
  80. const char *start, *end;
  81. // TODO better check of URL
  82. if(strncmp(payload, ACLK_CLOUD_REQ_V2_PREFIX, strlen(ACLK_CLOUD_REQ_V2_PREFIX))) {
  83. errno = 0;
  84. error("Only accepting requests that start with \"%s\" from CLOUD.", ACLK_CLOUD_REQ_V2_PREFIX);
  85. return 1;
  86. }
  87. start = payload + 4;
  88. if(!(end = strstr(payload, " HTTP/1.1\x0D\x0A"))) {
  89. errno = 0;
  90. error("Doesn't look like HTTP GET request.");
  91. return 1;
  92. }
  93. *query_url = mallocz((end - start) + 1);
  94. strncpyz(*query_url, start, end - start);
  95. return 0;
  96. }
  97. #define HTTP_CHECK_AGENT_INITIALIZED() ACLK_SHARED_STATE_LOCK;\
  98. if (unlikely(aclk_shared_state.agent_state == ACLK_HOST_INITIALIZING)) {\
  99. debug(D_ACLK, "Ignoring \"http\" cloud request; agent not in stable state");\
  100. ACLK_SHARED_STATE_UNLOCK;\
  101. return 1;\
  102. }\
  103. ACLK_SHARED_STATE_UNLOCK;
  104. static int aclk_handle_cloud_http_request_v2(struct aclk_request *cloud_to_agent, char *raw_payload)
  105. {
  106. if (!aclk_use_new_cloud_arch) {
  107. HTTP_CHECK_AGENT_INITIALIZED();
  108. }
  109. aclk_query_t query;
  110. errno = 0;
  111. if (cloud_to_agent->version < ACLK_V_COMPRESSION) {
  112. error(
  113. "This handler cannot reply to request with version older than %d, received %d.",
  114. ACLK_V_COMPRESSION,
  115. cloud_to_agent->version);
  116. return 1;
  117. }
  118. query = aclk_query_new(HTTP_API_V2);
  119. if (unlikely(aclk_extract_v2_data(raw_payload, &query->data.http_api_v2.payload))) {
  120. error("Error extracting payload expected after the JSON dictionary.");
  121. goto error;
  122. }
  123. if (unlikely(aclk_v2_payload_get_query(query->data.http_api_v2.payload, &query->dedup_id))) {
  124. error("Could not extract payload from query");
  125. goto error;
  126. }
  127. if (unlikely(!cloud_to_agent->callback_topic)) {
  128. error("Missing callback_topic");
  129. goto error;
  130. }
  131. if (unlikely(!cloud_to_agent->msg_id)) {
  132. error("Missing msg_id");
  133. goto error;
  134. }
  135. // aclk_queue_query takes ownership of data pointer
  136. query->callback_topic = cloud_to_agent->callback_topic;
  137. // for clarity and code readability as when we process the request
  138. // it would be strange to get URL from `dedup_id`
  139. query->data.http_api_v2.query = query->dedup_id;
  140. query->msg_id = cloud_to_agent->msg_id;
  141. aclk_queue_query(query);
  142. return 0;
  143. error:
  144. aclk_query_free(query);
  145. return 1;
  146. }
  147. int aclk_handle_cloud_cmd_message(char *payload)
  148. {
  149. struct aclk_request cloud_to_agent;
  150. memset(&cloud_to_agent, 0, sizeof(struct aclk_request));
  151. if (unlikely(!payload)) {
  152. error_report("ACLK incoming 'cmd' message is empty");
  153. return 1;
  154. }
  155. debug(D_ACLK, "ACLK incoming 'cmd' message (%s)", payload);
  156. int rc = json_parse(payload, &cloud_to_agent, cloud_to_agent_parse);
  157. if (unlikely(rc != JSON_OK)) {
  158. error_report("Malformed json request (%s)", payload);
  159. goto err_cleanup;
  160. }
  161. if (!cloud_to_agent.type_id) {
  162. error_report("Cloud message is missing compulsory key \"type\"");
  163. goto err_cleanup;
  164. }
  165. // Originally we were expecting to have multiple types of 'cmd' message,
  166. // but after the new protocol was designed we will ever only have 'http'
  167. if (strcmp(cloud_to_agent.type_id, "http")) {
  168. error_report("Only 'http' cmd message is supported");
  169. goto err_cleanup;
  170. }
  171. if (likely(!aclk_handle_cloud_http_request_v2(&cloud_to_agent, payload))) {
  172. // aclk_handle_cloud_request takes ownership of the pointers
  173. // (to avoid copying) in case of success
  174. freez(cloud_to_agent.type_id);
  175. return 0;
  176. }
  177. err_cleanup:
  178. if (cloud_to_agent.payload)
  179. freez(cloud_to_agent.payload);
  180. if (cloud_to_agent.type_id)
  181. freez(cloud_to_agent.type_id);
  182. if (cloud_to_agent.msg_id)
  183. freez(cloud_to_agent.msg_id);
  184. if (cloud_to_agent.callback_topic)
  185. freez(cloud_to_agent.callback_topic);
  186. return 1;
  187. }
  188. #ifdef ENABLE_NEW_CLOUD_PROTOCOL
  189. typedef uint32_t simple_hash_t;
  190. typedef int(*rx_msg_handler)(const char *msg, size_t msg_len);
  191. int handle_old_proto_cmd(const char *msg, size_t msg_len)
  192. {
  193. // msg is binary payload in all other cases
  194. // however in this message from old legacy cloud
  195. // we have to convert it to C string
  196. char *str = mallocz(msg_len+1);
  197. memcpy(str, msg, msg_len);
  198. str[msg_len] = 0;
  199. if (aclk_handle_cloud_cmd_message(str)) {
  200. freez(str);
  201. return 1;
  202. }
  203. freez(str);
  204. return 0;
  205. }
  206. int create_node_instance_result(const char *msg, size_t msg_len)
  207. {
  208. node_instance_creation_result_t res = parse_create_node_instance_result(msg, msg_len);
  209. if (!res.machine_guid || !res.node_id) {
  210. error_report("Error parsing CreateNodeInstanceResult");
  211. freez(res.machine_guid);
  212. freez(res.node_id);
  213. return 1;
  214. }
  215. debug(D_ACLK, "CreateNodeInstanceResult: guid:%s nodeid:%s", res.machine_guid, res.node_id);
  216. uuid_t host_id, node_id;
  217. if (uuid_parse(res.machine_guid, host_id)) {
  218. error("Error parsing machine_guid provided by CreateNodeInstanceResult");
  219. freez(res.machine_guid);
  220. freez(res.node_id);
  221. return 1;
  222. }
  223. if (uuid_parse(res.node_id, node_id)) {
  224. error("Error parsing node_id provided by CreateNodeInstanceResult");
  225. freez(res.machine_guid);
  226. freez(res.node_id);
  227. return 1;
  228. }
  229. update_node_id(&host_id, &node_id);
  230. aclk_query_t query = aclk_query_new(NODE_STATE_UPDATE);
  231. query->data.node_update.hops = 1; //TODO - real hop count instead of hardcoded
  232. rrdhost_aclk_state_lock(localhost);
  233. query->data.node_update.claim_id = strdupz(localhost->aclk_state.claimed_id);
  234. rrdhost_aclk_state_unlock(localhost);
  235. RRDHOST *host = rrdhost_find_by_guid(res.machine_guid, 0);
  236. query->data.node_update.live = 0;
  237. if (host) {
  238. // not all host must have RRDHOST struct created for them
  239. // if they never connected during runtime of agent
  240. if (host == localhost) {
  241. query->data.node_update.live = 1;
  242. query->data.node_update.hops = 0;
  243. } else {
  244. netdata_mutex_lock(&host->receiver_lock);
  245. query->data.node_update.live = (host->receiver != NULL);
  246. netdata_mutex_unlock(&host->receiver_lock);
  247. query->data.node_update.hops = host->system_info->hops;
  248. }
  249. }
  250. query->data.node_update.node_id = res.node_id; // aclk_query_free will free it
  251. query->data.node_update.queryable = 1;
  252. query->data.node_update.session_id = aclk_session_newarch;
  253. aclk_queue_query(query);
  254. freez(res.machine_guid);
  255. return 0;
  256. }
  257. int send_node_instances(const char *msg, size_t msg_len)
  258. {
  259. UNUSED(msg);
  260. UNUSED(msg_len);
  261. aclk_send_node_instances();
  262. return 0;
  263. }
  264. int stream_charts_and_dimensions(const char *msg, size_t msg_len)
  265. {
  266. stream_charts_and_dims_t res = parse_stream_charts_and_dims(msg, msg_len);
  267. if (!res.claim_id || !res.node_id) {
  268. error("Error parsing StreamChartsAndDimensions msg");
  269. freez(res.claim_id);
  270. freez(res.node_id);
  271. return 1;
  272. }
  273. chart_batch_id = res.batch_id;
  274. aclk_start_streaming(res.node_id, res.seq_id, res.seq_id_created_at.tv_sec, res.batch_id);
  275. freez(res.claim_id);
  276. freez(res.node_id);
  277. return 0;
  278. }
  279. int charts_and_dimensions_ack(const char *msg, size_t msg_len)
  280. {
  281. chart_and_dim_ack_t res = parse_chart_and_dimensions_ack(msg, msg_len);
  282. if (!res.claim_id || !res.node_id) {
  283. error("Error parsing StreamChartsAndDimensions msg");
  284. freez(res.claim_id);
  285. freez(res.node_id);
  286. return 1;
  287. }
  288. aclk_ack_chart_sequence_id(res.node_id, res.last_seq_id);
  289. freez(res.claim_id);
  290. freez(res.node_id);
  291. return 0;
  292. }
  293. int update_chart_configs(const char *msg, size_t msg_len)
  294. {
  295. struct update_chart_config res = parse_update_chart_config(msg, msg_len);
  296. if (!res.claim_id || !res.node_id || !res.hashes)
  297. error("Error parsing UpdateChartConfigs msg");
  298. else
  299. aclk_get_chart_config(res.hashes);
  300. destroy_update_chart_config(&res);
  301. return 0;
  302. }
  303. int start_alarm_streaming(const char *msg, size_t msg_len)
  304. {
  305. struct start_alarm_streaming res = parse_start_alarm_streaming(msg, msg_len);
  306. if (!res.node_id || !res.batch_id) {
  307. error("Error parsing StartAlarmStreaming");
  308. freez(res.node_id);
  309. return 1;
  310. }
  311. aclk_start_alert_streaming(res.node_id, res.batch_id, res.start_seq_id);
  312. freez(res.node_id);
  313. return 0;
  314. }
  315. int send_alarm_log_health(const char *msg, size_t msg_len)
  316. {
  317. char *node_id = parse_send_alarm_log_health(msg, msg_len);
  318. if (!node_id) {
  319. error("Error parsing SendAlarmLogHealth");
  320. return 1;
  321. }
  322. aclk_send_alarm_health_log(node_id);
  323. freez(node_id);
  324. return 0;
  325. }
  326. int send_alarm_configuration(const char *msg, size_t msg_len)
  327. {
  328. char *config_hash = parse_send_alarm_configuration(msg, msg_len);
  329. if (!config_hash || !*config_hash) {
  330. error("Error parsing SendAlarmConfiguration");
  331. freez(config_hash);
  332. return 1;
  333. }
  334. aclk_send_alarm_configuration(config_hash);
  335. freez(config_hash);
  336. return 0;
  337. }
  338. int send_alarm_snapshot(const char *msg, size_t msg_len)
  339. {
  340. struct send_alarm_snapshot *sas = parse_send_alarm_snapshot(msg, msg_len);
  341. if (!sas->node_id || !sas->claim_id) {
  342. error("Error parsing SendAlarmSnapshot");
  343. destroy_send_alarm_snapshot(sas);
  344. return 1;
  345. }
  346. aclk_process_send_alarm_snapshot(sas->node_id, sas->claim_id, sas->snapshot_id, sas->sequence_id);
  347. destroy_send_alarm_snapshot(sas);
  348. return 0;
  349. }
  350. int handle_disconnect_req(const char *msg, size_t msg_len)
  351. {
  352. struct disconnect_cmd *cmd = parse_disconnect_cmd(msg, msg_len);
  353. if (!cmd)
  354. return 1;
  355. if (cmd->permaban) {
  356. error("Cloud Banned This Agent!");
  357. aclk_disable_runtime = 1;
  358. }
  359. info("Cloud requested disconnect (EC=%u, \"%s\")", (unsigned int)cmd->error_code, cmd->error_description);
  360. if (cmd->reconnect_after_s > 0) {
  361. aclk_block_until = now_monotonic_sec() + cmd->reconnect_after_s;
  362. info(
  363. "Cloud asks not to reconnect for %u seconds. We shall honor that request",
  364. (unsigned int)cmd->reconnect_after_s);
  365. }
  366. disconnect_req = 1;
  367. freez(cmd->error_description);
  368. freez(cmd);
  369. return 0;
  370. }
  371. typedef struct {
  372. const char *name;
  373. simple_hash_t name_hash;
  374. rx_msg_handler fnc;
  375. } new_cloud_rx_msg_t;
  376. new_cloud_rx_msg_t rx_msgs[] = {
  377. { .name = "cmd", .name_hash = 0, .fnc = handle_old_proto_cmd },
  378. { .name = "CreateNodeInstanceResult", .name_hash = 0, .fnc = create_node_instance_result },
  379. { .name = "SendNodeInstances", .name_hash = 0, .fnc = send_node_instances },
  380. { .name = "StreamChartsAndDimensions", .name_hash = 0, .fnc = stream_charts_and_dimensions },
  381. { .name = "ChartsAndDimensionsAck", .name_hash = 0, .fnc = charts_and_dimensions_ack },
  382. { .name = "UpdateChartConfigs", .name_hash = 0, .fnc = update_chart_configs },
  383. { .name = "StartAlarmStreaming", .name_hash = 0, .fnc = start_alarm_streaming },
  384. { .name = "SendAlarmLogHealth", .name_hash = 0, .fnc = send_alarm_log_health },
  385. { .name = "SendAlarmConfiguration", .name_hash = 0, .fnc = send_alarm_configuration },
  386. { .name = "SendAlarmSnapshot", .name_hash = 0, .fnc = send_alarm_snapshot },
  387. { .name = "DisconnectReq", .name_hash = 0, .fnc = handle_disconnect_req },
  388. { .name = NULL, .name_hash = 0, .fnc = NULL },
  389. };
  390. new_cloud_rx_msg_t *find_rx_handler_by_hash(simple_hash_t hash)
  391. {
  392. // we can afford to not compare strings after hash match
  393. // because we check for collisions at initialization in
  394. // aclk_init_rx_msg_handlers()
  395. for (int i = 0; rx_msgs[i].fnc; i++) {
  396. if (rx_msgs[i].name_hash == hash)
  397. return &rx_msgs[i];
  398. }
  399. return NULL;
  400. }
  401. const char *rx_handler_get_name(size_t i)
  402. {
  403. return rx_msgs[i].name;
  404. }
  405. unsigned int aclk_init_rx_msg_handlers(void)
  406. {
  407. int i;
  408. for (i = 0; rx_msgs[i].fnc; i++) {
  409. simple_hash_t hash = simple_hash(rx_msgs[i].name);
  410. new_cloud_rx_msg_t *hdl = find_rx_handler_by_hash(hash);
  411. if (unlikely(hdl)) {
  412. // the list of message names changes only by changing
  413. // the source code, therefore fatal is appropriate
  414. fatal("Hash collision. Choose better hash. Added '%s' clashes with existing '%s'", rx_msgs[i].name, hdl->name);
  415. }
  416. rx_msgs[i].name_hash = hash;
  417. }
  418. return i;
  419. }
  420. void aclk_handle_new_cloud_msg(const char *message_type, const char *msg, size_t msg_len)
  421. {
  422. if (aclk_stats_enabled) {
  423. ACLK_STATS_LOCK;
  424. aclk_metrics_per_sample.cloud_req_recvd++;
  425. ACLK_STATS_UNLOCK;
  426. }
  427. new_cloud_rx_msg_t *msg_descriptor = find_rx_handler_by_hash(simple_hash(message_type));
  428. debug(D_ACLK, "Got message named '%s' from cloud", message_type);
  429. if (unlikely(!msg_descriptor)) {
  430. error("Do not know how to handle message of type '%s'. Ignoring", message_type);
  431. if (aclk_stats_enabled) {
  432. ACLK_STATS_LOCK;
  433. aclk_metrics_per_sample.cloud_req_err++;
  434. ACLK_STATS_UNLOCK;
  435. }
  436. return;
  437. }
  438. if (aclk_stats_enabled) {
  439. ACLK_STATS_LOCK;
  440. aclk_proto_rx_msgs_sample[msg_descriptor-rx_msgs]++;
  441. ACLK_STATS_UNLOCK;
  442. }
  443. if (msg_descriptor->fnc(msg, msg_len)) {
  444. error("Error processing message of type '%s'", message_type);
  445. if (aclk_stats_enabled) {
  446. ACLK_STATS_LOCK;
  447. aclk_metrics_per_sample.cloud_req_err++;
  448. ACLK_STATS_UNLOCK;
  449. }
  450. return;
  451. }
  452. }
  453. #endif