aclk_query.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "aclk_query.h"
  3. #include "aclk_stats.h"
  4. #include "aclk_tx_msgs.h"
  5. #include "../../web/server/web_client_cache.h"
  6. #define WEB_HDR_ACCEPT_ENC "Accept-Encoding:"
  7. pthread_cond_t query_cond_wait = PTHREAD_COND_INITIALIZER;
  8. pthread_mutex_t query_lock_wait = PTHREAD_MUTEX_INITIALIZER;
  9. #define QUERY_THREAD_LOCK pthread_mutex_lock(&query_lock_wait)
  10. #define QUERY_THREAD_UNLOCK pthread_mutex_unlock(&query_lock_wait)
  11. struct pending_req_list {
  12. const char *msg_id;
  13. uint32_t hash;
  14. int canceled;
  15. struct pending_req_list *next;
  16. };
  17. static struct pending_req_list *pending_req_list_head = NULL;
  18. static pthread_mutex_t pending_req_list_lock = PTHREAD_MUTEX_INITIALIZER;
  19. static struct pending_req_list *pending_req_list_add(const char *msg_id)
  20. {
  21. struct pending_req_list *new = callocz(1, sizeof(struct pending_req_list));
  22. new->msg_id = msg_id;
  23. new->hash = simple_hash(msg_id);
  24. pthread_mutex_lock(&pending_req_list_lock);
  25. new->next = pending_req_list_head;
  26. pending_req_list_head = new;
  27. pthread_mutex_unlock(&pending_req_list_lock);
  28. return new;
  29. }
  30. void pending_req_list_rm(const char *msg_id)
  31. {
  32. uint32_t hash = simple_hash(msg_id);
  33. struct pending_req_list *prev = NULL;
  34. pthread_mutex_lock(&pending_req_list_lock);
  35. struct pending_req_list *curr = pending_req_list_head;
  36. while (curr) {
  37. if (curr->hash == hash && strcmp(curr->msg_id, msg_id) == 0) {
  38. if (prev)
  39. prev->next = curr->next;
  40. else
  41. pending_req_list_head = curr->next;
  42. freez(curr);
  43. break;
  44. }
  45. prev = curr;
  46. curr = curr->next;
  47. }
  48. pthread_mutex_unlock(&pending_req_list_lock);
  49. }
  50. int mark_pending_req_cancelled(const char *msg_id)
  51. {
  52. uint32_t hash = simple_hash(msg_id);
  53. pthread_mutex_lock(&pending_req_list_lock);
  54. struct pending_req_list *curr = pending_req_list_head;
  55. while (curr) {
  56. if (curr->hash == hash && strcmp(curr->msg_id, msg_id) == 0) {
  57. curr->canceled = 1;
  58. pthread_mutex_unlock(&pending_req_list_lock);
  59. return 0;
  60. }
  61. curr = curr->next;
  62. }
  63. pthread_mutex_unlock(&pending_req_list_lock);
  64. return 1;
  65. }
  66. static bool aclk_web_client_interrupt_cb(struct web_client *w __maybe_unused, void *data)
  67. {
  68. struct pending_req_list *req = (struct pending_req_list *)data;
  69. return req->canceled;
  70. }
  71. static int http_api_v2(struct aclk_query_thread *query_thr, aclk_query_t query) {
  72. ND_LOG_STACK lgs[] = {
  73. ND_LOG_FIELD_TXT(NDF_SRC_TRANSPORT, "aclk"),
  74. ND_LOG_FIELD_END(),
  75. };
  76. ND_LOG_STACK_PUSH(lgs);
  77. int retval = 0;
  78. BUFFER *local_buffer = NULL;
  79. size_t size = 0;
  80. size_t sent = 0;
  81. usec_t dt_ut = 0;
  82. int z_ret;
  83. BUFFER *z_buffer = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE, &netdata_buffers_statistics.buffers_aclk);
  84. struct web_client *w = web_client_get_from_cache();
  85. web_client_set_conn_cloud(w);
  86. w->acl = HTTP_ACL_ACLK;
  87. w->access = HTTP_ACCESS_MEMBER; // the minimum access level for all requests from netdata cloud
  88. web_client_flags_clear_auth(w);
  89. web_client_flag_set(w, WEB_CLIENT_FLAG_AUTH_CLOUD);
  90. w->mode = HTTP_REQUEST_MODE_GET;
  91. w->timings.tv_in = query->created_tv;
  92. w->interrupt.callback = aclk_web_client_interrupt_cb;
  93. w->interrupt.callback_data = pending_req_list_add(query->msg_id);
  94. buffer_flush(w->response.data);
  95. buffer_strcat(w->response.data, query->data.http_api_v2.payload);
  96. HTTP_VALIDATION validation = http_request_validate(w);
  97. if(validation != HTTP_VALIDATION_OK) {
  98. nd_log(NDLS_ACCESS, NDLP_ERR, "ACLK received request is not valid, code %d", validation);
  99. retval = 1;
  100. w->response.code = HTTP_RESP_BAD_REQUEST;
  101. w->response.code = (short)aclk_http_msg_v2(query_thr->client, query->callback_topic, query->msg_id,
  102. dt_ut, query->created, w->response.code,
  103. NULL, 0);
  104. goto cleanup;
  105. }
  106. web_client_timeout_checkpoint_set(w, query->timeout);
  107. if(web_client_timeout_checkpoint_and_check(w, &dt_ut)) {
  108. nd_log(NDLS_ACCESS, NDLP_ERR,
  109. "QUERY CANCELED: QUEUE TIME EXCEEDED %llu ms (LIMIT %d ms)",
  110. dt_ut / USEC_PER_MS, query->timeout);
  111. retval = 1;
  112. w->response.code = HTTP_RESP_SERVICE_UNAVAILABLE;
  113. aclk_http_msg_v2_err(query_thr->client, query->callback_topic, query->msg_id, w->response.code, CLOUD_EC_SND_TIMEOUT, CLOUD_EMSG_SND_TIMEOUT, NULL, 0);
  114. goto cleanup;
  115. }
  116. char *path = (char *)buffer_tostring(w->url_path_decoded);
  117. if (aclk_stats_enabled) {
  118. char *url_path_endpoint = strrchr(path, '/');
  119. ACLK_STATS_LOCK;
  120. int stat_idx = aclk_cloud_req_http_type_to_idx(url_path_endpoint ? url_path_endpoint + 1 : "other");
  121. aclk_metrics_per_sample.cloud_req_http_by_type[stat_idx]++;
  122. ACLK_STATS_UNLOCK;
  123. }
  124. w->response.code = (short)web_client_api_request_with_node_selection(localhost, w, path);
  125. web_client_timeout_checkpoint_response_ready(w, &dt_ut);
  126. if (aclk_stats_enabled) {
  127. ACLK_STATS_LOCK;
  128. aclk_metrics_per_sample.cloud_q_process_total += dt_ut;
  129. aclk_metrics_per_sample.cloud_q_process_count++;
  130. if (aclk_metrics_per_sample.cloud_q_process_max < dt_ut)
  131. aclk_metrics_per_sample.cloud_q_process_max = dt_ut;
  132. ACLK_STATS_UNLOCK;
  133. }
  134. size = w->response.data->len;
  135. sent = size;
  136. if (w->response.data->len && w->response.zinitialized) {
  137. w->response.zstream.next_in = (Bytef *)w->response.data->buffer;
  138. w->response.zstream.avail_in = w->response.data->len;
  139. do {
  140. w->response.zstream.avail_out = NETDATA_WEB_RESPONSE_ZLIB_CHUNK_SIZE;
  141. w->response.zstream.next_out = w->response.zbuffer;
  142. z_ret = deflate(&w->response.zstream, Z_FINISH);
  143. if(z_ret < 0) {
  144. if(w->response.zstream.msg)
  145. netdata_log_error("Error compressing body. ZLIB error: \"%s\"", w->response.zstream.msg);
  146. else
  147. netdata_log_error("Unknown error during zlib compression.");
  148. retval = 1;
  149. w->response.code = 500;
  150. aclk_http_msg_v2_err(query_thr->client, query->callback_topic, query->msg_id, w->response.code, CLOUD_EC_ZLIB_ERROR, CLOUD_EMSG_ZLIB_ERROR, NULL, 0);
  151. goto cleanup;
  152. }
  153. int bytes_to_cpy = NETDATA_WEB_RESPONSE_ZLIB_CHUNK_SIZE - w->response.zstream.avail_out;
  154. buffer_need_bytes(z_buffer, bytes_to_cpy);
  155. memcpy(&z_buffer->buffer[z_buffer->len], w->response.zbuffer, bytes_to_cpy);
  156. z_buffer->len += bytes_to_cpy;
  157. } while(z_ret != Z_STREAM_END);
  158. // so that web_client_build_http_header
  159. // puts correct content length into header
  160. buffer_free(w->response.data);
  161. w->response.data = z_buffer;
  162. z_buffer = NULL;
  163. }
  164. web_client_build_http_header(w);
  165. local_buffer = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE, &netdata_buffers_statistics.buffers_aclk);
  166. local_buffer->content_type = CT_APPLICATION_JSON;
  167. buffer_strcat(local_buffer, w->response.header_output->buffer);
  168. if (w->response.data->len) {
  169. if (w->response.zinitialized) {
  170. buffer_need_bytes(local_buffer, w->response.data->len);
  171. memcpy(&local_buffer->buffer[local_buffer->len], w->response.data->buffer, w->response.data->len);
  172. local_buffer->len += w->response.data->len;
  173. sent = sent - size + w->response.data->len;
  174. } else {
  175. buffer_strcat(local_buffer, w->response.data->buffer);
  176. }
  177. }
  178. // send msg.
  179. w->response.code = (short)aclk_http_msg_v2(query_thr->client, query->callback_topic, query->msg_id,
  180. dt_ut, query->created, w->response.code,
  181. local_buffer->buffer, local_buffer->len);
  182. cleanup:
  183. web_client_log_completed_request(w, false);
  184. web_client_release_to_cache(w);
  185. pending_req_list_rm(query->msg_id);
  186. buffer_free(z_buffer);
  187. buffer_free(local_buffer);
  188. return retval;
  189. }
  190. static int send_bin_msg(struct aclk_query_thread *query_thr, aclk_query_t query)
  191. {
  192. // this will be simplified when legacy support is removed
  193. aclk_send_bin_message_subtopic_pid(query_thr->client, query->data.bin_payload.payload, query->data.bin_payload.size, query->data.bin_payload.topic, query->data.bin_payload.msg_name);
  194. return 0;
  195. }
  196. const char *aclk_query_get_name(aclk_query_type_t qt, int unknown_ok)
  197. {
  198. switch (qt) {
  199. case HTTP_API_V2: return "http_api_request_v2";
  200. case REGISTER_NODE: return "register_node";
  201. case NODE_STATE_UPDATE: return "node_state_update";
  202. case CHART_DIMS_UPDATE: return "chart_and_dim_update";
  203. case CHART_CONFIG_UPDATED: return "chart_config_updated";
  204. case CHART_RESET: return "reset_chart_messages";
  205. case RETENTION_UPDATED: return "update_retention_info";
  206. case UPDATE_NODE_INFO: return "update_node_info";
  207. case ALARM_PROVIDE_CHECKPOINT: return "alarm_checkpoint";
  208. case ALARM_PROVIDE_CFG: return "provide_alarm_config";
  209. case ALARM_SNAPSHOT: return "alarm_snapshot";
  210. case UPDATE_NODE_COLLECTORS: return "update_node_collectors";
  211. case PROTO_BIN_MESSAGE: return "generic_binary_proto_message";
  212. default:
  213. if (!unknown_ok)
  214. error_report("Unknown query type used %d", (int) qt);
  215. return "unknown";
  216. }
  217. }
  218. static void aclk_query_process_msg(struct aclk_query_thread *query_thr, aclk_query_t query)
  219. {
  220. if (query->type == UNKNOWN || query->type >= ACLK_QUERY_TYPE_COUNT) {
  221. error_report("Unknown query in query queue. %u", query->type);
  222. aclk_query_free(query);
  223. return;
  224. }
  225. worker_is_busy(query->type);
  226. if (query->type == HTTP_API_V2) {
  227. netdata_log_debug(D_ACLK, "Processing Queued Message of type: \"http_api_request_v2\"");
  228. http_api_v2(query_thr, query);
  229. } else {
  230. netdata_log_debug(D_ACLK, "Processing Queued Message of type: \"%s\"", query->data.bin_payload.msg_name);
  231. send_bin_msg(query_thr, query);
  232. }
  233. if (aclk_stats_enabled) {
  234. ACLK_STATS_LOCK;
  235. aclk_metrics_per_sample.queries_dispatched++;
  236. aclk_queries_per_thread[query_thr->idx]++;
  237. aclk_metrics_per_sample.queries_per_type[query->type]++;
  238. ACLK_STATS_UNLOCK;
  239. }
  240. aclk_query_free(query);
  241. worker_is_idle();
  242. }
  243. /* Processes messages from queue. Compete for work with other threads
  244. */
  245. int aclk_query_process_msgs(struct aclk_query_thread *query_thr)
  246. {
  247. aclk_query_t query;
  248. while ((query = aclk_queue_pop()))
  249. aclk_query_process_msg(query_thr, query);
  250. return 0;
  251. }
  252. static void worker_aclk_register(void) {
  253. worker_register("ACLKQUERY");
  254. for (int i = 1; i < ACLK_QUERY_TYPE_COUNT; i++) {
  255. worker_register_job_name(i, aclk_query_get_name(i, 0));
  256. }
  257. }
  258. static void aclk_query_request_cancel(void *data)
  259. {
  260. pthread_cond_broadcast((pthread_cond_t *) data);
  261. }
  262. /**
  263. * Main query processing thread
  264. */
  265. void *aclk_query_main_thread(void *ptr)
  266. {
  267. worker_aclk_register();
  268. struct aclk_query_thread *query_thr = ptr;
  269. service_register(SERVICE_THREAD_TYPE_NETDATA, aclk_query_request_cancel, NULL, &query_cond_wait, false);
  270. while (service_running(SERVICE_ACLK | ABILITY_DATA_QUERIES)) {
  271. aclk_query_process_msgs(query_thr);
  272. worker_is_idle();
  273. QUERY_THREAD_LOCK;
  274. if (unlikely(pthread_cond_wait(&query_cond_wait, &query_lock_wait)))
  275. sleep_usec(USEC_PER_SEC * 1);
  276. QUERY_THREAD_UNLOCK;
  277. }
  278. worker_unregister();
  279. return NULL;
  280. }
  281. #define TASK_LEN_MAX 22
  282. void aclk_query_threads_start(struct aclk_query_threads *query_threads, mqtt_wss_client client)
  283. {
  284. netdata_log_info("Starting %d query threads.", query_threads->count);
  285. char thread_name[TASK_LEN_MAX];
  286. query_threads->thread_list = callocz(query_threads->count, sizeof(struct aclk_query_thread));
  287. for (int i = 0; i < query_threads->count; i++) {
  288. query_threads->thread_list[i].idx = i; //thread needs to know its index for statistics
  289. query_threads->thread_list[i].client = client;
  290. if(unlikely(snprintfz(thread_name, TASK_LEN_MAX, "ACLK_QRY[%d]", i) < 0))
  291. netdata_log_error("snprintf encoding error");
  292. netdata_thread_create(
  293. &query_threads->thread_list[i].thread, thread_name, NETDATA_THREAD_OPTION_JOINABLE, aclk_query_main_thread,
  294. &query_threads->thread_list[i]);
  295. }
  296. }
  297. void aclk_query_threads_cleanup(struct aclk_query_threads *query_threads)
  298. {
  299. if (query_threads && query_threads->thread_list) {
  300. for (int i = 0; i < query_threads->count; i++) {
  301. netdata_thread_join(query_threads->thread_list[i].thread, NULL);
  302. }
  303. freez(query_threads->thread_list);
  304. }
  305. aclk_queue_lock();
  306. aclk_queue_flush();
  307. }