worker_utilization.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383
  1. #include "worker_utilization.h"
  2. #define WORKER_IDLE 'I'
  3. #define WORKER_BUSY 'B'
  4. struct worker_job_type {
  5. STRING *name;
  6. STRING *units;
  7. // statistics controlled variables
  8. size_t statistics_last_jobs_started;
  9. usec_t statistics_last_busy_time;
  10. NETDATA_DOUBLE statistics_last_custom_value;
  11. // worker controlled variables
  12. volatile size_t worker_jobs_started;
  13. volatile usec_t worker_busy_time;
  14. WORKER_METRIC_TYPE type;
  15. NETDATA_DOUBLE custom_value;
  16. };
  17. struct worker {
  18. pid_t pid;
  19. const char *tag;
  20. const char *workname;
  21. // statistics controlled variables
  22. volatile usec_t statistics_last_checkpoint;
  23. size_t statistics_last_jobs_started;
  24. usec_t statistics_last_busy_time;
  25. // the worker controlled variables
  26. size_t worker_max_job_id;
  27. volatile size_t job_id;
  28. volatile size_t jobs_started;
  29. volatile usec_t busy_time;
  30. volatile usec_t last_action_timestamp;
  31. volatile char last_action;
  32. struct worker_job_type per_job_type[WORKER_UTILIZATION_MAX_JOB_TYPES];
  33. struct worker *next;
  34. struct worker *prev;
  35. };
  36. struct workers_workname { // this is what we add to JudyHS
  37. SPINLOCK spinlock;
  38. struct worker *base;
  39. };
  40. static struct workers_globals {
  41. SPINLOCK spinlock;
  42. Pvoid_t worknames_JudyHS;
  43. size_t memory;
  44. } workers_globals = { // workers globals, the base of all worknames
  45. .spinlock = NETDATA_SPINLOCK_INITIALIZER, // a lock for the worknames index
  46. .worknames_JudyHS = NULL, // the worknames index
  47. };
  48. static __thread struct worker *worker = NULL; // the current thread worker
  49. static inline usec_t worker_now_monotonic_usec(void) {
  50. #ifdef NETDATA_WITHOUT_WORKERS_LATENCY
  51. return 0;
  52. #else
  53. return now_monotonic_usec();
  54. #endif
  55. }
  56. size_t workers_allocated_memory(void) {
  57. spinlock_lock(&workers_globals.spinlock);
  58. size_t memory = workers_globals.memory;
  59. spinlock_unlock(&workers_globals.spinlock);
  60. return memory;
  61. }
  62. void worker_register(const char *name) {
  63. if(unlikely(worker)) return;
  64. worker = callocz(1, sizeof(struct worker));
  65. worker->pid = gettid();
  66. worker->tag = strdupz(netdata_thread_tag());
  67. worker->workname = strdupz(name);
  68. usec_t now = worker_now_monotonic_usec();
  69. worker->statistics_last_checkpoint = now;
  70. worker->last_action_timestamp = now;
  71. worker->last_action = WORKER_IDLE;
  72. size_t name_size = strlen(name) + 1;
  73. spinlock_lock(&workers_globals.spinlock);
  74. workers_globals.memory += sizeof(struct worker) + strlen(worker->tag) + 1 + strlen(worker->workname) + 1;
  75. Pvoid_t *PValue = JudyHSIns(&workers_globals.worknames_JudyHS, (void *)name, name_size, PJE0);
  76. struct workers_workname *workname = *PValue;
  77. if(!workname) {
  78. workname = mallocz(sizeof(struct workers_workname));
  79. spinlock_init(&workname->spinlock);
  80. workname->base = NULL;
  81. *PValue = workname;
  82. workers_globals.memory += sizeof(struct workers_workname) + JUDYHS_INDEX_SIZE_ESTIMATE(name_size);
  83. }
  84. spinlock_lock(&workname->spinlock);
  85. DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(workname->base, worker, prev, next);
  86. spinlock_unlock(&workname->spinlock);
  87. spinlock_unlock(&workers_globals.spinlock);
  88. }
  89. void worker_register_job_custom_metric(size_t job_id, const char *name, const char *units, WORKER_METRIC_TYPE type) {
  90. if(unlikely(!worker)) return;
  91. if(unlikely(job_id >= WORKER_UTILIZATION_MAX_JOB_TYPES)) {
  92. netdata_log_error("WORKER_UTILIZATION: job_id %zu is too big. Max is %zu", job_id, (size_t)(WORKER_UTILIZATION_MAX_JOB_TYPES - 1));
  93. return;
  94. }
  95. if(job_id > worker->worker_max_job_id)
  96. worker->worker_max_job_id = job_id;
  97. if(worker->per_job_type[job_id].name) {
  98. if(strcmp(string2str(worker->per_job_type[job_id].name), name) != 0 || worker->per_job_type[job_id].type != type || strcmp(string2str(worker->per_job_type[job_id].units), units) != 0)
  99. netdata_log_error("WORKER_UTILIZATION: duplicate job registration: worker '%s' job id %zu is '%s', ignoring the later '%s'", worker->workname, job_id, string2str(worker->per_job_type[job_id].name), name);
  100. return;
  101. }
  102. worker->per_job_type[job_id].name = string_strdupz(name);
  103. worker->per_job_type[job_id].units = string_strdupz(units);
  104. worker->per_job_type[job_id].type = type;
  105. }
  106. void worker_register_job_name(size_t job_id, const char *name) {
  107. worker_register_job_custom_metric(job_id, name, "", WORKER_METRIC_IDLE_BUSY);
  108. }
  109. void worker_unregister(void) {
  110. if(unlikely(!worker)) return;
  111. size_t workname_size = strlen(worker->workname) + 1;
  112. spinlock_lock(&workers_globals.spinlock);
  113. Pvoid_t *PValue = JudyHSGet(workers_globals.worknames_JudyHS, (void *)worker->workname, workname_size);
  114. if(PValue) {
  115. struct workers_workname *workname = *PValue;
  116. spinlock_lock(&workname->spinlock);
  117. DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(workname->base, worker, prev, next);
  118. spinlock_unlock(&workname->spinlock);
  119. if(!workname->base) {
  120. JudyHSDel(&workers_globals.worknames_JudyHS, (void *) worker->workname, workname_size, PJE0);
  121. freez(workname);
  122. workers_globals.memory -= sizeof(struct workers_workname) + JUDYHS_INDEX_SIZE_ESTIMATE(workname_size);
  123. }
  124. }
  125. workers_globals.memory -= sizeof(struct worker) + strlen(worker->tag) + 1 + strlen(worker->workname) + 1;
  126. spinlock_unlock(&workers_globals.spinlock);
  127. for(int i = 0; i < WORKER_UTILIZATION_MAX_JOB_TYPES ;i++) {
  128. string_freez(worker->per_job_type[i].name);
  129. string_freez(worker->per_job_type[i].units);
  130. }
  131. freez((void *)worker->tag);
  132. freez((void *)worker->workname);
  133. freez(worker);
  134. worker = NULL;
  135. }
  136. static inline void worker_is_idle_with_time(usec_t now) {
  137. usec_t delta = now - worker->last_action_timestamp;
  138. worker->busy_time += delta;
  139. worker->per_job_type[worker->job_id].worker_busy_time += delta;
  140. // the worker was busy
  141. // set it to idle before we set the timestamp
  142. worker->last_action = WORKER_IDLE;
  143. if(likely(worker->last_action_timestamp < now))
  144. worker->last_action_timestamp = now;
  145. }
  146. void worker_is_idle(void) {
  147. if(unlikely(!worker || worker->last_action != WORKER_BUSY)) return;
  148. worker_is_idle_with_time(worker_now_monotonic_usec());
  149. }
  150. void worker_is_busy(size_t job_id) {
  151. if(unlikely(!worker || job_id >= WORKER_UTILIZATION_MAX_JOB_TYPES))
  152. return;
  153. usec_t now = worker_now_monotonic_usec();
  154. if(worker->last_action == WORKER_BUSY)
  155. worker_is_idle_with_time(now);
  156. // the worker was idle
  157. // set the timestamp and then set it to busy
  158. worker->job_id = job_id;
  159. worker->per_job_type[job_id].worker_jobs_started++;
  160. worker->jobs_started++;
  161. worker->last_action_timestamp = now;
  162. worker->last_action = WORKER_BUSY;
  163. }
  164. void worker_set_metric(size_t job_id, NETDATA_DOUBLE value) {
  165. if(unlikely(!worker)) return;
  166. if(unlikely(job_id >= WORKER_UTILIZATION_MAX_JOB_TYPES))
  167. return;
  168. switch(worker->per_job_type[job_id].type) {
  169. case WORKER_METRIC_INCREMENT:
  170. worker->per_job_type[job_id].custom_value += value;
  171. break;
  172. case WORKER_METRIC_INCREMENTAL_TOTAL:
  173. case WORKER_METRIC_ABSOLUTE:
  174. default:
  175. worker->per_job_type[job_id].custom_value = value;
  176. break;
  177. }
  178. }
  179. // statistics interface
  180. void workers_foreach(const char *name, void (*callback)(
  181. void *data
  182. , pid_t pid
  183. , const char *thread_tag
  184. , size_t max_job_id
  185. , size_t utilization_usec
  186. , size_t duration_usec
  187. , size_t jobs_started, size_t is_running
  188. , STRING **job_types_names
  189. , STRING **job_types_units
  190. , WORKER_METRIC_TYPE *job_metric_types
  191. , size_t *job_types_jobs_started
  192. , usec_t *job_types_busy_time
  193. , NETDATA_DOUBLE *job_custom_values
  194. )
  195. , void *data) {
  196. spinlock_lock(&workers_globals.spinlock);
  197. usec_t busy_time, delta;
  198. size_t i, jobs_started, jobs_running;
  199. size_t workname_size = strlen(name) + 1;
  200. struct workers_workname *workname;
  201. Pvoid_t *PValue = JudyHSGet(workers_globals.worknames_JudyHS, (void *)name, workname_size);
  202. if(PValue) {
  203. workname = *PValue;
  204. spinlock_lock(&workname->spinlock);
  205. }
  206. else
  207. workname = NULL;
  208. spinlock_unlock(&workers_globals.spinlock);
  209. if(!workname)
  210. return;
  211. struct worker *p;
  212. DOUBLE_LINKED_LIST_FOREACH_FORWARD(workname->base, p, prev, next) {
  213. usec_t now = worker_now_monotonic_usec();
  214. // find per job type statistics
  215. STRING *per_job_type_name[WORKER_UTILIZATION_MAX_JOB_TYPES];
  216. STRING *per_job_type_units[WORKER_UTILIZATION_MAX_JOB_TYPES];
  217. WORKER_METRIC_TYPE per_job_metric_type[WORKER_UTILIZATION_MAX_JOB_TYPES];
  218. size_t per_job_type_jobs_started[WORKER_UTILIZATION_MAX_JOB_TYPES];
  219. usec_t per_job_type_busy_time[WORKER_UTILIZATION_MAX_JOB_TYPES];
  220. NETDATA_DOUBLE per_job_custom_values[WORKER_UTILIZATION_MAX_JOB_TYPES];
  221. size_t max_job_id = p->worker_max_job_id;
  222. for(i = 0; i <= max_job_id ;i++) {
  223. per_job_type_name[i] = p->per_job_type[i].name;
  224. per_job_type_units[i] = p->per_job_type[i].units;
  225. per_job_metric_type[i] = p->per_job_type[i].type;
  226. switch(p->per_job_type[i].type) {
  227. default:
  228. case WORKER_METRIC_EMPTY: {
  229. per_job_type_jobs_started[i] = 0;
  230. per_job_type_busy_time[i] = 0;
  231. per_job_custom_values[i] = NAN;
  232. break;
  233. }
  234. case WORKER_METRIC_IDLE_BUSY: {
  235. size_t tmp_jobs_started = p->per_job_type[i].worker_jobs_started;
  236. per_job_type_jobs_started[i] = tmp_jobs_started - p->per_job_type[i].statistics_last_jobs_started;
  237. p->per_job_type[i].statistics_last_jobs_started = tmp_jobs_started;
  238. usec_t tmp_busy_time = p->per_job_type[i].worker_busy_time;
  239. per_job_type_busy_time[i] = tmp_busy_time - p->per_job_type[i].statistics_last_busy_time;
  240. p->per_job_type[i].statistics_last_busy_time = tmp_busy_time;
  241. per_job_custom_values[i] = NAN;
  242. break;
  243. }
  244. case WORKER_METRIC_ABSOLUTE: {
  245. per_job_type_jobs_started[i] = 0;
  246. per_job_type_busy_time[i] = 0;
  247. per_job_custom_values[i] = p->per_job_type[i].custom_value;
  248. break;
  249. }
  250. case WORKER_METRIC_INCREMENTAL_TOTAL:
  251. case WORKER_METRIC_INCREMENT: {
  252. per_job_type_jobs_started[i] = 0;
  253. per_job_type_busy_time[i] = 0;
  254. NETDATA_DOUBLE tmp_custom_value = p->per_job_type[i].custom_value;
  255. per_job_custom_values[i] = tmp_custom_value - p->per_job_type[i].statistics_last_custom_value;
  256. p->per_job_type[i].statistics_last_custom_value = tmp_custom_value;
  257. break;
  258. }
  259. }
  260. }
  261. // get a copy of the worker variables
  262. size_t worker_job_id = p->job_id;
  263. usec_t worker_busy_time = p->busy_time;
  264. size_t worker_jobs_started = p->jobs_started;
  265. char worker_last_action = p->last_action;
  266. usec_t worker_last_action_timestamp = p->last_action_timestamp;
  267. delta = now - p->statistics_last_checkpoint;
  268. p->statistics_last_checkpoint = now;
  269. // this is the only variable both the worker thread and the statistics thread are writing
  270. // we set this only when the worker is busy, so that the worker will not
  271. // accumulate all the busy time, but only the time after the point we collected statistics
  272. if(worker_last_action == WORKER_BUSY && p->last_action_timestamp == worker_last_action_timestamp && p->last_action == WORKER_BUSY)
  273. p->last_action_timestamp = now;
  274. // calculate delta busy time
  275. busy_time = worker_busy_time - p->statistics_last_busy_time;
  276. p->statistics_last_busy_time = worker_busy_time;
  277. // calculate delta jobs done
  278. jobs_started = worker_jobs_started - p->statistics_last_jobs_started;
  279. p->statistics_last_jobs_started = worker_jobs_started;
  280. jobs_running = 0;
  281. if(worker_last_action == WORKER_BUSY) {
  282. // the worker is still busy with something
  283. // let's add that busy time to the reported one
  284. usec_t dt = now - worker_last_action_timestamp;
  285. busy_time += dt;
  286. per_job_type_busy_time[worker_job_id] += dt;
  287. jobs_running = 1;
  288. }
  289. callback(data
  290. , p->pid
  291. , p->tag
  292. , max_job_id
  293. , busy_time
  294. , delta
  295. , jobs_started
  296. , jobs_running
  297. , per_job_type_name
  298. , per_job_type_units
  299. , per_job_metric_type
  300. , per_job_type_jobs_started
  301. , per_job_type_busy_time
  302. , per_job_custom_values
  303. );
  304. }
  305. spinlock_unlock(&workname->spinlock);
  306. }