signals.c 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. #include "common.h"
  3. static int reaper_enabled = 0;
  4. typedef enum signal_action {
  5. NETDATA_SIGNAL_END_OF_LIST,
  6. NETDATA_SIGNAL_IGNORE,
  7. NETDATA_SIGNAL_EXIT_CLEANLY,
  8. NETDATA_SIGNAL_SAVE_DATABASE,
  9. NETDATA_SIGNAL_REOPEN_LOGS,
  10. NETDATA_SIGNAL_RELOAD_HEALTH,
  11. NETDATA_SIGNAL_FATAL,
  12. NETDATA_SIGNAL_CHILD,
  13. } SIGNAL_ACTION;
  14. static struct {
  15. int signo; // the signal
  16. const char *name; // the name of the signal
  17. size_t count; // the number of signals received
  18. SIGNAL_ACTION action; // the action to take
  19. } signals_waiting[] = {
  20. { SIGPIPE, "SIGPIPE", 0, NETDATA_SIGNAL_IGNORE },
  21. { SIGINT , "SIGINT", 0, NETDATA_SIGNAL_EXIT_CLEANLY },
  22. { SIGQUIT, "SIGQUIT", 0, NETDATA_SIGNAL_EXIT_CLEANLY },
  23. { SIGTERM, "SIGTERM", 0, NETDATA_SIGNAL_EXIT_CLEANLY },
  24. { SIGHUP, "SIGHUP", 0, NETDATA_SIGNAL_REOPEN_LOGS },
  25. { SIGUSR1, "SIGUSR1", 0, NETDATA_SIGNAL_SAVE_DATABASE },
  26. { SIGUSR2, "SIGUSR2", 0, NETDATA_SIGNAL_RELOAD_HEALTH },
  27. { SIGBUS, "SIGBUS", 0, NETDATA_SIGNAL_FATAL },
  28. { SIGCHLD, "SIGCHLD", 0, NETDATA_SIGNAL_CHILD },
  29. // terminator
  30. { 0, "NONE", 0, NETDATA_SIGNAL_END_OF_LIST }
  31. };
  32. static void signal_handler(int signo) {
  33. // find the entry in the list
  34. int i;
  35. for(i = 0; signals_waiting[i].action != NETDATA_SIGNAL_END_OF_LIST ; i++) {
  36. if(unlikely(signals_waiting[i].signo == signo)) {
  37. signals_waiting[i].count++;
  38. if(signals_waiting[i].action == NETDATA_SIGNAL_FATAL) {
  39. char buffer[200 + 1];
  40. snprintfz(buffer, 200, "\nSIGNAL HANDLER: received: %s. Oops! This is bad!\n", signals_waiting[i].name);
  41. if(write(STDERR_FILENO, buffer, strlen(buffer)) == -1) {
  42. // nothing to do - we cannot write but there is no way to complain about it
  43. ;
  44. }
  45. }
  46. return;
  47. }
  48. }
  49. }
  50. void signals_block(void) {
  51. sigset_t sigset;
  52. sigfillset(&sigset);
  53. if(pthread_sigmask(SIG_BLOCK, &sigset, NULL) == -1)
  54. error("SIGNAL: Could not block signals for threads");
  55. }
  56. void signals_unblock(void) {
  57. sigset_t sigset;
  58. sigfillset(&sigset);
  59. if(pthread_sigmask(SIG_UNBLOCK, &sigset, NULL) == -1) {
  60. error("SIGNAL: Could not unblock signals for threads");
  61. }
  62. }
  63. void signals_init(void) {
  64. // Catch signals which we want to use
  65. struct sigaction sa;
  66. sa.sa_flags = 0;
  67. // Enable process tracking / reaper if running as init (pid == 1).
  68. // This prevents zombie processes when running in a container.
  69. if (getpid() == 1) {
  70. info("SIGNAL: Enabling reaper");
  71. netdata_popen_tracking_init();
  72. reaper_enabled = 1;
  73. } else {
  74. info("SIGNAL: Not enabling reaper");
  75. }
  76. // ignore all signals while we run in a signal handler
  77. sigfillset(&sa.sa_mask);
  78. int i;
  79. for (i = 0; signals_waiting[i].action != NETDATA_SIGNAL_END_OF_LIST; i++) {
  80. switch (signals_waiting[i].action) {
  81. case NETDATA_SIGNAL_IGNORE:
  82. sa.sa_handler = SIG_IGN;
  83. break;
  84. case NETDATA_SIGNAL_CHILD:
  85. if (reaper_enabled == 0)
  86. continue;
  87. // FALLTHROUGH
  88. default:
  89. sa.sa_handler = signal_handler;
  90. break;
  91. }
  92. if(sigaction(signals_waiting[i].signo, &sa, NULL) == -1)
  93. error("SIGNAL: Failed to change signal handler for: %s", signals_waiting[i].name);
  94. }
  95. }
  96. void signals_restore_SIGCHLD(void)
  97. {
  98. struct sigaction sa;
  99. if (reaper_enabled == 0)
  100. return;
  101. sa.sa_flags = 0;
  102. sigfillset(&sa.sa_mask);
  103. sa.sa_handler = signal_handler;
  104. if(sigaction(SIGCHLD, &sa, NULL) == -1)
  105. error("SIGNAL: Failed to change signal handler for: SIGCHLD");
  106. }
  107. void signals_reset(void) {
  108. struct sigaction sa;
  109. sigemptyset(&sa.sa_mask);
  110. sa.sa_handler = SIG_DFL;
  111. sa.sa_flags = 0;
  112. int i;
  113. for (i = 0; signals_waiting[i].action != NETDATA_SIGNAL_END_OF_LIST; i++) {
  114. if(sigaction(signals_waiting[i].signo, &sa, NULL) == -1)
  115. error("SIGNAL: Failed to reset signal handler for: %s", signals_waiting[i].name);
  116. }
  117. if (reaper_enabled == 1)
  118. netdata_popen_tracking_cleanup();
  119. }
  120. // reap_child reaps the child identified by pid.
  121. static void reap_child(pid_t pid) {
  122. siginfo_t i;
  123. errno = 0;
  124. debug(D_CHILDS, "SIGNAL: Reaping pid: %d...", pid);
  125. if (waitid(P_PID, (id_t)pid, &i, WEXITED|WNOHANG) == -1) {
  126. if (errno != ECHILD)
  127. error("SIGNAL: Failed to wait for: %d", pid);
  128. else
  129. debug(D_CHILDS, "SIGNAL: Already reaped: %d", pid);
  130. return;
  131. } else if (i.si_pid == 0) {
  132. // Process didn't exit, this shouldn't happen.
  133. return;
  134. }
  135. switch (i.si_code) {
  136. case CLD_EXITED:
  137. debug(D_CHILDS, "SIGNAL: Child %d exited: %d", pid, i.si_status);
  138. break;
  139. case CLD_KILLED:
  140. debug(D_CHILDS, "SIGNAL: Child %d killed by signal: %d", pid, i.si_status);
  141. break;
  142. case CLD_DUMPED:
  143. debug(D_CHILDS, "SIGNAL: Child %d dumped core by signal: %d", pid, i.si_status);
  144. break;
  145. case CLD_STOPPED:
  146. debug(D_CHILDS, "SIGNAL: Child %d stopped by signal: %d", pid, i.si_status);
  147. break;
  148. case CLD_TRAPPED:
  149. debug(D_CHILDS, "SIGNAL: Child %d trapped by signal: %d", pid, i.si_status);
  150. break;
  151. case CLD_CONTINUED:
  152. debug(D_CHILDS, "SIGNAL: Child %d continued by signal: %d", pid, i.si_status);
  153. break;
  154. default:
  155. debug(D_CHILDS, "SIGNAL: Child %d gave us a SIGCHLD with code %d and status %d.", pid, i.si_code, i.si_status);
  156. }
  157. }
  158. // reap_children reaps all pending children which are not managed by myp.
  159. static void reap_children() {
  160. siginfo_t i;
  161. while (1 == 1) {
  162. // Identify which process caused the signal so we can determine
  163. // if we need to reap a re-parented process.
  164. i.si_pid = 0;
  165. if (waitid(P_ALL, (id_t)0, &i, WEXITED|WNOHANG|WNOWAIT) == -1) {
  166. if (errno != ECHILD) // This shouldn't happen with WNOHANG but does.
  167. error("SIGNAL: Failed to wait");
  168. return;
  169. } else if (i.si_pid == 0) {
  170. // No child exited.
  171. return;
  172. } else if (netdata_popen_tracking_pid_shoud_be_reaped(i.si_pid) == 0) {
  173. // myp managed, sleep for a short time to avoid busy wait while
  174. // this is handled by myp.
  175. usleep(10000);
  176. } else {
  177. // Unknown process, likely a re-parented child, reap it.
  178. reap_child(i.si_pid);
  179. }
  180. }
  181. }
  182. void signals_handle(void) {
  183. while(1) {
  184. // pause() causes the calling process (or thread) to sleep until a signal
  185. // is delivered that either terminates the process or causes the invocation
  186. // of a signal-catching function.
  187. if(pause() == -1 && errno == EINTR) {
  188. // loop once, but keep looping while signals are coming in
  189. // this is needed because a few operations may take some time
  190. // so we need to check for new signals before pausing again
  191. int found = 1;
  192. while(found) {
  193. found = 0;
  194. // execute the actions of the signals
  195. int i;
  196. for (i = 0; signals_waiting[i].action != NETDATA_SIGNAL_END_OF_LIST; i++) {
  197. if (signals_waiting[i].count) {
  198. found = 1;
  199. signals_waiting[i].count = 0;
  200. const char *name = signals_waiting[i].name;
  201. switch (signals_waiting[i].action) {
  202. case NETDATA_SIGNAL_RELOAD_HEALTH:
  203. error_log_limit_unlimited();
  204. info("SIGNAL: Received %s. Reloading HEALTH configuration...", name);
  205. error_log_limit_reset();
  206. execute_command(CMD_RELOAD_HEALTH, NULL, NULL);
  207. break;
  208. case NETDATA_SIGNAL_SAVE_DATABASE:
  209. error_log_limit_unlimited();
  210. info("SIGNAL: Received %s. Saving databases...", name);
  211. error_log_limit_reset();
  212. execute_command(CMD_SAVE_DATABASE, NULL, NULL);
  213. break;
  214. case NETDATA_SIGNAL_REOPEN_LOGS:
  215. error_log_limit_unlimited();
  216. info("SIGNAL: Received %s. Reopening all log files...", name);
  217. error_log_limit_reset();
  218. execute_command(CMD_REOPEN_LOGS, NULL, NULL);
  219. break;
  220. case NETDATA_SIGNAL_EXIT_CLEANLY:
  221. error_log_limit_unlimited();
  222. info("SIGNAL: Received %s. Cleaning up to exit...", name);
  223. commands_exit();
  224. netdata_cleanup_and_exit(0);
  225. exit(0);
  226. break;
  227. case NETDATA_SIGNAL_FATAL:
  228. fatal("SIGNAL: Received %s. netdata now exits.", name);
  229. break;
  230. case NETDATA_SIGNAL_CHILD:
  231. debug(D_CHILDS, "SIGNAL: Received %s. Reaping...", name);
  232. reap_children();
  233. break;
  234. default:
  235. info("SIGNAL: Received %s. No signal handler configured. Ignoring it.", name);
  236. break;
  237. }
  238. }
  239. }
  240. }
  241. }
  242. else
  243. error("SIGNAL: pause() returned but it was not interrupted by a signal.");
  244. }
  245. }