Browse Source

Introduce a random sleep in the Netdata updater (#9079)

* Introduce a random sleep in the Netdata updater

* Only sleep if we're not a tty (e.g: cron) and use a random interval between 30m-60m

* Set lower bound to 1s

* Disable random sleep / netdata-updater splay in lifecycle tests
James Mills 4 years ago
parent
commit
cea8a3fcbb
2 changed files with 13 additions and 2 deletions
  1. 10 2
      packaging/installer/netdata-updater.sh
  2. 3 0
      tests/updater_checks.bats

+ 10 - 2
packaging/installer/netdata-updater.sh

@@ -185,8 +185,8 @@ update() {
       do_not_start="--dont-start-it"
     fi
 
-    if [ -n "${NETDATA_SELECTED_DASHBOARD}" ] ; then
-        env="NETDATA_SELECTED_DASHBOARD=${NETDATA_SELECTED_DASHBOARD}"
+    if [ -n "${NETDATA_SELECTED_DASHBOARD}" ]; then
+      env="NETDATA_SELECTED_DASHBOARD=${NETDATA_SELECTED_DASHBOARD}"
     fi
 
     info "Re-installing netdata..."
@@ -210,6 +210,14 @@ tmpdir=
 
 trap cleanup EXIT
 
+# Random sleep to aileviate stampede effect of Agents upgrading
+# and disconnecting/reconnecting at the same time (or near to).
+# But only we're not a controlling terminal (tty)
+# Randomly sleep between 1s and 60m
+if [ ! -t 1 ]; then
+  sleep $(((RANDOM % 3600) + 1))s
+fi
+
 # Usually stored in /etc/netdata/.environment
 : "${ENVIRONMENT_FILE:=THIS_SHOULD_BE_REPLACED_BY_INSTALLER_SCRIPT}"
 

+ 3 - 0
tests/updater_checks.bats

@@ -56,6 +56,9 @@ setup() {
 	# Run the updater, with the override so that it uses the local repo we have at hand
 	# Try to run the installed, if any, otherwise just run the one from the repo
 	export NETDATA_LOCAL_TARBAL_OVERRIDE="${PWD}"
+	# Disable random sleep / splay for netdata-updater to avoid sampede effect
+	# of many agents (dis|re)connecting too quickly all at onace to Netdata Cloud
+	unset RANDOM; export RANDOM=0
 	/etc/cron.daily/netdata-updater || ./packaging/installer/netdata-updater.sh
 	! grep "new_installation" "${ENV}"
 }