netdata-claim.sh.in 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. #!/usr/bin/env bash
  2. # netdata
  3. # real-time performance and health monitoring, done right!
  4. # (C) 2023 Netdata Inc.
  5. # SPDX-License-Identifier: GPL-3.0-or-later
  6. # Exit code: 0 - Success
  7. # Exit code: 1 - Unknown argument
  8. # Exit code: 2 - Problems with claiming working directory
  9. # Exit code: 3 - Missing dependencies
  10. # Exit code: 4 - Failure to connect to endpoint
  11. # Exit code: 5 - The CLI didn't work
  12. # Exit code: 6 - Wrong user
  13. # Exit code: 7 - Unknown HTTP error message
  14. #
  15. # OK: Agent claimed successfully
  16. # HTTP Status code: 204
  17. # Exit code: 0
  18. #
  19. # Unknown HTTP error message
  20. # HTTP Status code: 422
  21. # Exit code: 7
  22. ERROR_KEYS[7]="None"
  23. ERROR_MESSAGES[7]="Unknown HTTP error message"
  24. # Error: The agent id is invalid; it does not fulfill the constraints
  25. # HTTP Status code: 422
  26. # Exit code: 8
  27. ERROR_KEYS[8]="ErrInvalidNodeID"
  28. ERROR_MESSAGES[8]="invalid node id"
  29. # Error: The agent hostname is invalid; it does not fulfill the constraints
  30. # HTTP Status code: 422
  31. # Exit code: 9
  32. ERROR_KEYS[9]="ErrInvalidNodeName"
  33. ERROR_MESSAGES[9]="invalid node name"
  34. # Error: At least one of the given rooms ids is invalid; it does not fulfill the constraints
  35. # HTTP Status code: 422
  36. # Exit code: 10
  37. ERROR_KEYS[10]="ErrInvalidRoomID"
  38. ERROR_MESSAGES[10]="invalid room id"
  39. # Error: Invalid public key; the public key is empty or not present
  40. # HTTP Status code: 422
  41. # Exit code: 11
  42. ERROR_KEYS[11]="ErrInvalidPublicKey"
  43. ERROR_MESSAGES[11]="invalid public key"
  44. #
  45. # Error: Expired, missing or invalid token
  46. # HTTP Status code: 403
  47. # Exit code: 12
  48. ERROR_KEYS[12]="ErrForbidden"
  49. ERROR_MESSAGES[12]="token expired/token not found/invalid token"
  50. # Error: Duplicate agent id; an agent with the same id is already registered in the cloud
  51. # HTTP Status code: 409
  52. # Exit code: 13
  53. ERROR_KEYS[13]="ErrAlreadyClaimed"
  54. ERROR_MESSAGES[13]="already claimed"
  55. # Error: The node claiming process is still in progress.
  56. # HTTP Status code: 102
  57. # Exit code: 14
  58. ERROR_KEYS[14]="ErrProcessingClaim"
  59. ERROR_MESSAGES[14]="processing claiming"
  60. # Error: Internal server error. Any other unexpected error (DB problems, etc.)
  61. # HTTP Status code: 500
  62. # Exit code: 15
  63. ERROR_KEYS[15]="ErrInternalServerError"
  64. ERROR_MESSAGES[15]="Internal Server Error"
  65. # Error: There was a timeout processing the claim.
  66. # HTTP Status code: 504
  67. # Exit code: 16
  68. ERROR_KEYS[16]="ErrGatewayTimeout"
  69. ERROR_MESSAGES[16]="Gateway Timeout"
  70. # Error: The service cannot handle the claiming request at this time.
  71. # HTTP Status code: 503
  72. # Exit code: 17
  73. ERROR_KEYS[17]="ErrServiceUnavailable"
  74. ERROR_MESSAGES[17]="Service Unavailable"
  75. # Exit code: 18 - Agent unique id is not generated yet.
  76. NETDATA_RUNNING=1
  77. get_config_value() {
  78. conf_file="${1}"
  79. section="${2}"
  80. key_name="${3}"
  81. if [ "${NETDATA_RUNNING}" -eq 1 ]; then
  82. config_result=$(@sbindir_POST@/netdatacli 2>/dev/null read-config "$conf_file|$section|$key_name"; exit $?)
  83. result="$?"
  84. if [ "${result}" -ne 0 ]; then
  85. echo >&2 "Unable to communicate with Netdata daemon, querying config from disk instead."
  86. NETDATA_RUNNING=0
  87. fi
  88. fi
  89. if [ "${NETDATA_RUNNING}" -eq 0 ]; then
  90. config_result=$(@sbindir_POST@/netdata 2>/dev/null -W get2 "$conf_file" "$section" "$key_name" unknown_default)
  91. fi
  92. echo "$config_result"
  93. }
  94. if command -v curl >/dev/null 2>&1 ; then
  95. URLTOOL="curl"
  96. elif command -v wget >/dev/null 2>&1 ; then
  97. URLTOOL="wget"
  98. else
  99. echo >&2 "I need curl or wget to proceed, but neither is available on this system."
  100. exit 3
  101. fi
  102. if ! command -v openssl >/dev/null 2>&1 ; then
  103. echo >&2 "I need openssl to proceed, but it is not available on this system."
  104. exit 3
  105. fi
  106. # shellcheck disable=SC2050
  107. if [ "@enable_cloud_POST@" = "no" ]; then
  108. echo >&2 "This agent was built with --disable-cloud and cannot be claimed"
  109. exit 3
  110. fi
  111. # shellcheck disable=SC2050
  112. if [ "@enable_aclk_POST@" != "yes" ]; then
  113. echo >&2 "This agent was built without the dependencies for Cloud and cannot be claimed"
  114. exit 3
  115. fi
  116. # -----------------------------------------------------------------------------
  117. # defaults to allow running this script by hand
  118. [ -z "${NETDATA_VARLIB_DIR}" ] && NETDATA_VARLIB_DIR="@varlibdir_POST@"
  119. MACHINE_GUID_FILE="@registrydir_POST@/netdata.public.unique.id"
  120. CLAIMING_DIR="${NETDATA_VARLIB_DIR}/cloud.d"
  121. TOKEN="unknown"
  122. URL_BASE=$(get_config_value cloud global "cloud base url")
  123. [ -z "$URL_BASE" ] && URL_BASE="https://app.netdata.cloud" # Cover post-install with --dont-start
  124. ID="unknown"
  125. ROOMS=""
  126. [ -z "$HOSTNAME" ] && HOSTNAME=$(hostname)
  127. CLOUD_CERTIFICATE_FILE="${CLAIMING_DIR}/cloud_fullchain.pem"
  128. VERBOSE=0
  129. INSECURE=0
  130. RELOAD=1
  131. NETDATA_USER=$(get_config_value netdata global "run as user")
  132. [ -z "$EUID" ] && EUID="$(id -u)"
  133. gen_id() {
  134. local id
  135. if command -v uuidgen > /dev/null 2>&1; then
  136. id="$(uuidgen | tr '[:upper:]' '[:lower:]')"
  137. elif [ -r /proc/sys/kernel/random/uuid ]; then
  138. id="$(cat /proc/sys/kernel/random/uuid)"
  139. else
  140. echo >&2 "Unable to generate machine ID."
  141. exit 18
  142. fi
  143. if [ "${id}" = "8a795b0c-2311-11e6-8563-000c295076a6" ] || [ "${id}" = "4aed1458-1c3e-11e6-a53f-000c290fc8f5" ]; then
  144. gen_id
  145. else
  146. echo "${id}"
  147. fi
  148. }
  149. # get the MACHINE_GUID by default
  150. if [ -r "${MACHINE_GUID_FILE}" ]; then
  151. ID="$(cat "${MACHINE_GUID_FILE}")"
  152. MGUID=$ID
  153. elif [ -f "${MACHINE_GUID_FILE}" ]; then
  154. echo >&2 "netdata.public.unique.id is not readable. Please make sure you have rights to read it (Filename: ${MACHINE_GUID_FILE})."
  155. exit 18
  156. else
  157. if mkdir -p "${MACHINE_GUID_FILE%/*}" && echo -n "$(gen_id)" > "${MACHINE_GUID_FILE}"; then
  158. ID="$(cat "${MACHINE_GUID_FILE}")"
  159. MGUID=$ID
  160. else
  161. echo >&2 "Failed to write new machine GUID. Please make sure you have rights to write to ${MACHINE_GUID_FILE}."
  162. exit 18
  163. fi
  164. fi
  165. # get token from file
  166. if [ -r "${CLAIMING_DIR}/token" ]; then
  167. TOKEN="$(cat "${CLAIMING_DIR}/token")"
  168. fi
  169. # get rooms from file
  170. if [ -r "${CLAIMING_DIR}/rooms" ]; then
  171. ROOMS="$(cat "${CLAIMING_DIR}/rooms")"
  172. fi
  173. variable_to_set=
  174. for arg in "$@"
  175. do
  176. if [ -z "$variable_to_set" ]; then
  177. case $arg in
  178. --claim-token) variable_to_set="TOKEN" ;;
  179. --claim-rooms) variable_to_set="ROOMS" ;;
  180. --claim-url) variable_to_set="URL_BASE" ;;
  181. -token=*) TOKEN=${arg:7} ;;
  182. -url=*) [ -n "${arg:5}" ] && URL_BASE=${arg:5} ;;
  183. -id=*) ID=$(echo "${arg:4}" | tr '[:upper:]' '[:lower:]');;
  184. -rooms=*) ROOMS=${arg:7} ;;
  185. -hostname=*) HOSTNAME=${arg:10} ;;
  186. -verbose) VERBOSE=1 ;;
  187. -insecure) INSECURE=1 ;;
  188. -proxy=*) PROXY=${arg:7} ;;
  189. -noproxy) NOPROXY=yes ;;
  190. -noreload) RELOAD=0 ;;
  191. -user=*) NETDATA_USER=${arg:6} ;;
  192. -daemon-not-running) NETDATA_RUNNING=0 ;;
  193. *) echo >&2 "Unknown argument ${arg}"
  194. exit 1 ;;
  195. esac
  196. else
  197. case "$variable_to_set" in
  198. TOKEN) TOKEN="$arg" ;;
  199. ROOMS) ROOMS="$arg" ;;
  200. URL_BASE) URL_BASE="$arg" ;;
  201. esac
  202. variable_to_set=
  203. fi
  204. shift 1
  205. done
  206. if [ "$EUID" != "0" ] && [ "$(whoami)" != "$NETDATA_USER" ]; then
  207. echo >&2 "This script must be run by the $NETDATA_USER user account"
  208. exit 6
  209. fi
  210. # if curl not installed give warning SOCKS can't be used
  211. if [[ "${URLTOOL}" != "curl" && "${PROXY:0:5}" = socks ]] ; then
  212. echo >&2 "wget doesn't support SOCKS. Please install curl or disable SOCKS proxy."
  213. exit 1
  214. fi
  215. echo >&2 "Token: ****************"
  216. echo >&2 "Base URL: $URL_BASE"
  217. echo >&2 "Id: $ID"
  218. echo >&2 "Rooms: $ROOMS"
  219. echo >&2 "Hostname: $HOSTNAME"
  220. echo >&2 "Proxy: $PROXY"
  221. echo >&2 "Netdata user: $NETDATA_USER"
  222. # create the claiming directory for this user
  223. if [ ! -d "${CLAIMING_DIR}" ] ; then
  224. mkdir -p "${CLAIMING_DIR}" && chmod 0770 "${CLAIMING_DIR}"
  225. # shellcheck disable=SC2181
  226. if [ $? -ne 0 ] ; then
  227. echo >&2 "Failed to create claiming working directory ${CLAIMING_DIR}"
  228. exit 2
  229. fi
  230. fi
  231. if [ ! -w "${CLAIMING_DIR}" ] ; then
  232. echo >&2 "No write permission in claiming working directory ${CLAIMING_DIR}"
  233. exit 2
  234. fi
  235. if [ ! -f "${CLAIMING_DIR}/private.pem" ] ; then
  236. echo >&2 "Generating private/public key for the first time."
  237. if ! openssl genrsa -out "${CLAIMING_DIR}/private.pem" 2048 ; then
  238. echo >&2 "Failed to generate private/public key pair."
  239. exit 2
  240. fi
  241. fi
  242. if [ ! -f "${CLAIMING_DIR}/public.pem" ] ; then
  243. echo >&2 "Extracting public key from private key."
  244. if ! openssl rsa -in "${CLAIMING_DIR}/private.pem" -outform PEM -pubout -out "${CLAIMING_DIR}/public.pem" ; then
  245. echo >&2 "Failed to extract public key."
  246. exit 2
  247. fi
  248. fi
  249. TARGET_URL="${URL_BASE%/}/api/v1/spaces/nodes/${ID}"
  250. # shellcheck disable=SC2002
  251. KEY=$(cat "${CLAIMING_DIR}/public.pem" | tr '\n' '!' | sed -e 's/!/\\n/g')
  252. # shellcheck disable=SC2001
  253. [ -n "$ROOMS" ] && ROOMS=\"$(echo "$ROOMS" | sed s'/,/", "/g')\"
  254. cat > "${CLAIMING_DIR}/tmpin.txt" <<EMBED_JSON
  255. {
  256. "node": {
  257. "id": "$ID",
  258. "hostname": "$HOSTNAME"
  259. },
  260. "token": "$TOKEN",
  261. "rooms" : [ $ROOMS ],
  262. "publicKey" : "$KEY",
  263. "mGUID" : "$MGUID"
  264. }
  265. EMBED_JSON
  266. if [ "${VERBOSE}" == 1 ] ; then
  267. echo "Request to server:"
  268. cat "${CLAIMING_DIR}/tmpin.txt"
  269. fi
  270. if [ "${URLTOOL}" = "curl" ] ; then
  271. URLCOMMAND="curl --connect-timeout 30 --retry 0 -s -i -X PUT -d \"@${CLAIMING_DIR}/tmpin.txt\""
  272. if [ "${NOPROXY}" = "yes" ] ; then
  273. URLCOMMAND="${URLCOMMAND} -x \"\""
  274. elif [ -n "${PROXY}" ] ; then
  275. URLCOMMAND="${URLCOMMAND} -x \"${PROXY}\""
  276. fi
  277. else
  278. URLCOMMAND="wget -T 15 -O - -q --server-response --content-on-error=on --method=PUT \
  279. --body-file=\"${CLAIMING_DIR}/tmpin.txt\""
  280. if [ "${NOPROXY}" = "yes" ] ; then
  281. URLCOMMAND="${URLCOMMAND} --no-proxy"
  282. elif [ "${PROXY:0:4}" = http ] ; then
  283. URLCOMMAND="export http_proxy=${PROXY}; ${URLCOMMAND}"
  284. fi
  285. fi
  286. if [ "${INSECURE}" == 1 ] ; then
  287. if [ "${URLTOOL}" = "curl" ] ; then
  288. URLCOMMAND="${URLCOMMAND} --insecure"
  289. else
  290. URLCOMMAND="${URLCOMMAND} --no-check-certificate"
  291. fi
  292. fi
  293. if [ -r "${CLOUD_CERTIFICATE_FILE}" ] ; then
  294. if [ "${URLTOOL}" = "curl" ] ; then
  295. URLCOMMAND="${URLCOMMAND} --cacert \"${CLOUD_CERTIFICATE_FILE}\""
  296. else
  297. URLCOMMAND="${URLCOMMAND} --ca-certificate \"${CLOUD_CERTIFICATE_FILE}\""
  298. fi
  299. fi
  300. if [ "${VERBOSE}" == 1 ]; then
  301. echo "${URLCOMMAND} \"${TARGET_URL}\""
  302. fi
  303. attempt_contact () {
  304. if [ "${URLTOOL}" = "curl" ] ; then
  305. eval "${URLCOMMAND} \"${TARGET_URL}\"" >"${CLAIMING_DIR}/tmpout.txt"
  306. else
  307. eval "${URLCOMMAND} \"${TARGET_URL}\"" >"${CLAIMING_DIR}/tmpout.txt" 2>&1
  308. fi
  309. URLCOMMAND_EXIT_CODE=$?
  310. if [ "${URLTOOL}" = "wget" ] && [ "${URLCOMMAND_EXIT_CODE}" -eq 8 ] ; then
  311. # We consider the server issuing an error response a successful attempt at communicating
  312. URLCOMMAND_EXIT_CODE=0
  313. fi
  314. # Check if URLCOMMAND connected and received reply
  315. if [ "${URLCOMMAND_EXIT_CODE}" -ne 0 ] ; then
  316. echo >&2 "Failed to connect to ${URL_BASE}, return code ${URLCOMMAND_EXIT_CODE}"
  317. rm -f "${CLAIMING_DIR}/tmpout.txt"
  318. return 4
  319. fi
  320. if [ "${VERBOSE}" == 1 ] ; then
  321. echo "Response from server:"
  322. cat "${CLAIMING_DIR}/tmpout.txt"
  323. fi
  324. return 0
  325. }
  326. for i in {1..3}
  327. do
  328. if attempt_contact ; then
  329. echo "Connection attempt $i successful"
  330. break
  331. fi
  332. echo "Connection attempt $i failed. Retry in ${i}s."
  333. if [ "$i" -eq 5 ] ; then
  334. rm -f "${CLAIMING_DIR}/tmpin.txt"
  335. exit 4
  336. fi
  337. sleep "$i"
  338. done
  339. rm -f "${CLAIMING_DIR}/tmpin.txt"
  340. ERROR_KEY=$(grep "\"errorMsgKey\":" "${CLAIMING_DIR}/tmpout.txt" | awk -F "errorMsgKey\":\"" '{print $2}' | awk -F "\"" '{print $1}')
  341. case ${ERROR_KEY} in
  342. "ErrInvalidNodeID") EXIT_CODE=8 ;;
  343. "ErrInvalidNodeName") EXIT_CODE=9 ;;
  344. "ErrInvalidRoomID") EXIT_CODE=10 ;;
  345. "ErrInvalidPublicKey") EXIT_CODE=11 ;;
  346. "ErrForbidden") EXIT_CODE=12 ;;
  347. "ErrAlreadyClaimed") EXIT_CODE=13 ;;
  348. "ErrProcessingClaim") EXIT_CODE=14 ;;
  349. "ErrInternalServerError") EXIT_CODE=15 ;;
  350. "ErrGatewayTimeout") EXIT_CODE=16 ;;
  351. "ErrServiceUnavailable") EXIT_CODE=17 ;;
  352. *) EXIT_CODE=7 ;;
  353. esac
  354. HTTP_STATUS_CODE=$(grep "HTTP" "${CLAIMING_DIR}/tmpout.txt" | tail -1 | awk -F " " '{print $2}')
  355. if [ "${HTTP_STATUS_CODE}" = "204" ] ; then
  356. EXIT_CODE=0
  357. fi
  358. if [ "${HTTP_STATUS_CODE}" = "204" ] || [ "${ERROR_KEY}" = "ErrAlreadyClaimed" ] ; then
  359. rm -f "${CLAIMING_DIR}/tmpout.txt"
  360. if [ "${HTTP_STATUS_CODE}" = "204" ] ; then
  361. echo -n "${ID}" >"${CLAIMING_DIR}/claimed_id" || (echo >&2 "Claiming failed"; set -e; exit 2)
  362. fi
  363. rm -f "${CLAIMING_DIR}/token" || (echo >&2 "Claiming failed"; set -e; exit 2)
  364. # Rewrite the cloud.conf on the disk
  365. cat > "$CLAIMING_DIR/cloud.conf" <<HERE_DOC
  366. [global]
  367. enabled = yes
  368. cloud base url = $URL_BASE
  369. HERE_DOC
  370. if [ "$EUID" == "0" ]; then
  371. chown -R "${NETDATA_USER}:${NETDATA_USER}" "${CLAIMING_DIR}" || (echo >&2 "Claiming failed"; set -e; exit 2)
  372. fi
  373. if [ "${RELOAD}" == "0" ] ; then
  374. exit $EXIT_CODE
  375. fi
  376. if [ -z "${PROXY}" ]; then
  377. PROXYMSG=""
  378. else
  379. PROXYMSG="You have attempted to claim this node through a proxy - please update your the proxy setting in your netdata.conf to ${PROXY}. "
  380. fi
  381. # Update cloud.conf in the agent memory
  382. @sbindir_POST@/netdatacli write-config 'cloud|global|enabled|yes' && \
  383. @sbindir_POST@/netdatacli write-config "cloud|global|cloud base url|$URL_BASE" && \
  384. @sbindir_POST@/netdatacli reload-claiming-state && \
  385. if [ "${HTTP_STATUS_CODE}" = "204" ] ; then
  386. echo >&2 "${PROXYMSG}Node was successfully claimed."
  387. else
  388. echo >&2 "The agent cloud base url is set to the url provided."
  389. echo >&2 "The cloud may have different credentials already registered for this agent ID and it cannot be reclaimed under different credentials for security reasons. If you are unable to connect use -id=\$(uuidgen) to overwrite this agent ID with a fresh value if the original credentials cannot be restored."
  390. echo >&2 "${PROXYMSG}Failed to claim node with the following error message:\"${ERROR_MESSAGES[$EXIT_CODE]}\""
  391. fi && exit $EXIT_CODE
  392. if [ "${ERROR_KEY}" = "ErrAlreadyClaimed" ] ; then
  393. echo >&2 "The cloud may have different credentials already registered for this agent ID and it cannot be reclaimed under different credentials for security reasons. If you are unable to connect use -id=\$(uuidgen) to overwrite this agent ID with a fresh value if the original credentials cannot be restored."
  394. echo >&2 "${PROXYMSG}Failed to claim node with the following error message:\"${ERROR_MESSAGES[$EXIT_CODE]}\""
  395. exit $EXIT_CODE
  396. fi
  397. echo >&2 "${PROXYMSG}The claim was successful but the agent could not be notified ($?)- it requires a restart to connect to the cloud."
  398. [ "$NETDATA_RUNNING" -eq 0 ] && exit 0 || exit 5
  399. fi
  400. echo >&2 "Failed to claim node with the following error message:\"${ERROR_MESSAGES[$EXIT_CODE]}\""
  401. if [ "${VERBOSE}" == 1 ]; then
  402. echo >&2 "Error key was:\"${ERROR_KEYS[$EXIT_CODE]}\""
  403. fi
  404. rm -f "${CLAIMING_DIR}/tmpout.txt"
  405. exit $EXIT_CODE