netdata-claim.sh.in 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. #!/usr/bin/env bash
  2. # netdata
  3. # real-time performance and health monitoring, done right!
  4. # (C) 2017 Costa Tsaousis <costa@tsaousis.gr>
  5. # SPDX-License-Identifier: GPL-3.0-or-later
  6. # Exit code: 0 - Success
  7. # Exit code: 1 - Unknown argument
  8. # Exit code: 2 - Problems with claiming working directory
  9. # Exit code: 3 - Missing dependencies
  10. # Exit code: 4 - Failure to connect to endpoint
  11. # Exit code: 5 - The CLI didn't work
  12. # Exit code: 6 - Wrong user
  13. # Exit code: 7 - Unknown HTTP error message
  14. #
  15. # OK: Agent claimed successfully
  16. # HTTP Status code: 204
  17. # Exit code: 0
  18. #
  19. # Unknown HTTP error message
  20. # HTTP Status code: 422
  21. # Exit code: 7
  22. ERROR_KEYS[7]="None"
  23. ERROR_MESSAGES[7]="Unknown HTTP error message"
  24. # Error: The agent id is invalid; it does not fulfill the constraints
  25. # HTTP Status code: 422
  26. # Exit code: 8
  27. ERROR_KEYS[8]="ErrInvalidNodeID"
  28. ERROR_MESSAGES[8]="invalid node id"
  29. # Error: The agent hostname is invalid; it does not fulfill the constraints
  30. # HTTP Status code: 422
  31. # Exit code: 9
  32. ERROR_KEYS[9]="ErrInvalidNodeName"
  33. ERROR_MESSAGES[9]="invalid node name"
  34. # Error: At least one of the given rooms ids is invalid; it does not fulfill the constraints
  35. # HTTP Status code: 422
  36. # Exit code: 10
  37. ERROR_KEYS[10]="ErrInvalidRoomID"
  38. ERROR_MESSAGES[10]="invalid room id"
  39. # Error: Invalid public key; the public key is empty or not present
  40. # HTTP Status code: 422
  41. # Exit code: 11
  42. ERROR_KEYS[11]="ErrInvalidPublicKey"
  43. ERROR_MESSAGES[11]="invalid public key"
  44. #
  45. # Error: Expired, missing or invalid token
  46. # HTTP Status code: 403
  47. # Exit code: 12
  48. ERROR_KEYS[12]="ErrForbidden"
  49. ERROR_MESSAGES[12]="token expired/token not found/invalid token"
  50. # Error: Duplicate agent id; an agent with the same id is already registered in the cloud
  51. # HTTP Status code: 409
  52. # Exit code: 13
  53. ERROR_KEYS[13]="ErrAlreadyClaimed"
  54. ERROR_MESSAGES[13]="already claimed"
  55. # Error: The node claiming process is still in progress.
  56. # HTTP Status code: 102
  57. # Exit code: 14
  58. ERROR_KEYS[14]="ErrProcessingClaim"
  59. ERROR_MESSAGES[14]="processing claiming"
  60. # Error: Internal server error. Any other unexpected error (DB problems, etc.)
  61. # HTTP Status code: 500
  62. # Exit code: 15
  63. ERROR_KEYS[15]="ErrInternalServerError"
  64. ERROR_MESSAGES[15]="Internal Server Error"
  65. # Error: There was a timeout processing the claim.
  66. # HTTP Status code: 504
  67. # Exit code: 16
  68. ERROR_KEYS[16]="ErrGatewayTimeout"
  69. ERROR_MESSAGES[16]="Gateway Timeout"
  70. # Error: The service cannot handle the claiming request at this time.
  71. # HTTP Status code: 503
  72. # Exit code: 17
  73. ERROR_KEYS[17]="ErrServiceUnavailable"
  74. ERROR_MESSAGES[17]="Service Unavailable"
  75. # Exit code: 18 - Agent unique id is not generated yet.
  76. NETDATA_RUNNING=1
  77. get_config_value() {
  78. conf_file="${1}"
  79. section="${2}"
  80. key_name="${3}"
  81. if [ "${NETDATA_RUNNING}" -eq 1 ]; then
  82. config_result=$(@sbindir_POST@/netdatacli 2>/dev/null read-config "$conf_file|$section|$key_name"; exit $?)
  83. result="$?"
  84. if [ "${result}" -ne 0 ]; then
  85. echo >&2 "Unable to communicate with Netdata daemon, querying config from disk instead."
  86. NETDATA_RUNNING=0
  87. fi
  88. fi
  89. if [ "${NETDATA_RUNNING}" -eq 0 ]; then
  90. config_result=$(@sbindir_POST@/netdata 2>/dev/null -W get2 "$conf_file" "$section" "$key_name" unknown_default)
  91. fi
  92. echo "$config_result"
  93. }
  94. if command -v curl >/dev/null 2>&1 ; then
  95. URLTOOL="curl"
  96. elif command -v wget >/dev/null 2>&1 ; then
  97. URLTOOL="wget"
  98. else
  99. echo >&2 "I need curl or wget to proceed, but neither is available on this system."
  100. exit 3
  101. fi
  102. if ! command -v openssl >/dev/null 2>&1 ; then
  103. echo >&2 "I need openssl to proceed, but it is not available on this system."
  104. exit 3
  105. fi
  106. # shellcheck disable=SC2050
  107. if [ "@enable_cloud_POST@" = "no" ]; then
  108. echo >&2 "This agent was built with --disable-cloud and cannot be claimed"
  109. exit 3
  110. fi
  111. # shellcheck disable=SC2050
  112. if [ "@enable_aclk_POST@" != "yes" ]; then
  113. echo >&2 "This agent was built without the dependencies for Cloud and cannot be claimed"
  114. exit 3
  115. fi
  116. # -----------------------------------------------------------------------------
  117. # defaults to allow running this script by hand
  118. [ -z "${NETDATA_VARLIB_DIR}" ] && NETDATA_VARLIB_DIR="@varlibdir_POST@"
  119. MACHINE_GUID_FILE="@registrydir_POST@/netdata.public.unique.id"
  120. CLAIMING_DIR="${NETDATA_VARLIB_DIR}/cloud.d"
  121. TOKEN="unknown"
  122. URL_BASE=$(get_config_value cloud global "cloud base url")
  123. [ -z "$URL_BASE" ] && URL_BASE="https://api.netdata.cloud" # Cover post-install with --dont-start
  124. ID="unknown"
  125. ROOMS=""
  126. [ -z "$HOSTNAME" ] && HOSTNAME=$(hostname)
  127. CLOUD_CERTIFICATE_FILE="${CLAIMING_DIR}/cloud_fullchain.pem"
  128. VERBOSE=0
  129. INSECURE=0
  130. RELOAD=1
  131. NETDATA_USER=$(get_config_value netdata global "run as user")
  132. [ -z "$EUID" ] && EUID="$(id -u)"
  133. gen_id() {
  134. local id
  135. if command -v uuidgen > /dev/null 2>&1; then
  136. id="$(uuidgen | tr '[:upper:]' '[:lower:]')"
  137. elif [ -r /proc/sys/kernel/random/uuid ]; then
  138. id="$(cat /proc/sys/kernel/random/uuid)"
  139. else
  140. echo >&2 "Unable to generate machine ID."
  141. exit 18
  142. fi
  143. if [ "${id}" = "8a795b0c-2311-11e6-8563-000c295076a6" ] || [ "${id}" = "4aed1458-1c3e-11e6-a53f-000c290fc8f5" ]; then
  144. gen_id
  145. else
  146. echo "${id}"
  147. fi
  148. }
  149. # get the MACHINE_GUID by default
  150. if [ -r "${MACHINE_GUID_FILE}" ]; then
  151. ID="$(cat "${MACHINE_GUID_FILE}")"
  152. MGUID=$ID
  153. elif [ -f "${MACHINE_GUID_FILE}" ]; then
  154. echo >&2 "netdata.public.unique.id is not readable. Please make sure you have rights to read it (Filename: ${MACHINE_GUID_FILE})."
  155. exit 18
  156. else
  157. if mkdir -p "${MACHINE_GUID_FILE%/*}" && /bin/echo -n "$(gen_id)" > "${MACHINE_GUID_FILE}"; then
  158. ID="$(cat "${MACHINE_GUID_FILE}")"
  159. MGUID=$ID
  160. else
  161. echo >&2 "Failed to write new machine GUID. Please make sure you have rights to write to ${MACHINE_GUID_FILE}."
  162. exit 18
  163. fi
  164. fi
  165. # get token from file
  166. if [ -r "${CLAIMING_DIR}/token" ]; then
  167. TOKEN="$(cat "${CLAIMING_DIR}/token")"
  168. fi
  169. # get rooms from file
  170. if [ -r "${CLAIMING_DIR}/rooms" ]; then
  171. ROOMS="$(cat "${CLAIMING_DIR}/rooms")"
  172. fi
  173. for arg in "$@"
  174. do
  175. case $arg in
  176. -token=*) TOKEN=${arg:7} ;;
  177. -url=*) [ -n "${arg:5}" ] && URL_BASE=${arg:5} ;;
  178. -id=*) ID=$(echo "${arg:4}" | tr '[:upper:]' '[:lower:]');;
  179. -rooms=*) ROOMS=${arg:7} ;;
  180. -hostname=*) HOSTNAME=${arg:10} ;;
  181. -verbose) VERBOSE=1 ;;
  182. -insecure) INSECURE=1 ;;
  183. -proxy=*) PROXY=${arg:7} ;;
  184. -noproxy) NOPROXY=yes ;;
  185. -noreload) RELOAD=0 ;;
  186. -user=*) NETDATA_USER=${arg:6} ;;
  187. -daemon-not-running) NETDATA_RUNNING=0 ;;
  188. *) echo >&2 "Unknown argument ${arg}"
  189. exit 1 ;;
  190. esac
  191. shift 1
  192. done
  193. if [ "$EUID" != "0" ] && [ "$(whoami)" != "$NETDATA_USER" ]; then
  194. echo >&2 "This script must be run by the $NETDATA_USER user account"
  195. exit 6
  196. fi
  197. # if curl not installed give warning SOCKS can't be used
  198. if [[ "${URLTOOL}" != "curl" && "${PROXY:0:5}" = socks ]] ; then
  199. echo >&2 "wget doesn't support SOCKS. Please install curl or disable SOCKS proxy."
  200. exit 1
  201. fi
  202. echo >&2 "Token: ****************"
  203. echo >&2 "Base URL: $URL_BASE"
  204. echo >&2 "Id: $ID"
  205. echo >&2 "Rooms: $ROOMS"
  206. echo >&2 "Hostname: $HOSTNAME"
  207. echo >&2 "Proxy: $PROXY"
  208. echo >&2 "Netdata user: $NETDATA_USER"
  209. # create the claiming directory for this user
  210. if [ ! -d "${CLAIMING_DIR}" ] ; then
  211. mkdir -p "${CLAIMING_DIR}" && chmod 0770 "${CLAIMING_DIR}"
  212. # shellcheck disable=SC2181
  213. if [ $? -ne 0 ] ; then
  214. echo >&2 "Failed to create claiming working directory ${CLAIMING_DIR}"
  215. exit 2
  216. fi
  217. fi
  218. if [ ! -w "${CLAIMING_DIR}" ] ; then
  219. echo >&2 "No write permission in claiming working directory ${CLAIMING_DIR}"
  220. exit 2
  221. fi
  222. if [ ! -f "${CLAIMING_DIR}/private.pem" ] ; then
  223. echo >&2 "Generating private/public key for the first time."
  224. if ! openssl genrsa -out "${CLAIMING_DIR}/private.pem" 2048 ; then
  225. echo >&2 "Failed to generate private/public key pair."
  226. exit 2
  227. fi
  228. fi
  229. if [ ! -f "${CLAIMING_DIR}/public.pem" ] ; then
  230. echo >&2 "Extracting public key from private key."
  231. if ! openssl rsa -in "${CLAIMING_DIR}/private.pem" -outform PEM -pubout -out "${CLAIMING_DIR}/public.pem" ; then
  232. echo >&2 "Failed to extract public key."
  233. exit 2
  234. fi
  235. fi
  236. TARGET_URL="${URL_BASE%/}/api/v1/spaces/nodes/${ID}"
  237. # shellcheck disable=SC2002
  238. KEY=$(cat "${CLAIMING_DIR}/public.pem" | tr '\n' '!' | sed -e 's/!/\\n/g')
  239. # shellcheck disable=SC2001
  240. [ -n "$ROOMS" ] && ROOMS=\"$(echo "$ROOMS" | sed s'/,/", "/g')\"
  241. cat > "${CLAIMING_DIR}/tmpin.txt" <<EMBED_JSON
  242. {
  243. "node": {
  244. "id": "$ID",
  245. "hostname": "$HOSTNAME"
  246. },
  247. "token": "$TOKEN",
  248. "rooms" : [ $ROOMS ],
  249. "publicKey" : "$KEY",
  250. "mGUID" : "$MGUID"
  251. }
  252. EMBED_JSON
  253. if [ "${VERBOSE}" == 1 ] ; then
  254. echo "Request to server:"
  255. cat "${CLAIMING_DIR}/tmpin.txt"
  256. fi
  257. if [ "${URLTOOL}" = "curl" ] ; then
  258. URLCOMMAND="curl --connect-timeout 30 --retry 0 -s -i -X PUT -d \"@${CLAIMING_DIR}/tmpin.txt\""
  259. if [ "${NOPROXY}" = "yes" ] ; then
  260. URLCOMMAND="${URLCOMMAND} -x \"\""
  261. elif [ -n "${PROXY}" ] ; then
  262. URLCOMMAND="${URLCOMMAND} -x \"${PROXY}\""
  263. fi
  264. else
  265. URLCOMMAND="wget -T 15 -O - -q --server-response --content-on-error=on --method=PUT \
  266. --body-file=\"${CLAIMING_DIR}/tmpin.txt\""
  267. if [ "${NOPROXY}" = "yes" ] ; then
  268. URLCOMMAND="${URLCOMMAND} --no-proxy"
  269. elif [ "${PROXY:0:4}" = http ] ; then
  270. URLCOMMAND="export http_proxy=${PROXY}; ${URLCOMMAND}"
  271. fi
  272. fi
  273. if [ "${INSECURE}" == 1 ] ; then
  274. if [ "${URLTOOL}" = "curl" ] ; then
  275. URLCOMMAND="${URLCOMMAND} --insecure"
  276. else
  277. URLCOMMAND="${URLCOMMAND} --no-check-certificate"
  278. fi
  279. fi
  280. if [ -r "${CLOUD_CERTIFICATE_FILE}" ] ; then
  281. if [ "${URLTOOL}" = "curl" ] ; then
  282. URLCOMMAND="${URLCOMMAND} --cacert \"${CLOUD_CERTIFICATE_FILE}\""
  283. else
  284. URLCOMMAND="${URLCOMMAND} --ca-certificate \"${CLOUD_CERTIFICATE_FILE}\""
  285. fi
  286. fi
  287. if [ "${VERBOSE}" == 1 ]; then
  288. echo "${URLCOMMAND} \"${TARGET_URL}\""
  289. fi
  290. attempt_contact () {
  291. if [ "${URLTOOL}" = "curl" ] ; then
  292. eval "${URLCOMMAND} \"${TARGET_URL}\"" >"${CLAIMING_DIR}/tmpout.txt"
  293. else
  294. eval "${URLCOMMAND} \"${TARGET_URL}\"" >"${CLAIMING_DIR}/tmpout.txt" 2>&1
  295. fi
  296. URLCOMMAND_EXIT_CODE=$?
  297. if [ "${URLTOOL}" = "wget" ] && [ "${URLCOMMAND_EXIT_CODE}" -eq 8 ] ; then
  298. # We consider the server issuing an error response a successful attempt at communicating
  299. URLCOMMAND_EXIT_CODE=0
  300. fi
  301. # Check if URLCOMMAND connected and received reply
  302. if [ "${URLCOMMAND_EXIT_CODE}" -ne 0 ] ; then
  303. echo >&2 "Failed to connect to ${URL_BASE}, return code ${URLCOMMAND_EXIT_CODE}"
  304. rm -f "${CLAIMING_DIR}/tmpout.txt"
  305. return 4
  306. fi
  307. if [ "${VERBOSE}" == 1 ] ; then
  308. echo "Response from server:"
  309. cat "${CLAIMING_DIR}/tmpout.txt"
  310. fi
  311. return 0
  312. }
  313. for i in {1..3}
  314. do
  315. if attempt_contact ; then
  316. echo "Connection attempt $i successful"
  317. break
  318. fi
  319. echo "Connection attempt $i failed. Retry in ${i}s."
  320. if [ "$i" -eq 5 ] ; then
  321. rm -f "${CLAIMING_DIR}/tmpin.txt"
  322. exit 4
  323. fi
  324. sleep "$i"
  325. done
  326. rm -f "${CLAIMING_DIR}/tmpin.txt"
  327. ERROR_KEY=$(grep "\"errorMsgKey\":" "${CLAIMING_DIR}/tmpout.txt" | awk -F "errorMsgKey\":\"" '{print $2}' | awk -F "\"" '{print $1}')
  328. case ${ERROR_KEY} in
  329. "ErrInvalidNodeID") EXIT_CODE=8 ;;
  330. "ErrInvalidNodeName") EXIT_CODE=9 ;;
  331. "ErrInvalidRoomID") EXIT_CODE=10 ;;
  332. "ErrInvalidPublicKey") EXIT_CODE=11 ;;
  333. "ErrForbidden") EXIT_CODE=12 ;;
  334. "ErrAlreadyClaimed") EXIT_CODE=13 ;;
  335. "ErrProcessingClaim") EXIT_CODE=14 ;;
  336. "ErrInternalServerError") EXIT_CODE=15 ;;
  337. "ErrGatewayTimeout") EXIT_CODE=16 ;;
  338. "ErrServiceUnavailable") EXIT_CODE=17 ;;
  339. *) EXIT_CODE=7 ;;
  340. esac
  341. HTTP_STATUS_CODE=$(grep "HTTP" "${CLAIMING_DIR}/tmpout.txt" | tail -1 | awk -F " " '{print $2}')
  342. if [ "${HTTP_STATUS_CODE}" = "204" ] ; then
  343. EXIT_CODE=0
  344. fi
  345. if [ "${HTTP_STATUS_CODE}" = "204" ] || [ "${ERROR_KEY}" = "ErrAlreadyClaimed" ] ; then
  346. rm -f "${CLAIMING_DIR}/tmpout.txt"
  347. if [ "${HTTP_STATUS_CODE}" = "204" ] ; then
  348. echo -n "${ID}" >"${CLAIMING_DIR}/claimed_id" || (echo >&2 "Claiming failed"; set -e; exit 2)
  349. fi
  350. rm -f "${CLAIMING_DIR}/token" || (echo >&2 "Claiming failed"; set -e; exit 2)
  351. # Rewrite the cloud.conf on the disk
  352. cat > "$CLAIMING_DIR/cloud.conf" <<HERE_DOC
  353. [global]
  354. enabled = yes
  355. cloud base url = $URL_BASE
  356. HERE_DOC
  357. if [ "$EUID" == "0" ]; then
  358. chown -R "${NETDATA_USER}:${NETDATA_USER}" ${CLAIMING_DIR} || (echo >&2 "Claiming failed"; set -e; exit 2)
  359. fi
  360. if [ "${RELOAD}" == "0" ] ; then
  361. exit $EXIT_CODE
  362. fi
  363. if [ -z "${PROXY}" ]; then
  364. PROXYMSG=""
  365. else
  366. PROXYMSG="You have attempted to claim this node through a proxy - please update your the proxy setting in your netdata.conf to ${PROXY}. "
  367. fi
  368. # Update cloud.conf in the agent memory
  369. @sbindir_POST@/netdatacli write-config 'cloud|global|enabled|yes' && \
  370. @sbindir_POST@/netdatacli write-config "cloud|global|cloud base url|$URL_BASE" && \
  371. @sbindir_POST@/netdatacli reload-claiming-state && \
  372. if [ "${HTTP_STATUS_CODE}" = "204" ] ; then
  373. echo >&2 "${PROXYMSG}Node was successfully claimed."
  374. else
  375. echo >&2 "The agent cloud base url is set to the url provided."
  376. echo >&2 "The cloud may have different credentials already registered for this agent ID and it cannot be reclaimed under different credentials for security reasons. If you are unable to connect use -id=\$(uuidgen) to overwrite this agent ID with a fresh value if the original credentials cannot be restored."
  377. echo >&2 "${PROXYMSG}Failed to claim node with the following error message:\"${ERROR_MESSAGES[$EXIT_CODE]}\""
  378. fi && exit $EXIT_CODE
  379. if [ "${ERROR_KEY}" = "ErrAlreadyClaimed" ] ; then
  380. echo >&2 "The cloud may have different credentials already registered for this agent ID and it cannot be reclaimed under different credentials for security reasons. If you are unable to connect use -id=\$(uuidgen) to overwrite this agent ID with a fresh value if the original credentials cannot be restored."
  381. echo >&2 "${PROXYMSG}Failed to claim node with the following error message:\"${ERROR_MESSAGES[$EXIT_CODE]}\""
  382. exit $EXIT_CODE
  383. fi
  384. echo >&2 "${PROXYMSG}The claim was successful but the agent could not be notified ($?)- it requires a restart to connect to the cloud."
  385. [ "$NETDATA_RUNNING" -eq 0 ] && exit 0 || exit 5
  386. fi
  387. echo >&2 "Failed to claim node with the following error message:\"${ERROR_MESSAGES[$EXIT_CODE]}\""
  388. if [ "${VERBOSE}" == 1 ]; then
  389. echo >&2 "Error key was:\"${ERROR_KEYS[$EXIT_CODE]}\""
  390. fi
  391. rm -f "${CLAIMING_DIR}/tmpout.txt"
  392. exit $EXIT_CODE