devservices_healthcheck.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. from __future__ import annotations
  2. import argparse
  3. import os
  4. import subprocess
  5. import time
  6. from collections.abc import Sequence
  7. from typing import Callable
  8. class HealthcheckError(Exception):
  9. pass
  10. class HealthCheck:
  11. def __init__(
  12. self,
  13. service_id: str,
  14. container_name: str,
  15. check_by_default: bool,
  16. check: Callable[[], object] | None = None,
  17. deps: list[str] | None = None,
  18. retries: int = 3,
  19. timeout_secs: int = 5,
  20. ):
  21. self.service_id = service_id
  22. self.container_name = container_name
  23. self.check_by_default = check_by_default
  24. self.check = check
  25. self.deps = deps or []
  26. self.retries = retries
  27. self.timeout_secs = timeout_secs
  28. def check_container(self) -> None:
  29. response = subprocess.run(
  30. ("docker", "container", "inspect", "-f", "{{.State.Status}}", self.container_name),
  31. capture_output=True,
  32. text=True,
  33. )
  34. if response.stdout.strip() != "running":
  35. raise HealthcheckError(f"Container '{self.container_name}' is not running.")
  36. def check_kafka():
  37. subprocess.run(
  38. (
  39. "docker",
  40. "exec",
  41. "sentry_kafka",
  42. "kafka-topics",
  43. "--bootstrap-server",
  44. "127.0.0.1:9092",
  45. "--list",
  46. ),
  47. check=True,
  48. )
  49. def check_postgres() -> None:
  50. subprocess.run(
  51. ("docker", "exec", "sentry_postgres", "pg_isready", "-U", "postgres"), check=True
  52. )
  53. # Available health checks
  54. all_service_healthchecks = {
  55. "postgres": HealthCheck(
  56. "postgres",
  57. "sentry_postgres",
  58. True,
  59. check_postgres,
  60. ),
  61. "kafka": HealthCheck(
  62. "kafka",
  63. "sentry_kafka",
  64. os.getenv("NEED_KAFKA") == "true",
  65. check_kafka,
  66. ),
  67. }
  68. def run_with_retries(cmd: Callable[[], object], retries: int, timeout: int) -> None:
  69. for retry in range(1, retries + 1):
  70. try:
  71. cmd()
  72. except (HealthcheckError, subprocess.CalledProcessError) as e:
  73. if retry == retries:
  74. print(f"Command failed, no more retries: {e}")
  75. raise HealthcheckError(f"Command failed: {e}")
  76. else:
  77. print(f"Command failed, retrying in {timeout}s (attempt {retry+1} of {retries})...")
  78. time.sleep(timeout)
  79. else:
  80. return
  81. def get_services_to_check(id: str) -> list[str]:
  82. checks = []
  83. hc = all_service_healthchecks[id]
  84. for dep in hc.deps:
  85. dep_checks = get_services_to_check(dep)
  86. for d in dep_checks:
  87. checks.append(d)
  88. checks.append(id)
  89. return checks
  90. def check_health(service_ids: list[str]) -> None:
  91. checks = [
  92. check_id for service_id in service_ids for check_id in get_services_to_check(service_id)
  93. ]
  94. # dict.fromkeys is used to remove duplicates while maintaining order
  95. unique_checks = list(dict.fromkeys(checks))
  96. for name in unique_checks:
  97. print(f"Checking service {name}")
  98. hc = all_service_healthchecks[name]
  99. print(f"Checking '{hc.container_name}' is running...")
  100. ls = " ".join(unique_checks)
  101. try:
  102. run_with_retries(hc.check_container, hc.retries, hc.timeout_secs)
  103. except HealthcheckError:
  104. raise HealthcheckError(
  105. f"Container '{hc.container_name}' is not running.\n"
  106. f" Start service: sentry devservices up {hc.service_id}\n"
  107. f" Restart all services: sentry devservices down {ls} && sentry devservices up {ls}"
  108. )
  109. if hc.check is not None:
  110. print(f"Checking '{hc.container_name}' container health...")
  111. try:
  112. run_with_retries(hc.check, hc.retries, hc.timeout_secs)
  113. except HealthcheckError:
  114. raise HealthcheckError(
  115. f"Container '{hc.container_name}' does not appear to be healthy.\n"
  116. f" Restart service: sentry devservices down {hc.service_id} && sentry devservices up {hc.service_id}\n"
  117. f" Restart all services: sentry devservices down {ls} && sentry devservices up {ls}"
  118. )
  119. def main(argv: Sequence[str] | None = None) -> None:
  120. parser = argparse.ArgumentParser()
  121. parser.add_argument(
  122. "--service",
  123. action="append",
  124. choices=list(dict.fromkeys(all_service_healthchecks)),
  125. help="The services you wish to check on. Defaults to all services.",
  126. )
  127. args = parser.parse_args(argv)
  128. healthchecks = args.service
  129. if healthchecks is None:
  130. healthchecks = [k for k, v in all_service_healthchecks.items() if v.check_by_default]
  131. try:
  132. check_health(healthchecks)
  133. except HealthcheckError as e:
  134. raise SystemExit(e)
  135. if __name__ == "__main__":
  136. main()