devservices_healthcheck.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. from __future__ import annotations
  2. import argparse
  3. import os
  4. import subprocess
  5. import time
  6. from collections.abc import Sequence
  7. from typing import Callable
  8. class HealthcheckError(Exception):
  9. pass
  10. class HealthCheck:
  11. def __init__(
  12. self,
  13. service_id: str,
  14. container_name: str,
  15. check_by_default: bool,
  16. check: Callable[[], object] | None = None,
  17. deps: list[str] | None = None,
  18. retries: int = 3,
  19. timeout_secs: int = 5,
  20. ):
  21. self.service_id = service_id
  22. self.container_name = container_name
  23. self.check_by_default = check_by_default
  24. self.check = check
  25. self.deps = deps or []
  26. self.retries = retries
  27. self.timeout_secs = timeout_secs
  28. def check_container(self) -> None:
  29. response = subprocess.run(
  30. ("docker", "container", "inspect", "-f", "{{.State.Status}}", self.container_name),
  31. capture_output=True,
  32. text=True,
  33. )
  34. if response.stdout.strip() != "running":
  35. raise HealthcheckError(f"Container '{self.container_name}' is not running.")
  36. def check_kafka():
  37. subprocess.run(
  38. (
  39. "docker",
  40. "exec",
  41. "sentry_kafka",
  42. "kafka-topics",
  43. "--zookeeper",
  44. # TODO: sentry_zookeeper:2181 doesn't work in CI, but 127.0.0.1 doesn't work locally
  45. os.environ.get("ZK_HOST", "127.0.0.1:2181"),
  46. "--list",
  47. ),
  48. check=True,
  49. )
  50. def check_postgres() -> None:
  51. subprocess.run(
  52. ("docker", "exec", "sentry_postgres", "pg_isready", "-U", "postgres"), check=True
  53. )
  54. # Available health checks
  55. all_service_healthchecks = {
  56. "postgres": HealthCheck(
  57. "postgres",
  58. "sentry_postgres",
  59. True,
  60. check_postgres,
  61. ),
  62. "kafka": HealthCheck(
  63. "kafka",
  64. "sentry_kafka",
  65. os.getenv("NEED_KAFKA") == "true",
  66. check_kafka,
  67. deps=["zookeeper"],
  68. ),
  69. "zookeeper": HealthCheck(
  70. "zookeeper",
  71. "sentry_zookeeper",
  72. os.getenv("NEED_KAFKA") == "true",
  73. ),
  74. }
  75. def run_with_retries(cmd: Callable[[], object], retries: int, timeout: int) -> None:
  76. for retry in range(1, retries + 1):
  77. try:
  78. cmd()
  79. except (HealthcheckError, subprocess.CalledProcessError) as e:
  80. if retry == retries:
  81. print(f"Command failed, no more retries: {e}")
  82. raise HealthcheckError(f"Command failed: {e}")
  83. else:
  84. print(f"Command failed, retrying in {timeout}s (attempt {retry+1} of {retries})...")
  85. time.sleep(timeout)
  86. else:
  87. return
  88. def get_services_to_check(id: str) -> list[str]:
  89. checks = []
  90. hc = all_service_healthchecks[id]
  91. for dep in hc.deps:
  92. dep_checks = get_services_to_check(dep)
  93. for d in dep_checks:
  94. checks.append(d)
  95. checks.append(id)
  96. return checks
  97. def check_health(service_ids: list[str]) -> None:
  98. checks = [
  99. check_id for service_id in service_ids for check_id in get_services_to_check(service_id)
  100. ]
  101. # dict.fromkeys is used to remove duplicates while maintaining order
  102. unique_checks = list(dict.fromkeys(checks))
  103. for name in unique_checks:
  104. print(f"Checking service {name}")
  105. hc = all_service_healthchecks[name]
  106. print(f"Checking '{hc.container_name}' is running...")
  107. ls = " ".join(unique_checks)
  108. try:
  109. run_with_retries(hc.check_container, hc.retries, hc.timeout_secs)
  110. except HealthcheckError:
  111. raise HealthcheckError(
  112. f"Container '{hc.container_name}' is not running.\n"
  113. f" Start service: sentry devservices up {hc.service_id}\n"
  114. f" Restart all services: sentry devservices down {ls} && sentry devservices up {ls}"
  115. )
  116. if hc.check is not None:
  117. print(f"Checking '{hc.container_name}' container health...")
  118. try:
  119. run_with_retries(hc.check, hc.retries, hc.timeout_secs)
  120. except HealthcheckError:
  121. raise HealthcheckError(
  122. f"Container '{hc.container_name}' does not appear to be healthy.\n"
  123. f" Restart service: sentry devservices down {hc.service_id} && sentry devservices up {hc.service_id}\n"
  124. f" Restart all services: sentry devservices down {ls} && sentry devservices up {ls}"
  125. )
  126. def main(argv: Sequence[str] | None = None) -> None:
  127. parser = argparse.ArgumentParser()
  128. parser.add_argument(
  129. "--service",
  130. action="append",
  131. choices=list(dict.fromkeys(all_service_healthchecks)),
  132. help="The services you wish to check on. Defaults to all services.",
  133. )
  134. args = parser.parse_args(argv)
  135. healthchecks = args.service
  136. if healthchecks is None:
  137. healthchecks = [k for k, v in all_service_healthchecks.items() if v.check_by_default]
  138. try:
  139. check_health(healthchecks)
  140. except HealthcheckError as e:
  141. raise SystemExit(e)
  142. if __name__ == "__main__":
  143. main()