devservices_healthcheck.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. from __future__ import annotations
  2. import argparse
  3. import os
  4. import subprocess
  5. import time
  6. from collections.abc import Sequence
  7. from typing import Callable
  8. class HealthcheckError(Exception):
  9. pass
  10. class HealthCheck:
  11. def __init__(
  12. self,
  13. service_id: str,
  14. container_name: str,
  15. check_by_default: bool,
  16. check: Callable[[], object] | None = None,
  17. deps: list[str] | None = None,
  18. retries: int = 3,
  19. timeout_secs: int = 5,
  20. ):
  21. self.service_id = service_id
  22. self.container_name = container_name
  23. self.check_by_default = check_by_default
  24. self.check = check
  25. self.deps = deps or []
  26. self.retries = retries
  27. self.timeout_secs = timeout_secs
  28. def check_container(self) -> None:
  29. response = subprocess.run(
  30. ("docker", "container", "inspect", "-f", "{{.State.Status}}", self.container_name),
  31. capture_output=True,
  32. text=True,
  33. )
  34. if response.stdout.strip() != "running":
  35. raise HealthcheckError(f"Container '{self.container_name}' is not running.")
  36. def check_kafka():
  37. subprocess.run(
  38. (
  39. "docker",
  40. "exec",
  41. "sentry_kafka",
  42. "kafka-topics",
  43. "--zookeeper",
  44. "sentry_zookeeper:2181",
  45. "--list",
  46. ),
  47. check=True,
  48. )
  49. def check_postgres() -> None:
  50. subprocess.run(
  51. ("docker", "exec", "sentry_postgres", "pg_isready", "-U", "postgres"), check=True
  52. )
  53. # Available health checks
  54. all_service_healthchecks = {
  55. "postgres": HealthCheck(
  56. "postgres",
  57. "sentry_postgres",
  58. True,
  59. check_postgres,
  60. ),
  61. "kafka": HealthCheck(
  62. "kafka",
  63. "sentry_kafka",
  64. os.getenv("NEED_KAFKA") == "true",
  65. check_kafka,
  66. deps=["zookeeper"],
  67. ),
  68. "zookeeper": HealthCheck(
  69. "zookeeper",
  70. "sentry_zookeeper",
  71. os.getenv("NEED_KAFKA") == "true",
  72. ),
  73. }
  74. def run_with_retries(cmd: Callable[[], object], retries: int, timeout: int) -> None:
  75. for retry in range(1, retries + 1):
  76. try:
  77. cmd()
  78. except (HealthcheckError, subprocess.CalledProcessError) as e:
  79. if retry == retries:
  80. print(f"Command failed, no more retries: {e}")
  81. raise HealthcheckError(f"Command failed: {e}")
  82. else:
  83. print(f"Command failed, retrying in {timeout}s (attempt {retry+1} of {retries})...")
  84. time.sleep(timeout)
  85. else:
  86. return
  87. def get_services_to_check(id: str) -> list[str]:
  88. checks = []
  89. hc = all_service_healthchecks[id]
  90. for dep in hc.deps:
  91. dep_checks = get_services_to_check(dep)
  92. for d in dep_checks:
  93. checks.append(d)
  94. checks.append(id)
  95. return checks
  96. def check_health(service_ids: list[str]) -> None:
  97. checks = [
  98. check_id for service_id in service_ids for check_id in get_services_to_check(service_id)
  99. ]
  100. # dict.fromkeys is used to remove duplicates while maintaining order
  101. unique_checks = list(dict.fromkeys(checks))
  102. for name in unique_checks:
  103. print(f"Checking service {name}")
  104. hc = all_service_healthchecks[name]
  105. print(f"Checking '{hc.container_name}' is running...")
  106. ls = " ".join(unique_checks)
  107. try:
  108. run_with_retries(hc.check_container, hc.retries, hc.timeout_secs)
  109. except HealthcheckError:
  110. raise HealthcheckError(
  111. f"Container '{hc.container_name}' is not running.\n"
  112. f" Start service: sentry devservices up {hc.service_id}\n"
  113. f" Restart all services: sentry devservices down {ls} && sentry devservices up {ls}"
  114. )
  115. if hc.check is not None:
  116. print(f"Checking '{hc.container_name}' container health...")
  117. try:
  118. run_with_retries(hc.check, hc.retries, hc.timeout_secs)
  119. except HealthcheckError:
  120. raise HealthcheckError(
  121. f"Container '{hc.container_name}' does not appear to be healthy.\n"
  122. f" Restart service: sentry devservices down {hc.service_id} && sentry devservices up {hc.service_id}\n"
  123. f" Restart all services: sentry devservices down {ls} && sentry devservices up {ls}"
  124. )
  125. def main(argv: Sequence[str] | None = None) -> None:
  126. parser = argparse.ArgumentParser()
  127. parser.add_argument(
  128. "--service",
  129. action="append",
  130. choices=list(dict.fromkeys(all_service_healthchecks)),
  131. help="The services you wish to check on. Defaults to all services.",
  132. )
  133. args = parser.parse_args(argv)
  134. healthchecks = args.service
  135. if healthchecks is None:
  136. healthchecks = [k for k, v in all_service_healthchecks.items() if v.check_by_default]
  137. try:
  138. check_health(healthchecks)
  139. except HealthcheckError as e:
  140. raise SystemExit(e)
  141. if __name__ == "__main__":
  142. main()