3 years ago · a8e890c8e8
--- a/.github/actions/setup-sentry/action.yml
+++ b/.github/actions/setup-sentry/action.yml
@@ -206,3 +206,5 @@ runs:
 
				         fi
			
 
				 
			
 
				         docker ps -a
			
 
				+
			
 
				+        ./scripts/devservices-healthcheck.sh
			
--- a/scripts/devservices-healthcheck.sh
+++ b/scripts/devservices-healthcheck.sh
@@ -0,0 +1,33 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# This is used for healthchecking because docker running healthchecks
			
 
				+# keeps the CPU too warm and if a container becomes unhealthy, there's no
			
 
				+# notification system or anything that would make it more easily visible.
			
 
				+
			
 
				+# In the future, more healthchecks should be added to make it more helpful
			
 
				+# as a troubleshooting script. Right now this is just used by CI.
			
 
				+
			
 
				+# TODO: iterate through all sentry_ containers,
			
 
				+#       and warn on anything that's expected to run but isn't running.
			
 
				+# TODO: spawn subprocess for each container,
			
 
				+#       specify distinct retries and other parameters.
			
 
				+#       would also be nice to have a single ^C work.
			
 
				+
			
 
				+try=1
			
 
				+to=3
			
 
				+start=0
			
 
				+delay=5
			
 
				+
			
 
				+sleep "$start"
			
 
				+while (( $try <= $to )); do
			
 
				+    echo "Checking health of postgres (try ${try} of ${to})..."
			
 
				+    if docker exec sentry_postgres pg_isready -U postgres; then
			
 
				+        break
			
 
				+    fi
			
 
				+    if (( $try == $to )); then
			
 
				+        echo "Exceeded retries."
			
 
				+        exit 1
			
 
				+    fi
			
 
				+    try=$(($try + 1))
			
 
				+    sleep "$delay"
			
 
				+done
			
--- a/src/sentry/conf/server.py
+++ b/src/sentry/conf/server.py
@@ -1755,12 +1755,6 @@ SENTRY_DEVSERVICES = {
 
				                 "max_wal_senders=1",
			
 
				             ],
			
 
				             "entrypoint": "/cdc/postgres-entrypoint.sh" if settings.SENTRY_USE_CDC_DEV else None,
			
 
				-            "healthcheck": {
			
 
				-                "test": ["CMD", "pg_isready", "-U", "postgres"],
			
 
				-                "interval": 30000000000,  # Test every 30 seconds (in ns).
			
 
				-                "timeout": 5000000000,  # Time we should expect the test to take.
			
 
				-                "retries": 3,
			
 
				-            },
			
 
				         }
			
 
				     ),
			
 
				     "zookeeper": lambda settings, options: (
			
--- a/src/sentry/runner/commands/devservices.py
+++ b/src/sentry/runner/commands/devservices.py
@@ -1,11 +1,10 @@
 
				 import os
			
 
				 import signal
			
 
				 import time
			
 
				+from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				 
			
 
				 import click
			
 
				 
			
 
				-from sentry.utils.compat import map
			
 
				-
			
 
				 # Work around a stupid docker issue: https://github.com/docker/for-mac/issues/5025
			
 
				 RAW_SOCKET_HACK_PATH = os.path.expanduser(
			
 
				     "~/Library/Containers/com.docker.docker/Data/docker.raw.sock"
			
@@ -25,17 +24,6 @@ def get_docker_client():
 
				         raise click.ClickException("Make sure Docker is running.")
			
 
				 
			
 
				 
			
 
				-def get_docker_low_level_client():
			
 
				-    import docker
			
 
				-
			
 
				-    client = docker.APIClient()
			
 
				-    try:
			
 
				-        client.ping()
			
 
				-        return client
			
 
				-    except Exception:
			
 
				-        raise click.ClickException("Make sure Docker is running.")
			
 
				-
			
 
				-
			
 
				 def get_or_create(client, thing, name):
			
 
				     from docker.errors import NotFound
			
 
				 
			
@@ -66,37 +54,6 @@ def retryable_pull(client, image, max_attempts=5):
 
				         break
			
 
				 
			
 
				 
			
 
				-def wait_for_healthcheck(low_level_client, container_name, healthcheck_options):
			
 
				-    # healthcheck_options should be the dictionary for docker-py.
			
 
				-
			
 
				-    # Convert ns -> s, float in both py2 + 3.
			
 
				-    healthcheck_timeout = healthcheck_options["timeout"] / 1000.0 ** 3
			
 
				-    healthcheck_interval = healthcheck_options["interval"] / 1000.0 ** 3
			
 
				-    healthcheck_retries = healthcheck_options["retries"]
			
 
				-
			
 
				-    # This is the maximum elapsed timeout.
			
 
				-    timeout = healthcheck_retries * (healthcheck_interval + healthcheck_timeout)
			
 
				-
			
 
				-    # And as for delay, polling is sort of cheap so we can do it quite often.
			
 
				-    # Important to note that the interval also defines the initial delay,
			
 
				-    # so the first polls will likely fail.
			
 
				-    delay = 0.25
			
 
				-
			
 
				-    health_status = None
			
 
				-    start_time = time.time()
			
 
				-
			
 
				-    while time.time() - start_time < timeout:
			
 
				-        resp = low_level_client.inspect_container(container_name)
			
 
				-        health_status = resp["State"]["Health"]["Status"]
			
 
				-        if health_status == "healthy":
			
 
				-            return
			
 
				-        time.sleep(delay)
			
 
				-
			
 
				-    raise click.ClickException(
			
 
				-        f"Timed out waiting for {container_name}: healthcheck status {health_status}"
			
 
				-    )
			
 
				-
			
 
				-
			
 
				 def ensure_interface(ports):
			
 
				     # If there is no interface specified, make sure the
			
 
				     # default interface is 127.0.0.1
			
@@ -117,7 +74,6 @@ def devservices(ctx):
 
				     Do not use in production!
			
 
				     """
			
 
				     ctx.obj["client"] = get_docker_client()
			
 
				-    ctx.obj["low_level_client"] = get_docker_low_level_client()
			
 
				 
			
 
				     # Disable backend validation so no devservices commands depend on like,
			
 
				     # redis to be already running.
			
@@ -150,7 +106,6 @@ def attach(ctx, project, fast, service):
 
				 
			
 
				     container = _start_service(
			
 
				         ctx.obj["client"],
			
 
				-        ctx.obj["low_level_client"],
			
 
				         service,
			
 
				         containers,
			
 
				         project,
			
@@ -233,10 +188,32 @@ def up(ctx, services, project, exclude, fast, skip_only_if):
 
				 
			
 
				     get_or_create(ctx.obj["client"], "network", project)
			
 
				 
			
 
				-    for name in selected_services:
			
 
				-        _start_service(
			
 
				-            ctx.obj["client"], ctx.obj["low_level_client"], name, containers, project, fast=fast
			
 
				-        )
			
 
				+    with ThreadPoolExecutor(max_workers=len(selected_services)) as executor:
			
 
				+        futures = []
			
 
				+        for name in selected_services:
			
 
				+            futures.append(
			
 
				+                executor.submit(
			
 
				+                    _start_service,
			
 
				+                    ctx.obj["client"],
			
 
				+                    name,
			
 
				+                    containers,
			
 
				+                    project,
			
 
				+                    fast=fast,
			
 
				+                )
			
 
				+            )
			
 
				+        for future in as_completed(futures):
			
 
				+            # If there was an exception, reraising it here to the main thread
			
 
				+            # will not terminate the whole python process. We'd like to report
			
 
				+            # on this exception and stop as fast as possible, so terminate
			
 
				+            # ourselves. I believe (without verification) that the OS is now
			
 
				+            # free to cleanup these threads, but not sure if they'll remain running
			
 
				+            # in the background. What matters most is that we regain control
			
 
				+            # of the terminal.
			
 
				+            e = future.exception()
			
 
				+            if e:
			
 
				+                click.echo(e)
			
 
				+                me = os.getpid()
			
 
				+                os.kill(me, signal.SIGTERM)
			
 
				 
			
 
				 
			
 
				 def _prepare_containers(project, skip_only_if=False, silent=False):
			
@@ -272,9 +249,7 @@ def _prepare_containers(project, skip_only_if=False, silent=False):
 
				     return containers
			
 
				 
			
 
				 
			
 
				-def _start_service(
			
 
				-    client, low_level_client, name, containers, project, fast=False, always_start=False
			
 
				-):
			
 
				+def _start_service(client, name, containers, project, fast=False, always_start=False):
			
 
				     from django.conf import settings
			
 
				 
			
 
				     from docker.errors import NotFound
			
@@ -295,7 +270,7 @@ def _start_service(
 
				     pull = options.pop("pull", False)
			
 
				     if not fast:
			
 
				         if pull:
			
 
				-            click.secho("> Pulling image '%s'" % options["image"], err=True, fg="green")
			
 
				+            click.secho(f"> Pulling image '{options['image']}'", fg="green")
			
 
				             retryable_pull(client, options["image"])
			
 
				         else:
			
 
				             # We want make sure to pull everything on the first time,
			
@@ -303,7 +278,7 @@ def _start_service(
 
				             try:
			
 
				                 client.images.get(options["image"])
			
 
				             except NotFound:
			
 
				-                click.secho("> Pulling image '%s'" % options["image"], err=True, fg="green")
			
 
				+                click.secho(f"> Pulling image '{options['image']}'", fg="green")
			
 
				                 retryable_pull(client, options["image"])
			
 
				 
			
 
				     for mount in list(options.get("volumes", {}).keys()):
			
@@ -329,8 +304,7 @@ def _start_service(
 
				     # should ONLY be started via the latter, which sets `always_start`.
			
 
				     if with_devserver and not always_start:
			
 
				         click.secho(
			
 
				-            "> Not starting container '%s' because it should be started on-demand with devserver."
			
 
				-            % options["name"],
			
 
				+            f"> Not starting container '{options['name']}' because it should be started on-demand with devserver.",
			
 
				             fg="yellow",
			
 
				         )
			
 
				         # XXX: if always_start=False, do not expect to have a container returned 100% of the time.
			
@@ -355,29 +329,22 @@ def _start_service(
 
				         if should_reuse_container:
			
 
				             click.secho(
			
 
				                 f"> Starting EXISTING container '{container.name}' {listening}",
			
 
				-                err=True,
			
 
				                 fg="yellow",
			
 
				             )
			
 
				             # Note that if the container is already running, this will noop.
			
 
				             # This makes repeated `devservices up` quite fast.
			
 
				             container.start()
			
 
				-            healthcheck_options = options.get("healthcheck")
			
 
				-            if healthcheck_options:
			
 
				-                wait_for_healthcheck(low_level_client, container.name, healthcheck_options)
			
 
				             return container
			
 
				 
			
 
				-        click.secho("> Stopping container '%s'" % container.name, err=True, fg="yellow")
			
 
				+        click.secho(f"> Stopping container '{container.name}'", fg="yellow")
			
 
				         container.stop()
			
 
				-        click.secho("> Removing container '%s'" % container.name, err=True, fg="yellow")
			
 
				+        click.secho(f"> Removing container '{container.name}'", fg="yellow")
			
 
				         container.remove()
			
 
				 
			
 
				-    click.secho("> Creating container '%s'" % options["name"], err=True, fg="yellow")
			
 
				+    click.secho(f"> Creating container '{options['name']}'", fg="yellow")
			
 
				     container = client.containers.create(**options)
			
 
				-    click.secho(f"> Starting container '{container.name}' {listening}", err=True, fg="yellow")
			
 
				+    click.secho(f"> Starting container '{container.name}' {listening}", fg="yellow")
			
 
				     container.start()
			
 
				-    healthcheck_options = options.get("healthcheck")
			
 
				-    if healthcheck_options:
			
 
				-        wait_for_healthcheck(low_level_client, container.name, healthcheck_options)
			
 
				     return container
			
 
				 
			
 
				 
			
@@ -393,15 +360,40 @@ def down(ctx, project, service):
 
				     The default is everything, however you may pass positional arguments to specify
			
 
				     an explicit list of services to bring down.
			
 
				     """
			
 
				-    prefix = project + "_"
			
 
				-
			
 
				     # TODO: make more like devservices rm
			
 
				 
			
 
				+    def _down(container):
			
 
				+        click.secho(f"> Stopping '{container.name}' container", fg="red")
			
 
				+        container.stop()
			
 
				+        click.secho(f"> Stopped '{container.name}' container", fg="red")
			
 
				+
			
 
				+    containers = []
			
 
				+    prefix = f"{project}_"
			
 
				+
			
 
				     for container in ctx.obj["client"].containers.list(all=True):
			
 
				-        if container.name.startswith(prefix):
			
 
				-            if not service or container.name[len(prefix) :] in service:
			
 
				-                click.secho("> Stopping '%s' container" % container.name, err=True, fg="red")
			
 
				-                container.stop()
			
 
				+        if not container.name.startswith(prefix):
			
 
				+            continue
			
 
				+        if service and not container.name[len(prefix) :] in service:
			
 
				+            continue
			
 
				+        containers.append(container)
			
 
				+
			
 
				+    with ThreadPoolExecutor(max_workers=len(containers)) as executor:
			
 
				+        futures = []
			
 
				+        for container in containers:
			
 
				+            futures.append(executor.submit(_down, container))
			
 
				+        for future in as_completed(futures):
			
 
				+            # If there was an exception, reraising it here to the main thread
			
 
				+            # will not terminate the whole python process. We'd like to report
			
 
				+            # on this exception and stop as fast as possible, so terminate
			
 
				+            # ourselves. I believe (without verification) that the OS is now
			
 
				+            # free to cleanup these threads, but not sure if they'll remain running
			
 
				+            # in the background. What matters most is that we regain control
			
 
				+            # of the terminal.
			
 
				+            e = future.exception()
			
 
				+            if e:
			
 
				+                click.echo(e)
			
 
				+                me = os.getpid()
			
 
				+                os.kill(me, signal.SIGTERM)
			
 
				 
			
 
				 
			
 
				 @devservices.command()