Browse Source

feat(dev): Multi-threaded devservices and move PG health-check out of Docker (#29824)

josh 3 years ago
parent
commit
a8e890c8e8

+ 2 - 0
.github/actions/setup-sentry/action.yml

@@ -206,3 +206,5 @@ runs:
         fi
 
         docker ps -a
+
+        ./scripts/devservices-healthcheck.sh

+ 33 - 0
scripts/devservices-healthcheck.sh

@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# This is used for healthchecking because docker running healthchecks
+# keeps the CPU too warm and if a container becomes unhealthy, there's no
+# notification system or anything that would make it more easily visible.
+
+# In the future, more healthchecks should be added to make it more helpful
+# as a troubleshooting script. Right now this is just used by CI.
+
+# TODO: iterate through all sentry_ containers,
+#       and warn on anything that's expected to run but isn't running.
+# TODO: spawn subprocess for each container,
+#       specify distinct retries and other parameters.
+#       would also be nice to have a single ^C work.
+
+try=1
+to=3
+start=0
+delay=5
+
+sleep "$start"
+while (( $try <= $to )); do
+    echo "Checking health of postgres (try ${try} of ${to})..."
+    if docker exec sentry_postgres pg_isready -U postgres; then
+        break
+    fi
+    if (( $try == $to )); then
+        echo "Exceeded retries."
+        exit 1
+    fi
+    try=$(($try + 1))
+    sleep "$delay"
+done

+ 0 - 6
src/sentry/conf/server.py

@@ -1755,12 +1755,6 @@ SENTRY_DEVSERVICES = {
                 "max_wal_senders=1",
             ],
             "entrypoint": "/cdc/postgres-entrypoint.sh" if settings.SENTRY_USE_CDC_DEV else None,
-            "healthcheck": {
-                "test": ["CMD", "pg_isready", "-U", "postgres"],
-                "interval": 30000000000,  # Test every 30 seconds (in ns).
-                "timeout": 5000000000,  # Time we should expect the test to take.
-                "retries": 3,
-            },
         }
     ),
     "zookeeper": lambda settings, options: (

+ 66 - 74
src/sentry/runner/commands/devservices.py

@@ -1,11 +1,10 @@
 import os
 import signal
 import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
 
 import click
 
-from sentry.utils.compat import map
-
 # Work around a stupid docker issue: https://github.com/docker/for-mac/issues/5025
 RAW_SOCKET_HACK_PATH = os.path.expanduser(
     "~/Library/Containers/com.docker.docker/Data/docker.raw.sock"
@@ -25,17 +24,6 @@ def get_docker_client():
         raise click.ClickException("Make sure Docker is running.")
 
 
-def get_docker_low_level_client():
-    import docker
-
-    client = docker.APIClient()
-    try:
-        client.ping()
-        return client
-    except Exception:
-        raise click.ClickException("Make sure Docker is running.")
-
-
 def get_or_create(client, thing, name):
     from docker.errors import NotFound
 
@@ -66,37 +54,6 @@ def retryable_pull(client, image, max_attempts=5):
         break
 
 
-def wait_for_healthcheck(low_level_client, container_name, healthcheck_options):
-    # healthcheck_options should be the dictionary for docker-py.
-
-    # Convert ns -> s, float in both py2 + 3.
-    healthcheck_timeout = healthcheck_options["timeout"] / 1000.0 ** 3
-    healthcheck_interval = healthcheck_options["interval"] / 1000.0 ** 3
-    healthcheck_retries = healthcheck_options["retries"]
-
-    # This is the maximum elapsed timeout.
-    timeout = healthcheck_retries * (healthcheck_interval + healthcheck_timeout)
-
-    # And as for delay, polling is sort of cheap so we can do it quite often.
-    # Important to note that the interval also defines the initial delay,
-    # so the first polls will likely fail.
-    delay = 0.25
-
-    health_status = None
-    start_time = time.time()
-
-    while time.time() - start_time < timeout:
-        resp = low_level_client.inspect_container(container_name)
-        health_status = resp["State"]["Health"]["Status"]
-        if health_status == "healthy":
-            return
-        time.sleep(delay)
-
-    raise click.ClickException(
-        f"Timed out waiting for {container_name}: healthcheck status {health_status}"
-    )
-
-
 def ensure_interface(ports):
     # If there is no interface specified, make sure the
     # default interface is 127.0.0.1
@@ -117,7 +74,6 @@ def devservices(ctx):
     Do not use in production!
     """
     ctx.obj["client"] = get_docker_client()
-    ctx.obj["low_level_client"] = get_docker_low_level_client()
 
     # Disable backend validation so no devservices commands depend on like,
     # redis to be already running.
@@ -150,7 +106,6 @@ def attach(ctx, project, fast, service):
 
     container = _start_service(
         ctx.obj["client"],
-        ctx.obj["low_level_client"],
         service,
         containers,
         project,
@@ -233,10 +188,32 @@ def up(ctx, services, project, exclude, fast, skip_only_if):
 
     get_or_create(ctx.obj["client"], "network", project)
 
-    for name in selected_services:
-        _start_service(
-            ctx.obj["client"], ctx.obj["low_level_client"], name, containers, project, fast=fast
-        )
+    with ThreadPoolExecutor(max_workers=len(selected_services)) as executor:
+        futures = []
+        for name in selected_services:
+            futures.append(
+                executor.submit(
+                    _start_service,
+                    ctx.obj["client"],
+                    name,
+                    containers,
+                    project,
+                    fast=fast,
+                )
+            )
+        for future in as_completed(futures):
+            # If there was an exception, reraising it here to the main thread
+            # will not terminate the whole python process. We'd like to report
+            # on this exception and stop as fast as possible, so terminate
+            # ourselves. I believe (without verification) that the OS is now
+            # free to cleanup these threads, but not sure if they'll remain running
+            # in the background. What matters most is that we regain control
+            # of the terminal.
+            e = future.exception()
+            if e:
+                click.echo(e)
+                me = os.getpid()
+                os.kill(me, signal.SIGTERM)
 
 
 def _prepare_containers(project, skip_only_if=False, silent=False):
@@ -272,9 +249,7 @@ def _prepare_containers(project, skip_only_if=False, silent=False):
     return containers
 
 
-def _start_service(
-    client, low_level_client, name, containers, project, fast=False, always_start=False
-):
+def _start_service(client, name, containers, project, fast=False, always_start=False):
     from django.conf import settings
 
     from docker.errors import NotFound
@@ -295,7 +270,7 @@ def _start_service(
     pull = options.pop("pull", False)
     if not fast:
         if pull:
-            click.secho("> Pulling image '%s'" % options["image"], err=True, fg="green")
+            click.secho(f"> Pulling image '{options['image']}'", fg="green")
             retryable_pull(client, options["image"])
         else:
             # We want make sure to pull everything on the first time,
@@ -303,7 +278,7 @@ def _start_service(
             try:
                 client.images.get(options["image"])
             except NotFound:
-                click.secho("> Pulling image '%s'" % options["image"], err=True, fg="green")
+                click.secho(f"> Pulling image '{options['image']}'", fg="green")
                 retryable_pull(client, options["image"])
 
     for mount in list(options.get("volumes", {}).keys()):
@@ -329,8 +304,7 @@ def _start_service(
     # should ONLY be started via the latter, which sets `always_start`.
     if with_devserver and not always_start:
         click.secho(
-            "> Not starting container '%s' because it should be started on-demand with devserver."
-            % options["name"],
+            f"> Not starting container '{options['name']}' because it should be started on-demand with devserver.",
             fg="yellow",
         )
         # XXX: if always_start=False, do not expect to have a container returned 100% of the time.
@@ -355,29 +329,22 @@ def _start_service(
         if should_reuse_container:
             click.secho(
                 f"> Starting EXISTING container '{container.name}' {listening}",
-                err=True,
                 fg="yellow",
             )
             # Note that if the container is already running, this will noop.
             # This makes repeated `devservices up` quite fast.
             container.start()
-            healthcheck_options = options.get("healthcheck")
-            if healthcheck_options:
-                wait_for_healthcheck(low_level_client, container.name, healthcheck_options)
             return container
 
-        click.secho("> Stopping container '%s'" % container.name, err=True, fg="yellow")
+        click.secho(f"> Stopping container '{container.name}'", fg="yellow")
         container.stop()
-        click.secho("> Removing container '%s'" % container.name, err=True, fg="yellow")
+        click.secho(f"> Removing container '{container.name}'", fg="yellow")
         container.remove()
 
-    click.secho("> Creating container '%s'" % options["name"], err=True, fg="yellow")
+    click.secho(f"> Creating container '{options['name']}'", fg="yellow")
     container = client.containers.create(**options)
-    click.secho(f"> Starting container '{container.name}' {listening}", err=True, fg="yellow")
+    click.secho(f"> Starting container '{container.name}' {listening}", fg="yellow")
     container.start()
-    healthcheck_options = options.get("healthcheck")
-    if healthcheck_options:
-        wait_for_healthcheck(low_level_client, container.name, healthcheck_options)
     return container
 
 
@@ -393,15 +360,40 @@ def down(ctx, project, service):
     The default is everything, however you may pass positional arguments to specify
     an explicit list of services to bring down.
     """
-    prefix = project + "_"
-
     # TODO: make more like devservices rm
 
+    def _down(container):
+        click.secho(f"> Stopping '{container.name}' container", fg="red")
+        container.stop()
+        click.secho(f"> Stopped '{container.name}' container", fg="red")
+
+    containers = []
+    prefix = f"{project}_"
+
     for container in ctx.obj["client"].containers.list(all=True):
-        if container.name.startswith(prefix):
-            if not service or container.name[len(prefix) :] in service:
-                click.secho("> Stopping '%s' container" % container.name, err=True, fg="red")
-                container.stop()
+        if not container.name.startswith(prefix):
+            continue
+        if service and not container.name[len(prefix) :] in service:
+            continue
+        containers.append(container)
+
+    with ThreadPoolExecutor(max_workers=len(containers)) as executor:
+        futures = []
+        for container in containers:
+            futures.append(executor.submit(_down, container))
+        for future in as_completed(futures):
+            # If there was an exception, reraising it here to the main thread
+            # will not terminate the whole python process. We'd like to report
+            # on this exception and stop as fast as possible, so terminate
+            # ourselves. I believe (without verification) that the OS is now
+            # free to cleanup these threads, but not sure if they'll remain running
+            # in the background. What matters most is that we regain control
+            # of the terminal.
+            e = future.exception()
+            if e:
+                click.echo(e)
+                me = os.getpid()
+                os.kill(me, signal.SIGTERM)
 
 
 @devservices.command()