|
@@ -1,8 +1,19 @@
|
|
|
|
+from threading import Thread
|
|
|
|
+from time import sleep
|
|
|
|
+from typing import List, Tuple
|
|
from urllib.parse import urlparse
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
|
|
+import sentry_sdk
|
|
from django.conf import settings
|
|
from django.conf import settings
|
|
from django.utils.functional import cached_property
|
|
from django.utils.functional import cached_property
|
|
|
|
|
|
|
|
+from sentry import options
|
|
|
|
+from sentry.utils import redis
|
|
|
|
+
|
|
|
|
+QUEUES = ["profiles.process"]
|
|
|
|
+
|
|
|
|
+KEY_NAME = "unhealthy-queues"
|
|
|
|
+
|
|
|
|
|
|
class RedisBackend:
|
|
class RedisBackend:
|
|
def __init__(self, broker_url):
|
|
def __init__(self, broker_url):
|
|
@@ -38,7 +49,7 @@ class AmqpBackend:
|
|
host="%s:%d" % (host, port),
|
|
host="%s:%d" % (host, port),
|
|
userid=dsn.username,
|
|
userid=dsn.username,
|
|
password=dsn.password,
|
|
password=dsn.password,
|
|
- virtual_host=dsn.path[1:],
|
|
|
|
|
|
+ virtual_host=dsn.path[1:] or "/",
|
|
)
|
|
)
|
|
|
|
|
|
def get_conn(self):
|
|
def get_conn(self):
|
|
@@ -94,3 +105,88 @@ try:
|
|
backend = get_backend_for_broker(settings.BROKER_URL)
|
|
backend = get_backend_for_broker(settings.BROKER_URL)
|
|
except KeyError:
|
|
except KeyError:
|
|
backend = None
|
|
backend = None
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+queue_monitoring_cluster = redis.redis_clusters.get(settings.SENTRY_QUEUE_MONITORING_REDIS_CLUSTER)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def _unhealthy_queue_key(queue_name: str) -> str:
|
|
|
|
+ return f"{KEY_NAME}:{queue_name}"
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def is_queue_healthy(queue_name: str) -> bool:
|
|
|
|
+ """Checks whether the given queue is healthy by looking it up in Redis.
|
|
|
|
+
|
|
|
|
+ NB: If the queue is not found in Redis, it is assumed to be healthy.
|
|
|
|
+ This behavior might change in the future.
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+ if not options.get("backpressure.monitor_queues.enable"):
|
|
|
|
+ return True
|
|
|
|
+ # check if queue is healthy by pinging Redis
|
|
|
|
+ try:
|
|
|
|
+ healthy = queue_monitoring_cluster.exists(_unhealthy_queue_key(queue_name))
|
|
|
|
+ except Exception:
|
|
|
|
+ healthy = False
|
|
|
|
+ return healthy
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def _is_healthy(queue_size) -> bool:
|
|
|
|
+ return queue_size < options.get("backpressure.monitor_queues.unhealthy_threshold")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def _update_queue_stats(redis_cluster, queue_health: List[Tuple[str, bool]]) -> None:
|
|
|
|
+ unhealthy = [queue for (queue, unhealthy) in queue_health if unhealthy]
|
|
|
|
+ if unhealthy:
|
|
|
|
+ # Report list of unhealthy queues to sentry
|
|
|
|
+ with sentry_sdk.push_scope() as scope:
|
|
|
|
+ scope.set_extra("unhealthy_queues", unhealthy)
|
|
|
|
+ sentry_sdk.capture_message("RabbitMQ queues are exceeding size threshold")
|
|
|
|
+
|
|
|
|
+ with redis_cluster.pipeline(transaction=True) as pipeline:
|
|
|
|
+ for (queue, unhealthy) in queue_health:
|
|
|
|
+ if unhealthy:
|
|
|
|
+ pipeline.set(_unhealthy_queue_key(queue), "1", ex=60)
|
|
|
|
+ else:
|
|
|
|
+ pipeline.delete(_unhealthy_queue_key(queue))
|
|
|
|
+ pipeline.execute()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def _run_queue_stats_updater(redis_cluster: str) -> None:
|
|
|
|
+ # bonus point if we manage to use asyncio and launch all tasks at once
|
|
|
|
+ # in case we have many queues to check
|
|
|
|
+ cluster = redis.redis_clusters.get(redis_cluster)
|
|
|
|
+
|
|
|
|
+ queue_history = {queue: 0 for queue in QUEUES}
|
|
|
|
+ while True:
|
|
|
|
+ if not options.get("backpressure.monitor_queues.enable"):
|
|
|
|
+ sleep(10)
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ try:
|
|
|
|
+ new_sizes = backend.bulk_get_sizes(QUEUES)
|
|
|
|
+ for (queue, size) in new_sizes:
|
|
|
|
+ if _is_healthy(size):
|
|
|
|
+ queue_history[queue] = 0
|
|
|
|
+ else:
|
|
|
|
+ queue_history[queue] += 1
|
|
|
|
+ except Exception:
|
|
|
|
+ # If there was an error getting queue sizes from RabbitMQ, assume
|
|
|
|
+ # all queues are unhealthy
|
|
|
|
+ for queue in QUEUES:
|
|
|
|
+ queue_history[queue] += 1
|
|
|
|
+
|
|
|
|
+ strike_threshold = options.get("backpressure.monitor_queues.strike_threshold")
|
|
|
|
+ queue_health = [(queue, count >= strike_threshold) for (queue, count) in queue_history]
|
|
|
|
+ _update_queue_stats(cluster, queue_health)
|
|
|
|
+ sleep(options.get("backpressure.monitor_queues.check_interval"))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def monitor_queues():
|
|
|
|
+ if backend is None:
|
|
|
|
+ return
|
|
|
|
+ queue_stats_updater_process = Thread(
|
|
|
|
+ target=_run_queue_stats_updater,
|
|
|
|
+ args=(settings.SENTRY_QUEUE_MONITORING_REDIS_CLUSTER,),
|
|
|
|
+ )
|
|
|
|
+ queue_stats_updater_process.start()
|