Browse Source

feat(crons): Add QuotaConfig for check-in rate-limits (#61310)

This is a revert of f741b50a73b9c2f4c40a45e81eb53d5bc13aa175.

This introduces a `QuotaConfig` per-project to limit monitor check-ins.

This is done by taking the number of MonitorEnvironment's for a project
and using that as the per-minute rate-limit for monitor check-ins.

 - There is a multiplication factor which is used to give some leeway to
   how many check-ins can happen per project. This is needed since
   otherwise if you have 10 monitors in a project, you could only create
   exactly 10 check-ins in a minute. Which in some cases may not be
   enough (if a check-in happens slightly late / early etc etc)

   To avoid this the `ALLOWED_CHECK_INS_PER_MONITOR` constant
   is used as a multiplier on the number of MonitorEnvironment's.

   This constant is set to 5 check-ins per monitor environment per
   minute. Meaning if you have 10 monitors, you can create 50 check-ins
   per minute.

 - There is a minimum per-minute rate-limit value (ALLOWED_MINIMUM).
   If there were 0 MonitorEnvironment's this would introduce a
   rate-limit of 0, effectively allowing no-check-ins ever. Check-ins
   are required to genearte MonitorEnvironment's. This constant is used
   to specify the minimum for the per minute rate limit

Fixes https://github.com/getsentry/sentry/issues/60303
Evan Purkhiser 1 year ago
parent
commit
b31a1b293e

+ 2 - 0
src/sentry/conf/server.py

@@ -1661,6 +1661,8 @@ SENTRY_FEATURES: dict[str, bool | None] = {
     "organizations:mobile-cpu-memory-in-transactions": False,
     # Enable Monitors (Crons) view
     "organizations:monitors": False,
+    # Enable rate-limiting via relay for Monitors (crons)
+    "organizations:monitors-quota-rate-limit": False,
     # Enables higher limit for alert rules
     "organizations:more-slow-alerts": False,
     # Enables region provisioning for individual users

+ 1 - 0
src/sentry/features/__init__.py

@@ -149,6 +149,7 @@ default_manager.add("organizations:mobile-cpu-memory-in-transactions", Organizat
 default_manager.add("organizations:mobile-ttid-ttfd-contribution", OrganizationFeature, FeatureHandlerStrategy.REMOTE)
 default_manager.add("organizations:mobile-vitals", OrganizationFeature, FeatureHandlerStrategy.REMOTE)
 default_manager.add("organizations:monitors", OrganizationFeature, FeatureHandlerStrategy.REMOTE)
+default_manager.add("organizations:monitors-quota-rate-limit", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
 default_manager.add("organizations:more-slow-alerts", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
 default_manager.add("organizations:new-page-filter", OrganizationFeature, FeatureHandlerStrategy.REMOTE)
 default_manager.add("organizations:new-weekly-report", OrganizationFeature, FeatureHandlerStrategy.REMOTE)

+ 10 - 2
src/sentry/monitors/models.py

@@ -540,6 +540,8 @@ class MonitorEnvironmentManager(BaseManager["MonitorEnvironment"]):
     def ensure_environment(
         self, project: Project, monitor: Monitor, environment_name: str | None
     ) -> MonitorEnvironment:
+        from sentry.monitors.rate_limit import update_monitor_quota
+
         if not environment_name:
             environment_name = "production"
 
@@ -549,9 +551,15 @@ class MonitorEnvironmentManager(BaseManager["MonitorEnvironment"]):
         # TODO: assume these objects exist once backfill is completed
         environment = Environment.get_or_create(project=project, name=environment_name)
 
-        return MonitorEnvironment.objects.get_or_create(
+        monitor_env, created = MonitorEnvironment.objects.get_or_create(
             monitor=monitor, environment=environment, defaults={"status": MonitorStatus.ACTIVE}
-        )[0]
+        )
+
+        # recompute per-project monitor check-in rate limit quota
+        if created:
+            update_monitor_quota(monitor_env)
+
+        return monitor_env
 
 
 @region_silo_only_model

+ 100 - 0
src/sentry/monitors/rate_limit.py

@@ -0,0 +1,100 @@
+from typing import Optional, Tuple
+
+from django.core.cache import cache
+
+from sentry import features
+from sentry.models.organization import Organization
+from sentry.models.project import Project
+from sentry.monitors.models import MonitorEnvironment
+from sentry.tasks.relay import schedule_invalidate_project_config
+
+# This module is used by the Quotas system to inform relay of the allowed
+# number of check-ins that can be sent per project during the QUOTA_WINDOW.
+#
+# In an ideal world we would be rate-limiting monitors on the key of
+# (organization, monitor_slug, environment). Unfortunately due to how Relay
+# handles quota rate-limiting we're at best able to rate-limit per project.
+#
+# > Relay cannot do rate-limiting like this because it would need to fully
+# > unmarshal the event payload to rate-limit on the monitor slug. The project_id
+# > is available in the payload headers.
+#
+# To allow the monitors system to take-advantage of Relay's quota system to be
+# used for rate-limiting, we can rate-limit per project by providing relay with
+# a quota for the DataCategory.MONITOR that is calculated based on how many
+# monitors the project has.
+#
+# There are some caveats to this:
+#
+# - For upserted monitors we need to give some amount of minimum quota, to
+#   allow monitors to be created even when there are 0 monitors within a
+#   project (since otherwise it would be completely rate-limited)
+#
+# - It is possible that one monitor may saturate the quota, thus rate-limiting
+#   all other monitors within that project. This is an accepted side-effect of
+#   doing rate-limiting this way.
+#
+# - We give each monitor a 'grace' number of allowed check-ins. Ideally each
+#   monitor should only be checking in once per minute, so the quota could just
+#   be the number of monitors in the project. However,
+
+
+# Monitor check-in limits are per minute. This maps to the smallest check-in
+# window that we support.
+QUOTA_WINDOW = 60
+
+# Determines how many check-ins per-minute will be allowed per monitor. This is
+# used when computing the QuotaConfig for the DataCategory.MONITOR (check-ins)
+#
+# These are the 'grace' check-ins as described above.
+ALLOWED_CHECK_INS_PER_MONITOR = 5
+
+# The minimum rate-limit per project for the DataCategory.MONITOR. This value
+# should be high enough that it allows for a large number of monitors to be
+# upserted without hitting the project rate-limit.
+ALLOWED_MINIMUM = 50
+
+
+def get_project_monitor_quota(
+    project: Project,
+    cache_bust=False,
+) -> Tuple[Optional[int], Optional[int]]:
+    """
+    Determines the rate-limit for monitor check-ins across a particular
+    project.
+
+    :return: A (limit, window) tuple. (None, None) indicates no rate-limit
+    """
+    organization = Organization.objects.get_from_cache(id=project.organization_id)
+
+    if not features.has("organizations:monitors-quota-rate-limit", organization=organization):
+        return (None, None)
+
+    limit = None
+    cache_key = f"project:{project.id}:monitor-env-count"
+
+    # Cache rate-limit computation. This function will be called often by the
+    # Quotas system.
+    if not cache_bust:
+        limit = cache.get(cache_key)
+
+    if limit is None:
+        monitor_count = MonitorEnvironment.objects.filter(monitor__project_id=project.id).count()
+        limit = monitor_count * ALLOWED_CHECK_INS_PER_MONITOR
+        cache.set(cache_key, limit, 600)
+
+    return (ALLOWED_MINIMUM + limit, QUOTA_WINDOW)
+
+
+def update_monitor_quota(monitor_env: MonitorEnvironment):
+    """
+    When new monitor environments are created we recompute the per-project
+    monitor check-in rate limit QuotaConfig in relay.
+    """
+    project = Project.objects.get_from_cache(id=monitor_env.monitor.project_id)
+
+    get_project_monitor_quota(project, cache_bust=True)
+    schedule_invalidate_project_config(
+        project_id=project.id,
+        trigger="monitors:monitor_created",
+    )

+ 5 - 0
src/sentry/quotas/base.py

@@ -457,6 +457,11 @@ class Quota(Service):
                     reason_code=reason_codes[quota.scope],
                 )
 
+    def get_monitor_quota(self, project):
+        from sentry.monitors.rate_limit import get_project_monitor_quota
+
+        return get_project_monitor_quota(project)
+
     def get_project_quota(self, project):
         from sentry.models.options.organization_option import OrganizationOption
         from sentry.models.organization import Organization

+ 16 - 0
src/sentry/quotas/redis.py

@@ -87,6 +87,22 @@ class RedisQuota(Quota):
                     )
                 )
 
+        with sentry_sdk.start_span(op="redis.get_quotas.get_monitor_quota") as span:
+            span.set_tag("project.id", project.id)
+            mrlquota = self.get_monitor_quota(project)
+            if mrlquota[0] is not None:
+                results.append(
+                    QuotaConfig(
+                        id="mrl",
+                        limit=mrlquota[0],
+                        window=mrlquota[1],
+                        scope=QuotaScope.PROJECT,
+                        scope_id=project.id,
+                        categories=[DataCategory.MONITOR],
+                        reason_code="monitor_rate_limit",
+                    )
+                )
+
         if key and not keys:
             keys = [key]
         elif not keys:

+ 75 - 0
tests/sentry/monitors/test_rate_limit.py

@@ -0,0 +1,75 @@
+from unittest import mock
+
+from sentry.monitors.models import (
+    Monitor,
+    MonitorEnvironment,
+    MonitorStatus,
+    MonitorType,
+    ScheduleType,
+)
+from sentry.monitors.rate_limit import get_project_monitor_quota
+from sentry.testutils.cases import TestCase
+from sentry.testutils.helpers.features import with_feature
+
+
+@mock.patch("sentry.monitors.rate_limit.QUOTA_WINDOW", 45)
+@mock.patch("sentry.monitors.rate_limit.ALLOWED_CHECK_INS_PER_MONITOR", 2)
+@mock.patch("sentry.monitors.rate_limit.ALLOWED_MINIMUM", 5)
+class MonitorRateLimit(TestCase):
+    def test_disabled(self):
+        """
+        Disabled without feature flag
+        """
+        limit, window = get_project_monitor_quota(self.project)
+        assert limit is None
+        assert window is None
+
+    @with_feature("organizations:monitors-quota-rate-limit")
+    def test_minimum(self):
+        """
+        Without any monitor environments we'll always return ALLOWED_MINIMUM.
+        """
+        limit, window = get_project_monitor_quota(self.project)
+        assert limit == 5
+        assert window == 45
+
+    @with_feature("organizations:monitors-quota-rate-limit")
+    def test_computed_from_environments(self):
+        """
+        Validate that the quota is computed from the total number of monitor
+        environments in a project.
+        """
+        monitor = Monitor.objects.create(
+            organization_id=self.organization.id,
+            project_id=self.project.id,
+            type=MonitorType.CRON_JOB,
+            config={"schedule": "*/5 * * * *", "schedule_type": ScheduleType.CRONTAB},
+        )
+
+        mon_env_count = 10
+        for i in range(mon_env_count):
+            env = self.create_environment(self.project, name=f"test-{i}")
+            MonitorEnvironment.objects.create(
+                monitor=monitor,
+                environment=env,
+                status=MonitorStatus.OK,
+            )
+
+        # The rate limit is per project, create another monitor in a different
+        # project to validate it is not counted
+        project2 = self.create_project(organization=self.organization)
+        monitor2 = Monitor.objects.create(
+            organization_id=self.organization.id,
+            project_id=project2.id,
+            type=MonitorType.CRON_JOB,
+            config={"schedule": "*/5 * * * *", "schedule_type": ScheduleType.CRONTAB},
+        )
+        MonitorEnvironment.objects.create(
+            monitor=monitor2,
+            environment=self.environment,
+            status=MonitorStatus.OK,
+        )
+
+        limit, window = get_project_monitor_quota(self.project, cache_bust=True)
+        assert limit == 25
+        assert window == 45

+ 33 - 4
tests/sentry/quotas/test_redis.py

@@ -76,6 +76,7 @@ class RedisQuotaTest(TestCase):
         # AssertionError: reject-all quotas cannot be tracked
         self.get_project_quota.return_value = (100, 10)
         self.get_organization_quota.return_value = (1000, 10)
+        self.get_monitor_quota.return_value = (15, 60)
 
         # A negative quota means reject-all.
         self.organization.update_option("project-abuse-quota.error-limit", -1)
@@ -208,6 +209,7 @@ class RedisQuotaTest(TestCase):
         # AssertionError: reject-all quotas cannot be tracked
         self.get_project_quota.return_value = (100, 10)
         self.get_organization_quota.return_value = (1000, 10)
+        self.get_monitor_quota.return_value = (15, 60)
 
         self.organization.update_option("project-abuse-quota.transaction-limit", 600)
         with self.feature({"organizations:transaction-metrics-extraction": False}):
@@ -235,9 +237,17 @@ class RedisQuotaTest(TestCase):
         ) as self.get_organization_quota:
             yield
 
+    @pytest.fixture(autouse=True)
+    def _patch_get_monitor_quota(self):
+        with mock.patch.object(
+            RedisQuota, "get_monitor_quota", return_value=(0, 60)
+        ) as self.get_monitor_quota:
+            yield
+
     def test_uses_defined_quotas(self):
         self.get_project_quota.return_value = (200, 60)
         self.get_organization_quota.return_value = (300, 60)
+        self.get_monitor_quota.return_value = (15, 60)
         quotas = self.quota.get_quotas(self.project)
 
         assert quotas[0].id == "p"
@@ -250,6 +260,11 @@ class RedisQuotaTest(TestCase):
         assert quotas[1].scope_id == str(self.organization.id)
         assert quotas[1].limit == 300
         assert quotas[1].window == 60
+        assert quotas[2].id == "mrl"
+        assert quotas[2].scope == QuotaScope.PROJECT
+        assert quotas[2].scope_id == str(self.project.id)
+        assert quotas[2].limit == 15
+        assert quotas[2].window == 60
 
     @mock.patch("sentry.quotas.redis.is_rate_limited")
     @mock.patch.object(RedisQuota, "get_quotas", return_value=[])
@@ -262,12 +277,14 @@ class RedisQuotaTest(TestCase):
     def test_is_not_limited_without_rejections(self, is_rate_limited):
         self.get_organization_quota.return_value = (100, 60)
         self.get_project_quota.return_value = (200, 60)
+        self.get_monitor_quota.return_value = (15, 60)
         assert not self.quota.is_rate_limited(self.project).is_limited
 
     @mock.patch("sentry.quotas.redis.is_rate_limited", return_value=(True, False))
     def test_is_limited_on_rejections(self, is_rate_limited):
         self.get_organization_quota.return_value = (100, 60)
         self.get_project_quota.return_value = (200, 60)
+        self.get_monitor_quota.return_value = (15, 60)
         assert self.quota.is_rate_limited(self.project).is_limited
 
     @mock.patch.object(RedisQuota, "get_quotas")
@@ -323,6 +340,7 @@ class RedisQuotaTest(TestCase):
 
         self.get_project_quota.return_value = (200, 60)
         self.get_organization_quota.return_value = (300, 60)
+        self.get_monitor_quota.return_value = (15, 60)
 
         n = 10
         for _ in range(n):
@@ -336,9 +354,13 @@ class RedisQuotaTest(TestCase):
 
         usage = self.quota.get_usage(self.project.organization_id, all_quotas, timestamp=timestamp)
 
-        # Only quotas with an ID are counted in Redis (via this ID). Assume the
-        # count for these quotas and None for the others.
-        assert usage == [n if q.id else None for q in quotas] + [0, 0]
+        assert usage == [
+            n,  # project quota is consumed
+            n,  # organization quota is consumed
+            0,  # monitor quota is not consumed
+            0,  # unlimited quota is not consumed
+            0,  # dummy quota is not consumed
+        ]
 
     @mock.patch.object(RedisQuota, "get_quotas")
     def test_refund_defaults(self, mock_get_quotas):
@@ -443,6 +465,7 @@ class RedisQuotaTest(TestCase):
 
         self.get_project_quota.return_value = (200, 60)
         self.get_organization_quota.return_value = (300, 60)
+        self.get_monitor_quota.return_value = (15, 60)
 
         n = 10
         for _ in range(n):
@@ -461,4 +484,10 @@ class RedisQuotaTest(TestCase):
         # Only quotas with an ID are counted in Redis (via this ID). Assume the
         # count for these quotas and None for the others.
         # The ``- 1`` is because we refunded once.
-        assert usage == [n - 1 if q.id else None for q in quotas] + [0, 0]
+        assert usage == [
+            n - 1,  # project quota has been refunded one
+            n - 1,  # organization quota has been refunded one
+            0,  # monitor quota was not consumed
+            0,  # unlimited quota was not consumed
+            0,  # dummy quota was not consumed
+        ]