Browse Source

ref(severity): fallback for Snuba failures (#62173)

Re-revert of #61553 with fix for backwards compatibility.

Resolves SENTRY-2BAG
Isabella Enriquez 1 year ago
parent
commit
5f444595c1
2 changed files with 218 additions and 58 deletions
  1. 83 34
      src/sentry/issues/issue_velocity.py
  2. 135 24
      tests/sentry/issues/test_issue_velocity.py

+ 83 - 34
src/sentry/issues/issue_velocity.py

@@ -26,7 +26,7 @@ from snuba_sdk import (
 
 from sentry.snuba.dataset import Dataset, EntityKey
 from sentry.tasks.post_process import locks
-from sentry.utils import json
+from sentry.utils import metrics
 from sentry.utils.locking import UnableToAcquireLock
 from sentry.utils.redis import redis_clusters
 
@@ -43,10 +43,13 @@ WEEK_IN_HOURS = 7 * 24
 
 # for redis operations
 DEFAULT_TTL = 48 * 60 * 60  # 2 days
-NONE_TTL = 10 * 60  # 10 minutes
+FALLBACK_TTL = 10 * 60  # 10 minutes; TTL for storing temporary values while we can't query Snuba
 THRESHOLD_KEY = "new-issue-escalation-threshold:{project_id}"
-STALE_DATE_KEY = "new-issue-escalation-threshold-stale-date:{project_id}"
-DATE_FORMAT = "%Y%m%d"
+STALE_DATE_KEY = "new-issue-escalation-threshold-stale-date:v2:{project_id}"
+LEGACY_STALE_DATE_KEY = "new-issue-escalation-threshold-stale-date:{project_id}"
+STRING_TO_DATETIME = "%Y-%m-%d %H:%M:%S.%f"
+LEGACY_STRING_TO_DATETIME = "%Y%m%d"
+TIME_TO_USE_EXISTING_THRESHOLD = 24 * 60 * 60  # 1 day
 
 
 def calculate_threshold(project: Project) -> Optional[float]:
@@ -133,47 +136,81 @@ def calculate_threshold(project: Project) -> Optional[float]:
         tenant_ids={"referrer": REFERRER, "organization_id": project.organization.id},
     )
 
-    result = raw_snql_query(request, referrer=REFERRER)["data"]
-    if len(result) == 0:
-        return None
-
     try:
-        return result[0][THRESHOLD_QUANTILE["name"]]
+        result = raw_snql_query(request, referrer=REFERRER)["data"]
     except Exception:
         logger.exception(
-            "Unexpected shape for threshold query results",
-            extra={"project_id": project.id, "result": json.dumps(result)},
+            "sentry.issues.issue_velocity.calculate_threshold.error",
+            extra={"project_id": project.id},
         )
         return None
 
+    return result[0][THRESHOLD_QUANTILE["name"]]
 
-def update_threshold(project: Project, threshold_key: str, stale_date_key: str) -> float:
+
+def update_threshold(
+    project: Project,
+    threshold_key: str,
+    stale_date_key: str,
+    stale_threshold: Optional[float] = None,
+) -> float:
     """
     Runs the calculation for the threshold and saves it and the date it is last updated to Redis.
-    If the threshold is NaN, we save it as 0 at the normal TTL. If the threshold is None (due to errors
-    in the query or any other reason), we save it as 0 at the shortened TTL.
+    If the threshold is NaN, we save it as 0 at the normal TTL. If the threshold is `None` (due to
+    Snuba errors), we call the fallback method.
     """
     threshold = calculate_threshold(project)
-    ttl = DEFAULT_TTL
     if threshold is None:
-        logger.error(
-            "Velocity threshold couldn't be calculated, query returned nothing",
-            extra={"project_id": project.id},
-        )
-        threshold = 0
-        ttl = NONE_TTL
-    elif math.isnan(threshold):  # indicates there were no valid events to base the calculation
+        return fallback_to_stale_or_zero(threshold_key, stale_date_key, stale_threshold)
+    if math.isnan(threshold):
         threshold = 0
 
+    ttl = DEFAULT_TTL
     client = get_redis_client()
     with client.pipeline() as p:
         p.set(threshold_key, threshold, ex=ttl)
-        p.set(stale_date_key, convert_date_to_int(datetime.utcnow()), ex=ttl),
+        p.set(stale_date_key, str(datetime.utcnow()), ex=ttl)
         p.execute()
-
+    metrics.incr("issues.update_new_escalation_threshold", tags={"useFallback": False})
     return threshold
 
 
+def fallback_to_stale_or_zero(
+    threshold_key: str, stale_date_key: str, stale_threshold: Optional[float]
+) -> float:
+    """
+    Returns the backup threshold for when the current threshold can't be calculated. If we have a
+    stale threshold, its stale date in Redis is extended to make it usable for the next ten minutes,
+    while TTL is maintained. Otherwise, we save a value of 0 with a stale date and TTL of ten minutes
+    into the future, and return 0 (so issues in this project do not escalate during this time).
+    """
+    ttl = FALLBACK_TTL
+    # current datetime - the amount of time a threshold is valid for + how much time to wait before trying to query Snuba for the threshold again
+    stale_date = (
+        datetime.utcnow()
+        - timedelta(seconds=TIME_TO_USE_EXISTING_THRESHOLD)
+        + timedelta(seconds=FALLBACK_TTL)
+    )
+    client = get_redis_client()
+    with client.pipeline() as p:
+        p.watch(threshold_key)
+        existing_ttl = p.ttl(threshold_key)  # get the ttl of the stale threshold
+        if stale_threshold is not None and isinstance(existing_ttl, int) and existing_ttl > 0:
+            ttl = existing_ttl
+        else:
+            # if the stale threshold doesn't exist, doesn't have an expiry, or is exactly expired
+            # in redis, don't use it; fallback to zero
+            stale_threshold = 0
+        p.multi()
+
+        if stale_threshold == 0:
+            p.set(threshold_key, stale_threshold, ex=ttl)
+        p.set(stale_date_key, str(stale_date), ex=ttl)
+        p.execute()
+    metrics.incr("issues.update_new_escalation_threshold", tags={"useFallback": True})
+    return stale_threshold
+
+
 def get_latest_threshold(project: Project) -> float:
     """
     Returns the most up-to-date threshold for the project, re-calculating if outdated or non-existent.
@@ -182,12 +219,27 @@ def get_latest_threshold(project: Project) -> float:
     keys = [
         THRESHOLD_KEY.format(project_id=project.id),
         STALE_DATE_KEY.format(project_id=project.id),
+        LEGACY_STALE_DATE_KEY.format(project_id=project.id),
     ]
     client = get_redis_client()
     cache_results = client.mget(keys)  # returns None if key is nonexistent
-    threshold, stale_date = cache_results[0], cache_results[1]
-    now = convert_date_to_int(datetime.utcnow())
-    if (stale_date and int(stale_date) < now) or stale_date is None or threshold is None:
+    threshold = cache_results[0]
+    stale_date = None
+    if cache_results[1] is not None:
+        stale_date = datetime.strptime(cache_results[1], STRING_TO_DATETIME)
+    elif cache_results[2] is not None:  # for backwards compatibility
+        # TODO(isabella): remove the legacy format once it is no longer being used
+        stale_date = datetime.strptime(cache_results[2], LEGACY_STRING_TO_DATETIME)
+        logger.info(
+            "issue_velocity.get_latest_threshold.legacy_date_format",
+            extra={"org_id": project.organization.id, "project_id": project.id},
+        )
+    now = datetime.utcnow()
+    if (
+        stale_date is None
+        or threshold is None
+        or (now - stale_date).total_seconds() > TIME_TO_USE_EXISTING_THRESHOLD
+    ):
         lock = locks.get(
             f"calculate_project_thresholds:{project.id}",
             duration=10,
@@ -195,11 +247,12 @@ def get_latest_threshold(project: Project) -> float:
         )
         try:
             with lock.acquire():
-                threshold = update_threshold(project, keys[0], keys[1])
-        except UnableToAcquireLock as error:  # another process is already updating
+                threshold = update_threshold(project, keys[0], keys[1], threshold)
+        except UnableToAcquireLock:  # another process is already updating
             logger.warning(
                 "issue_velocity.get_latest_threshold.unable_to_acquire_lock",
-                extra={"org_id": project.organization.id, "project_id": project.id, "error": error},
+                exc_info=True,
+                extra={"org_id": project.organization.id, "project_id": project.id},
             )
             threshold = float(threshold) if threshold else 0  # use stale value if possible
     else:
@@ -210,7 +263,3 @@ def get_latest_threshold(project: Project) -> float:
 
 def get_redis_client() -> RedisCluster | StrictRedis:
     return redis_clusters.get(settings.SENTRY_ESCALATION_THRESHOLDS_REDIS_CLUSTER)
-
-
-def convert_date_to_int(date: datetime) -> int:
-    return int(date.strftime(DATE_FORMAT))

+ 135 - 24
tests/sentry/issues/test_issue_velocity.py

@@ -5,24 +5,28 @@ from unittest.mock import patch
 from django.utils import timezone
 
 from sentry.issues.issue_velocity import (
-    DATE_FORMAT,
     DEFAULT_TTL,
-    NONE_TTL,
+    FALLBACK_TTL,
+    LEGACY_STALE_DATE_KEY,
     STALE_DATE_KEY,
+    STRING_TO_DATETIME,
     THRESHOLD_KEY,
+    TIME_TO_USE_EXISTING_THRESHOLD,
     calculate_threshold,
-    convert_date_to_int,
+    fallback_to_stale_or_zero,
     get_latest_threshold,
     get_redis_client,
     update_threshold,
 )
 from sentry.tasks.post_process import locks
 from sentry.testutils.cases import SnubaTestCase, TestCase
+from sentry.testutils.silo import region_silo_test
 from tests.sentry.issues.test_utils import SearchIssueTestMixin
 
 WEEK_IN_HOURS = 7 * 24
 
 
+@region_silo_test
 class IssueVelocityTests(TestCase, SnubaTestCase, SearchIssueTestMixin):
     def setUp(self):
         self.now = timezone.now()
@@ -150,9 +154,7 @@ class IssueVelocityTests(TestCase, SnubaTestCase, SearchIssueTestMixin):
         """
         redis_client = get_redis_client()
         redis_client.set(THRESHOLD_KEY.format(project_id=self.project.id), 0.1)
-        redis_client.set(
-            STALE_DATE_KEY.format(project_id=self.project.id), convert_date_to_int(self.utcnow)
-        )
+        redis_client.set(STALE_DATE_KEY.format(project_id=self.project.id), str(self.utcnow))
         threshold = get_latest_threshold(self.project)
         mock_update.assert_not_called()
         assert threshold == 0.1
@@ -166,7 +168,7 @@ class IssueVelocityTests(TestCase, SnubaTestCase, SearchIssueTestMixin):
         redis_client.set(THRESHOLD_KEY.format(project_id=self.project.id), 1.2)
         redis_client.set(
             STALE_DATE_KEY.format(project_id=self.project.id),
-            convert_date_to_int(self.utcnow - timedelta(days=1)),
+            str(self.utcnow - timedelta(days=1)),
         )
         mock_update.return_value = 1.5
         assert get_latest_threshold(self.project) == 1.5
@@ -188,7 +190,7 @@ class IssueVelocityTests(TestCase, SnubaTestCase, SearchIssueTestMixin):
         redis_client.set(THRESHOLD_KEY.format(project_id=self.project.id), 0.7)
         redis_client.set(
             STALE_DATE_KEY.format(project_id=self.project.id),
-            convert_date_to_int(self.utcnow - timedelta(days=1)),
+            str(self.utcnow - timedelta(days=1)),
         )
 
         lock = locks.get(
@@ -216,6 +218,21 @@ class IssueVelocityTests(TestCase, SnubaTestCase, SearchIssueTestMixin):
             mock_update.assert_not_called()
             assert threshold == 0
 
+    @patch("sentry.issues.issue_velocity.calculate_threshold", return_value=2)
+    def test_legacy_date_format_compatibility(self, mock_calculation):
+        """Tests that the logic does not break if a stale date was stored with the legacy format."""
+        redis_client = get_redis_client()
+        redis_client.set(THRESHOLD_KEY.format(project_id=self.project.id), 1)
+        redis_client.set(LEGACY_STALE_DATE_KEY.format(project_id=self.project.id), 20231220)
+        threshold = get_latest_threshold(self.project)
+        assert threshold == 2
+
+        # the legacy stale date key is not updated but the current version of the stale date key is
+        assert (
+            redis_client.get(LEGACY_STALE_DATE_KEY.format(project_id=self.project.id)) == "20231220"
+        )
+        assert redis_client.get(STALE_DATE_KEY.format(project_id=self.project.id)) is not None
+
     @patch("sentry.issues.issue_velocity.calculate_threshold")
     def test_update_threshold_simple(self, mock_calculation):
         """
@@ -225,26 +242,37 @@ class IssueVelocityTests(TestCase, SnubaTestCase, SearchIssueTestMixin):
         threshold = update_threshold(self.project.id, "threshold-key", "date-key")
         assert threshold == 5
         redis_client = get_redis_client()
-        assert redis_client.mget(["threshold-key", "date-key"]) == [
-            "5",
-            self.utcnow.strftime(DATE_FORMAT),
-        ]
+        assert redis_client.get("threshold-key") == "5"
+        stored_date = redis_client.get("date-key")
+        assert isinstance(stored_date, str)
+        # self.utcnow and the datetime.utcnow() used in the update method may vary in milliseconds so we can't do a direct comparison
+        assert (
+            0
+            <= (datetime.strptime(stored_date, STRING_TO_DATETIME) - self.utcnow).total_seconds()
+            < 1
+        )
         assert redis_client.ttl("threshold-key") == DEFAULT_TTL
+        assert redis_client.ttl("date-key") == DEFAULT_TTL
+
+    @patch("sentry.issues.issue_velocity.calculate_threshold")
+    def test_update_threshold_with_stale(self, mock_calculation):
+        """
+        Tests that we return the stale threshold if the calculation method returns None.
+        """
+        mock_calculation.return_value = None
+        redis_client = get_redis_client()
+        redis_client.set("threshold-key", 0.5, ex=86400)
+
+        assert update_threshold(self.project, "threshold-key", "date-key", 0.5) == 0.5
 
     @patch("sentry.issues.issue_velocity.calculate_threshold")
     def test_update_threshold_none(self, mock_calculation):
         """
-        Tests that we return 0 and save a threshold for a shorter amount of time than the default
-        if the calculation returned None.
+        Tests that we return 0 if the calculation method returns None and we don't have a stale
+        threshold.
         """
         mock_calculation.return_value = None
         assert update_threshold(self.project, "threshold-key", "date-key") == 0
-        redis_client = get_redis_client()
-        assert redis_client.mget(["threshold-key", "date-key"]) == [
-            "0",
-            self.utcnow.strftime(DATE_FORMAT),
-        ]
-        assert redis_client.ttl("threshold-key") == NONE_TTL
 
     @patch("sentry.issues.issue_velocity.calculate_threshold")
     def test_update_threshold_nan(self, mock_calculation):
@@ -254,8 +282,91 @@ class IssueVelocityTests(TestCase, SnubaTestCase, SearchIssueTestMixin):
         mock_calculation.return_value = float("nan")
         assert update_threshold(self.project, "threshold-key", "date-key") == 0
         redis_client = get_redis_client()
-        assert redis_client.mget(["threshold-key", "date-key"]) == [
-            "0",
-            self.utcnow.strftime(DATE_FORMAT),
-        ]
+        assert redis_client.get("threshold-key") == "0"
+        stored_date = redis_client.get("date-key")
+        assert isinstance(stored_date, str)
+        assert (
+            0
+            <= (datetime.strptime(stored_date, STRING_TO_DATETIME) - self.utcnow).total_seconds()
+            < 1
+        )
         assert redis_client.ttl("threshold-key") == DEFAULT_TTL
+
+    def test_fallback_to_stale(self):
+        """
+        Tests that we return the stale threshold and maintain its TTL, and update the stale date to
+        make the threshold usable for the next ten minutes as a fallback.
+        """
+        redis_client = get_redis_client()
+        redis_client.set("threshold-key", 0.5, ex=86400)
+
+        assert fallback_to_stale_or_zero("threshold-key", "date-key", 0.5) == 0.5
+        assert redis_client.get("threshold-key") == "0.5"
+        stored_date = redis_client.get("date-key")
+        assert isinstance(stored_date, str)
+        assert (
+            0
+            <= (
+                datetime.strptime(stored_date, STRING_TO_DATETIME)
+                - (
+                    self.utcnow
+                    - timedelta(seconds=TIME_TO_USE_EXISTING_THRESHOLD)
+                    + timedelta(seconds=FALLBACK_TTL)
+                )
+            ).total_seconds()
+            < 1
+        )
+
+        assert redis_client.ttl("threshold-key") == 86400
+        assert redis_client.ttl("date-key") == 86400
+
+    def test_fallback_to_zero(self):
+        """
+        Tests that we return 0 and store it in Redis for the next ten minutes as a fallback if we
+        do not have a stale threshold.
+        """
+        assert fallback_to_stale_or_zero("threshold-key", "date-key", None) == 0
+        redis_client = get_redis_client()
+        assert redis_client.get("threshold-key") == "0"
+        stored_date = redis_client.get("date-key")
+        assert isinstance(stored_date, str)
+        assert (
+            0
+            <= (
+                datetime.strptime(stored_date, STRING_TO_DATETIME)
+                - (
+                    self.utcnow
+                    - timedelta(seconds=TIME_TO_USE_EXISTING_THRESHOLD)
+                    + timedelta(seconds=FALLBACK_TTL)
+                )
+            ).total_seconds()
+            < 1
+        )
+        assert redis_client.ttl("threshold-key") == FALLBACK_TTL
+        assert redis_client.ttl("date-key") == FALLBACK_TTL
+
+    def test_fallback_to_stale_zero_ttl(self):
+        """
+        Tests that we return 0 and store it in Redis for the next ten minutes as a fallback if our
+        stale threshold has a TTL <= 0.
+        """
+        redis_client = get_redis_client()
+        assert fallback_to_stale_or_zero("threshold-key", "date-key", 0.5) == 0
+        assert redis_client.get("threshold-key") == "0"
+        stored_date = redis_client.get("date-key")
+        assert isinstance(stored_date, str)
+        assert (
+            0
+            <= (
+                datetime.strptime(stored_date, STRING_TO_DATETIME)
+                - (
+                    self.utcnow
+                    - timedelta(seconds=TIME_TO_USE_EXISTING_THRESHOLD)
+                    + timedelta(seconds=FALLBACK_TTL)
+                )
+            ).total_seconds()
+            < 1
+        )
+
+        assert redis_client.ttl("threshold-key") == FALLBACK_TTL
+        assert redis_client.ttl("date-key") == FALLBACK_TTL