Browse Source

fix(metrics) Prevent negative crash_free_rate value to be returned to the client (#73353)

Sometimes due to the problems in other parts of the system, data gets
corrupted and `crash_free_rate` is returned as negative value (which
should never happen)

This solution guards against corrupted data by returning 0 if the
calculated `crash_free_rate` is less than 0

Fixes: https://github.com/getsentry/sentry/issues/73172

---------

Signed-off-by: Vjeran Grozdanic <vjerangrozdanic@sentry.io>
Vjeran Grozdanić 3 days ago
parent
commit
b2cd1452ec

+ 4 - 0
src/sentry/api/endpoints/organization_release_health_data.py

@@ -11,6 +11,7 @@ from sentry.exceptions import InvalidParams
 from sentry.sentry_metrics.use_case_utils import get_use_case_id
 from sentry.snuba.metrics import DerivedMetricException, QueryDefinition, get_series
 from sentry.types.ratelimit import RateLimit, RateLimitCategory
+from sentry.utils import metrics
 from sentry.utils.cursors import Cursor, CursorResult
 
 
@@ -59,6 +60,9 @@ class OrganizationReleaseHealthDataEndpoint(OrganizationEndpoint):
                     use_case_id=get_use_case_id(request),
                     tenant_ids={"organization_id": organization.id},
                 )
+                # due to possible data corruption crash_free_rate value can be less than 0,
+                # which is not valid behavior, so those values have to be bottom capped at 0
+                metrics.ensure_non_negative_crash_free_rate_value(data, request, organization)
                 data["query"] = query.query
             except (
                 InvalidParams,

+ 57 - 0
src/sentry/utils/metrics.py

@@ -11,7 +11,9 @@ from random import random
 from threading import Thread
 from typing import Any, TypeVar
 
+import sentry_sdk
 from django.conf import settings
+from rest_framework.request import Request
 
 from sentry.metrics.base import MetricsBackend, MutableTags, Tags
 from sentry.metrics.middleware import MiddlewareWrapper, add_global_tags, global_tags
@@ -28,6 +30,7 @@ __all__ = [
     "gauge",
     "backend",
     "MutableTags",
+    "ensure_non_negative_crash_free_rate_value",
 ]
 
 
@@ -259,3 +262,57 @@ def event(
     except Exception:
         logger = logging.getLogger("sentry.errors")
         logger.exception("Unable to record backend metric")
+
+
+def ensure_non_negative_crash_free_rate_value(
+    data: Any, request: Request, organization, CRASH_FREE_RATE_METRIC_KEY="session.crash_free_rate"
+):
+    """
+    Ensures that crash_free_rate metric will never have negative
+    value returned to the customer by replacing all the negative values with 0.
+    Negative value of crash_free_metric can happen due to the
+    corrupted data that is used to calculate the metric
+    (see: https://github.com/getsentry/sentry/issues/73172)
+
+    Example format of data argument:
+    {
+        ...
+        "groups" : [
+            ...
+            "series": {..., "session.crash_free_rate": [..., None, 0.35]},
+            "totals": {..., "session.crash_free_rate": 0.35}
+        ]
+    }
+    """
+    groups = data["groups"]
+    for group in groups:
+        if "series" in group:
+            series = group["series"]
+            if CRASH_FREE_RATE_METRIC_KEY in series:
+                for i, value in enumerate(series[CRASH_FREE_RATE_METRIC_KEY]):
+                    try:
+                        value = float(value)
+                        if value < 0:
+                            with sentry_sdk.push_scope() as scope:
+                                scope.set_tag("organization", organization.id)
+                                scope.set_extra("crash_free_rate_in_series", value)
+                                scope.set_extra("request_query_params", request.query_params)
+                                sentry_sdk.capture_message("crash_free_rate in series is negative")
+                            series[CRASH_FREE_RATE_METRIC_KEY][i] = 0
+                    except TypeError:
+                        # value is not a number
+                        continue
+
+        if "totals" in group:
+            totals = group["totals"]
+            if (
+                CRASH_FREE_RATE_METRIC_KEY in totals
+                and totals[CRASH_FREE_RATE_METRIC_KEY] is not None
+                and totals[CRASH_FREE_RATE_METRIC_KEY] < 0
+            ):
+                with sentry_sdk.push_scope() as scope:
+                    scope.set_tag("organization", organization.id)
+                    scope.set_extra("crash_free_rate", totals[CRASH_FREE_RATE_METRIC_KEY])
+                    scope.set_extra("request_query_params", request.query_params)
+                    sentry_sdk.capture_message("crash_free_rate is negative")
+                totals[CRASH_FREE_RATE_METRIC_KEY] = 0

+ 56 - 0
tests/sentry/api/endpoints/test_organization_release_health_data.py

@@ -2413,3 +2413,59 @@ class DerivedMetricsDataTest(MetricsAPIBaseTestCase):
             "totals": {"p50(session.duration)": 6.0},
             "series": {"p50(session.duration)": [6.0]},
         }
+
+    def test_do_not_return_negative_crash_free_rate_value_to_the_customer(self):
+        """
+        Bug: https://github.com/getsentry/sentry/issues/73172
+        Assert that negative value is never returned to the user, even
+        in case when there is a problem with the data, instead of negative value we return 0.
+        This problem happens during ingestion when 'e:session/crashed' metric is
+        ingested, but 'e:session/all' for some reason is not. That would cause
+        crash_free_rate value to be negative since it is calculated as:
+
+        crash_free_rate = 1 - (count('e:session/crashed') / count('e:session/all'))
+        """
+        # ingesting 'e:session/all' and 'e:session/crashed'
+        self.build_and_store_session(
+            project_id=self.project.id,
+            minutes_before_now=1,
+            status="crashed",
+        )
+
+        # manually ingesting only 'e:session/crashed' metric
+        # to make sure that there are more 'e:session/crashed' metrics ingested
+        # than 'e:session/all'
+        for i in range(2):
+            session = self.build_session(
+                started=self.adjust_timestamp(
+                    self.now
+                    - timedelta(
+                        minutes=i,
+                    )
+                ).timestamp()
+            )
+            # ingesting only 'e:session/crashed'
+            self.store_metric(
+                self.organization.id,
+                self.project.id,
+                "counter",
+                SessionMRI.RAW_SESSION.value,
+                {"session.status": "crashed"},
+                int(session["started"]),
+                +1,
+                use_case_id=UseCaseID.SESSIONS,
+            )
+
+        response = self.get_success_response(
+            self.organization.slug,
+            field=["session.crash_free_rate", "session.all", "session.crashed"],
+            statsPeriod="6m",
+            interval="1m",
+        )
+
+        group = response.data["groups"][0]
+        assert group["totals"]["session.all"] == 1.0
+        assert group["totals"]["session.crashed"] == 3.0
+        assert group["totals"]["session.crash_free_rate"] == 0.0
+        for value in group["series"]["session.crash_free_rate"]:
+            assert value is None or value >= 0