Browse Source

ref(seer grouping): Use new CircuitBreaker class, take 2 (#74898)

This is a redo of https://github.com/getsentry/sentry/pull/74563 (which got reverted), with the difference that the circuit breaker is now reinstantiated before each use, rather than being instantiated once at the module level. (The problem with that setup was that the module turns out to initialize before our database connections are established, meaning we only get the hardcoded default config rather than the one provided by the options automator.)
Katie Byers 7 months ago
parent
commit
2d46de8ac0

+ 1 - 0
src/sentry/conf/server.py

@@ -3435,6 +3435,7 @@ SEER_PROJECT_GROUPING_RECORDS_DELETE_URL = (
 SEER_HASH_GROUPING_RECORDS_DELETE_URL = (
     f"/{SEER_SIMILARITY_MODEL_VERSION}/issues/similar-issues/grouping-record/delete-by-hash"
 )
+SEER_SIMILARITY_CIRCUIT_BREAKER_KEY = "seer.similarity"
 
 SIMILARITY_BACKFILL_COHORT_MAP: dict[str, list[int]] = {}
 

+ 9 - 31
src/sentry/event_manager.py

@@ -129,9 +129,7 @@ from sentry.utils.cache import cache_key_for_event
 from sentry.utils.circuit_breaker import (
     ERROR_COUNT_CACHE_KEY,
     CircuitBreakerPassthrough,
-    CircuitBreakerTripped,
     circuit_breaker_activated,
-    with_circuit_breaker,
 )
 from sentry.utils.dates import to_datetime
 from sentry.utils.event import has_event_minified_stack_trace, has_stacktrace, is_handled
@@ -1538,13 +1536,18 @@ def _save_aggregate(
                 seer_matched_group = None
 
                 if should_call_seer_for_grouping(event, primary_hashes):
+                    metrics.incr(
+                        "grouping.similarity.did_call_seer",
+                        # TODO: Consider lowering this (in all the spots this metric is
+                        # collected) once we roll Seer grouping out more widely
+                        sample_rate=1.0,
+                        tags={"call_made": True, "blocker": "none"},
+                    )
                     try:
                         # If the `projects:similarity-embeddings-grouping` feature is disabled,
                         # we'll still get back result metadata, but `seer_matched_group` will be None
-                        seer_response_data, seer_matched_group = with_circuit_breaker(
-                            "event_manager.get_seer_similar_issues",
-                            lambda: get_seer_similar_issues(event, primary_hashes),
-                            options.get("seer.similarity.circuit-breaker-config"),
+                        seer_response_data, seer_matched_group = get_seer_similar_issues(
+                            event, primary_hashes
                         )
                         event.data["seer_similarity"] = seer_response_data
 
@@ -1555,33 +1558,8 @@ def _save_aggregate(
                                 "seer_similarity"
                             ] = seer_response_data
 
-                        metrics.incr(
-                            "grouping.similarity.did_call_seer",
-                            # TODO: Consider lowering this (in all the spots this metric is
-                            # collected) once we roll Seer grouping out more widely
-                            sample_rate=1.0,
-                            tags={"call_made": True, "blocker": "none"},
-                        )
-
-                    except CircuitBreakerTripped:
-                        # TODO: Do we want to include all of the conditions which cause us to log a
-                        # `grouping.similarity.seer_call_blocked` metric (here and in
-                        # `should_call_seer_for_grouping`) under a single outcome tag on the span
-                        # and timer metric below and in `record_calculation_metric_with_result`
-                        # (also below)? Right now they just fall into the `new_group` bucket.
-                        metrics.incr(
-                            "grouping.similarity.did_call_seer",
-                            sample_rate=1.0,
-                            tags={"call_made": False, "blocker": "circuit-breaker"},
-                        )
-
                     # Insurance - in theory we shouldn't ever land here
                     except Exception as e:
-                        metrics.incr(
-                            "grouping.similarity.did_call_seer",
-                            sample_rate=1.0,
-                            tags={"call_made": True, "blocker": "none"},
-                        )
                         sentry_sdk.capture_exception(
                             e, tags={"event": event.event_id, "project": project.id}
                         )

+ 34 - 6
src/sentry/grouping/ingest/seer.py

@@ -1,6 +1,8 @@
 import logging
 from dataclasses import asdict
 
+from django.conf import settings
+
 from sentry import features, options
 from sentry import ratelimits as ratelimiter
 from sentry.eventstore.models import Event
@@ -17,6 +19,7 @@ from sentry.seer.similarity.utils import (
     killswitch_enabled,
 )
 from sentry.utils import metrics
+from sentry.utils.circuit_breaker2 import CircuitBreaker
 from sentry.utils.safe import get_path
 
 logger = logging.getLogger("sentry.events.grouping")
@@ -45,12 +48,11 @@ def should_call_seer_for_grouping(event: Event, primary_hashes: CalculatedHashes
     # (Checking the rate limit for calling Seer also increments the counter of how many times we've
     # tried to call it, and if we fail any of the other checks, it shouldn't count as an attempt.
     # Thus we only want to run the rate limit check if every other check has already succeeded.)
-    #
-    # Note: The circuit breaker check which might naturally be here alongside its killswitch
-    # and rate limiting friends instead happens in the `with_circuit_breaker` helper used where
-    # `get_seer_similar_issues` is actually called. (It has to be there in order for it to track
-    # errors arising from that call.)
-    if killswitch_enabled(project.id, event) or _ratelimiting_enabled(event, project):
+    if (
+        killswitch_enabled(project.id, event)
+        or _circuit_breaker_broken(event, project)
+        or _ratelimiting_enabled(event, project)
+    ):
         return False
 
     return True
@@ -155,6 +157,32 @@ def _ratelimiting_enabled(event: Event, project: Project) -> bool:
     return False
 
 
+def _circuit_breaker_broken(event: Event, project: Project) -> bool:
+    breaker_config = options.get("seer.similarity.circuit-breaker-config")
+    circuit_breaker = CircuitBreaker(settings.SEER_SIMILARITY_CIRCUIT_BREAKER_KEY, breaker_config)
+    circuit_broken = not circuit_breaker.should_allow_request()
+
+    if circuit_broken:
+        logger.warning(
+            "should_call_seer_for_grouping.broken_circuit_breaker",
+            extra={
+                "event_id": event.event_id,
+                "project_id": project.id,
+                **breaker_config,
+            },
+        )
+        metrics.incr(
+            "grouping.similarity.broken_circuit_breaker",
+        )
+        metrics.incr(
+            "grouping.similarity.did_call_seer",
+            sample_rate=1.0,
+            tags={"call_made": False, "blocker": "circuit-breaker"},
+        )
+
+    return circuit_broken
+
+
 def get_seer_similar_issues(
     event: Event,
     primary_hashes: CalculatedHashes,

+ 12 - 3
src/sentry/options/defaults.py

@@ -892,12 +892,21 @@ register(
     flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
 )
 
+# TODO: The default error limit here was estimated based on EA traffic. (In an average 10 min
+# period, there are roughly 35K events without matching hashes. About 2% of orgs are EA, so for
+# simplicity, assume 2% of those events are from EA orgs. If we're willing to tolerate up to a 95%
+# failure rate, then we need 35K * 0.02 * 0.95 events to fail to trip the breaker.)
+#
+# When we GA, we should multiply both the limits by 50 (to remove the 2% part of the current
+# calculation), and remove this TODO.
 register(
     "seer.similarity.circuit-breaker-config",
     type=Dict,
-    # TODO: For now we're using the defaults for everything but `allow_passthrough`. We may want to
-    # revisit that choice in the future.
-    default={"allow_passthrough": True},
+    default={
+        "error_limit": 666,
+        "error_limit_window": 600,  # 10 min
+        "broken_state_duration": 300,  # 5 min
+    },
     flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
 )
 

+ 16 - 1
src/sentry/seer/similarity/similar_issues.py

@@ -3,7 +3,12 @@ import logging
 from django.conf import settings
 from urllib3.exceptions import MaxRetryError, TimeoutError
 
-from sentry.conf.server import SEER_MAX_GROUPING_DISTANCE, SEER_SIMILAR_ISSUES_URL
+from sentry import options
+from sentry.conf.server import (
+    SEER_MAX_GROUPING_DISTANCE,
+    SEER_SIMILAR_ISSUES_URL,
+    SEER_SIMILARITY_CIRCUIT_BREAKER_KEY,
+)
 from sentry.models.grouphash import GroupHash
 from sentry.net.http import connection_from_url
 from sentry.seer.signed_seer_api import make_signed_seer_api_request
@@ -15,6 +20,7 @@ from sentry.seer.similarity.types import (
 )
 from sentry.tasks.delete_seer_grouping_records import delete_seer_grouping_records_by_hash
 from sentry.utils import json, metrics
+from sentry.utils.circuit_breaker2 import CircuitBreaker
 from sentry.utils.json import JSONDecodeError, apply_key_filter
 
 logger = logging.getLogger(__name__)
@@ -95,6 +101,11 @@ def get_similarity_data_from_seer(
         )
         return []
 
+    circuit_breaker = CircuitBreaker(
+        SEER_SIMILARITY_CIRCUIT_BREAKER_KEY,
+        options.get("seer.similarity.circuit-breaker-config"),
+    )
+
     try:
         response = make_signed_seer_api_request(
             seer_grouping_connection_pool,
@@ -111,6 +122,7 @@ def get_similarity_data_from_seer(
             sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE,
             tags={**metric_tags, "outcome": "error", "error": type(e).__name__},
         )
+        circuit_breaker.record_error()
         return []
 
     metric_tags["response_status"] = response.status
@@ -137,6 +149,9 @@ def get_similarity_data_from_seer(
             },
         )
 
+        if response.status >= 500:
+            circuit_breaker.record_error()
+
         return []
 
     try:

+ 0 - 18
tests/sentry/event_manager/grouping/test_seer_grouping.py

@@ -10,7 +10,6 @@ from sentry.testutils.helpers import Feature
 from sentry.testutils.helpers.eventprocessing import save_new_event
 from sentry.testutils.helpers.features import with_feature
 from sentry.testutils.pytest.mocking import capture_results
-from sentry.utils.circuit_breaker import with_circuit_breaker
 from sentry.utils.types import NonNone
 
 
@@ -85,23 +84,6 @@ class SeerEventManagerGroupingTest(TestCase):
                 assert get_seer_similar_issues_return_values[0][1] == existing_event.group
                 assert new_event.group_id == existing_event.group_id
 
-    @patch("sentry.event_manager.should_call_seer_for_grouping", return_value=True)
-    @patch("sentry.event_manager.with_circuit_breaker", wraps=with_circuit_breaker)
-    @patch("sentry.event_manager.get_seer_similar_issues", return_value=({}, None))
-    def test_obeys_circult_breaker(
-        self, mock_get_seer_similar_issues: MagicMock, mock_with_circuit_breaker: MagicMock, _
-    ):
-        with patch("sentry.utils.circuit_breaker._should_call_callback", return_value=True):
-            save_new_event({"message": "Dogs are great!"}, self.project)
-            assert mock_with_circuit_breaker.call_count == 1
-            assert mock_get_seer_similar_issues.call_count == 1
-
-        with patch("sentry.utils.circuit_breaker._should_call_callback", return_value=False):
-            save_new_event({"message": "Adopt don't shop"}, self.project)
-
-            assert mock_with_circuit_breaker.call_count == 2  # increased
-            assert mock_get_seer_similar_issues.call_count == 1  # didn't increase
-
     @patch("sentry.event_manager.should_call_seer_for_grouping", return_value=True)
     @patch("sentry.event_manager.get_seer_similar_issues", return_value=({}, None))
     def test_calls_seer_if_no_group_found(self, mock_get_seer_similar_issues: MagicMock, _):

+ 12 - 0
tests/sentry/grouping/ingest/test_seer.py

@@ -121,6 +121,18 @@ class ShouldCallSeerTest(TestCase):
                     is expected_result
                 )
 
+    @with_feature("projects:similarity-embeddings-grouping")
+    def test_obeys_circuit_breaker(self):
+        for request_allowed, expected_result in [(True, True), (False, False)]:
+            with patch(
+                "sentry.grouping.ingest.seer.CircuitBreaker.should_allow_request",
+                return_value=request_allowed,
+            ):
+                assert (
+                    should_call_seer_for_grouping(self.event, self.primary_hashes)
+                    is expected_result
+                )
+
     @with_feature("projects:similarity-embeddings-grouping")
     def test_obeys_customized_fingerprint_check(self):
         default_fingerprint_event = Event(

+ 50 - 15
tests/sentry/seer/similarity/test_similar_issues.py

@@ -96,9 +96,15 @@ class GetSimilarityDataFromSeerTest(TestCase):
             tags={"response_status": 200, "outcome": "no_similar_groups"},
         )
 
+    @mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error")
     @mock.patch("sentry.seer.similarity.similar_issues.metrics.incr")
     @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen")
-    def test_bad_response_data(self, mock_seer_request: MagicMock, mock_metrics_incr: MagicMock):
+    def test_bad_response_data(
+        self,
+        mock_seer_request: MagicMock,
+        mock_metrics_incr: MagicMock,
+        mock_record_circuit_breaker_error: MagicMock,
+    ):
         cases: list[tuple[Any, str]] = [
             (None, "AttributeError"),
             ([], "AttributeError"),
@@ -139,14 +145,20 @@ class GetSimilarityDataFromSeerTest(TestCase):
                 sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE,
                 tags={"response_status": 200, "outcome": "error", "error": expected_error},
             )
+            assert mock_record_circuit_breaker_error.call_count == 0
 
             mock_metrics_incr.reset_mock()
 
+    @mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error")
     @mock.patch("sentry.seer.similarity.similar_issues.metrics.incr")
     @mock.patch("sentry.seer.similarity.similar_issues.logger")
     @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen")
     def test_redirect(
-        self, mock_seer_request: MagicMock, mock_logger: MagicMock, mock_metrics_incr: MagicMock
+        self,
+        mock_seer_request: MagicMock,
+        mock_logger: MagicMock,
+        mock_metrics_incr: MagicMock,
+        mock_record_circuit_breaker_error: MagicMock,
     ):
         mock_seer_request.return_value = HTTPResponse(
             status=308, headers={"location": "/new/and/improved/endpoint/"}
@@ -161,12 +173,18 @@ class GetSimilarityDataFromSeerTest(TestCase):
             sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE,
             tags={"response_status": 308, "outcome": "error", "error": "Redirect"},
         )
+        assert mock_record_circuit_breaker_error.call_count == 0
 
+    @mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error")
     @mock.patch("sentry.seer.similarity.similar_issues.metrics.incr")
     @mock.patch("sentry.seer.similarity.similar_issues.logger")
     @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen")
     def test_request_error(
-        self, mock_seer_request: MagicMock, mock_logger: MagicMock, mock_metrics_incr: MagicMock
+        self,
+        mock_seer_request: MagicMock,
+        mock_logger: MagicMock,
+        mock_metrics_incr: MagicMock,
+        mock_record_circuit_breaker_error: MagicMock,
     ):
         for request_error, expected_error_tag in [
             (TimeoutError, "TimeoutError"),
@@ -192,25 +210,42 @@ class GetSimilarityDataFromSeerTest(TestCase):
                 sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE,
                 tags={"outcome": "error", "error": expected_error_tag},
             )
+            assert mock_record_circuit_breaker_error.call_count == 1
+
+            mock_logger.warning.reset_mock()
+            mock_metrics_incr.reset_mock()
+            mock_record_circuit_breaker_error.reset_mock()
 
+    @mock.patch("sentry.grouping.ingest.seer.CircuitBreaker.record_error")
     @mock.patch("sentry.seer.similarity.similar_issues.metrics.incr")
     @mock.patch("sentry.seer.similarity.similar_issues.logger")
     @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen")
     def test_error_status(
-        self, mock_seer_request: MagicMock, mock_logger: MagicMock, mock_metrics_incr: MagicMock
+        self,
+        mock_seer_request: MagicMock,
+        mock_logger: MagicMock,
+        mock_metrics_incr: MagicMock,
+        mock_record_circuit_breaker_error: MagicMock,
     ):
-        mock_seer_request.return_value = HTTPResponse("No soup for you", status=403)
+        for response, status, counts_for_circuit_breaker in [
+            ("No soup for you", 403, False),
+            ("No soup, period", 500, True),
+        ]:
+            mock_seer_request.return_value = HTTPResponse(response, status=status)
 
-        assert get_similarity_data_from_seer(self.request_params) == []
-        mock_logger.error.assert_called_with(
-            f"Received 403 when calling Seer endpoint {SEER_SIMILAR_ISSUES_URL}.",
-            extra={"response_data": "No soup for you"},
-        )
-        mock_metrics_incr.assert_any_call(
-            "seer.similar_issues_request",
-            sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE,
-            tags={"response_status": 403, "outcome": "error", "error": "RequestError"},
-        )
+            assert get_similarity_data_from_seer(self.request_params) == []
+            mock_logger.error.assert_called_with(
+                f"Received {status} when calling Seer endpoint {SEER_SIMILAR_ISSUES_URL}.",
+                extra={"response_data": response},
+            )
+            mock_metrics_incr.assert_any_call(
+                "seer.similar_issues_request",
+                sample_rate=SIMILARITY_REQUEST_METRIC_SAMPLE_RATE,
+                tags={"response_status": status, "outcome": "error", "error": "RequestError"},
+            )
+            assert mock_record_circuit_breaker_error.call_count == (
+                1 if counts_for_circuit_breaker else 0
+            )
 
     @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen")
     def test_returns_sorted_results(self, mock_seer_request: MagicMock):