Browse Source

feat(perf-issues): Add tag and metric before slow db issue creation (#42985)

Ash Anand 2 years ago
parent
commit
4a4c252e8a

+ 1 - 1
fixtures/events/performance_problems/n-plus-one-in-django-index-view.json

@@ -168,7 +168,7 @@
       "same_process_as_parent": true
     },
     {
-      "timestamp": 1661959871.418989,
+      "timestamp": 1661957871.249481,
       "start_timestamp": 1661957871.249481,
       "exclusive_time": 169.507981,
       "description": "\n                SELECT VERSION(),\n                       @@sql_mode,\n                       @@default_storage_engine,\n                       @@sql_auto_is_null,\n                       @@lower_case_table_names,\n                       CONVERT_TZ('2001-01-01 01:00:00', 'UTC', 'UTC') IS NOT NULL\n            ",

+ 73 - 0
fixtures/events/performance_problems/slow-db-spans.json

@@ -0,0 +1,73 @@
+{
+  "event_id": "ab095b616af4459098d29ade067108d6",
+  "project": 82372,
+  "release": "16.1.6",
+  "dist": null,
+  "platform": "ruby",
+  "message": "",
+  "datetime": "2022-11-23T20:40:03.938939+00:00",
+  "_metrics": {"bytes.ingested.event": 7876, "bytes.stored.event": 9222},
+  "culprit": "/pokedex/",
+  "environment": "production",
+  "location": "/pokedex/",
+  "contexts": {
+    "trace": {
+      "trace_id": "10d0b72df0fe4392a6788bce71ec2028",
+      "span_id": "1756e116945a4360",
+      "parent_span_id": "d71f841b69164c33",
+      "op": "http.server",
+      "status": "ok",
+      "type": "trace"
+    }
+  },
+  "spans": [
+    {
+      "timestamp": 1669236003.406264,
+      "start_timestamp": 1669236000.221315,
+      "exclusive_time": 3184.949159,
+      "description": "SELECT pokemon FROM pokedex",
+      "op": "db",
+      "span_id": "481bffa126c31edd",
+      "parent_span_id": "877546cc52754d21",
+      "trace_id": "3aa614f2d9a44204bd737312244e70fa",
+      "tags": {},
+      "data": {"connection_id": null},
+      "hash": "c6cca113a59f8486"
+    },
+    {
+      "timestamp": 1669236003.937326,
+      "start_timestamp": 1669236003.406517,
+      "exclusive_time": 530.808926,
+      "description": "SELECT pokemon FROM pokedex WHERE pokemon.name = 'pikachu'",
+      "op": "db",
+      "span_id": "a8d0ffadce91d103",
+      "parent_span_id": "877546cc52754d21",
+      "trace_id": "3aa614f2d9a44204bd737312244e70fa",
+      "tags": {},
+      "data": {"connection_id": null},
+      "hash": "95fb0da5a66a4936"
+    },
+    {
+      "timestamp": 1669236003.938425,
+      "start_timestamp": 1669236003.937986,
+      "exclusive_time": 0.439167,
+      "description": "do something in redis idk",
+      "op": "db.redis.command",
+      "span_id": "2949ee0f744ca1e3",
+      "parent_span_id": "877546cc52754d21",
+      "trace_id": "3aa614f2d9a44204bd737312244e70fa",
+      "tags": {},
+      "data": {
+        "server": "some-server:4200"
+      },
+      "hash": "ac2e1279e1642d6f"
+    }
+  ],
+  "start_timestamp": 1669236000.211091,
+  "timestamp": 1669236003.938939,
+  "title": "/pokedex/",
+  "transaction": "/pokedex/",
+  "transaction_info": {"source": "unknown"},
+  "type": "transaction",
+  "version": "7"
+}

+ 39 - 12
src/sentry/event_manager.py

@@ -2312,6 +2312,7 @@ def _save_aggregate_performance(jobs: Sequence[PerformanceJob], projects: Projec
 
                     new_grouphashes = new_grouphashes - groups_to_ignore
 
+                delete_slow_span_group_hashes(new_grouphashes, performance_problems)
                 new_grouphashes_count = len(new_grouphashes)
 
                 with metrics.timer("performance.performance_issue.check_write_limits"):
@@ -2421,19 +2422,45 @@ def _save_aggregate_performance(jobs: Sequence[PerformanceJob], projects: Projec
 
 @metrics.wraps("performance.performance_issue.should_create_group", sample_rate=1.0)
 def should_create_group(client: Any, grouphash: str, type: GroupType) -> bool:
-    times_seen = client.incr(f"grouphash:{grouphash}")
-    metrics.incr(
-        "performance.performance_issue.grouphash_counted",
-        tags={"times_seen": times_seen, "group_type": GROUP_TYPE_TO_TEXT.get(type, "Unknown Type")},
-        sample_rate=1.0,
-    )
+    with sentry_sdk.start_span(op="event_manager.should_create_group") as span:
+        times_seen = client.incr(f"grouphash:{grouphash}")
+        metrics.incr(
+            "performance.performance_issue.grouphash_counted",
+            tags={
+                "times_seen": times_seen,
+                "group_type": GROUP_TYPE_TO_TEXT.get(type, "Unknown Type"),
+            },
+            sample_rate=1.0,
+        )
 
-    if times_seen >= GROUPHASH_IGNORE_LIMIT_MAP.get(type, DEFAULT_GROUPHASH_IGNORE_LIMIT):
-        client.delete(grouphash)
-        return True
-    else:
-        client.expire(grouphash, 60 * 60 * 24)  # 24 hour expiration from last seen
-        return False
+        if times_seen >= GROUPHASH_IGNORE_LIMIT_MAP.get(type, DEFAULT_GROUPHASH_IGNORE_LIMIT):
+            client.delete(grouphash)
+            metrics.incr(
+                "performance.performance_issue.issue_will_be_created",
+                tags={"group_type": type.name},
+                sample_rate=1.0,
+            )
+
+            # TODO: Experimental tag to indicate when a Slow DB Issue would have been created.
+            # This is in place to confirm whether the above rate limit is effective in preventing spikes
+            # and inaccuracies.
+            if type == GroupType.PERFORMANCE_SLOW_SPAN:
+                if span.containing_transaction:
+                    span.containing_transaction.set_tag("_will_create_slow_db_issue", "true")
+                return False
+
+            return True
+        else:
+            client.expire(grouphash, 60 * 60 * 24)  # 24 hour expiration from last seen
+            return False
+
+
+def delete_slow_span_group_hashes(
+    group_hashes: set[str], performance_problems: Sequence[PerformanceProblem]
+) -> None:
+    for problem in performance_problems:
+        if problem.type == GroupType.PERFORMANCE_SLOW_SPAN and problem.fingerprint in group_hashes:
+            group_hashes.remove(problem.fingerprint)
 
 
 @metrics.wraps("event_manager.save_transaction_events")

+ 17 - 0
src/sentry/utils/performance_issues/performance_detection.py

@@ -79,6 +79,11 @@ DETECTOR_TYPE_ISSUE_CREATION_TO_SYSTEM_OPTION = {
     DetectorType.N_PLUS_ONE_DB_QUERIES_EXTENDED: "performance.issues.n_plus_one_db_ext.problem-creation",
     DetectorType.CONSECUTIVE_DB_OP: "performance.issues.consecutive_db.problem-creation",
     DetectorType.N_PLUS_ONE_API_CALLS: "performance.issues.n_plus_one_api_calls.problem-creation",
+    # NOTE: Slow Span issues are not allowed for creation yet, the addition of this line is temporary so that we can
+    # record some metrics for issues of this type that *should* be created. We won't actually create any of these issues atm.
+    # This is handled within `event_manager.py` before the issue gets created.
+    # TODO: Remove this once we've verified that quality issues will be created, and not during spikes.
+    DetectorType.SLOW_SPAN: "performance.issues.slow_span.problem-creation",
 }
 
 
@@ -532,6 +537,18 @@ class SlowSpanDetector(PerformanceDetector):
                 offender_span_ids=spans_involved,
             )
 
+    # TODO: Temporarily set to true for now, but issues will not be created.
+    def is_creation_allowed_for_organization(self, organization: Optional[Organization]) -> bool:
+        return True
+
+    # TODO: Temporarily set to true for now, but issues will not be created.
+    def is_creation_allowed_for_project(self, project: Optional[Project]) -> bool:
+        return True
+
+    # TODO: Temporarily set to true for now, but issues will not be created.
+    def is_creation_allowed_for_system(self) -> bool:
+        return True
+
     @classmethod
     def is_span_eligible(cls, span: Span) -> bool:
         description = span.get("description", None)

+ 2 - 2
tests/sentry/api/endpoints/test_group_tagkey_details.py

@@ -44,7 +44,7 @@ class GroupTagDetailsTest(APITestCase, SnubaTestCase):
                 "start_timestamp": iso_format(before_now(minutes=1)),
                 "tags": {"foo": "bar", "biz": "baz"},
                 "release": "releaseme",
-                "fingerprint": [f"{GroupType.PERFORMANCE_SLOW_SPAN.value}-group1"],
+                "fingerprint": [f"{GroupType.PERFORMANCE_RENDER_BLOCKING_ASSET_SPAN.value}-group1"],
             },
             project_id=self.project.id,
         )
@@ -56,7 +56,7 @@ class GroupTagDetailsTest(APITestCase, SnubaTestCase):
                 "start_timestamp": iso_format(before_now(minutes=2)),
                 "tags": {"foo": "quux"},
                 "release": "releaseme",
-                "fingerprint": [f"{GroupType.PERFORMANCE_SLOW_SPAN.value}-group1"],
+                "fingerprint": [f"{GroupType.PERFORMANCE_RENDER_BLOCKING_ASSET_SPAN.value}-group1"],
             },
             project_id=self.project.id,
         )

+ 1 - 1
tests/sentry/api/endpoints/test_group_tagkey_values.py

@@ -45,7 +45,7 @@ class GroupTagKeyValuesTest(APITestCase, SnubaTestCase):
                 "timestamp": iso_format(before_now(minutes=1)),
                 "start_timestamp": iso_format(before_now(minutes=1)),
                 "tags": {key: value},
-                "fingerprint": [f"{GroupType.PERFORMANCE_SLOW_SPAN.value}-group1"],
+                "fingerprint": [f"{GroupType.PERFORMANCE_RENDER_BLOCKING_ASSET_SPAN.value}-group1"],
             },
             project_id=self.project.id,
         )

+ 2 - 2
tests/sentry/api/endpoints/test_group_tags.py

@@ -86,7 +86,7 @@ class GroupTagsTest(APITestCase, SnubaTestCase):
                 "start_timestamp": iso_format(before_now(minutes=1)),
                 "tags": {"foo": "bar", "biz": "baz"},
                 "release": "releaseme",
-                "fingerprint": [f"{GroupType.PERFORMANCE_SLOW_SPAN.value}-group1"],
+                "fingerprint": [f"{GroupType.PERFORMANCE_RENDER_BLOCKING_ASSET_SPAN.value}-group1"],
             },
             project_id=self.project.id,
         )
@@ -98,7 +98,7 @@ class GroupTagsTest(APITestCase, SnubaTestCase):
                 "start_timestamp": iso_format(before_now(minutes=2)),
                 "tags": {"foo": "quux"},
                 "release": "releaseme",
-                "fingerprint": [f"{GroupType.PERFORMANCE_SLOW_SPAN.value}-group1"],
+                "fingerprint": [f"{GroupType.PERFORMANCE_RENDER_BLOCKING_ASSET_SPAN.value}-group1"],
             },
             project_id=self.project.id,
         )

+ 43 - 0
tests/sentry/event_manager/test_event_manager.py

@@ -2435,6 +2435,49 @@ class EventManagerTest(TestCase, SnubaTestCase, EventManagerTestMixin):
             assert data2["hashes"] == []
             assert data3["hashes"] == [expected_hash]
 
+    @override_options(
+        {
+            "performance.issues.slow_span.problem-creation": 1.0,
+            "performance_issue_creation_rate": 1.0,
+        }
+    )
+    @override_settings(SENTRY_PERFORMANCE_ISSUES_REDUCE_NOISE=True)
+    def test_perf_issue_slow_db_issue_not_created(self):
+        self.project.update_option("sentry:performance_issue_creation_rate", 1.0)
+
+        with mock.patch("sentry_sdk.tracing.Span.containing_transaction") as transaction:
+            last_event = None
+
+            for _ in range(100):
+                manager = EventManager(make_event(**get_event("slow-db-spans")))
+                manager.normalize()
+                event = manager.save(self.project.id)
+                last_event = event
+
+            # The group should not be created, but there should be a tag on the transaction
+            assert len(last_event.groups) == 0
+            transaction.set_tag.assert_called_with("_will_create_slow_db_issue", "true")
+
+    @override_options(
+        {
+            "performance.issues.slow_span.problem-creation": 1.0,
+            "performance_issue_creation_rate": 1.0,
+        }
+    )
+    @override_settings(SENTRY_PERFORMANCE_ISSUES_REDUCE_NOISE=False)
+    def test_perf_issue_slow_db_issue_not_created_with_noise_flag_false(self):
+        self.project.update_option("sentry:performance_issue_creation_rate", 1.0)
+
+        last_event = None
+
+        for _ in range(100):
+            manager = EventManager(make_event(**get_event("slow-db-spans")))
+            manager.normalize()
+            event = manager.save(self.project.id)
+            last_event = event
+
+        assert len(last_event.groups) == 0
+
 
 class AutoAssociateCommitTest(TestCase, EventManagerTestMixin):
     def setUp(self):

+ 1 - 1
tests/sentry/eventstore/snuba/test_backend.py

@@ -63,7 +63,7 @@ class SnubaEventStorageTest(TestCase, SnubaTestCase):
 
         event_data_2 = load_data(
             platform="transaction",
-            fingerprint=[f"{GroupType.PERFORMANCE_SLOW_SPAN.value}-group3"],
+            fingerprint=[f"{GroupType.PERFORMANCE_RENDER_BLOCKING_ASSET_SPAN.value}-group3"],
         )
         event_data_2["timestamp"] = iso_format(before_now(seconds=30))
         event_data_2["start_timestamp"] = iso_format(before_now(seconds=31))

+ 1 - 1
tests/sentry/rules/filters/test_issue_category.py

@@ -53,7 +53,7 @@ class IssueCategoryFilterPerformanceTest(
         tx_event = self.store_transaction(
             self.project.id,
             "test_transaction_category",
-            [f"{GroupType.PERFORMANCE_SLOW_SPAN.value}-group1"],
+            [f"{GroupType.PERFORMANCE_RENDER_BLOCKING_ASSET_SPAN.value}-group1"],
         )
 
         group_events = list(tx_event.build_group_events())

Some files were not shown because too many files changed in this diff