Browse Source

ref(perf): Add duplicate detection on group hash (#37787)

This uses the existing grouping strategy hash already on the spans for http spans, since we have more refined rules around auto-parameterizing the description etc. If this works, it's also a better fingerprint to use since it allows us to directly query span.groups etc. on a transaction (eg. suspect spans).

The one issue with this approach is that it only works after manager.save is called, which is where span groups gets calculated, so if we need to propagate data into the transaction based on detection we'll have to figure something else out (or move this to 'save')
Kev 2 years ago
parent
commit
08535455ff

+ 60 - 0
src/sentry/utils/performance_issues/performance_detection.py

@@ -20,6 +20,7 @@ PerformanceIssues = Dict[str, Any]
 
 class DetectorType(Enum):
     SLOW_SPAN = "slow_span"
+    DUPLICATE_SPANS_HASH = "dupes_hash"  # Have to stay within tag key length limits
     DUPLICATE_SPANS = "duplicates"
     SEQUENTIAL_SLOW_SPANS = "sequential"
     LONG_TASK_SPANS = "long_task"
@@ -54,6 +55,13 @@ def get_default_detection_settings():
                 "allowed_span_ops": ["db", "http"],
             }
         ],
+        DetectorType.DUPLICATE_SPANS_HASH: [
+            {
+                "count": 5,
+                "cumulative_duration": 500.0,  # ms
+                "allowed_span_ops": ["http"],
+            },
+        ],
         DetectorType.SEQUENTIAL_SLOW_SPANS: [
             {
                 "count": 3,
@@ -87,6 +95,7 @@ def _detect_performance_issue(data: Event, sdk_span: Any):
     detection_settings = get_default_detection_settings()
     detectors = {
         DetectorType.DUPLICATE_SPANS: DuplicateSpanDetector(detection_settings),
+        DetectorType.DUPLICATE_SPANS_HASH: DuplicateSpanHashDetector(detection_settings),
         DetectorType.SLOW_SPAN: SlowSpanDetector(detection_settings),
         DetectorType.SEQUENTIAL_SLOW_SPANS: SequentialSlowSpanDetector(detection_settings),
         DetectorType.LONG_TASK_SPANS: LongTaskSpanDetector(detection_settings),
@@ -222,6 +231,53 @@ class DuplicateSpanDetector(PerformanceDetector):
                 )
 
 
+class DuplicateSpanHashDetector(PerformanceDetector):
+    """
+    Broadly check for duplicate spans.
+    Uses the span grouping strategy hash to potentially detect duplicate spans more accurately.
+    """
+
+    __slots__ = ("cumulative_durations", "duplicate_spans_involved", "stored_issues")
+
+    settings_key = DetectorType.DUPLICATE_SPANS_HASH
+
+    def init(self):
+        self.cumulative_durations = {}
+        self.duplicate_spans_involved = {}
+        self.stored_issues = {}
+
+    def visit_span(self, span: Span):
+        settings_for_span = self.settings_for_span(span)
+        if not settings_for_span:
+            return
+        op, span_id, op_prefix, span_duration, settings = settings_for_span
+        duplicate_count_threshold = settings.get("count")
+        duplicate_duration_threshold = settings.get("cumulative_duration")
+
+        hash = span.get("hash", None)
+        if not hash:
+            return
+
+        self.cumulative_durations[hash] = (
+            self.cumulative_durations.get(hash, timedelta(0)) + span_duration
+        )
+
+        if hash not in self.duplicate_spans_involved:
+            self.duplicate_spans_involved[hash] = []
+
+        self.duplicate_spans_involved[hash] += [span_id]
+        duplicate_spans_counts = len(self.duplicate_spans_involved[hash])
+
+        if not self.stored_issues.get(hash, False):
+            if duplicate_spans_counts >= duplicate_count_threshold and self.cumulative_durations[
+                hash
+            ] >= timedelta(milliseconds=duplicate_duration_threshold):
+                spans_involved = self.duplicate_spans_involved[hash]
+                self.stored_issues[hash] = PerformanceSpanIssue(
+                    span_id, op_prefix, spans_involved, hash
+                )
+
+
 class SlowSpanDetector(PerformanceDetector):
     """
     Check for slow spans in a certain type of span.op (eg. slow db spans)
@@ -382,6 +438,10 @@ def report_metrics_for_detectors(
             continue
 
         first_issue = detected_issues[detected_issue_keys[0]]
+        if first_issue.fingerprint:
+            sdk_span.containing_transaction.set_tag(
+                f"_pi_{detector_key}_fp", first_issue.fingerprint
+            )
         sdk_span.containing_transaction.set_tag(f"_pi_{detector_key}", first_issue.span_id)
         metrics.incr(
             f"performance.performance_issue.{detector_key}",

+ 3 - 2
src/sentry/utils/performance_issues/performance_span_issue.py

@@ -2,12 +2,13 @@ from typing import List
 
 
 class PerformanceSpanIssue:
-    __slots__ = ("span_id", "allowed_op", "spans_involved")
+    __slots__ = ("span_id", "allowed_op", "spans_involved", "fingerprint")
     """
     A class representing a detected performance issue caused by a performance span
     """
 
-    def __init__(self, span_id: str, allowed_op: str, spans_involved: List[str]):
+    def __init__(self, span_id: str, allowed_op: str, spans_involved: List[str], fingerprint=""):
         self.span_id = span_id
         self.allowed_op = allowed_op
         self.spans_involved = spans_involved
+        self.fingerprint = fingerprint

+ 57 - 90
tests/sentry/utils/performance_issues/test_performance_detection.py

@@ -24,9 +24,9 @@ def modify_span_start(obj, start):
     return obj
 
 
-def create_span(op, duration=100.0, desc="SELECT count() FROM table WHERE id = %s"):
+def create_span(op, duration=100.0, desc="SELECT count() FROM table WHERE id = %s", hash=""):
     return modify_span_duration(
-        SpanBuilder().with_op(op).with_description(desc).build(),
+        SpanBuilder().with_op(op).with_description(desc).with_hash(hash).build(),
         duration,
     )
 
@@ -81,6 +81,44 @@ class PerformanceDetectionTest(unittest.TestCase):
             ]
         )
 
+    def test_calls_detect_duplicate_hash(self):
+        no_duplicate_event = create_event(
+            [create_span("http", 100.0, "http://example.com/slow?q=1", "")] * 4
+            + [create_span("http", 100.0, "http://example.com/slow?q=2", "")]
+        )
+        duplicate_event = create_event(
+            [create_span("http", 100.0, "http://example.com/slow?q=1", "abcdef")] * 4
+            + [create_span("http", 100.0, "http://example.com/slow?q=2", "abcdef")]
+        )
+
+        sdk_span_mock = Mock()
+
+        _detect_performance_issue(no_duplicate_event, sdk_span_mock)
+        assert sdk_span_mock.containing_transaction.set_tag.call_count == 0
+
+        _detect_performance_issue(duplicate_event, sdk_span_mock)
+        assert sdk_span_mock.containing_transaction.set_tag.call_count == 4
+        sdk_span_mock.containing_transaction.set_tag.assert_has_calls(
+            [
+                call(
+                    "_pi_all_issue_count",
+                    1,
+                ),
+                call(
+                    "_pi_transaction",
+                    "aaaaaaaaaaaaaaaa",
+                ),
+                call(
+                    "_pi_dupes_hash_fp",
+                    "abcdef",
+                ),
+                call(
+                    "_pi_dupes_hash",
+                    "bbbbbbbbbbbbbbbb",
+                ),
+            ]
+        )
+
     def test_calls_detect_slow_span(self):
         no_slow_span_event = create_event([create_span("db", 999.0)] * 1)
         slow_span_event = create_event([create_span("db", 1001.0)] * 1)
@@ -152,64 +190,15 @@ class PerformanceDetectionTest(unittest.TestCase):
         assert sdk_span_mock.containing_transaction.set_tag.call_count == 3
 
     def test_calls_detect_sequential(self):
-        no_sequential_event = {
-            "event_id": "a" * 16,
-            "spans": [
-                modify_span_duration(
-                    SpanBuilder()
-                    .with_op("db")
-                    .with_description("SELECT count() FROM table WHERE id = %s")
-                    .build(),
-                    999.0,
-                )
-            ]
-            * 4,
-        }
-        sequential_event = {
-            "event_id": "a" * 16,
-            "spans": [
-                modify_span_duration(
-                    SpanBuilder()
-                    .with_op("db")
-                    .with_description("SELECT count() FROM table WHERE id = %s")
-                    .build(),
-                    999.0,
-                ),
-            ]
-            * 2
+        no_sequential_event = create_event([create_span("db", 999.0)] * 4)
+        sequential_event = create_event(
+            [create_span("db", 999.0)] * 2
             + [
-                modify_span_start(
-                    modify_span_duration(
-                        SpanBuilder()
-                        .with_op("db")
-                        .with_description("SELECT count() FROM table WHERE id = %s")
-                        .build(),
-                        999.0,
-                    ),
-                    1000.0,
-                ),
-                modify_span_start(
-                    modify_span_duration(
-                        SpanBuilder()
-                        .with_op("db")
-                        .with_description("SELECT count() FROM table WHERE id = %s")
-                        .build(),
-                        999.0,
-                    ),
-                    2000.0,
-                ),
-                modify_span_start(
-                    modify_span_duration(
-                        SpanBuilder()
-                        .with_op("db")
-                        .with_description("SELECT count() FROM table WHERE id = %s")
-                        .build(),
-                        999.0,
-                    ),
-                    3000.0,
-                ),
-            ],
-        }
+                modify_span_start(create_span("db", 999.0), 1000.0),
+                modify_span_start(create_span("db", 999.0), 2000.0),
+                modify_span_start(create_span("db", 999.0), 3000.0),
+            ]
+        )
 
         sdk_span_mock = Mock()
 
@@ -240,37 +229,15 @@ class PerformanceDetectionTest(unittest.TestCase):
         )
 
     def test_calls_detect_long_task(self):
-        tolerable_long_task_spans_event = {
-            "event_id": "a" * 16,
-            "spans": [
-                modify_span_duration(
-                    SpanBuilder().with_op("ui.long-task").with_description("Long Task").build(),
-                    50.0,
-                )
-            ]
-            * 3,
-        }
-
-        long_task_span_event = {
-            "event_id": "a" * 16,
-            "spans": [
-                modify_span_duration(
-                    SpanBuilder().with_op("ui.long-task").with_description("Long Task").build(),
-                    550.0,
-                )
-            ],
-        }
-
-        multiple_long_task_span_event = {
-            "event_id": "c" * 16,
-            "spans": [
-                modify_span_duration(
-                    SpanBuilder().with_op("ui.long-task").with_description("Long Task").build(),
-                    50.0,
-                )
-            ]
-            * 11,
-        }
+        tolerable_long_task_spans_event = create_event(
+            [create_span("ui.long-task", 50.0, "Long Task")] * 3, "a" * 16
+        )
+        long_task_span_event = create_event(
+            [create_span("ui.long-task", 550.0, "Long Task")], "a" * 16
+        )
+        multiple_long_task_span_event = create_event(
+            [create_span("ui.long-task", 50.0, "Long Task")] * 11, "c" * 16
+        )
 
         sdk_span_mock = Mock()