Browse Source

ref(perf): Split threshold for perf issues (#37758)

* ref(perf): Split threshold for perf issues

This allows for multiple settings per detector class, which allows us to set different settings for different span ops and more.

Other:
- Also moved this class out of tasks for clarity to address an earlier comments in PRs
- Cleaned up the tests a bit with some helper functions
- Move the detected span issue object into it's own file for clarity
- Added the directory to codeowners so we don't have to manually add for review
Kev 2 years ago
parent
commit
7bc55e1ccd

+ 1 - 0
.github/CODEOWNERS

@@ -28,6 +28,7 @@
 /src/sentry/quotas/                @getsentry/owners-ingest
 /src/sentry/relay/                 @getsentry/owners-ingest
 /src/sentry/utils/data_filters.py  @getsentry/owners-ingest
+/src/sentry/utils/performance_issues/   @getsentry/performance
 /src/sentry/web/api.py             @getsentry/owners-ingest
 /src/sentry/scripts/quotas/        @getsentry/owners-ingest
 /src/sentry/scripts/tsdb/          @getsentry/owners-ingest

+ 2 - 2
src/sentry/tasks/store.py

@@ -17,12 +17,12 @@ from sentry.eventstore.processing.base import Event
 from sentry.killswitches import killswitch_matches_context
 from sentry.models import Activity, Organization, Project, ProjectOption
 from sentry.stacktraces.processing import process_stacktraces, should_process_for_stacktraces
-from sentry.tasks import performance_detection
 from sentry.tasks.base import instrumented_task
 from sentry.types.activity import ActivityType
 from sentry.utils import metrics
 from sentry.utils.canonical import CANONICAL_TYPES, CanonicalKeyDict
 from sentry.utils.dates import to_datetime
+from sentry.utils.performance_issues.performance_detection import detect_performance_issue
 from sentry.utils.safe import safe_execute
 from sentry.utils.sdk import set_current_event_project
 
@@ -703,7 +703,7 @@ def _do_save_event(
                 with metrics.timer("tasks.store.do_save_event.write_processing_cache"):
                     processing.event_processing_store.store(data)
                 if event_type == "transaction":
-                    performance_detection.detect_performance_issue(data)
+                    detect_performance_issue(data)
         except HashDiscarded:
             # Delete the event payload from cache since it won't show up in post-processing.
             if cache_key:

+ 0 - 0
src/sentry/utils/performance_issues/__init__.py


+ 76 - 76
src/sentry/tasks/performance_detection.py → src/sentry/utils/performance_issues/performance_detection.py

@@ -11,6 +11,8 @@ from sentry import options
 from sentry.eventstore.processing.base import Event
 from sentry.utils import metrics
 
+from .performance_span_issue import PerformanceSpanIssue
+
 Span = Dict[str, Any]
 TransactionSpans = List[Span]
 PerformanceIssues = Dict[str, Any]
@@ -42,32 +44,45 @@ def detect_performance_issue(data: Event):
 
 # Gets some of the thresholds to perform performance detection. Can be made configurable later.
 # Thresholds are in milliseconds.
+# Allowed span ops are allowed span prefixes. (eg. 'http' would work for a span with 'http.client' as it's op)
 def get_default_detection_settings():
     return {
-        DetectorType.DUPLICATE_SPANS: {
-            "count": 5,
-            "cumulative_duration": 500.0,  # ms
-            "allowed_span_ops": ["db", "http"],
-        },
-        DetectorType.SEQUENTIAL_SLOW_SPANS: {
-            "count": 3,
-            "cumulative_duration": 1200.0,  # ms
-            "allowed_span_ops": ["db", "http", "ui"],
-        },
-        DetectorType.SLOW_SPAN: {
-            "duration_threshold": 1000.0,  # ms
-            "allowed_span_ops": ["db", "http"],
-        },
-        DetectorType.LONG_TASK_SPANS: {
-            "cumulative_duration": 500.0,  # ms
-            "allowed_span_ops": ["ui.long-task", "ui.sentry.long-task"],
-        },
+        DetectorType.DUPLICATE_SPANS: [
+            {
+                "count": 5,
+                "cumulative_duration": 500.0,  # ms
+                "allowed_span_ops": ["db", "http"],
+            }
+        ],
+        DetectorType.SEQUENTIAL_SLOW_SPANS: [
+            {
+                "count": 3,
+                "cumulative_duration": 1200.0,  # ms
+                "allowed_span_ops": ["db", "http", "ui"],
+            }
+        ],
+        DetectorType.SLOW_SPAN: [
+            {
+                "duration_threshold": 1000.0,  # ms
+                "allowed_span_ops": ["db"],
+            },
+            {
+                "duration_threshold": 2000.0,  # ms
+                "allowed_span_ops": ["http"],
+            },
+        ],
+        DetectorType.LONG_TASK_SPANS: [
+            {
+                "cumulative_duration": 500.0,  # ms
+                "allowed_span_ops": ["ui.long-task", "ui.sentry.long-task"],
+            }
+        ],
     }
 
 
 def _detect_performance_issue(data: Event, sdk_span: Any):
     event_id = data.get("event_id", None)
-    spans: TransactionSpans = data.get("spans", [])
+    spans = data.get("spans", [])
 
     detection_settings = get_default_detection_settings()
     detectors = {
@@ -114,18 +129,6 @@ def get_span_duration(span: Span):
     )
 
 
-class PerformanceSpanIssue:
-    __slots__ = ("span_id", "allowed_op", "spans_involved")
-    """
-    A class representing a detected performance issue caused by a performance span
-    """
-
-    def __init__(self, span_id: str, allowed_op: str, spans_involved: List[str]):
-        self.span_id = span_id
-        self.allowed_op = allowed_op
-        self.spans_involved = spans_involved
-
-
 class PerformanceDetector(ABC):
     """
     Classes of this type have their visit functions called as the event is walked once and will store a performance issue if one is detected.
@@ -139,12 +142,25 @@ class PerformanceDetector(ABC):
     def init(self):
         raise NotImplementedError
 
-    def span_op_allowed(self, span_op: str):
-        allowed_span_ops = self.settings.get("allowed_span_ops", [])
+    def find_span_prefix(self, settings, span_op: str):
+        allowed_span_ops = settings.get("allowed_span_ops", [])
         if len(allowed_span_ops) <= 0:
             return True
         return next((op for op in allowed_span_ops if span_op.startswith(op)), False)
 
+    def settings_for_span(self, span: Span):
+        op = span.get("op", None)
+        span_id = span.get("span_id", None)
+        if not op or not span_id:
+            return None
+
+        span_duration = get_span_duration(span)
+        for setting in self.settings:
+            op_prefix = self.find_span_prefix(setting, op)
+            if op_prefix:
+                return op, span_id, op_prefix, span_duration, setting
+        return None
+
     @property
     @abstractmethod
     def settings_key(self) -> DetectorType:
@@ -175,22 +191,17 @@ class DuplicateSpanDetector(PerformanceDetector):
         self.stored_issues = {}
 
     def visit_span(self, span: Span):
-        op = span.get("op", None)
-        span_id = span.get("span_id", None)
-        if not op or not span_id:
+        settings_for_span = self.settings_for_span(span)
+        if not settings_for_span:
             return
+        op, span_id, op_prefix, span_duration, settings = settings_for_span
+        duplicate_count_threshold = settings.get("count")
+        duplicate_duration_threshold = settings.get("cumulative_duration")
 
         fingerprint = fingerprint_span(span)
-
-        duplicate_count_threshold = self.settings.get("count")
-        duplicate_duration_threshold = self.settings.get("cumulative_duration")
-        allowed_op = self.span_op_allowed(op)
-
-        if not fingerprint or not allowed_op:
+        if not fingerprint:
             return
 
-        span_duration = get_span_duration(span)
-
         self.cumulative_durations[fingerprint] = (
             self.cumulative_durations.get(fingerprint, timedelta(0)) + span_duration
         )
@@ -207,7 +218,7 @@ class DuplicateSpanDetector(PerformanceDetector):
             ] >= timedelta(milliseconds=duplicate_duration_threshold):
                 spans_involved = self.duplicate_spans_involved[fingerprint]
                 self.stored_issues[fingerprint] = PerformanceSpanIssue(
-                    span_id, allowed_op, spans_involved
+                    span_id, op_prefix, spans_involved
                 )
 
 
@@ -224,27 +235,23 @@ class SlowSpanDetector(PerformanceDetector):
         self.stored_issues = {}
 
     def visit_span(self, span: Span):
-        op = span.get("op", None)
-        span_id = span.get("span_id", None)
-        if not op or not span_id:
+        settings_for_span = self.settings_for_span(span)
+        if not settings_for_span:
             return
+        op, span_id, op_prefix, span_duration, settings = settings_for_span
+        duration_threshold = settings.get("duration_threshold")
 
         fingerprint = fingerprint_span(span)
 
-        slow_span_duration_threshold = self.settings.get("duration_threshold")
-        allowed_op = self.span_op_allowed(op)
-
-        if not fingerprint or not allowed_op:
+        if not fingerprint:
             return
 
-        span_duration = get_span_duration(span)
-
         if span_duration >= timedelta(
-            milliseconds=slow_span_duration_threshold
+            milliseconds=duration_threshold
         ) and not self.stored_issues.get(fingerprint, False):
             spans_involved = [span_id]
             self.stored_issues[fingerprint] = PerformanceSpanIssue(
-                span_id, allowed_op, spans_involved
+                span_id, op_prefix, spans_involved
             )
 
 
@@ -265,21 +272,17 @@ class SequentialSlowSpanDetector(PerformanceDetector):
         self.last_span_seen = {}
 
     def visit_span(self, span: Span):
-        op = span.get("op", None)
-        span_id = span.get("span_id", None)
-        if not op or not span_id:
+        settings_for_span = self.settings_for_span(span)
+        if not settings_for_span:
             return
+        op, span_id, op_prefix, span_duration, settings = settings_for_span
+        duration_threshold = settings.get("cumulative_duration")
+        count_threshold = settings.get("count")
 
         fingerprint = fingerprint_span_op(span)
-
-        count_threshold = self.settings.get("count")
-        duration_threshold = self.settings.get("cumulative_duration")
-        allowed_op = self.span_op_allowed(op)
-
-        if not fingerprint or not allowed_op:
+        if not fingerprint:
             return
 
-        span_duration = get_span_duration(span)
         span_end = timedelta(seconds=span.get("timestamp", 0))
 
         if fingerprint not in self.spans_involved:
@@ -313,7 +316,7 @@ class SequentialSlowSpanDetector(PerformanceDetector):
             ] >= timedelta(milliseconds=duration_threshold):
                 spans_involved = self.spans_involved[fingerprint]
                 self.stored_issues[fingerprint] = PerformanceSpanIssue(
-                    span_id, allowed_op, spans_involved
+                    span_id, op_prefix, spans_involved
                 )
 
 
@@ -332,17 +335,14 @@ class LongTaskSpanDetector(PerformanceDetector):
         self.stored_issues = {}
 
     def visit_span(self, span: Span):
-        op = span.get("op", None)
-        span_id = span.get("span_id", None)
-        if not op or not span_id:
+        settings_for_span = self.settings_for_span(span)
+        if not settings_for_span:
             return
+        op, span_id, op_prefix, span_duration, settings = settings_for_span
+        duration_threshold = settings.get("cumulative_duration")
 
         fingerprint = fingerprint_span(span)
-
-        duration_threshold = self.settings.get("cumulative_duration")
-        allowed_op = self.span_op_allowed(op)
-
-        if not fingerprint or not allowed_op:
+        if not fingerprint:
             return
 
         span_duration = get_span_duration(span)
@@ -351,7 +351,7 @@ class LongTaskSpanDetector(PerformanceDetector):
 
         if self.cumulative_duration >= timedelta(milliseconds=duration_threshold):
             self.stored_issues[fingerprint] = PerformanceSpanIssue(
-                span_id, allowed_op, self.spans_involved
+                span_id, op_prefix, self.spans_involved
             )
 
 

+ 13 - 0
src/sentry/utils/performance_issues/performance_span_issue.py

@@ -0,0 +1,13 @@
+from typing import List
+
+
+class PerformanceSpanIssue:
+    __slots__ = ("span_id", "allowed_op", "spans_involved")
+    """
+    A class representing a detected performance issue caused by a performance span
+    """
+
+    def __init__(self, span_id: str, allowed_op: str, spans_involved: List[str]):
+        self.span_id = span_id
+        self.allowed_op = allowed_op
+        self.spans_involved = spans_involved

+ 38 - 89
tests/sentry/tasks/test_performance_detection.py → tests/sentry/utils/performance_issues/test_performance_detection.py

@@ -1,8 +1,11 @@
 import unittest
 from unittest.mock import Mock, call, patch
 
-from sentry.tasks.performance_detection import _detect_performance_issue, detect_performance_issue
 from sentry.testutils.helpers import override_options
+from sentry.utils.performance_issues.performance_detection import (
+    _detect_performance_issue,
+    detect_performance_issue,
+)
 from tests.sentry.spans.grouping.test_strategy import SpanBuilder
 
 
@@ -21,14 +24,25 @@ def modify_span_start(obj, start):
     return obj
 
 
+def create_span(op, duration=100.0, desc="SELECT count() FROM table WHERE id = %s"):
+    return modify_span_duration(
+        SpanBuilder().with_op(op).with_description(desc).build(),
+        duration,
+    )
+
+
+def create_event(spans, event_id="a" * 16):
+    return {"event_id": event_id, "spans": spans}
+
+
 class PerformanceDetectionTest(unittest.TestCase):
-    @patch("sentry.tasks.performance_detection._detect_performance_issue")
+    @patch("sentry.utils.performance_issues.performance_detection._detect_performance_issue")
     def test_options_disabled(self, mock):
         event = {}
         detect_performance_issue(event)
         assert mock.call_count == 0
 
-    @patch("sentry.tasks.performance_detection._detect_performance_issue")
+    @patch("sentry.utils.performance_issues.performance_detection._detect_performance_issue")
     def test_options_enabled(self, mock):
         event = {}
         with override_options({"store.use-ingest-performance-detection-only": 1.0}):
@@ -36,43 +50,9 @@ class PerformanceDetectionTest(unittest.TestCase):
         assert mock.call_count == 1
 
     def test_calls_detect_duplicate(self):
-        no_duplicate_event = {
-            "event_id": "a" * 16,
-            "spans": [
-                modify_span_duration(
-                    SpanBuilder()
-                    .with_op("db")
-                    .with_description("SELECT count() FROM table WHERE id = %s")
-                    .build(),
-                    100.0,
-                )
-            ]
-            * 4,
-        }
-        duplicate_not_allowed_op_event = {
-            "event_id": "a" * 16,
-            "spans": [
-                modify_span_duration(
-                    SpanBuilder().with_op("random").with_description("example").build(),
-                    100.0,
-                )
-            ]
-            * 5,
-        }
-
-        duplicate_event = {
-            "event_id": "a" * 16,
-            "spans": [
-                modify_span_duration(
-                    SpanBuilder()
-                    .with_op("db")
-                    .with_description("SELECT count() FROM table WHERE id = %s")
-                    .build(),
-                    100.0,
-                )
-            ]
-            * 5,
-        }
+        no_duplicate_event = create_event([create_span("db")] * 4)
+        duplicate_not_allowed_op_event = create_event([create_span("random", 100.0)] * 5)
+        duplicate_event = create_event([create_span("db")] * 5)
 
         sdk_span_mock = Mock()
 
@@ -102,42 +82,9 @@ class PerformanceDetectionTest(unittest.TestCase):
         )
 
     def test_calls_detect_slow_span(self):
-        no_slow_span_event = {
-            "event_id": "a" * 16,
-            "spans": [
-                modify_span_duration(
-                    SpanBuilder()
-                    .with_op("db")
-                    .with_description("SELECT count() FROM table WHERE id = %s")
-                    .build(),
-                    999.0,
-                )
-            ]
-            * 1,
-        }
-        slow_span_event = {
-            "event_id": "a" * 16,
-            "spans": [
-                modify_span_duration(
-                    SpanBuilder()
-                    .with_op("db")
-                    .with_description("SELECT count() FROM table WHERE id = %s")
-                    .build(),
-                    1001.0,
-                )
-            ]
-            * 1,
-        }
-        slow_not_allowed_op_span_event = {
-            "event_id": "a" * 16,
-            "spans": [
-                modify_span_duration(
-                    SpanBuilder().with_op("random").with_description("example").build(),
-                    1001.0,
-                )
-            ]
-            * 1,
-        }
+        no_slow_span_event = create_event([create_span("db", 999.0)] * 1)
+        slow_span_event = create_event([create_span("db", 1001.0)] * 1)
+        slow_not_allowed_op_span_event = create_event([create_span("random", 1001.0, "example")])
 
         sdk_span_mock = Mock()
 
@@ -167,19 +114,7 @@ class PerformanceDetectionTest(unittest.TestCase):
         )
 
     def test_calls_partial_span_op_allowed(self):
-        span_event = {
-            "event_id": "a" * 16,
-            "spans": [
-                modify_span_duration(
-                    SpanBuilder()
-                    .with_op("http.client")
-                    .with_description("http://example.com")
-                    .build(),
-                    1001.0,
-                )
-            ]
-            * 1,
-        }
+        span_event = create_event([create_span("http.client", 2001.0, "http://example.com")] * 1)
 
         sdk_span_mock = Mock()
 
@@ -202,6 +137,20 @@ class PerformanceDetectionTest(unittest.TestCase):
             ]
         )
 
+    def test_calls_slow_span_threshold(self):
+        http_span_event = create_event(
+            [create_span("http.client", 1001.0, "http://example.com")] * 1
+        )
+        db_span_event = create_event([create_span("db.query", 1001.0)] * 1)
+
+        sdk_span_mock = Mock()
+
+        _detect_performance_issue(http_span_event, sdk_span_mock)
+        assert sdk_span_mock.containing_transaction.set_tag.call_count == 0
+
+        _detect_performance_issue(db_span_event, sdk_span_mock)
+        assert sdk_span_mock.containing_transaction.set_tag.call_count == 3
+
     def test_calls_detect_sequential(self):
         no_sequential_event = {
             "event_id": "a" * 16,