Browse Source

feat(trace): Add performance issues to trace response (#45728)

- This adds a new performance_issues key to the trace response so that
we can show perf issues in both the trace navigator and the trace view
- Closes #45729
William Mak 2 years ago
parent
commit
6134984f6d

+ 111 - 17
src/sentry/api/endpoints/organization_events_trace.py

@@ -22,16 +22,18 @@ from django.http import Http404, HttpRequest, HttpResponse
 from rest_framework.exceptions import ParseError
 from rest_framework.response import Response
 from sentry_relay.consts import SPAN_STATUS_CODE_TO_NAME
+from snuba_sdk import AliasedExpression, Column, Function
 
-from sentry import eventstore, features
+from sentry import constants, eventstore, features
 from sentry.api.base import region_silo_endpoint
 from sentry.api.bases import NoProjects, OrganizationEventsV2EndpointBase
 from sentry.api.serializers.models.event import get_tags_with_meta
 from sentry.eventstore.models import Event
-from sentry.models import Organization
+from sentry.models import Group, Organization
 from sentry.search.events.builder import QueryBuilder
 from sentry.snuba import discover
 from sentry.utils.numbers import format_grouped_length
+from sentry.utils.performance_issues.performance_detection import EventPerformanceProblem
 from sentry.utils.sdk import set_measurement
 from sentry.utils.snuba import Dataset, bulk_snql_query
 from sentry.utils.validators import INVALID_ID_DETAILS, is_event_id
@@ -56,6 +58,7 @@ SnubaTransaction = TypedDict(
         "root": str,
         "project.id": int,
         "project": str,
+        "issue.ids": List[int],
     },
 )
 SnubaError = TypedDict(
@@ -84,6 +87,17 @@ class TraceError(TypedDict):
     level: str
 
 
+class TracePerformanceIssue(TypedDict):
+    event_id: str
+    issue_id: int
+    span: List[str]
+    suspect_spans: List[str]
+    project_id: int
+    project_slug: str
+    title: str
+    level: str
+
+
 LightResponse = TypedDict(
     "LightResponse",
     {
@@ -98,6 +112,7 @@ LightResponse = TypedDict(
         "parent_event_id": Optional[str],
         "generation": Optional[int],
         "errors": List[TraceError],
+        "performance_issues": List[TracePerformanceIssue],
     },
 )
 FullResponse = TypedDict(
@@ -115,6 +130,7 @@ FullResponse = TypedDict(
         "profile_id": Optional[str],
         "generation": Optional[int],
         "errors": List[TraceError],
+        "performance_issues": List[TracePerformanceIssue],
         "timestamp": str,
         "start_timestamp": str,
         # Any because children are more FullResponse objects
@@ -130,18 +146,79 @@ FullResponse = TypedDict(
 
 class TraceEvent:
     def __init__(
-        self, event: SnubaTransaction, parent: Optional[str], generation: Optional[int]
+        self,
+        event: SnubaTransaction,
+        parent: Optional[str],
+        generation: Optional[int],
+        light: bool = False,
     ) -> None:
         self.event: SnubaTransaction = event
         self.errors: List[TraceError] = []
         self.children: List[TraceEvent] = []
+        self.performance_issues: List[TracePerformanceIssue] = []
 
         # Can be None on the light trace when we don't know the parent
         self.parent_event_id: Optional[str] = parent
         self.generation: Optional[int] = generation
 
         # Added as required because getting the nodestore_event is expensive
-        self.nodestore_event: Optional[Event] = None
+        self._nodestore_event: Optional[Event] = None
+        self.fetched_nodestore: bool = False
+        self.load_performance_issues(light)
+
+    @property
+    def nodestore_event(self) -> Optional[Event]:
+        with sentry_sdk.start_span(op="nodestore", description="get_event_by_id"):
+            if self._nodestore_event is None and not self.fetched_nodestore:
+                self.fetched_nodestore = True
+                self._nodestore_event = eventstore.get_event_by_id(
+                    self.event["project.id"], self.event["id"]
+                )
+        return self._nodestore_event
+
+    def load_performance_issues(self, light: bool) -> None:
+        """Doesn't get suspect spans, since we don't need that for the light view"""
+        for group_id in self.event["issue.ids"]:
+            group = Group.objects.filter(id=group_id, project=self.event["project.id"]).first()
+            if group is None:
+                continue
+
+            suspect_spans: List[str] = []
+            if light:
+                # This value doesn't matter for the light view
+                span = [self.event["trace.span"]]
+            else:
+                if self.nodestore_event is not None:
+                    hashes = self.nodestore_event.get_hashes().hashes
+                    problems = [
+                        eventproblem.problem
+                        for eventproblem in EventPerformanceProblem.fetch_multi(
+                            [(self.nodestore_event, event_hash) for event_hash in hashes]
+                        )
+                    ]
+                    unique_spans: Set[str] = set()
+                    for problem in problems:
+                        unique_spans = unique_spans.union(problem.parent_span_ids)
+                    span = list(unique_spans)
+                    for event_span in self.nodestore_event.data.get("spans", []):
+                        for problem in problems:
+                            if event_span.get("span_id") in problem.offender_span_ids:
+                                suspect_spans.append(event_span.get("span_id"))
+                else:
+                    span = [self.event["trace.span"]]
+
+            self.performance_issues.append(
+                {
+                    "event_id": self.event["id"],
+                    "issue_id": group_id,
+                    "span": span,
+                    "suspect_spans": suspect_spans,
+                    "project_id": self.event["project.id"],
+                    "project_slug": self.event["project"],
+                    "title": group.title,
+                    "level": constants.LOG_LEVELS[group.level],
+                }
+            )
 
     def to_dict(self) -> LightResponse:
         return {
@@ -157,6 +234,7 @@ class TraceEvent:
             "parent_event_id": self.parent_event_id,
             "generation": self.generation,
             "errors": self.errors,
+            "performance_issues": self.performance_issues,
         }
 
     def full_dict(self, detailed: bool = False) -> FullResponse:
@@ -185,7 +263,7 @@ class TraceEvent:
                 result["tags"], result["_meta"]["tags"] = get_tags_with_meta(self.nodestore_event)
         # Only add children that have nodestore events, which may be missing if we're pruning for trace navigator
         result["children"] = [
-            child.full_dict(detailed) for child in self.children if child.nodestore_event
+            child.full_dict(detailed) for child in self.children if child.fetched_nodestore
         ]
         return result
 
@@ -203,7 +281,7 @@ def is_root(item: SnubaTransaction) -> bool:
 
 
 def child_sort_key(item: TraceEvent) -> List[int]:
-    if item.nodestore_event:
+    if item.fetched_nodestore and item.nodestore_event is not None:
         return [
             item.nodestore_event.data["start_timestamp"],
             item.nodestore_event.data["timestamp"],
@@ -213,6 +291,21 @@ def child_sort_key(item: TraceEvent) -> List[int]:
         return [0]
 
 
+def count_performance_issues(trace_id: str, params: Mapping[str, str]) -> int:
+    transaction_query = QueryBuilder(
+        Dataset.Transactions,
+        params,
+        query=f"trace:{trace_id}",
+        selected_columns=[],
+        limit=MAX_TRACE_SIZE,
+    )
+    transaction_query.columns.append(
+        Function("sum", [Function("length", [Column("group_ids")])], "total_groups")
+    )
+    count = transaction_query.run_query("api.trace-view.count-performance-issues")
+    return cast(int, count["data"][0].get("total_groups", 0))
+
+
 def query_trace_data(
     trace_id: str, params: Mapping[str, str]
 ) -> Tuple[Sequence[SnubaTransaction], Sequence[SnubaError]]:
@@ -238,6 +331,7 @@ def query_trace_data(
         orderby=["-root", "timestamp", "id"],
         limit=MAX_TRACE_SIZE,
     )
+    transaction_query.columns.append(AliasedExpression(Column("group_ids"), "issue.ids"))
     error_query = QueryBuilder(
         Dataset.Events,
         params,
@@ -480,12 +574,13 @@ class OrganizationEventsTraceLightEndpoint(OrganizationEventsTraceEndpointBase):
                                     root,
                                     None,
                                     0,
+                                    True,
                                 )
                             )
                             current_generation = 1
                             break
 
-            current_event = TraceEvent(snuba_event, root_id, current_generation)
+            current_event = TraceEvent(snuba_event, root_id, current_generation, True)
             trace_results.append(current_event)
 
             spans: NodeSpans = nodestore_event.data.get("spans", [])
@@ -515,6 +610,7 @@ class OrganizationEventsTraceLightEndpoint(OrganizationEventsTraceEndpointBase):
                                     if current_event.generation is not None
                                     else None
                                 ),
+                                True,
                             )
                             for child_event in child_events
                         ]
@@ -607,16 +703,11 @@ class OrganizationEventsTraceEndpoint(OrganizationEventsTraceEndpointBase):
                             del parent_map[to_remove["trace.parent_span"]]
                     to_check = deque()
 
-                # This is faster than doing a call to get_events, since get_event_by_id only makes a call to snuba
-                # when non transaction events are included.
-                with sentry_sdk.start_span(op="nodestore", description="get_event_by_id"):
-                    nodestore_event = eventstore.get_event_by_id(
-                        current_event["project.id"], current_event["id"]
-                    )
-
-                previous_event.nodestore_event = nodestore_event
-
-                spans: NodeSpans = nodestore_event.data.get("spans", [])
+                spans: NodeSpans = (
+                    previous_event.nodestore_event.data.get("spans", [])
+                    if previous_event.nodestore_event
+                    else []
+                )
 
                 # Need to include the transaction as a span as well
                 #
@@ -725,6 +816,8 @@ class OrganizationEventsTraceMetaEndpoint(OrganizationEventsTraceEndpointBase):
             )
             if len(result["data"]) == 0:
                 return Response(status=404)
+            # Merge the result back into the first query
+            result["data"][0]["performance_issues"] = count_performance_issues(trace_id, params)
         return Response(self.serialize(result["data"][0]))
 
     @staticmethod
@@ -734,4 +827,5 @@ class OrganizationEventsTraceMetaEndpoint(OrganizationEventsTraceEndpointBase):
             "projects": results.get("projects") or 0,
             "transactions": results.get("transactions") or 0,
             "errors": results.get("errors") or 0,
+            "performance_issues": results.get("performance_issues") or 0,
         }

+ 4 - 0
src/sentry/snuba/discover.py

@@ -155,6 +155,7 @@ def query(
     has_metrics=False,
     use_metrics_layer=False,
     skip_tag_resolution=False,
+    extra_columns=None,
 ) -> EventsResponse:
     """
     High-level API for doing arbitrary user queries against events.
@@ -212,6 +213,9 @@ def query(
     )
     if conditions is not None:
         builder.add_conditions(conditions)
+    if extra_columns is not None:
+        builder.columns.extend(extra_columns)
+
     result = builder.process_results(builder.run_query(referrer))
     result["meta"]["tips"] = transform_tips(builder.tips)
     return result

+ 22 - 2
tests/snuba/api/endpoints/test_organization_events_trace.py

@@ -4,6 +4,7 @@ from uuid import uuid4
 import pytest
 from django.urls import NoReverseMatch, reverse
 
+from sentry import options
 from sentry.testutils import APITestCase, SnubaTestCase
 from sentry.testutils.helpers.datetime import before_now, iso_format
 from sentry.testutils.silo import region_silo_test
@@ -11,7 +12,10 @@ from sentry.utils.samples import load_data
 
 
 class OrganizationEventsTraceEndpointBase(APITestCase, SnubaTestCase):
-    FEATURES = ["organizations:performance-view"]
+    FEATURES = [
+        "organizations:performance-view",
+        "organizations:performance-file-io-main-thread-detector",
+    ]
 
     def get_start_end(self, duration):
         return self.day_ago, self.day_ago + timedelta(milliseconds=duration)
@@ -27,6 +31,7 @@ class OrganizationEventsTraceEndpointBase(APITestCase, SnubaTestCase):
         duration=4000,
         span_id=None,
         measurements=None,
+        file_io_performance_issue=False,
         **kwargs,
     ):
         start, end = self.get_start_end(duration)
@@ -46,7 +51,13 @@ class OrganizationEventsTraceEndpointBase(APITestCase, SnubaTestCase):
                 data["measurements"][key]["value"] = value
         if tags is not None:
             data["tags"] = tags
-        return self.store_event(data, project_id=project_id, **kwargs)
+        if file_io_performance_issue:
+            span = data["spans"][0]
+            if "data" not in span:
+                span["data"] = {}
+            span["data"].update({"duration": 1, "blocked_main_thread": True})
+        with self.feature(self.FEATURES):
+            return self.store_event(data, project_id=project_id, **kwargs)
 
     def setUp(self):
         """
@@ -62,6 +73,8 @@ class OrganizationEventsTraceEndpointBase(APITestCase, SnubaTestCase):
                 gen2-2
         """
         super().setUp()
+        options.set("performance.issues.all.problem-detection", 1.0)
+        options.set("performance.issues.file_io_main_thread.problem-creation", 1.0)
         self.login_as(user=self.user)
 
         self.day_ago = before_now(days=1).replace(hour=10, minute=0, second=0, microsecond=0)
@@ -92,6 +105,7 @@ class OrganizationEventsTraceEndpointBase(APITestCase, SnubaTestCase):
                 "fcp": 750,
             },
             parent_span_id=None,
+            file_io_performance_issue=True,
             project_id=self.project.id,
             duration=3000,
         )
@@ -636,6 +650,8 @@ class OrganizationEventsTraceEndpointTest(OrganizationEventsTraceEndpointBase):
         assert root["generation"] == 0
         assert root["transaction.duration"] == 3000
         assert len(root["children"]) == 3
+        assert len(root["performance_issues"]) == 1
+        assert root["performance_issues"][0]["suspect_spans"][0] == self.root_span_ids[0]
 
         for i, gen1 in enumerate(root["children"]):
             self.assert_event(gen1, self.gen1_events[i], f"gen1_{i}")
@@ -1187,6 +1203,7 @@ class OrganizationEventsTraceMetaEndpointTest(OrganizationEventsTraceEndpointBas
         assert data["projects"] == 0
         assert data["transactions"] == 0
         assert data["errors"] == 0
+        assert data["performance_issues"] == 0
 
         # Invalid trace id
         with pytest.raises(NoReverseMatch):
@@ -1211,6 +1228,7 @@ class OrganizationEventsTraceMetaEndpointTest(OrganizationEventsTraceEndpointBas
         assert data["projects"] == 4
         assert data["transactions"] == 8
         assert data["errors"] == 0
+        assert data["performance_issues"] == 1
 
     def test_with_errors(self):
         self.load_trace()
@@ -1226,6 +1244,7 @@ class OrganizationEventsTraceMetaEndpointTest(OrganizationEventsTraceEndpointBas
         assert data["projects"] == 4
         assert data["transactions"] == 8
         assert data["errors"] == 2
+        assert data["performance_issues"] == 1
 
     def test_with_default(self):
         self.load_trace()
@@ -1241,3 +1260,4 @@ class OrganizationEventsTraceMetaEndpointTest(OrganizationEventsTraceEndpointBas
         assert data["projects"] == 4
         assert data["transactions"] == 8
         assert data["errors"] == 1
+        assert data["performance_issues"] == 1