Browse Source

feat(rca): Query required data for span analysis (#56653)

Uses the QueryBuilder to fetch the expected data in the span analysis.
Since the span analysis hasn't been merged yet, I'm just returning
the data in the endpoint.

Closes [#54372](https://github.com/getsentry/sentry/issues/54372)
Nar Saynorath 1 year ago
parent
commit
d553493acc

+ 71 - 6
src/sentry/api/endpoints/organization_events_root_cause_analysis.py

@@ -1,10 +1,68 @@
 from rest_framework.response import Response
+from snuba_sdk import Column, Function, LimitBy
 
 from sentry import features
 from sentry.api.api_publish_status import ApiPublishStatus
 from sentry.api.base import region_silo_endpoint
 from sentry.api.bases.organization_events import OrganizationEventsEndpointBase
+from sentry.search.events.builder import QueryBuilder
+from sentry.search.events.types import QueryBuilderConfig
+from sentry.search.utils import parse_datetime_string
+from sentry.snuba.dataset import Dataset
 from sentry.snuba.metrics_performance import query as metrics_query
+from sentry.utils.snuba import raw_snql_query
+
+DEFAULT_LIMIT = 50
+
+
+def query_spans(transaction, regression_breakpoint, params, limit):
+    selected_columns = [
+        "count(span_id) as span_count",
+        "sumArray(spans_exclusive_time) as total_span_self_time",
+        "array_join(spans_op) as span_op",
+        "array_join(spans_group) as span_group",
+        # want a single event id to fetch from nodestore for the span description
+        "any(id) as sample_event_id",
+    ]
+
+    builder = QueryBuilder(
+        dataset=Dataset.Discover,
+        params=params,
+        selected_columns=selected_columns,
+        equations=[],
+        query=f"transaction:{transaction}",
+        orderby=["span_op", "span_group", "total_span_self_time"],
+        limit=limit,
+        config=QueryBuilderConfig(
+            auto_aggregations=True,
+            use_aggregate_conditions=True,
+            functions_acl=[
+                "array_join",
+                "sumArray",
+                "percentileArray",
+            ],
+        ),
+    )
+
+    builder.columns.append(
+        Function(
+            "if",
+            [
+                Function("greaterOrEquals", [Column("timestamp"), regression_breakpoint]),
+                "after",
+                "before",
+            ],
+            "period",
+        )
+    )
+    builder.columns.append(Function("countDistinct", [Column("event_id")], "transaction_count"))
+    builder.groupby.append(Column("period"))
+    builder.limitby = LimitBy([Column("period")], limit // 2)
+
+    snql_query = builder.get_snql_query()
+    results = raw_snql_query(snql_query, "api.organization-events-root-cause-analysis")
+
+    return results.get("data", [])
 
 
 @region_silo_endpoint
@@ -21,15 +79,17 @@ class OrganizationEventsRootCauseAnalysisEndpoint(OrganizationEventsEndpointBase
         ):
             return Response(status=404)
 
-        root_cause_results = {}
-
+        # TODO: Extract this into a custom serializer to handle validation
         transaction_name = request.GET.get("transaction")
         project_id = request.GET.get("project")
-        if not transaction_name or not project_id:
+        regression_breakpoint = request.GET.get("breakpoint")
+        if not transaction_name or not project_id or not regression_breakpoint:
             # Project ID is required to ensure the events we query for are
             # the same transaction
             return Response(status=400)
 
+        regression_breakpoint = parse_datetime_string(regression_breakpoint)
+
         params = self.get_snuba_params(request, organization)
 
         with self.handle_query_errors():
@@ -43,6 +103,11 @@ class OrganizationEventsRootCauseAnalysisEndpoint(OrganizationEventsEndpointBase
         if transaction_count_query["data"][0]["count"] == 0:
             return Response(status=400, data="Transaction not found")
 
-        # TODO: This is only a temporary stub for surfacing RCA data
-        root_cause_results["transaction_count"] = transaction_count_query["data"][0]["count"]
-        return Response(status=200, data=root_cause_results)
+        results = query_spans(
+            transaction=transaction_name,
+            regression_breakpoint=regression_breakpoint,
+            params=params,
+            limit=int(request.GET.get("per_page", DEFAULT_LIMIT)),
+        )
+
+        return Response(results, status=200)

+ 300 - 5
tests/sentry/api/endpoints/test_organization_root_cause_analysis.py

@@ -1,10 +1,13 @@
+from datetime import timedelta
+
 import pytest
 from django.urls import reverse
 
 from sentry.snuba.metrics.naming_layer.mri import TransactionMRI
 from sentry.testutils.cases import MetricsAPIBaseTestCase
-from sentry.testutils.helpers.datetime import freeze_time
+from sentry.testutils.helpers.datetime import freeze_time, iso_format
 from sentry.testutils.silo import region_silo_test
+from sentry.utils.samples import load_data
 
 ROOT_CAUSE_FEATURE_FLAG = "organizations:statistical-detectors-root-cause-analysis"
 
@@ -31,10 +34,39 @@ class OrganizationRootCauseAnalysisTest(MetricsAPIBaseTestCase):
             project_id=self.project.id,
             value=1,
         )
+        self.trace_id = "a" * 32
 
     @property
     def now(self):
-        return MetricsAPIBaseTestCase.MOCK_DATETIME
+        return MetricsAPIBaseTestCase.MOCK_DATETIME.replace(tzinfo=None)
+
+    def create_transaction(
+        self,
+        transaction,
+        trace_id,
+        span_id,
+        parent_span_id,
+        spans,
+        project_id,
+        start_timestamp,
+        duration,
+        transaction_id=None,
+    ):
+        timestamp = start_timestamp + timedelta(milliseconds=duration)
+
+        data = load_data(
+            "transaction",
+            trace=trace_id,
+            span_id=span_id,
+            spans=spans,
+            start_timestamp=start_timestamp,
+            timestamp=timestamp,
+        )
+        if transaction_id is not None:
+            data["event_id"] = transaction_id
+        data["transaction"] = transaction
+        data["contexts"]["trace"]["parent_span_id"] = parent_span_id
+        return self.store_event(data, project_id=project_id)
 
     def test_404s_without_feature_flag(self):
         response = self.client.get(self.url, format="json")
@@ -42,17 +74,30 @@ class OrganizationRootCauseAnalysisTest(MetricsAPIBaseTestCase):
 
     def test_transaction_name_required(self):
         with self.feature(FEATURES):
-            response = self.client.get(self.url, format="json")
+            response = self.client.get(
+                self.url,
+                format="json",
+                data={
+                    "project": self.project.id,
+                    "breakpoint": (self.now - timedelta(days=1)).isoformat(),
+                },
+            )
 
         assert response.status_code == 400, response.content
 
     def test_project_id_required(self):
         with self.feature(FEATURES):
-            response = self.client.get(self.url, format="json", data={"transaction": "foo"})
+            response = self.client.get(
+                self.url,
+                format="json",
+                data={
+                    "transaction": "foo",
+                },
+            )
 
         assert response.status_code == 400, response.content
 
-    def test_transaction_must_exist(self):
+    def test_breakpoint_required(self):
         with self.feature(FEATURES):
             response = self.client.get(
                 self.url,
@@ -60,6 +105,22 @@ class OrganizationRootCauseAnalysisTest(MetricsAPIBaseTestCase):
                 data={"transaction": "foo", "project": self.project.id},
             )
 
+        assert response.status_code == 400, response.content
+
+    def test_transaction_must_exist(self):
+        with self.feature(FEATURES):
+            response = self.client.get(
+                self.url,
+                format="json",
+                data={
+                    "transaction": "foo",
+                    "project": self.project.id,
+                    "breakpoint": self.now - timedelta(days=1),
+                    "start": self.now - timedelta(days=3),
+                    "end": self.now,
+                },
+            )
+
         assert response.status_code == 200, response.content
 
         with self.feature(FEATURES):
@@ -69,7 +130,241 @@ class OrganizationRootCauseAnalysisTest(MetricsAPIBaseTestCase):
                 data={
                     "transaction": "does not exist",
                     "project": self.project.id,
+                    "breakpoint": self.now - timedelta(days=1),
+                    "start": self.now - timedelta(days=3),
+                    "end": self.now,
                 },
             )
 
         assert response.status_code == 400, response.content
+
+    # TODO: Enable this test when adding a serializer to handle validation
+    # def test_breakpoint_must_be_in_the_past(self):
+    #     with self.feature(FEATURES):
+    #         response = self.client.get(
+    #             self.url,
+    #             format="json",
+    #             data={
+    #                 "transaction": "foo",
+    #                 "project": self.project.id,
+    #                 "breakpoint": (self.now + timedelta(days=1)).isoformat(),
+    #             },
+    #         )
+
+    #     assert response.status_code == 400, response.content
+
+    def test_returns_counts_of_spans_before_and_after_breakpoint(self):
+        before_timestamp = self.now - timedelta(days=2)
+        before_span = {
+            "parent_span_id": "a" * 16,
+            "span_id": "e" * 16,
+            "start_timestamp": iso_format(before_timestamp),
+            "timestamp": iso_format(before_timestamp),
+            "op": "django.middleware",
+            "description": "middleware span",
+            "exclusive_time": 60.0,
+        }
+
+        # before
+        self.create_transaction(
+            transaction="foo",
+            trace_id=self.trace_id,
+            span_id="a" * 16,
+            parent_span_id="b" * 16,
+            spans=[before_span],
+            project_id=self.project.id,
+            start_timestamp=before_timestamp,
+            duration=60,
+        )
+        self.create_transaction(
+            transaction="foo",
+            trace_id=self.trace_id,
+            span_id="b" * 16,
+            parent_span_id="b" * 16,
+            spans=[{**before_span, "op": "db", "description": "db span"}],
+            project_id=self.project.id,
+            start_timestamp=before_timestamp,
+            duration=60,
+        )
+
+        # after
+        after_timestamp = self.now - timedelta(hours=1)
+        self.create_transaction(
+            transaction="foo",
+            trace_id=self.trace_id,
+            span_id="c" * 16,
+            parent_span_id="d" * 16,
+            spans=[
+                {
+                    "parent_span_id": "e" * 16,
+                    "span_id": "f" * 16,
+                    "start_timestamp": iso_format(after_timestamp),
+                    "timestamp": iso_format(after_timestamp),
+                    "op": "django.middleware",
+                    "description": "middleware span",
+                    "exclusive_time": 40.0,
+                },
+                {
+                    "parent_span_id": "1" * 16,
+                    "span_id": "2" * 16,
+                    "start_timestamp": iso_format(after_timestamp),
+                    "timestamp": iso_format(after_timestamp),
+                    "op": "django.middleware",
+                    "description": "middleware span",
+                    "exclusive_time": 60.0,
+                },
+                {
+                    "parent_span_id": "1" * 16,
+                    "span_id": "3" * 16,
+                    "start_timestamp": iso_format(after_timestamp),
+                    "timestamp": iso_format(after_timestamp),
+                    "op": "django.middleware",
+                    "description": "middleware span",
+                    "exclusive_time": 60.0,
+                },
+            ],
+            project_id=self.project.id,
+            start_timestamp=after_timestamp,
+            duration=100,
+        )
+
+        with self.feature(FEATURES):
+            response = self.client.get(
+                self.url,
+                format="json",
+                data={
+                    "transaction": "foo",
+                    "project": self.project.id,
+                    "breakpoint": self.now - timedelta(days=1),
+                    "start": self.now - timedelta(days=3),
+                    "end": self.now,
+                },
+            )
+
+        assert response.status_code == 200, response.content
+
+        # Check that sample IDs are gathered, but remove them from the data
+        # for checking since they are randomized
+        assert all("sample_event_id" in row for row in response.data)
+        for row in response.data:
+            del row["sample_event_id"]
+        assert response.data == [
+            {
+                "period": "before",
+                "span_count": 1,
+                "span_group": "5ad8c5a1e8d0e5f7",
+                "span_op": "db",
+                "total_span_self_time": 60.0,
+                "transaction_count": 1,
+            },
+            {
+                "span_group": "2b9cbb96dbf59baa",
+                "span_op": "django.middleware",
+                "period": "before",
+                "total_span_self_time": 60.0,
+                "span_count": 1,
+                "transaction_count": 1,
+            },
+            {
+                "span_group": "2b9cbb96dbf59baa",
+                "span_op": "django.middleware",
+                "period": "after",
+                "total_span_self_time": 160.0,
+                "span_count": 3,
+                "transaction_count": 1,
+            },
+        ]
+
+    def test_results_per_period_are_limited(self):
+        # Before
+        self.create_transaction(
+            transaction="foo",
+            trace_id=self.trace_id,
+            span_id="a" * 16,
+            parent_span_id="b" * 16,
+            spans=[
+                {
+                    "parent_span_id": "a" * 16,
+                    "span_id": "e" * 16,
+                    "start_timestamp": iso_format(self.now - timedelta(days=2)),
+                    "timestamp": iso_format(self.now - timedelta(days=2)),
+                    "op": "django.middleware",
+                    "description": "middleware span",
+                    "exclusive_time": 60.0,
+                }
+            ],
+            project_id=self.project.id,
+            start_timestamp=self.now - timedelta(days=2),
+            duration=60,
+        )
+
+        # After
+        self.create_transaction(
+            transaction="foo",
+            trace_id=self.trace_id,
+            span_id="a" * 16,
+            parent_span_id="b" * 16,
+            spans=[
+                {
+                    "parent_span_id": "a" * 16,
+                    "span_id": "e" * 16,
+                    "start_timestamp": iso_format(self.now - timedelta(hours=1)),
+                    "timestamp": iso_format(self.now - timedelta(hours=1)),
+                    "op": "django.middleware",
+                    "description": "middleware span",
+                    "exclusive_time": 100.0,
+                },
+                {
+                    "parent_span_id": "a" * 16,
+                    "span_id": "f" * 16,
+                    "start_timestamp": iso_format(self.now - timedelta(hours=1)),
+                    "timestamp": iso_format(self.now - timedelta(hours=1)),
+                    "op": "db",
+                    "description": "db",
+                    "exclusive_time": 100.0,
+                },
+            ],
+            project_id=self.project.id,
+            start_timestamp=self.now - timedelta(hours=1),
+            duration=200,
+        )
+
+        with self.feature(FEATURES):
+            response = self.client.get(
+                self.url,
+                format="json",
+                data={
+                    "transaction": "foo",
+                    "project": self.project.id,
+                    "breakpoint": self.now - timedelta(days=1),
+                    "start": self.now - timedelta(days=3),
+                    "end": self.now,
+                    # Force a small page size and verify the 2 spans from After
+                    # don't dominate the results
+                    "per_page": 2,
+                },
+            )
+
+        assert response.status_code == 200, response.content
+
+        for row in response.data:
+            del row["sample_event_id"]
+
+        assert response.data == [
+            {
+                "period": "after",
+                "span_count": 1,
+                "span_group": "d77d5e503ad1439f",
+                "span_op": "db",
+                "total_span_self_time": 100.0,
+                "transaction_count": 1,
+            },
+            {
+                "span_group": "2b9cbb96dbf59baa",
+                "span_op": "django.middleware",
+                "period": "before",
+                "total_span_self_time": 60.0,
+                "span_count": 1,
+                "transaction_count": 1,
+            },
+        ]