Browse Source

perf(issues): improve adjacent_events query (#79365)

Right now the a `adjacent_events` query does a really big scan in order
to find the adjacent events given an event id. It does this because the
old query does not specify an order by which can take advantage of the
primary key
`ORDER BY (project_id, toStartOfDay(timestamp), primary_hash,
cityHash64(event_id))`

This PR adds a new function `get_adjacent_event_ids_snql` which uses
SnQL and adds a order by clause. We observe in the snuba admin tool this
new query takes only 2 seconds and scans hundreds of MBs of data as
opposed to the old query which scans 10+ GB on large issues.

`get_adjacent_event_ids_snql` is not general purpose as its only used in
one place.

see ticket below for more info:
Fixes https://github.com/getsentry/team-issues/issues/42
Josh Ferge 4 months ago
parent
commit
d8ff0cb911

+ 11 - 0
src/sentry/eventstore/base.py

@@ -130,6 +130,7 @@ class EventStorage(Service):
         "get_events_snql",
         "get_unfetched_events",
         "get_adjacent_event_ids",
+        "get_adjacent_event_ids_snql",
         "bind_nodes",
         "get_unfetched_transactions",
     )
@@ -272,6 +273,16 @@ class EventStorage(Service):
         """
         raise NotImplementedError
 
+    def get_adjacent_event_ids_snql(
+        self,
+        organization_id: int,
+        project_id: int,
+        group_id: int,
+        environments: list[str],
+        event: Event | GroupEvent,
+    ):
+        raise NotImplementedError
+
     def get_adjacent_event_ids(self, event, filter):
         """
         Gets the previous and next event IDs given a current event and some conditions/filters.

+ 109 - 1
src/sentry/eventstore/snuba/backend.py

@@ -29,7 +29,7 @@ from sentry.models.group import Group
 from sentry.snuba.dataset import Dataset
 from sentry.snuba.events import Columns
 from sentry.utils import snuba
-from sentry.utils.snuba import DATASETS, _prepare_start_end, raw_snql_query
+from sentry.utils.snuba import DATASETS, _prepare_start_end, bulk_snuba_queries, raw_snql_query
 from sentry.utils.validators import normalize_event_id
 
 EVENT_ID = Columns.EVENT_ID.value.alias
@@ -451,6 +451,114 @@ class SnubaEventStorage(EventStorage):
         else:
             return Dataset.Discover
 
+    def get_adjacent_event_ids_snql(
+        self, organization_id, project_id, group_id, environments, event
+    ):
+        """
+        Utility function for grabbing an event's adjascent events,
+        which are the ones with the closest timestamps before and after.
+        This function is only used in project_event_details at the moment,
+        so it's interface is tailored to that. We use SnQL and use the project_id
+        and toStartOfDay(timestamp) to efficently scan our table
+        """
+        dataset = self._get_dataset_for_event(event)
+        app_id = "eventstore"
+        referrer = "eventstore.get_next_or_prev_event_id_snql"
+        tenant_ids = {"organization_id": organization_id}
+        environment_conditions = []
+        if environments:
+            environment_conditions.append(Condition(Column("environment"), Op.IN, environments))
+
+        def make_constant_conditions():
+            environment_conditions = []
+            if environments:
+                environment_conditions.append(Condition(Column("environment"), Op.IN, environments))
+
+            group_conditions = []
+            if group_id:
+                group_conditions.append(Condition(Column("group_id"), Op.EQ, group_id))
+            project_conditions = [Condition(Column("project_id"), Op.EQ, project_id)]
+            return [
+                *environment_conditions,
+                *group_conditions,
+                *project_conditions,
+            ]
+
+        def make_prev_timestamp_conditions(event):
+            return [
+                Condition(
+                    Column(DATASETS[dataset][Columns.TIMESTAMP.value.alias]),
+                    Op.GTE,
+                    event.datetime - timedelta(days=100),
+                ),
+                Condition(
+                    Column(DATASETS[dataset][Columns.TIMESTAMP.value.alias]),
+                    Op.LT,
+                    event.datetime + timedelta(seconds=1),
+                ),
+                Condition(Column("event_id"), Op.LT, event.event_id),
+            ]
+
+        def make_next_timestamp_conditions(event):
+            return [
+                Condition(
+                    Column(DATASETS[dataset][Columns.TIMESTAMP.value.alias]),
+                    Op.LT,
+                    event.datetime + timedelta(days=100),
+                ),
+                Condition(
+                    Column(DATASETS[dataset][Columns.TIMESTAMP.value.alias]), Op.GTE, event.datetime
+                ),
+                Condition(Column("event_id"), Op.GT, event.event_id),
+            ]
+
+        def make_request(is_prev):
+            order_by_direction = Direction.DESC if is_prev else Direction.ASC
+            conditions = make_constant_conditions()
+            conditions.extend(
+                make_prev_timestamp_conditions(event)
+                if is_prev
+                else make_next_timestamp_conditions(event)
+            )
+            return Request(
+                dataset=dataset.value,
+                app_id=app_id,
+                query=Query(
+                    match=Entity(dataset.value),
+                    select=[Column("event_id"), Column("project_id")],
+                    where=conditions,
+                    orderby=[
+                        OrderBy(
+                            Column("project_id"),
+                            direction=order_by_direction,
+                        ),
+                        OrderBy(
+                            Function("toStartOfDay", [Column("timestamp")]),
+                            direction=order_by_direction,
+                        ),
+                        OrderBy(
+                            Column("timestamp"),
+                            direction=order_by_direction,
+                        ),
+                        OrderBy(
+                            Column("event_id"),
+                            direction=order_by_direction,
+                        ),
+                    ],
+                    limit=Limit(1),
+                ),
+                tenant_ids=tenant_ids,
+            )
+
+        snql_request_prev = make_request(is_prev=True)
+        snql_request_next = make_request(is_prev=False)
+
+        bulk_snql_results = bulk_snuba_queries(
+            [snql_request_prev, snql_request_next], referrer=referrer
+        )
+        event_ids = [self.__get_event_id_from_result(result) for result in bulk_snql_results]
+        return event_ids
+
     def get_adjacent_event_ids(self, event, filter):
         """
         Returns (project_id, event_id) of a previous event given a current event

+ 20 - 11
src/sentry/issues/endpoints/project_event_details.py

@@ -4,7 +4,7 @@ from typing import Any
 from rest_framework.request import Request
 from rest_framework.response import Response
 
-from sentry import eventstore
+from sentry import eventstore, options
 from sentry.api.api_owners import ApiOwner
 from sentry.api.api_publish_status import ApiPublishStatus
 from sentry.api.base import region_silo_endpoint
@@ -37,16 +37,25 @@ def wrap_event_response(
     prev_event_id = None
 
     if event.group_id:
-        conditions = []
-        if environments:
-            conditions.append(["environment", "IN", environments])
-        _filter = eventstore.Filter(
-            conditions=conditions,
-            project_ids=[event.project_id],
-            group_ids=[event.group_id],
-        )
-
-        prev_ids, next_ids = eventstore.backend.get_adjacent_event_ids(event, filter=_filter)
+        if options.get("eventstore.adjacent_event_ids_use_snql"):
+            prev_ids, next_ids = eventstore.backend.get_adjacent_event_ids_snql(
+                organization_id=event.organization.id,
+                project_id=event.project_id,
+                group_id=event.group_id,
+                environments=environments,
+                event=event,
+            )
+        else:
+            conditions = []
+            if environments:
+                conditions.append(["environment", "IN", environments])
+            _filter = eventstore.Filter(
+                conditions=conditions,
+                project_ids=[event.project_id],
+                group_ids=[event.group_id],
+            )
+
+            prev_ids, next_ids = eventstore.backend.get_adjacent_event_ids(event, filter=_filter)
 
         next_event_id = next_ids[1] if next_ids else None
         prev_event_id = prev_ids[1] if prev_ids else None

+ 7 - 0
src/sentry/options/defaults.py

@@ -2794,3 +2794,10 @@ register(
     default=True,
     flags=FLAG_AUTOMATOR_MODIFIABLE,
 )
+
+register(
+    "eventstore.adjacent_event_ids_use_snql",
+    type=Bool,
+    default=False,
+    flags=FLAG_AUTOMATOR_MODIFIABLE,
+)

+ 93 - 0
tests/sentry/eventstore/snuba/test_backend.py

@@ -321,3 +321,96 @@ class SnubaEventStorageTest(TestCase, SnubaTestCase, PerformanceIssueTestCase):
 
         assert prev_event is None
         assert next_event == (str(self.project2.id), self.transaction_event_3.event_id)
+
+    def test_get_adjacent_event_ids_snql(self):
+        project = self.create_project()
+        event1 = self.store_event(
+            data={
+                "event_id": "a" * 32,
+                "type": "default",
+                "platform": "python",
+                "fingerprint": ["group"],
+                "timestamp": self.two_min_ago,
+            },
+            project_id=project.id,
+        )
+        event2 = self.store_event(
+            data={
+                "event_id": "b" * 32,
+                "type": "default",
+                "platform": "python",
+                "fingerprint": ["group"],
+                "timestamp": self.min_ago,
+            },
+            project_id=project.id,
+        )
+        event3 = self.store_event(
+            data={
+                "event_id": "c" * 32,
+                "type": "default",
+                "platform": "python",
+                "fingerprint": ["group"],
+                "timestamp": before_now(minutes=0).isoformat(),
+            },
+            project_id=project.id,
+        )
+        prev_ids, next_ids = self.eventstore.get_adjacent_event_ids_snql(
+            organization_id=event2.organization.id,
+            project_id=event2.project_id,
+            group_id=event2.group_id,
+            environments=[],
+            event=event2,
+        )
+
+        assert prev_ids == (str(event1.project_id), event1.event_id)
+        assert next_ids == (str(event3.project_id), event3.event_id)
+
+    def test_adjacent_event_ids_same_timestamp_snql(self):
+        project = self.create_project()
+        event1 = self.store_event(
+            data={
+                "event_id": "a" * 32,
+                "type": "default",
+                "platform": "python",
+                "fingerprint": ["group"],
+                "timestamp": self.min_ago,
+            },
+            project_id=project.id,
+        )
+        event2 = self.store_event(
+            data={
+                "event_id": "b" * 32,
+                "type": "default",
+                "platform": "python",
+                "fingerprint": ["group"],
+                "timestamp": self.min_ago,
+            },
+            project_id=project.id,
+        )
+
+        # the 2 events should be in the same group
+        assert event1.group_id == event2.group_id
+        # the 2 events should have the same timestamp
+        assert event1.datetime == event2.datetime
+
+        prev_ids, next_ids = self.eventstore.get_adjacent_event_ids_snql(
+            organization_id=event1.organization.id,
+            project_id=event1.project_id,
+            group_id=event1.group_id,
+            environments=[],
+            event=event1,
+        )
+
+        assert prev_ids is None
+        assert next_ids == (str(project.id), event2.event_id)
+
+        prev_ids, next_ids = self.eventstore.get_adjacent_event_ids_snql(
+            organization_id=event2.organization.id,
+            project_id=event2.project_id,
+            group_id=event2.group_id,
+            environments=[],
+            event=event2,
+        )
+
+        assert prev_ids == (str(project.id), event1.event_id)
+        assert next_ids is None