Browse Source

fix(similarity-embedding): Skip groups with no events in record backfill (#71659)

Skip and log groups with no events when running grouping record backfill
Jodi Jang 9 months ago
parent
commit
11a12efdbc

+ 17 - 0
src/sentry/tasks/backfill_seer_grouping_records.py

@@ -205,6 +205,23 @@ def backfill_seer_grouping_records(
 
     if result and result[0].get("data"):
         rows: list[GroupEventRow] = result[0]["data"]
+
+        # Log if any group does not have any events in snuba and skip it
+        if len(rows) != len(group_id_batch):
+            row_group_ids = {row["group_id"] for row in rows}
+            for group_id in group_id_batch:
+                if group_id not in row_group_ids:
+                    logger.info(
+                        "tasks.backfill_seer_grouping_records.no_snuba_event",
+                        extra={
+                            "organization_id": project.organization.id,
+                            "project_id": project_id,
+                            "group_id": group_id,
+                        },
+                    )
+                    group_id_batch.remove(group_id)
+                    del group_id_message_batch_filtered[group_id]
+
         group_hashes = GroupHash.objects.filter(
             project_id=project.id, group_id__in=group_id_batch
         ).distinct("group_id")

+ 78 - 1
tests/sentry/tasks/test_backfill_seer_grouping_records.py

@@ -1,13 +1,16 @@
 import copy
 from collections.abc import Mapping
+from datetime import datetime, timedelta
 from random import choice
 from string import ascii_uppercase
 from typing import Any
-from unittest.mock import patch
+from unittest.mock import call, patch
 
 import pytest
 from django.conf import settings
 from google.api_core.exceptions import DeadlineExceeded, ServiceUnavailable
+from snuba_sdk import Column, Condition, Entity, Function, Op, Query, Request
+from snuba_sdk.orderby import Direction, OrderBy
 
 from sentry.conf.server import SEER_SIMILARITY_MODEL_VERSION
 from sentry.grouping.grouping_info import get_grouping_info
@@ -17,6 +20,8 @@ from sentry.models.grouphash import GroupHash
 from sentry.seer.similarity.backfill import CreateGroupingRecordData
 from sentry.seer.similarity.types import RawSeerSimilarIssueData
 from sentry.seer.similarity.utils import get_stacktrace_string
+from sentry.snuba.dataset import Dataset
+from sentry.snuba.referrer import Referrer
 from sentry.tasks.backfill_seer_grouping_records import (
     GroupStacktraceData,
     backfill_seer_grouping_records,
@@ -32,6 +37,7 @@ from sentry.testutils.helpers.features import with_feature
 from sentry.testutils.helpers.task_runner import TaskRunner
 from sentry.testutils.pytest.fixtures import django_db_all
 from sentry.utils import json, redis
+from sentry.utils.snuba import bulk_snuba_queries
 
 EXCEPTION = {
     "values": [
@@ -984,3 +990,74 @@ class TestBackfillSeerGroupingRecords(SnubaTestCase, TestCase):
         # Assert metadata was not set for groups that will be deleted
         for group in Group.objects.filter(project_id=self.project.id, id__in=deleted_group_ids):
             assert group.data["metadata"].get("seer_similarity") is None
+
+    @with_feature("projects:similarity-embeddings-backfill")
+    @patch("sentry.tasks.backfill_seer_grouping_records.logger")
+    @patch("sentry.tasks.backfill_seer_grouping_records.bulk_snuba_queries")
+    @patch("sentry.tasks.backfill_seer_grouping_records.post_bulk_grouping_records")
+    def test_backfill_seer_grouping_records_no_events(
+        self, mock_post_bulk_grouping_records, mock_snuba_queries, mock_logger
+    ):
+        """
+        Test that groups that have no events in snuba are excluded.
+        """
+        mock_post_bulk_grouping_records.return_value = {"success": True, "groups_with_neighbor": {}}
+
+        # Mock snuba response to purposefully exclude the first group
+        group_ids_minus_first = Group.objects.filter(project_id=self.project.id).order_by("id")[1:]
+        group_id_batch = [group.id for group in group_ids_minus_first]
+        time_now = datetime.now()
+        events_entity = Entity("events", alias="events")
+        query = Query(
+            match=events_entity,
+            select=[
+                Column("group_id"),
+                Function("max", [Column("event_id")], "event_id"),
+            ],
+            groupby=[Column("group_id")],
+            where=[
+                Condition(Column("project_id"), Op.EQ, self.project.id),
+                Condition(Column("group_id"), Op.IN, group_id_batch),
+                Condition(
+                    Column("timestamp", entity=events_entity), Op.GTE, time_now - timedelta(days=90)
+                ),
+                Condition(Column("timestamp", entity=events_entity), Op.LT, time_now),
+            ],
+            orderby=[OrderBy(Column("group_id"), Direction.ASC)],
+        )
+
+        request = Request(
+            dataset=Dataset.Events.value,
+            app_id=Referrer.GROUPING_RECORDS_BACKFILL_REFERRER.value,
+            query=query,
+            tenant_ids={
+                "referrer": Referrer.GROUPING_RECORDS_BACKFILL_REFERRER.value,
+                "cross_org_query": 1,
+            },
+        )
+
+        result = bulk_snuba_queries(
+            [request], referrer=Referrer.GROUPING_RECORDS_BACKFILL_REFERRER.value
+        )
+        mock_snuba_queries.return_value = result
+
+        with TaskRunner():
+            backfill_seer_grouping_records(self.project.id, None)
+
+        for group in Group.objects.filter(project_id=self.project.id).order_by("id")[1:]:
+            assert group.data["metadata"].get("seer_similarity") is not None
+
+        # Check that the group with no events has no seer metadata
+        group_no_events = Group.objects.filter(project_id=self.project.id).order_by("id")[0]
+        assert group_no_events.data["metadata"].get("seer_similarity") is None
+        assert (
+            call(
+                "tasks.backfill_seer_grouping_records.no_snuba_event",
+                extra={
+                    "organization_id": self.project.organization.id,
+                    "project_id": self.project.id,
+                    "group_id": group_no_events.id,
+                },
+            )
+            in mock_logger.info.call_args_list
+        )