Browse Source

fix(similarity-embedding): Fix backfill filtering and other logic (#71754)

Remove adding metadata for groups where times_seen = 1
When nodestore has an exception, log then raise the exception
When nodestore returns no data, log then continue with the backfill
Fix the ordering by times_seen and id
Jodi Jang 9 months ago
parent
commit
3111e7ce0b

+ 59 - 51
src/sentry/tasks/backfill_seer_grouping_records.py

@@ -110,22 +110,6 @@ def backfill_seer_grouping_records(
         )
         return
 
-    if last_processed_index == 0:
-        # Set the metadata of groups where times_seen = 1
-        # Do not set the version number, so we can consider it for future backfills later
-        groups_seen_once = Group.objects.filter(
-            project_id=project_id, type=ErrorGroupType.type_id, times_seen=1
-        )
-        for group in groups_seen_once:
-            seer_similarity_seen_once = {"times_seen_once": True}
-            if group.data.get("metadata"):
-                group.data["metadata"]["seer_similarity"] = seer_similarity_seen_once
-            else:
-                group.data["metadata"] = {"seer_similarity": seer_similarity_seen_once}
-
-        if not dry_run:
-            Group.objects.bulk_update(groups_seen_once, ["data"])
-
     group_id_message_data = (
         Group.objects.filter(
             project_id=project.id,
@@ -135,8 +119,7 @@ def backfill_seer_grouping_records(
         )
         .exclude(status__in=[GroupStatus.PENDING_DELETION, GroupStatus.DELETION_IN_PROGRESS])
         .values_list("id", "message", "data")
-        .order_by("times_seen")
-        .order_by("id")
+        .order_by("-times_seen", "id")
     )
 
     batch_size = options.get("embeddings-grouping.seer.backfill-batch-size")
@@ -236,11 +219,19 @@ def backfill_seer_grouping_records(
             project, rows, group_id_message_batch_filtered, group_hashes_dict
         )
 
-        # If nodestore is down, we should stop
+        # If nodestore returns no data
         if data["data"] == [] and data["stacktrace_list"] == []:
             logger.info(
-                "backfill_seer_grouping_records.no_data",
-                extra={"project_id": project.id},
+                "tasks.backfill_seer_grouping_records.no_data",
+                extra={"project_id": project.id, "group_id_batch": json.dumps(group_id_batch)},
+            )
+            call_next_backfill(
+                batch_end_index,
+                project_id,
+                redis_client,
+                len(group_id_message_data),
+                group_id_batch[-1],
+                dry_run,
             )
             return
 
@@ -297,35 +288,14 @@ def backfill_seer_grouping_records(
                     extra={"project_id": project.id, "num_updated": num_updated},
                 )
 
-            last_processed_index = batch_end_index
-            redis_client.set(
-                f"{make_backfill_redis_key(project_id)}",
-                last_processed_index if last_processed_index is not None else 0,
-                ex=60 * 60 * 24 * 7,
+            call_next_backfill(
+                batch_end_index,
+                project_id,
+                redis_client,
+                len(group_id_message_data),
+                group_id_batch[-1],
+                dry_run,
             )
-
-            if last_processed_index and last_processed_index < len(group_id_message_data):
-                logger.info(
-                    "calling next backfill task",
-                    extra={
-                        "project_id": project.id,
-                        "last_processed_index": last_processed_index,
-                        "last_processed_group_id": group_id_batch[-1],
-                        "dry_run": dry_run,
-                    },
-                )
-                backfill_seer_grouping_records.apply_async(
-                    args=[project.id, last_processed_index, dry_run],
-                )
-            else:
-                logger.info(
-                    "reached the end of the group id list",
-                    extra={
-                        "project_id": project.id,
-                        "last_processed_index": last_processed_index,
-                        "dry_run": dry_run,
-                    },
-                )
         else:
             # If seer is down, we should stop
             logger.info(
@@ -408,7 +378,6 @@ def lookup_group_data_stacktrace_bulk(
             try:
                 bulk_data = nodestore.backend.get_multi(list(node_id_to_group_data.keys()))
             except (ServiceUnavailable, DeadlineExceeded) as e:
-                bulk_data = {}
                 extra = {
                     "organization_id": project.organization.id,
                     "project_id": project.id,
@@ -419,6 +388,7 @@ def lookup_group_data_stacktrace_bulk(
                     "tasks.backfill_seer_grouping_records.bulk_event_lookup_exception",
                     extra=extra,
                 )
+                raise
 
     group_data = []
     stacktrace_strings = []
@@ -481,7 +451,6 @@ def lookup_group_data_stacktrace_single(
             try:
                 event = lookup_event(project_id=project_id, event_id=event_id, group_id=group_id)
             except (ServiceUnavailable, DeadlineExceeded) as e:
-                event = None
                 extra = {
                     "organization_id": project.organization.id,
                     "project_id": project.id,
@@ -492,6 +461,7 @@ def lookup_group_data_stacktrace_single(
                 logger.exception(
                     "tasks.backfill_seer_grouping_records.event_lookup_exception", extra=extra
                 )
+                raise
 
     if event and event.data and event.data.get("exception"):
         with sentry_sdk.start_transaction(op="embeddings_grouping.get_latest_event"):
@@ -554,3 +524,41 @@ def delete_seer_grouping_records(
         for group in groups_with_seer_metadata:
             del group.data["metadata"]["seer_similarity"]
         Group.objects.bulk_update(groups_with_seer_metadata, ["data"])
+
+
+def call_next_backfill(
+    last_processed_index: int,
+    project_id: int,
+    redis_client: RedisCluster | StrictRedis,
+    len_group_id_batch_unfiltered: int,
+    last_group_id: int,
+    dry_run: bool,
+):
+    redis_client.set(
+        f"{make_backfill_redis_key(project_id)}",
+        last_processed_index if last_processed_index is not None else 0,
+        ex=60 * 60 * 24 * 7,
+    )
+
+    if last_processed_index and last_processed_index < len_group_id_batch_unfiltered:
+        logger.info(
+            "calling next backfill task",
+            extra={
+                "project_id": project_id,
+                "last_processed_index": last_processed_index,
+                "last_processed_group_id": last_group_id,
+                "dry_run": dry_run,
+            },
+        )
+        backfill_seer_grouping_records.apply_async(
+            args=[project_id, last_processed_index, dry_run],
+        )
+    else:
+        logger.info(
+            "reached the end of the group id list",
+            extra={
+                "project_id": project_id,
+                "last_processed_index": last_processed_index,
+                "dry_run": dry_run,
+            },
+        )

+ 76 - 45
tests/sentry/tasks/test_backfill_seer_grouping_records.py

@@ -4,7 +4,7 @@ from datetime import UTC, datetime, timedelta
 from random import choice
 from string import ascii_uppercase
 from typing import Any
-from unittest.mock import call, patch
+from unittest.mock import ANY, call, patch
 
 import pytest
 from django.conf import settings
@@ -177,10 +177,16 @@ class TestBackfillSeerGroupingRecords(SnubaTestCase, TestCase):
         assert group_data == expected_group_data
         assert stacktrace_string == EXCEPTION_STACKTRACE_STRING
 
-    @patch("sentry.tasks.backfill_seer_grouping_records.lookup_event")
+    @patch("time.sleep", return_value=None)
     @patch("sentry.tasks.backfill_seer_grouping_records.logger")
-    def test_lookup_group_data_stacktrace_single_exceptions(self, mock_logger, mock_lookup_event):
-        """Test cases where ServiceUnavailable and DeadlineExceeded exceptions occur"""
+    @patch("sentry.tasks.backfill_seer_grouping_records.lookup_event")
+    def test_lookup_group_data_stacktrace_single_exceptions(
+        self, mock_lookup_event, mock_logger, mock_sleep
+    ):
+        """
+        Test when ServiceUnavailable and DeadlineExceeded exceptions occur, that we stop the
+        backfill
+        """
         exceptions = [
             ServiceUnavailable(message="Service Unavailable"),
             DeadlineExceeded(message="Deadline Exceeded"),
@@ -189,24 +195,24 @@ class TestBackfillSeerGroupingRecords(SnubaTestCase, TestCase):
 
         for exception in exceptions:
             mock_lookup_event.side_effect = exception
-            group_data, stacktrace_string = lookup_group_data_stacktrace_single(
-                self.project,
-                event.event_id,
-                event.group_id,
-                event.group.message,
-                self.group_hashes[event.group.id],
-            )
-            assert (group_data, stacktrace_string) == (None, "")
-            mock_logger.exception.assert_called_with(
-                "tasks.backfill_seer_grouping_records.event_lookup_exception",
-                extra={
-                    "organization_id": self.project.organization.id,
-                    "project_id": self.project.id,
-                    "group_id": event.group.id,
-                    "event_id": event.event_id,
-                    "error": exception.message,
-                },
-            )
+            with pytest.raises(Exception):
+                lookup_group_data_stacktrace_single(
+                    self.project,
+                    event.event_id,
+                    event.group_id,
+                    event.group.message,
+                    self.group_hashes[event.group.id],
+                )
+                mock_logger.exception.assert_called_with(
+                    "tasks.backfill_seer_grouping_records.event_lookup_exception",
+                    extra={
+                        "organization_id": self.project.organization.id,
+                        "project_id": self.project.id,
+                        "group_id": event.group.id,
+                        "event_id": event.event_id,
+                        "error": exception.message,
+                    },
+                )
 
     def test_lookup_group_data_stacktrace_single_not_stacktrace_grouping(self):
         """Test that no data is returned if the group did not use the stacktrace to determine grouping"""
@@ -264,14 +270,14 @@ class TestBackfillSeerGroupingRecords(SnubaTestCase, TestCase):
         )
 
     @patch("time.sleep", return_value=None)
-    @patch("sentry.nodestore.backend.get_multi")
     @patch("sentry.tasks.backfill_seer_grouping_records.logger")
+    @patch("sentry.nodestore.backend.get_multi")
     def test_lookup_group_data_stacktrace_bulk_exceptions(
-        self, mock_logger, mock_get_multi, mock_sleep
+        self, mock_get_multi, mock_logger, mock_sleep
     ):
         """
         Test cases where ServiceUnavailable or DeadlineExceeded exceptions occur in bulk data
-        lookup
+        lookup, that the backfill stops
         """
         exceptions = [
             ServiceUnavailable(message="Service Unavailable"),
@@ -281,24 +287,17 @@ class TestBackfillSeerGroupingRecords(SnubaTestCase, TestCase):
 
         for exception in exceptions:
             mock_get_multi.side_effect = exception
-            (
-                bulk_event_ids,
-                invalid_event_ids,
-                bulk_group_data_stacktraces,
-            ) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, self.group_hashes)
-            assert bulk_event_ids == set()
-            assert invalid_event_ids == set()
-            assert bulk_group_data_stacktraces["data"] == []
-            assert bulk_group_data_stacktraces["stacktrace_list"] == []
-            mock_logger.exception.assert_called_with(
-                "tasks.backfill_seer_grouping_records.bulk_event_lookup_exception",
-                extra={
-                    "organization_id": self.project.organization.id,
-                    "project_id": self.project.id,
-                    "group_data": json.dumps(rows),
-                    "error": exception.message,
-                },
-            )
+            with pytest.raises(Exception):
+                lookup_group_data_stacktrace_bulk(self.project, rows, messages, self.group_hashes)
+                mock_logger.exception.assert_called_with(
+                    "tasks.backfill_seer_grouping_records.bulk_event_lookup_exception",
+                    extra={
+                        "organization_id": self.project.organization.id,
+                        "project_id": self.project.id,
+                        "group_data": json.dumps(rows),
+                        "error": exception.message,
+                    },
+                )
 
     def test_lookup_group_data_stacktrace_bulk_not_stacktrace_grouping(self):
         """
@@ -697,7 +696,7 @@ class TestBackfillSeerGroupingRecords(SnubaTestCase, TestCase):
         self, mock_post_bulk_grouping_records
     ):
         """
-        Test that different metadata is set for groups where times_seen > 1 and times_seen == 1.
+        Test that groups where times_seen == 1 are not included.
         """
         mock_post_bulk_grouping_records.return_value = {"success": True, "groups_with_neighbor": {}}
 
@@ -725,7 +724,7 @@ class TestBackfillSeerGroupingRecords(SnubaTestCase, TestCase):
                     "request_hash": self.group_hashes[group.id],
                 }
             else:
-                assert group.data["metadata"].get("seer_similarity") == {"times_seen_once": True}
+                assert group.data["metadata"].get("seer_similarity") is None
 
         redis_client = redis.redis_clusters.get(settings.SENTRY_MONITORS_REDIS_CLUSTER)
         last_processed_index = int(redis_client.get(make_backfill_redis_key(self.project.id)) or 0)
@@ -1100,3 +1099,35 @@ class TestBackfillSeerGroupingRecords(SnubaTestCase, TestCase):
         # Assert metadata was not set for groups that is 90 days old
         old_group = Group.objects.filter(project_id=self.project.id, id=old_group_id).first()
         assert old_group.data["metadata"].get("seer_similarity") is None
+
+    @with_feature("projects:similarity-embeddings-backfill")
+    @patch("sentry.tasks.backfill_seer_grouping_records.logger")
+    @patch(
+        "sentry.tasks.backfill_seer_grouping_records.lookup_group_data_stacktrace_bulk_with_fallback"
+    )
+    @patch("sentry.tasks.backfill_seer_grouping_records.call_next_backfill")
+    def test_backfill_seer_grouping_records_empty_nodestore(
+        self,
+        mock_call_next_backfill,
+        mock_lookup_group_data_stacktrace_bulk_with_fallback,
+        mock_logger,
+    ):
+        mock_lookup_group_data_stacktrace_bulk_with_fallback.return_value = GroupStacktraceData(
+            data=[], stacktrace_list=[]
+        )
+
+        with TaskRunner():
+            backfill_seer_grouping_records(self.project.id, None)
+
+        groups = Group.objects.all()
+        groups_len = len(groups)
+        mock_logger.info.assert_called_with(
+            "tasks.backfill_seer_grouping_records.no_data",
+            extra={
+                "project_id": self.project.id,
+                "group_id_batch": json.dumps([group.id for group in groups]),
+            },
+        )
+        mock_call_next_backfill.assert_called_with(
+            groups_len, self.project.id, ANY, groups_len, groups[groups_len - 1].id, False
+        )