Browse Source

ref(similarity): Add option to skip processed and chosen projects in backfill (#75084)

Allow already backfilled projects and manually
included projects to be skipped
Jodi Jang 7 months ago
parent
commit
e0350cf886

+ 11 - 1
src/sentry/api/endpoints/project_backfill_similar_issues_embeddings_records.py

@@ -34,6 +34,8 @@ class ProjectBackfillSimilarIssuesEmbeddingsRecords(ProjectEndpoint):
         last_processed_id = None
         only_delete = False
         enable_ingestion = False
+        skip_processed_projects = False
+        skip_project_ids = None
 
         if request.data.get("last_processed_id"):
             last_processed_id = int(request.data["last_processed_id"])
@@ -42,12 +44,20 @@ class ProjectBackfillSimilarIssuesEmbeddingsRecords(ProjectEndpoint):
             only_delete = True
 
         if request.data.get("enable_ingestion"):
-            enable_ingestion = request.data["enable_ingestion"] == "true"
+            enable_ingestion = True
+
+        if request.data.get("skip_processed_projects"):
+            skip_processed_projects = True
+
+        if request.data.get("skip_project_ids"):
+            skip_project_ids = request.data["skip_project_ids"]
 
         backfill_seer_grouping_records_for_project.delay(
             current_project_id=project.id,
             last_processed_group_id_input=last_processed_id,
             only_delete=only_delete,
             enable_ingestion=enable_ingestion,
+            skip_processed_projects=skip_processed_projects,
+            skip_project_ids=skip_project_ids,
         )
         return Response(status=204)

+ 26 - 0
src/sentry/tasks/embeddings_grouping/backfill_seer_grouping_records_for_project.py

@@ -45,6 +45,8 @@ def backfill_seer_grouping_records_for_project(
     last_processed_project_index_input: int | None = None,
     only_delete: bool = False,
     enable_ingestion: bool = False,
+    skip_processed_projects: bool = False,
+    skip_project_ids: list[int] | None = None,
     *args: Any,
     **kwargs: Any,
 ) -> None:
@@ -99,6 +101,30 @@ def backfill_seer_grouping_records_for_project(
         )
         return
 
+    is_project_processed = (
+        skip_processed_projects
+        and project.get_option("sentry:similarity_backfill_completed") is not None
+    )
+    is_project_skipped = skip_project_ids and project.id in skip_project_ids
+    if is_project_processed or is_project_skipped:
+        logger.info(
+            "backfill_seer_grouping_records.project_skipped",
+            extra={
+                "project_id": current_project_id,
+                "project_already_processed": is_project_processed,
+                "project_manually_skipped": is_project_skipped,
+            },
+        )
+        call_next_backfill(
+            last_processed_group_id=None,
+            project_id=current_project_id,
+            last_processed_project_index=last_processed_project_index,
+            cohort=cohort,
+            only_delete=only_delete,
+            enable_ingestion=enable_ingestion,
+        )
+        return
+
     if only_delete:
         delete_seer_grouping_records(current_project_id)
         logger.info(

+ 58 - 0
tests/sentry/api/endpoints/test_project_backfill_similar_issues_embeddings_records.py

@@ -51,6 +51,8 @@ class ProjectBackfillSimilarIssuesEmbeddingsRecordsTest(APITestCase):
             last_processed_group_id_input=None,
             only_delete=False,
             enable_ingestion=False,
+            skip_processed_projects=False,
+            skip_project_ids=None,
         )
 
     @patch(
@@ -68,6 +70,8 @@ class ProjectBackfillSimilarIssuesEmbeddingsRecordsTest(APITestCase):
             last_processed_group_id_input=None,
             only_delete=False,
             enable_ingestion=False,
+            skip_processed_projects=False,
+            skip_project_ids=None,
         )
 
     @patch(
@@ -88,6 +92,8 @@ class ProjectBackfillSimilarIssuesEmbeddingsRecordsTest(APITestCase):
             last_processed_group_id_input=8,
             only_delete=False,
             enable_ingestion=False,
+            skip_processed_projects=False,
+            skip_project_ids=None,
         )
 
     @patch(
@@ -110,6 +116,8 @@ class ProjectBackfillSimilarIssuesEmbeddingsRecordsTest(APITestCase):
             last_processed_group_id_input=8,
             only_delete=True,
             enable_ingestion=False,
+            skip_processed_projects=False,
+            skip_project_ids=None,
         )
 
     @patch(
@@ -132,4 +140,54 @@ class ProjectBackfillSimilarIssuesEmbeddingsRecordsTest(APITestCase):
             last_processed_group_id_input=8,
             only_delete=False,
             enable_ingestion=True,
+            skip_processed_projects=False,
+            skip_project_ids=None,
+        )
+
+    @patch(
+        "sentry.api.endpoints.project_backfill_similar_issues_embeddings_records.is_active_superuser",
+        return_value=True,
+    )
+    @patch(
+        "sentry.api.endpoints.project_backfill_similar_issues_embeddings_records.backfill_seer_grouping_records_for_project.delay"
+    )
+    @with_feature("projects:similarity-embeddings-backfill")
+    def test_post_success_skip_processed_projects(
+        self, mock_backfill_seer_grouping_records, mock_is_active_superuser
+    ):
+        response = self.client.post(
+            self.url, data={"last_processed_id": "8", "skip_processed_projects": "true"}
+        )
+        assert response.status_code == 204, response.content
+        mock_backfill_seer_grouping_records.assert_called_with(
+            current_project_id=self.project.id,
+            last_processed_group_id_input=8,
+            only_delete=False,
+            enable_ingestion=False,
+            skip_processed_projects=True,
+            skip_project_ids=None,
+        )
+
+    @patch(
+        "sentry.api.endpoints.project_backfill_similar_issues_embeddings_records.is_active_superuser",
+        return_value=True,
+    )
+    @patch(
+        "sentry.api.endpoints.project_backfill_similar_issues_embeddings_records.backfill_seer_grouping_records_for_project.delay"
+    )
+    @with_feature("projects:similarity-embeddings-backfill")
+    def test_post_success_skip_project_ids(
+        self, mock_backfill_seer_grouping_records, mock_is_active_superuser
+    ):
+        response = self.client.post(
+            self.url, data={"last_processed_id": "8", "skip_project_ids": [1]}
+        )
+        assert response.status_code == 204, response.content
+        mock_backfill_seer_grouping_records.assert_called_with(
+            current_project_id=self.project.id,
+            last_processed_group_id_input=8,
+            only_delete=False,
+            enable_ingestion=False,
+            skip_processed_projects=False,
+            skip_project_ids=[1],
         )

+ 120 - 2
tests/sentry/tasks/test_backfill_seer_grouping_records.py

@@ -1,4 +1,5 @@
 import copy
+import time
 from collections.abc import Mapping
 from datetime import UTC, datetime, timedelta
 from random import choice
@@ -1519,10 +1520,9 @@ class TestBackfillSeerGroupingRecords(SnubaTestCase, TestCase):
         assert self.project.get_option("sentry:similarity_backfill_completed") is not None
 
     @with_feature("projects:similarity-embeddings-backfill")
-    @patch("sentry.tasks.embeddings_grouping.utils.logger")
     @patch("sentry.tasks.embeddings_grouping.utils.post_bulk_grouping_records")
     def test_backfill_seer_grouping_records_no_enable_ingestion(
-        self, mock_post_bulk_grouping_records, mock_logger
+        self, mock_post_bulk_grouping_records
     ):
         """
         Test that when the enable_ingestion flag is False, the project option is not set.
@@ -1540,3 +1540,121 @@ class TestBackfillSeerGroupingRecords(SnubaTestCase, TestCase):
             }
 
         assert self.project.get_option("sentry:similarity_backfill_completed") is None
+
+    @with_feature("projects:similarity-embeddings-backfill")
+    @patch("sentry.tasks.embeddings_grouping.backfill_seer_grouping_records_for_project.logger")
+    def test_backfill_seer_grouping_records_skip_project_already_processed(self, mock_logger):
+        """
+        Test that projects that have a backfill completed project option are skipped when passed
+        the skip_processed_projects flag.
+        """
+        self.project.update_option("sentry:similarity_backfill_completed", int(time.time()))
+        with TaskRunner():
+            backfill_seer_grouping_records_for_project(
+                self.project.id, None, skip_processed_projects=True
+            )
+
+        expected_call_args_list = [
+            call(
+                "backfill_seer_grouping_records",
+                extra={
+                    "current_project_id": self.project.id,
+                    "last_processed_group_id": None,
+                    "cohort": None,
+                    "last_processed_project_index": None,
+                    "only_delete": False,
+                },
+            ),
+            call(
+                "backfill_seer_grouping_records.project_skipped",
+                extra={
+                    "project_id": self.project.id,
+                    "project_already_processed": True,
+                    "project_manually_skipped": None,
+                },
+            ),
+            call("backfill finished, no cohort", extra={"project_id": self.project.id}),
+        ]
+        assert mock_logger.info.call_args_list == expected_call_args_list
+
+    @with_feature("projects:similarity-embeddings-backfill")
+    @patch("sentry.tasks.embeddings_grouping.backfill_seer_grouping_records_for_project.logger")
+    @patch("sentry.tasks.embeddings_grouping.utils.post_bulk_grouping_records")
+    def test_backfill_seer_grouping_records_reprocess_project_already_processed(
+        self, mock_post_bulk_grouping_records, mock_logger
+    ):
+        """
+        Test that projects that have a backfill completed project option are not skipped when not
+        passed the skip_processed_projects flag.
+        """
+        mock_post_bulk_grouping_records.return_value = {"success": True, "groups_with_neighbor": {}}
+        self.project.update_option("sentry:similarity_backfill_completed", int(time.time()))
+        with TaskRunner():
+            backfill_seer_grouping_records_for_project(self.project.id, None)
+
+        last_group_id = sorted(
+            [group.id for group in Group.objects.filter(project_id=self.project.id)]
+        )[0]
+        expected_call_args_list = [
+            call(
+                "backfill_seer_grouping_records",
+                extra={
+                    "current_project_id": self.project.id,
+                    "last_processed_group_id": None,
+                    "cohort": None,
+                    "last_processed_project_index": None,
+                    "only_delete": False,
+                },
+            ),
+            call("about to call next backfill", extra={"project_id": self.project.id}),
+            call(
+                "calling next backfill task",
+                extra={"project_id": self.project.id, "last_processed_group_id": last_group_id},
+            ),
+            call(
+                "backfill_seer_grouping_records",
+                extra={
+                    "current_project_id": self.project.id,
+                    "last_processed_group_id": last_group_id,
+                    "cohort": None,
+                    "last_processed_project_index": 0,
+                    "only_delete": False,
+                },
+            ),
+            call("backfill finished, no cohort", extra={"project_id": self.project.id}),
+        ]
+        assert mock_logger.info.call_args_list == expected_call_args_list
+
+    @with_feature("projects:similarity-embeddings-backfill")
+    @patch("sentry.tasks.embeddings_grouping.backfill_seer_grouping_records_for_project.logger")
+    def test_backfill_seer_grouping_records_manually_skip_project(self, mock_logger):
+        """
+        Test that project ids that are included in the skip_project_ids field are skipped.
+        """
+        with TaskRunner():
+            backfill_seer_grouping_records_for_project(
+                self.project.id, None, skip_project_ids=[self.project.id]
+            )
+
+        expected_call_args_list = [
+            call(
+                "backfill_seer_grouping_records",
+                extra={
+                    "current_project_id": self.project.id,
+                    "last_processed_group_id": None,
+                    "cohort": None,
+                    "last_processed_project_index": None,
+                    "only_delete": False,
+                },
+            ),
+            call(
+                "backfill_seer_grouping_records.project_skipped",
+                extra={
+                    "project_id": self.project.id,
+                    "project_already_processed": False,
+                    "project_manually_skipped": True,
+                },
+            ),
+            call("backfill finished, no cohort", extra={"project_id": self.project.id}),
+        ]
+        assert mock_logger.info.call_args_list == expected_call_args_list