Browse Source

fix(similarity): Filter same group out of response (#80195)

Filter out hashes from similar issues results that correspond to the
requesting group's hashes
Jodi Jang 4 months ago
parent
commit
cd40799988

+ 14 - 6
src/sentry/issues/endpoints/group_similar_issues_embeddings.py

@@ -14,6 +14,7 @@ from sentry.api.bases.group import GroupEndpoint
 from sentry.api.serializers import serialize
 from sentry.grouping.grouping_info import get_grouping_info
 from sentry.models.group import Group
+from sentry.models.grouphash import GroupHash
 from sentry.seer.similarity.similar_issues import get_similarity_data_from_seer
 from sentry.seer.similarity.types import SeerSimilarIssueData, SimilarIssuesEmbeddingsRequest
 from sentry.seer.similarity.utils import (
@@ -40,22 +41,29 @@ class GroupSimilarIssuesEmbeddingsEndpoint(GroupEndpoint):
         "GET": ApiPublishStatus.PRIVATE,
     }
 
+    def get_group_hashes_for_group_id(self, group_id: int) -> set[str]:
+        hashes = GroupHash.objects.filter(group_id=group_id)
+        return {hash.hash for hash in hashes}
+
     def get_formatted_results(
         self,
         similar_issues_data: Sequence[SeerSimilarIssueData],
         user: User | AnonymousUser,
+        group_id: int,
     ) -> Sequence[tuple[Mapping[str, Any], Mapping[str, Any]] | None]:
         """
         Format the responses using to be used by the frontend by changing the  field names and
         changing the cosine distances into cosine similarities.
         """
+        hashes = self.get_group_hashes_for_group_id(group_id)
         group_data = {}
         for similar_issue_data in similar_issues_data:
-            formatted_response: FormattedSimilarIssuesEmbeddingsData = {
-                "exception": round(1 - similar_issue_data.stacktrace_distance, 4),
-                "shouldBeGrouped": "Yes" if similar_issue_data.should_group else "No",
-            }
-            group_data[similar_issue_data.parent_group_id] = formatted_response
+            if similar_issue_data.parent_hash not in hashes:
+                formatted_response: FormattedSimilarIssuesEmbeddingsData = {
+                    "exception": round(1 - similar_issue_data.stacktrace_distance, 4),
+                    "shouldBeGrouped": "Yes" if similar_issue_data.should_group else "No",
+                }
+                group_data[similar_issue_data.parent_group_id] = formatted_response
 
         serialized_groups = {
             int(g["id"]): g
@@ -122,6 +130,6 @@ class GroupSimilarIssuesEmbeddingsEndpoint(GroupEndpoint):
 
         if not results:
             return Response([])
-        formatted_results = self.get_formatted_results(results, request.user)
+        formatted_results = self.get_formatted_results(results, request.user, group.id)
 
         return Response(formatted_results)

+ 30 - 1
tests/sentry/issues/endpoints/test_group_similar_issues_embeddings.py

@@ -200,7 +200,9 @@ class GroupSimilarIssuesEmbeddingsTest(APITestCase):
         )
         group_similar_endpoint = GroupSimilarIssuesEmbeddingsEndpoint()
         formatted_results = group_similar_endpoint.get_formatted_results(
-            similar_issues_data=[similar_issue_data_1, similar_issue_data_2], user=self.user
+            similar_issues_data=[similar_issue_data_1, similar_issue_data_2],
+            user=self.user,
+            group_id=self.group.id,
         )
         assert formatted_results == self.get_expected_response(
             [
@@ -344,6 +346,33 @@ class GroupSimilarIssuesEmbeddingsTest(APITestCase):
             user_id=self.user.id,
         )
 
+    @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen")
+    def test_parent_hash_in_group_hashes(self, mock_seer_request):
+        """
+        Test that the request group's hashes are filtered out of the returned similar parent hashes
+        """
+        seer_return_value: Any = {
+            "responses": [
+                # Make the group's own hash the returned parent hash
+                {
+                    "parent_hash": self.event.get_primary_hash(),
+                    "should_group": True,
+                    "stacktrace_distance": 0.01,
+                },
+                {
+                    "parent_hash": self.similar_event.get_primary_hash(),
+                    "should_group": True,
+                    "stacktrace_distance": 0.01,
+                },
+            ]
+        }
+        mock_seer_request.return_value = HTTPResponse(orjson.dumps(seer_return_value), status=200)
+        response = self.client.get(self.path)
+
+        assert response.data == self.get_expected_response(
+            [NonNone(self.similar_event.group_id)], [0.99], ["Yes"]
+        )
+
     @mock.patch("sentry.seer.similarity.similar_issues.metrics.incr")
     @mock.patch("sentry.seer.similarity.similar_issues.logger")
     @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen")