Browse Source

ref(grouping): Small seer similar issues fixes (#70005)

This is a handful of small fixes pulled out of a few upcoming PRs, in order to make them easier to review.

Changes:

- Switch from using inheritance and `total=False` to using `NotRequired` for optional properties of `SimilarIssuesEmbeddingsRequest`, so everything can be in one class.
- Add exception types thrown by deserialization to the `try-except` block surrounding the parsing of seer response data, and add a log to track if/when things go wrong. Also simplify returning an empty result-set from the `except` block.
- Simplify a few places where we were using `update` to add a single key-value pair to a dictionary.
- Remove an unnecessary check for the existence of entries in the seer result-set. This should have been part of https://github.com/getsentry/sentry/pull/69993, but it got missed.
- Rename `response`/`responses` in  `get_formatted_results` to `similar_issue_data`/`similar_issues_data` in preparation for changing some seer data datatypes.
- Add a TODO to track the fact that we need to at some point let seer know when it's recommending groups which no longer exist.
- Fix the data in a `get_similar_issues_embeddings` test to match the `SimilarIssuesEmbeddingsData` type. (It was given in terms of message and stacktrace similarity rather than distance.)
- Add types to similar issues endpoint helpers.
- Prepend `get_value_if_exists` with an underscore since it's an internal helper.
Katie Byers 10 months ago
parent
commit
838b3fda5f

+ 24 - 15
src/sentry/api/endpoints/group_similar_issues_embeddings.py

@@ -26,11 +26,11 @@ logger = logging.getLogger(__name__)
 MAX_FRAME_COUNT = 50
 
 
-def get_value_if_exists(exception_value):
+def _get_value_if_exists(exception_value: dict[str, Any]) -> str:
     return exception_value["values"][0] if exception_value.get("values") else ""
 
 
-def get_stacktrace_string(data):
+def get_stacktrace_string(data: dict[str, Any]) -> str:
     """Format a stacktrace string from the grouping information."""
     if not (
         get_path(data, "app", "hash") and get_path(data, "app", "component", "values")
@@ -59,9 +59,9 @@ def get_stacktrace_string(data):
         exc_type, exc_value, frame_str = "", "", ""
         for exception_value in exception.get("values", []):
             if exception_value.get("id") == "type":
-                exc_type = get_value_if_exists(exception_value)
+                exc_type = _get_value_if_exists(exception_value)
             elif exception_value.get("id") == "value":
-                exc_value = get_value_if_exists(exception_value)
+                exc_value = _get_value_if_exists(exception_value)
             elif exception_value.get("id") == "stacktrace" and frame_count < MAX_FRAME_COUNT:
                 contributing_frames = [
                     frame
@@ -79,7 +79,7 @@ def get_stacktrace_string(data):
                     frame_dict = {"filename": "", "function": "", "context-line": ""}
                     for frame_values in frame.get("values", []):
                         if frame_values.get("id") in frame_dict:
-                            frame_dict[frame_values["id"]] = get_value_if_exists(frame_values)
+                            frame_dict[frame_values["id"]] = _get_value_if_exists(frame_values)
 
                     frame_str += f'  File "{frame_dict["filename"]}", function {frame_dict["function"]}\n    {frame_dict["context-line"]}\n'
 
@@ -107,20 +107,22 @@ class GroupSimilarIssuesEmbeddingsEndpoint(GroupEndpoint):
     }
 
     def get_formatted_results(
-        self, responses: Sequence[SimilarIssuesEmbeddingsData], user: User | AnonymousUser
+        self,
+        similar_issues_data: Sequence[SimilarIssuesEmbeddingsData],
+        user: User | AnonymousUser,
     ) -> Sequence[tuple[Mapping[str, Any], Mapping[str, Any]] | None]:
         """
         Format the responses using to be used by the frontend by changing the  field names and
         changing the cosine distances into cosine similarities.
         """
         group_data = {}
-        for response in responses:
+        for similar_issue_data in similar_issues_data:
             formatted_response: FormattedSimilarIssuesEmbeddingsData = {
-                "message": 1 - response["message_distance"],
-                "exception": 1 - response["stacktrace_distance"],
-                "shouldBeGrouped": "Yes" if response["should_group"] else "No",
+                "message": 1 - similar_issue_data["message_distance"],
+                "exception": 1 - similar_issue_data["stacktrace_distance"],
+                "shouldBeGrouped": "Yes" if similar_issue_data["should_group"] else "No",
             }
-            group_data.update({response["parent_group_id"]: formatted_response})
+            group_data[similar_issue_data["parent_group_id"]] = formatted_response
 
         serialized_groups = {
             int(g["id"]): g
@@ -134,7 +136,14 @@ class GroupSimilarIssuesEmbeddingsEndpoint(GroupEndpoint):
             try:
                 result.append((serialized_groups[group_id], group_data[group_id]))
             except KeyError:
-                # KeyErrors may occur if seer API returns a deleted/merged group
+                # KeyErrors may occur if seer API returns a deleted/merged group, which means it
+                # will be missing from `serialized_groups`
+                #
+                # TODO: This shouldn't be an issue for merged groups once we only use hashes (since
+                # merging leaves the hashes intact), but it will still be an error for deleted
+                # groups/hashes.
+                #
+                # TODO: Report back to seer that the hash has been deleted.
                 continue
         return result
 
@@ -159,9 +168,9 @@ class GroupSimilarIssuesEmbeddingsEndpoint(GroupEndpoint):
         }
         # Add optional parameters
         if request.GET.get("k"):
-            similar_issues_params.update({"k": int(request.GET["k"])})
+            similar_issues_params["k"] = int(request.GET["k"])
         if request.GET.get("threshold"):
-            similar_issues_params.update({"threshold": float(request.GET["threshold"])})
+            similar_issues_params["threshold"] = float(request.GET["threshold"])
 
         extra: dict[str, Any] = dict(similar_issues_params.copy())
         extra["group_message"] = extra.pop("message")
@@ -178,7 +187,7 @@ class GroupSimilarIssuesEmbeddingsEndpoint(GroupEndpoint):
                 [
                     result["stacktrace_distance"]
                     for result in (results.get("responses") or [])
-                    if result and result["stacktrace_distance"] <= 0.01
+                    if result["stacktrace_distance"] <= 0.01
                 ]
             ),
             user_id=request.user.id,

+ 21 - 10
src/sentry/seer/utils.py

@@ -1,4 +1,5 @@
-from typing import TypedDict
+import logging
+from typing import NotRequired, TypedDict
 
 import sentry_sdk
 from django.conf import settings
@@ -6,6 +7,9 @@ from urllib3 import Retry
 
 from sentry.net.http import connection_from_url
 from sentry.utils import json
+from sentry.utils.json import JSONDecodeError
+
+logger = logging.getLogger(__name__)
 
 
 class SeerException(Exception):
@@ -80,16 +84,13 @@ def detect_breakpoints(breakpoint_request) -> BreakpointResponse:
     return {"data": []}
 
 
-class SimilarIssuesEmbeddingsRequestNotRequired(TypedDict, total=False):
-    k: int
-    threshold: float
-
-
-class SimilarIssuesEmbeddingsRequest(SimilarIssuesEmbeddingsRequestNotRequired):
+class SimilarIssuesEmbeddingsRequest(TypedDict):
     group_id: int
     project_id: int
     stacktrace: str
     message: str
+    k: NotRequired[int]  # how many neighbors to find
+    threshold: NotRequired[float]
 
 
 class SimilarIssuesEmbeddingsData(TypedDict):
@@ -116,6 +117,16 @@ def get_similar_issues_embeddings(
 
     try:
         return json.loads(response.data.decode("utf-8"))
-    except AttributeError:
-        empty_response: SimilarIssuesEmbeddingsResponse = {"responses": []}
-        return empty_response
+    except (
+        AttributeError,  # caused by a response with no data and therefore no `.decode` method
+        UnicodeError,
+        JSONDecodeError,
+    ):
+        logger.exception(
+            "Failed to parse seer similar issues response",
+            extra={
+                "request_params": similar_issues_request,
+                "response_data": response.data,
+            },
+        )
+        return {"responses": []}

+ 1 - 1
tests/sentry/api/endpoints/test_group_similar_issues_embeddings.py

@@ -666,7 +666,7 @@ class GroupSimilarIssuesEmbeddingsTest(APITestCase):
         }
         group_similar_endpoint = GroupSimilarIssuesEmbeddingsEndpoint()
         formatted_results = group_similar_endpoint.get_formatted_results(
-            responses=[response_1, response_2], user=self.user
+            similar_issues_data=[response_1, response_2], user=self.user
         )
         assert formatted_results == self.get_expected_response(
             [self.similar_group.id, new_group.id], [0.95, 0.51], [0.99, 0.77], ["Yes", "No"]

+ 2 - 2
tests/sentry/seer/test_utils.py

@@ -58,10 +58,10 @@ def test_simple_similar_issues_embeddings(mock_seer_request):
     expected_return_value = {
         "responses": [
             {
-                "message_similarity": 0.95,
+                "message_distance": 0.05,
                 "parent_group_id": 6,
                 "should_group": True,
-                "stacktrace_similarity": 0.99,
+                "stacktrace_distance": 0.01,
             }
         ]
     }