Browse Source

fix(ingest): Repair broken title data (#69398)

Last week, we ran into a bug wherein we were mistakenly classifying error events as default (message) events. Because the two types of events have slightly different schema, that meant that we were looking in the wrong place for title data, and as a result ended up with a whole mess of events all titled `<unlabled event>`. Because groups update their title with each new event that comes in, that meant we also had a bunch of groups called `<unlabeled event>`, with the result that the issue stream became an unusable sea of indistinguishable issues.

We quickly realized the problem and fixed the bug, but while that meant events started coming in with correct titles again, the groups weren't fully updating themselves. Why? Because it turns out that default events add title information to `group.data["metadata"]` as well as to `group.data` itself, where as error events only add it to `group.data`. This meant that as new events came in only half of the bad title data was getting overwritten by good title data, while the bad data in `group.data["metadata"]` was untouched. 

This fixes that by allowing `None` to overwrite existing title data, provided the existing title data is `<unlabeled event>` or one of its cousins (`<unknown>`, `<untitled>`, etc.) It also aims to prevent a similar thing from happening in the future, by not allowing `<unlabeled event>` (et al) to overwrite either a real title or a `None`. The same fix is applied to `group.message`, which comes in part from the title.

Note: To determine if a title is worthwhile, I reused the collection of titles we use when checking if an event's message is worth sending to seer or not. To make it a little clearer what they're about, I renamed the constant to `PLACEHOLDER_TITLES`, and to make things ever-so-slightly more efficient, I changed it from a list to a set.
Katie Byers 10 months ago
parent
commit
f928193dea

+ 61 - 12
src/sentry/event_manager.py

@@ -147,7 +147,7 @@ SECURITY_REPORT_INTERFACES = ("csp", "hpkp", "expectct", "expectstaple", "nel")
 # Timeout for cached group crash report counts
 CRASH_REPORT_TIMEOUT = 24 * 3600  # one day
 
-NON_TITLE_EVENT_TITLES = ["<untitled>", "<unknown>", "<unlabeled event>", "Error"]
+PLACEHOLDER_EVENT_TITLES = frozenset(["<untitled>", "<unknown>", "<unlabeled event>", "Error"])
 
 HIGH_SEVERITY_THRESHOLD = 0.1
 
@@ -2126,6 +2126,48 @@ def _handle_regression(group: Group, event: BaseEvent, release: Release | None)
     return is_regression
 
 
+def _is_placeholder_title(title):
+    return title in PLACEHOLDER_EVENT_TITLES
+
+
+def _is_real_title(title):
+    return bool(title) and title not in PLACEHOLDER_EVENT_TITLES
+
+
+def _get_updated_group_title(existing_container, incoming_container):
+    """
+    Given either `group.data` or `group.data["metadata"]`, in both existing and incoming forms, pick
+    the correct title to use when updating the group. Uses the incoming title (or `None` if there
+    isn't one) except in  the case where a placeholder title (`<unlabeled event>`, `<untitled>`,
+    etc) would be replacing a non-placeholder title (either `None` or a real title).
+
+    This stems from an incident during which we were interpreting error events as default-type
+    events and thereby overwriting good titles with placeholder ones and inserting placeholder
+    titles where there shouldn't have been a title at all. (The second case matters because
+    default-type and error-type events differ in where they include a `title` attribute, and we
+    count on the lack of a `title` attribute in certain cases as well as the presence of one.) This
+    prevents that from happening in the future and will delete errant placeholder titles by
+    overwriting them with `None`.
+    """
+
+    existing_title = existing_container.get("title")
+    incoming_title = incoming_container.get("title")
+
+    return (
+        incoming_title
+        if (
+            # Real titles beat both placeholder and non-existent titles
+            _is_real_title(incoming_title)
+            or
+            # Conversely, placeholder titles lose to both real titles and lack of a title (the
+            # latter in order to fix the regression caused by error events being interpreted as
+            # default-type events)
+            _is_placeholder_title(existing_title)
+        )
+        else existing_title
+    )
+
+
 def _process_existing_aggregate(
     group: Group,
     event: BaseEvent,
@@ -2141,6 +2183,7 @@ def _process_existing_aggregate(
     if (
         event.search_message
         and event.search_message != group.message
+        and not _is_placeholder_title(event.search_message)
         and event.get_event_type() != TransactionEvent.key
     ):
         updated_group_values["message"] = event.search_message
@@ -2157,19 +2200,25 @@ def _process_existing_aggregate(
 
     is_regression = _handle_regression(group, event, release)
 
-    # Merge new data with existing data
-    incoming_data = incoming_group_values["data"]
-    incoming_metadata = incoming_group_values["data"].get("metadata", {})
-
     existing_data = group.data
-    # Grab a reference to this before it gets clobbered when we update `existing_data`
     existing_metadata = group.data.get("metadata", {})
 
-    existing_data.update(incoming_data)
-    existing_metadata.update(incoming_metadata)
+    incoming_data = incoming_group_values["data"]
+    incoming_metadata = incoming_group_values["data"].get("metadata", {})
 
-    updated_group_values["data"] = existing_data
-    updated_group_values["data"]["metadata"] = existing_metadata
+    # Merge old and new data/metadata, keeping the existing title if the incoming title is a
+    # placeholder (`<unlabeled event`, `<untitled>`, etc.) and the existing one isn't. See
+    # `_get_updated_group_title` docstring.
+    updated_group_values["data"] = {
+        **existing_data,
+        **incoming_data,
+        "title": _get_updated_group_title(existing_data, incoming_data),
+    }
+    updated_group_values["data"]["metadata"] = {
+        **existing_metadata,
+        **incoming_metadata,
+        "title": _get_updated_group_title(existing_metadata, incoming_metadata),
+    }
 
     update_kwargs = {"times_seen": 1}
 
@@ -2364,11 +2413,11 @@ def _get_severity_score(event: Event) -> tuple[float, str]:
         title = event.title
 
     # If the event hasn't yet been given a helpful title, attempt to calculate one
-    if title in NON_TITLE_EVENT_TITLES:
+    if title in PLACEHOLDER_EVENT_TITLES:
         title = event_type.get_title(metadata)
 
     # If there's still nothing helpful to be had, bail
-    if title in NON_TITLE_EVENT_TITLES:
+    if title in PLACEHOLDER_EVENT_TITLES:
         logger_data.update(
             {"event_type": event_type.key, "event_title": event.title, "computed_title": title}
         )

+ 201 - 0
tests/sentry/event_manager/test_event_manager_grouping.py

@@ -5,10 +5,15 @@ from typing import Any
 from unittest import mock
 from unittest.mock import MagicMock
 
+import pytest
+
+from sentry.event_manager import _get_updated_group_title
+from sentry.eventtypes.base import DefaultEvent
 from sentry.grouping.result import CalculatedHashes
 from sentry.models.group import Group
 from sentry.testutils.cases import TestCase
 from sentry.testutils.helpers.eventprocessing import save_new_event
+from sentry.testutils.pytest.fixtures import django_db_all
 from sentry.testutils.skips import requires_snuba
 
 pytestmark = [requires_snuba]
@@ -113,6 +118,202 @@ class EventManagerGroupingTest(TestCase):
         assert group.message == event2.message
 
 
+class PlaceholderTitleTest(TestCase):
+    """
+    Tests for a bug where error events were interpreted as default-type events and therefore all
+    came out with a placeholder title.
+    """
+
+    def test_fixes_broken_title_data(self):
+        # An event before the bug was introduced
+        event1 = save_new_event(
+            {
+                "exception": {
+                    "values": [{"type": "DogsAreNeverAnError", "value": "Dogs are great!"}],
+                },
+                # Use a fingerprint to guarantee all events end up in the same group
+                "fingerprint": ["adopt don't shop"],
+            },
+            self.project,
+        )
+
+        group = Group.objects.get(id=event1.group_id)
+
+        assert group.title == event1.title == "DogsAreNeverAnError: Dogs are great!"
+        assert group.data["title"] == event1.data["title"] == "DogsAreNeverAnError: Dogs are great!"
+        assert group.data["metadata"].get("title") is event1.data["metadata"].get("title") is None
+        assert group.message == "Dogs are great! DogsAreNeverAnError"
+
+        # Simulate the bug
+        with mock.patch(
+            "sentry.event_manager.get_event_type",
+            return_value=DefaultEvent(),
+        ):
+            # Neutralize the data fixes by making them unable to recognize a bad title and by
+            # unconditionally using the incoming title
+            with (
+                mock.patch(
+                    "sentry.event_manager._is_placeholder_title",
+                    return_value=False,
+                ),
+                mock.patch(
+                    "sentry.event_manager._get_updated_group_title",
+                    new=lambda existing_container, incoming_container: incoming_container.get(
+                        "title"
+                    ),
+                ),
+            ):
+                event2 = save_new_event(
+                    {
+                        "exception": {
+                            "values": [{"type": "DogsAreNeverAnError", "value": "Maisey is silly"}],
+                        },
+                        "fingerprint": ["adopt don't shop"],
+                    },
+                    self.project,
+                )
+
+        assert event2.group_id == event1.group_id
+
+        # Pull the group again to get updated data
+        group = Group.objects.get(id=event2.group_id)
+
+        # As expected, without the fixes, the bug screws up both the event and group data. (Compare
+        # this to the next test, where the fixes are left in place, and the group remains untouched.)
+        assert group.title == event2.title == "<unlabeled event>"
+        assert group.data["title"] == event2.data["title"] == "<unlabeled event>"
+        assert (
+            group.data["metadata"].get("title")
+            == event2.data["metadata"].get("title")
+            == "<unlabeled event>"
+        )
+        assert group.message == "<unlabeled event>"
+
+        # Now that we have a group with bad data, return to the current world - where the bug has
+        # been fixed and the data fix is also in place - and we can see that the group's data
+        # returns to what it should be
+        event3 = save_new_event(
+            {
+                "exception": {
+                    "values": [{"type": "DogsAreNeverAnError", "value": "Charlie is goofy"}],
+                },
+                "fingerprint": ["adopt don't shop"],
+            },
+            self.project,
+        )
+
+        assert event3.group_id == event2.group_id == event1.group_id
+
+        # Pull the group again to get updated data
+        group = Group.objects.get(id=event3.group_id)
+
+        # Title data is updated with values from newest event, and is back to the structure it was
+        # before the bug
+        assert group.title == event3.title == "DogsAreNeverAnError: Charlie is goofy"
+        assert (
+            group.data["title"] == event3.data["title"] == "DogsAreNeverAnError: Charlie is goofy"
+        )
+        assert group.data["metadata"].get("title") is event3.data["metadata"].get("title") is None
+        assert group.message == "Charlie is goofy DogsAreNeverAnError"
+
+    # This is the same as the data-fixing test above, except that the fix is left in place when
+    # the bug happens, and so the bad titles never get saved on the group
+    def test_bug_regression_no_longer_breaks_titles(self):
+        # An event before the bug was introduced
+        event1 = save_new_event(
+            {
+                "exception": {
+                    "values": [{"type": "DogsAreNeverAnError", "value": "Dogs are great!"}],
+                },
+                # Use a fingerprint to guarantee all events end up in the same group
+                "fingerprint": ["adopt don't shop"],
+            },
+            self.project,
+        )
+
+        group = Group.objects.get(id=event1.group_id)
+
+        assert group.title == event1.title == "DogsAreNeverAnError: Dogs are great!"
+        assert group.data["title"] == event1.data["title"] == "DogsAreNeverAnError: Dogs are great!"
+        assert group.data["metadata"].get("title") is event1.data["metadata"].get("title") is None
+        assert group.message == "Dogs are great! DogsAreNeverAnError"
+
+        # Simulate the bug, but with the fix in place
+        with mock.patch(
+            "sentry.event_manager.get_event_type",
+            return_value=DefaultEvent(),
+        ):
+            event2 = save_new_event(
+                {
+                    "exception": {
+                        "values": [{"type": "DogsAreNeverAnError", "value": "Maisey is silly"}],
+                    },
+                    "fingerprint": ["adopt don't shop"],
+                },
+                self.project,
+            )
+
+        assert event2.group_id == event1.group_id
+
+        # Pull the group again to get updated data
+        group = Group.objects.get(id=event2.group_id)
+
+        # The event may be messed up, but it didn't mess up the group
+        assert event2.title == "<unlabeled event>"
+        assert group.title == "DogsAreNeverAnError: Dogs are great!"
+        assert event2.data["title"] == "<unlabeled event>"
+        assert group.data["title"] == "DogsAreNeverAnError: Dogs are great!"
+        assert group.data["metadata"].get("title") is None
+        assert event2.data["metadata"].get("title") == "<unlabeled event>"
+        assert group.message == "Dogs are great! DogsAreNeverAnError"
+
+        # An event after the bug was fixed
+        event3 = save_new_event(
+            {
+                "exception": {
+                    "values": [{"type": "DogsAreNeverAnError", "value": "Charlie is goofy"}],
+                },
+                "fingerprint": ["adopt don't shop"],
+            },
+            self.project,
+        )
+
+        assert event3.group_id == event2.group_id == event1.group_id
+
+        # Pull the group again to get updated data
+        group = Group.objects.get(id=event3.group_id)
+
+        # Title data is updated with values from newest event
+        assert group.title == event3.title == "DogsAreNeverAnError: Charlie is goofy"
+        assert (
+            group.data["title"] == event3.data["title"] == "DogsAreNeverAnError: Charlie is goofy"
+        )
+        assert group.data["metadata"].get("title") is event3.data["metadata"].get("title") is None
+        assert group.message == "Charlie is goofy DogsAreNeverAnError"
+
+
+@django_db_all
+@pytest.mark.parametrize(
+    ["existing_title", "incoming_title", "expected_title"],
+    [
+        ("Dogs are great!", "Adopt don't shop", "Adopt don't shop"),
+        ("Dogs are great!", "<untitled>", "Dogs are great!"),
+        ("Dogs are great!", None, "Dogs are great!"),
+        ("<unlabeled event>", "Adopt don't shop", "Adopt don't shop"),
+        ("<unlabeled event>", "<untitled>", "<untitled>"),
+        ("<unlabeled event>", None, None),
+        (None, "Adopt don't shop", "Adopt don't shop"),
+        (None, "<untitled>", None),
+        (None, None, None),
+    ],
+)
+def test_get_updated_group_title(existing_title, incoming_title, expected_title):
+    existing_data = {"title": existing_title} if existing_title is not None else {}
+    incoming_data = {"title": incoming_title} if incoming_title is not None else {}
+
+    assert _get_updated_group_title(existing_data, incoming_data) == expected_title
+
+
 class EventManagerGroupingMetricsTest(TestCase):
     @mock.patch("sentry.event_manager.metrics.incr")
     def test_records_avg_calculations_per_event_metrics(self, mock_metrics_incr: MagicMock):

+ 2 - 2
tests/sentry/event_manager/test_severity.py

@@ -11,7 +11,7 @@ from urllib3.exceptions import MaxRetryError
 
 from sentry import options
 from sentry.event_manager import (
-    NON_TITLE_EVENT_TITLES,
+    PLACEHOLDER_EVENT_TITLES,
     SEER_ERROR_COUNT_KEY,
     EventManager,
     _get_severity_score,
@@ -207,7 +207,7 @@ class TestGetEventSeverity(TestCase):
         mock_logger_warning: MagicMock,
         mock_urlopen: MagicMock,
     ) -> None:
-        for title in NON_TITLE_EVENT_TITLES:
+        for title in PLACEHOLDER_EVENT_TITLES:
             manager = EventManager(make_event(exception={"values": []}, platform="python"))
             event = manager.save(self.project.id)
             # `title` is a property with no setter, but it pulls from `metadata`, so it's equivalent