Browse Source

ref(grouping): Clean up message normalization (#53479)

This is a refactor intended to make the code handling exception messages in the grouping algorithm a little easier to work with, by adding/clarifying comments and changing names to be more reflective of their namees' respective purposes. The only behavior change is that the hint for message parameterization (shown in the grouping info section of the issue details page) has been changed from `Stripped common values` to `Stripped event-specific values`, since the whole point of parameterization is to remove values which different events _don't_ have in common. (All of the snapshot changes are related to this switch.)

The rest of the changes are just renamings, the main theme being to change our language from talking about "trimming" to talking about "normalization," since we both shorten and parameterize the event's message before using it for grouping.
Katie Byers 1 year ago
parent
commit
764039b04f

+ 7 - 7
src/sentry/grouping/strategies/configurations.py

@@ -33,9 +33,9 @@ BASE_STRATEGY = create_strategy_configuration(
         # strategy to disable itself.  Recursion is detected by the outer
         # strategy.
         "is_recursion": False,
-        # This turns on the automatic message trimming by the message
-        # strategy.
-        "trim_message": False,
+        # This turns on the automatic message trimming and parameter substitution
+        # by the message strategy.
+        "normalize_message": False,
         # newstyle: enables the legacy function logic.  This is only used
         # by the newstyle:2019-04-05 strategy.  Once this is no longer used
         # this can go away entirely.
@@ -66,7 +66,7 @@ BASE_STRATEGY = create_strategy_configuration(
         "use_package_fallback": False,
         # Remove platform differences in native frames
         "native_fuzzing": False,
-        # Ignore exception types for native if they are platform specific error
+        # Ignore exception types for native if they are platform-specific error
         # codes. Normally SDKs are supposed to disable error-type grouping with
         # the `synthetic` flag in the event, but a lot of error types we can
         # also detect on the backend.
@@ -105,7 +105,7 @@ register_strategy_config(
         * Some known weaknesses with regards to grouping of native frames
     """,
     initial_context={
-        "trim_message": False,
+        "normalize_message": False,
     },
     enhancements_base="legacy:2019-03-12",
 )
@@ -135,7 +135,7 @@ register_strategy_config(
         "javascript_fuzzing": True,
         "contextline_platforms": ("javascript", "node", "python", "php", "ruby"),
         "with_context_line_file_origin_bug": True,
-        "trim_message": True,
+        "normalize_message": True,
         "with_exception_value_fallback": True,
     },
     enhancements_base="common:2019-03-23",
@@ -230,7 +230,7 @@ register_strategy_config(
     hidden=True,
     initial_context={
         "legacy_function_logic": False,
-        "trim_message": True,
+        "normalize_message": True,
         "with_exception_value_fallback": True,
     },
 )

+ 31 - 17
src/sentry/grouping/strategies/message.py

@@ -13,7 +13,9 @@ from sentry.grouping.strategies.base import (
 from sentry.interfaces.message import Message
 from sentry.utils import metrics
 
-_irrelevant_re = re.compile(
+_parameterization_regex = re.compile(
+    # The `(?x)` tells the regex compiler to ingore comments and unescaped whitespace,
+    # so we can use newlines and indentation for better legibility.
     r"""(?x)
     (?P<email>
         [a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*
@@ -96,32 +98,44 @@ _irrelevant_re = re.compile(
         \b\d+\b
     ) |
     (?P<quoted_str>
+        # The `=` here guarantees we'll only match the value half of key-value pairs,
+        # rather than all quoted strings
         ='([\w\s]+)'
     )
 """
 )
 
 
-def trim_message_for_grouping(string: str) -> str:
-    """Replace values from a group's message to hide P.I.I. and improve grouping when no
-    stacktrace available.
+def normalize_message_for_grouping(message: str) -> str:
+    """Replace values from a group's message with placeholders (to hide P.I.I. and
+    improve grouping when no stacktrace is available) and trim to at most 2 lines.
     """
-    s = "\n".join(islice((x for x in string.splitlines() if x.strip()), 2)).strip()
-    if s != string:
-        s += "..."
+    trimmed = "\n".join(
+        # If there are multiple lines, grab the first two non-empty ones.
+        islice(
+            (x for x in message.splitlines() if x.strip()),
+            2,
+        )
+    )
+    if trimmed != message:
+        trimmed += "..."
 
     def _handle_match(match: Match[str]) -> str:
-        # e.g. hex, 0x40000015
+        # Find the first (should be only) non-None match entry, and sub in the placeholder. For
+        # example, given the groupdict item `('hex', '0x40000015')`, this returns '<hex>' as a
+        # replacement for the original value in the string.
         for key, value in match.groupdict().items():
             if value is not None:
-                # key can be one of the keys from _irrelevant_re, thus, not a large cardinality
-                # tracking the key helps distinguish what kinds of replacements are happening
+                # `key` can only be one of the keys from `_parameterization_regex`, thus, not a large
+                # cardinality. Tracking the key helps distinguish what kinds of replacements are happening.
                 metrics.incr("grouping.value_trimmed_from_message", tags={"key": key})
-                # For quoted_str we want to preserver the = symbol
+                # For `quoted_str` we want to preserve the `=` symbol, which we include in
+                # the match in order not to replace random quoted strings in contexts other
+                # than key-value pairs
                 return f"=<{key}>" if key == "quoted_str" else f"<{key}>"
         return ""
 
-    return _irrelevant_re.sub(_handle_match, s)
+    return _parameterization_regex.sub(_handle_match, trimmed)
 
 
 @strategy(ids=["message:v1"], interface=Message, score=0)
@@ -129,14 +143,14 @@ def trim_message_for_grouping(string: str) -> str:
 def message_v1(
     interface: Message, event: Event, context: GroupingContext, **meta: Any
 ) -> ReturnedVariants:
-    if context["trim_message"]:
-        message_in = interface.message or interface.formatted or ""
-        message_trimmed = trim_message_for_grouping(message_in)
-        hint = "stripped common values" if message_in != message_trimmed else None
+    if context["normalize_message"]:
+        raw = interface.message or interface.formatted or ""
+        normalized = normalize_message_for_grouping(raw)
+        hint = "stripped event-specific values" if raw != normalized else None
         return {
             context["variant"]: GroupingComponent(
                 id="message",
-                values=[message_trimmed],
+                values=[normalized],
                 hint=hint,
             )
         }

+ 7 - 7
src/sentry/grouping/strategies/newstyle.py

@@ -15,7 +15,7 @@ from sentry.grouping.strategies.base import (
     strategy,
 )
 from sentry.grouping.strategies.hierarchical import get_stacktrace_hierarchy
-from sentry.grouping.strategies.message import trim_message_for_grouping
+from sentry.grouping.strategies.message import normalize_message_for_grouping
 from sentry.grouping.strategies.utils import has_url_origin, remove_non_stacktrace_variants
 from sentry.grouping.utils import hash_from_values
 from sentry.interfaces.exception import Exception as ChainedException
@@ -642,12 +642,12 @@ def single_exception(
                 id="value",
             )
 
-            value_in = interface.value
-            if value_in is not None:
-                value_trimmed = trim_message_for_grouping(value_in)
-                hint = "stripped common values" if value_in != value_trimmed else None
-                if value_trimmed:
-                    value_component.update(values=[value_trimmed], hint=hint)
+            raw = interface.value
+            if raw is not None:
+                normalized = normalize_message_for_grouping(raw)
+                hint = "stripped event-specific values" if raw != normalized else None
+                if normalized:
+                    value_component.update(values=[normalized], hint=hint)
 
             if stacktrace_component.contributes and value_component.contributes:
                 value_component.update(

+ 1 - 1
tests/sentry/grouping/snapshots/test_variants/test_event_hash_variant/mobile@2021_02_12/exception_groups_bad_duplicate_id.pysnap

@@ -16,7 +16,7 @@ app:
         exception*
           type*
             "MyApp.Exception"
-          value* (stripped common values)
+          value* (stripped event-specific values)
             "Test <int>"
         exception*
           type*

+ 1 - 1
tests/sentry/grouping/snapshots/test_variants/test_event_hash_variant/mobile@2021_02_12/exception_groups_bad_missing_parent.pysnap

@@ -10,5 +10,5 @@ app:
       exception*
         type*
           "MyApp.Exception"
-        value* (stripped common values)
+        value* (stripped event-specific values)
           "Test <int>"

+ 1 - 1
tests/sentry/grouping/snapshots/test_variants/test_event_hash_variant/mobile@2021_02_12/exception_groups_bad_out_of_sequence.pysnap

@@ -11,7 +11,7 @@ app:
         exception*
           type*
             "MyApp.Exception"
-          value* (stripped common values)
+          value* (stripped event-specific values)
             "Test <int>"
         exception*
           type*

+ 1 - 1
tests/sentry/grouping/snapshots/test_variants/test_event_hash_variant/mobile@2021_02_12/exception_groups_one_exception.pysnap

@@ -10,5 +10,5 @@ app:
       exception*
         type*
           "MyApp.Exception"
-        value* (stripped common values)
+        value* (stripped event-specific values)
           "Test <int>"

+ 1 - 1
tests/sentry/grouping/snapshots/test_variants/test_event_hash_variant/mobile@2021_02_12/exception_groups_one_type_under_nested_groups.pysnap

@@ -10,5 +10,5 @@ app:
       exception*
         type*
           "MyApp.Exception"
-        value* (stripped common values)
+        value* (stripped event-specific values)
           "Test <int>"

+ 1 - 1
tests/sentry/grouping/snapshots/test_variants/test_event_hash_variant/mobile@2021_02_12/exception_groups_one_type_with_different_values.pysnap

@@ -11,7 +11,7 @@ app:
         exception*
           type*
             "MyApp.Exception"
-          value* (stripped common values)
+          value* (stripped event-specific values)
             "Test <int>"
         exception*
           type*

+ 1 - 1
tests/sentry/grouping/snapshots/test_variants/test_event_hash_variant/mobile@2021_02_12/exception_groups_one_type_with_similar_values.pysnap

@@ -10,5 +10,5 @@ app:
       exception*
         type*
           "MyApp.Exception"
-        value* (stripped common values)
+        value* (stripped event-specific values)
           "Test <int>"

Some files were not shown because too many files changed in this diff