Browse Source

ref(grouping): refactoring parametrization (#71078)

Pulling out parametrization logic out of grouping.strategies.message and
making it more reusable:
- regex split into individual composable components;
- using regex lookbehind instead of adding and removing `=` hack 
- refactored experiments;

---------

Co-authored-by: Armen Zambrano G. <44410+armenzg@users.noreply.github.com>
Bartek Ogryczak 9 months ago
parent
commit
805151bd9b

+ 1 - 0
pyproject.toml

@@ -549,6 +549,7 @@ module = [
     "sentry.buffer.*",
     "sentry.build.*",
     "sentry.eventstore.reprocessing.redis",
+    "sentry.grouping.parameterization",
     "sentry.hybridcloud",
     "sentry.hybridcloud.migrations.*",
     "sentry.hybridcloud.options",

+ 368 - 0
src/sentry/grouping/parameterization.py

@@ -0,0 +1,368 @@
+import dataclasses
+import re
+from collections import defaultdict
+from collections.abc import Callable, Sequence
+from functools import lru_cache
+
+import tiktoken
+
+__all__ = [
+    "ParameterizationCallable",
+    "ParameterizationCallableExperiment",
+    "ParameterizationExperiment",
+    "ParameterizationRegex",
+    "ParameterizationRegexExperiment",
+    "Parameterizer",
+    "UniqueIdExperiment",
+]
+
+
+@dataclasses.dataclass
+class ParameterizationRegex:
+
+    name: str  # name of the pattern also used as group name in combined regex
+    raw_pattern: str  # regex pattern w/o matching group name
+    lookbehind: str | None = None  # positive lookbehind prefix if needed
+    lookahead: str | None = None  # positive lookahead postfix if needed
+    counter: int = 0
+
+    # These need to be used with `(?x)` tells the regex compiler to ignore comments
+    # and unescaped whitespace, so we can use newlines and indentation for better legibility.
+
+    @property
+    def pattern(self) -> str:
+        """
+        Returns the regex pattern for with as a named matching group and lookbehind/lookahead if needed.
+        """
+        prefix = rf"(?<={self.lookbehind})" if self.lookbehind else ""
+        postfix = rf"(?={self.lookahead})" if self.lookahead else ""
+        return rf"{prefix}(?P<{self.name}>{self.raw_pattern}){postfix}"
+
+    @property
+    def compiled_pattern(self) -> re.Pattern[str]:
+        """
+        Returns the compiled regex pattern for with as a named matching group and lookbehind/lookahead if needed.
+        """
+        if not hasattr(self, "_compiled_pattern"):
+            self._compiled_pattern = re.compile(rf"(?x){self.pattern}")
+        return self._compiled_pattern
+
+
+DEFAULT_PARAMETERIZATION_REGEXES = [
+    ParameterizationRegex(
+        name="email",
+        raw_pattern=r"""[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*""",
+    ),
+    ParameterizationRegex(name="url", raw_pattern=r"""\b(wss?|https?|ftp)://[^\s/$.?#].[^\s]*"""),
+    ParameterizationRegex(
+        name="hostname",
+        raw_pattern=r"""
+            # Top 100 TLDs. The complete list is 1000s long.
+            \b
+            ([a-zA-Z0-9\-]{1,63}\.)+?
+            (
+                (COM|NET|ORG|JP|DE|UK|FR|BR|IT|RU|ES|ME|GOV|PL|CA|AU|CN|CO|IN|NL|EDU|INFO|EU|CH|ID|AT|KR|CZ|MX|BE|TV|SE|TR|TW|AL|UA|IR|VN|CL|SK|LY|CC|TO|NO|FI|US|PT|DK|AR|HU|TK|GR|IL|NEWS|RO|MY|BIZ|IE|ZA|NZ|SG|EE|TH|IO|XYZ|PE|BG|HK|RS|LT|LINK|PH|CLUB|SI|SITE|MOBI|BY|CAT|WIKI|LA|GA|XXX|CF|HR|NG|JOBS|ONLINE|KZ|UG|GQ|AE|IS|LV|PRO|FM|TIPS|MS|SA|APP)|
+                (com|net|org|jp|de|uk|fr|br|it|ru|es|me|gov|pl|ca|au|cn|co|in|nl|edu|info|eu|ch|id|at|kr|cz|mx|be|tv|se|tr|tw|al|ua|ir|vn|cl|sk|ly|cc|to|no|fi|us|pt|dk|ar|hu|tk|gr|il|news|ro|my|biz|ie|za|nz|sg|ee|th|io|xyz|pe|bg|hk|rs|lt|link|ph|club|si|site|mobi|by|cat|wiki|la|ga|xxx|cf|hr|ng|jobs|online|kz|ug|gq|ae|is|lv|pro|fm|tips|ms|sa|app)
+            )
+            \b
+        """,
+    ),
+    ParameterizationRegex(
+        name="ip",
+        raw_pattern=r"""
+            (
+                ([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|
+                ([0-9a-fA-F]{1,4}:){1,7}:|
+                ([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|
+                ([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|
+                ([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|
+                ([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|
+                ([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|
+                [0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|
+                :((:[0-9a-fA-F]{1,4}){1,7}|:)|
+                fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|
+                ::(ffff(:0{1,4}){0,1}:){0,1}
+                ((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}
+                (25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|
+                ([0-9a-fA-F]{1,4}:){1,4}:
+                ((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}
+                (25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\b
+            ) |
+            (
+                \b((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}
+                (25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\b
+            )
+        """,
+    ),
+    ParameterizationRegex(
+        name="uuid",
+        raw_pattern=r"""\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b""",
+    ),
+    ParameterizationRegex(name="sha1", raw_pattern=r"""\b[0-9a-fA-F]{40}\b"""),
+    ParameterizationRegex(name="md5", raw_pattern=r"""\b[0-9a-fA-F]{32}\b"""),
+    ParameterizationRegex(
+        name="date",
+        raw_pattern=r"""
+            # No word boundaries required around dates. Should there be?
+            # RFC822, RFC1123, RFC1123Z
+            ((?:Sun|Mon|Tue|Wed|Thu|Fri|Sat),\s\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{2,4}\s\d{1,2}:\d{1,2}(:\d{1,2})?\s([-\+][\d]{2}[0-5][\d]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z])))
+            |
+            # Similar to RFC822, but "Mon Jan 02, 1999", "Jan 02, 1999"
+            (((?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s)?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s[0-3]\d,\s\d{2,4})
+            |
+            # RFC850
+            ((?:Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday),\s\d{2}-(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-\d{2}\s\d{2}:\d{2}:\d{2}\s(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))
+            |
+            # RFC3339, RFC3339Nano
+            (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?Z?([+-]?\d{2}:\d{2})?)
+            |
+            # Datetime:
+            (\d{4}-?[01]\d-?[0-3]\d\s[0-2]\d:[0-5]\d:[0-5]\d)(\.\d+)?
+            |
+            # Kitchen
+            (\d{1,2}:\d{2}(:\d{2})?(?: [aApP][Mm])?)
+            |
+            # Date
+            (\d{4}-[01]\d-[0-3]\d)
+            |
+            # Time
+            ([0-2]\d:[0-5]\d:[0-5]\d)
+            |
+            # Old Date Formats, TODO: possibly safe to remove?
+            (
+                (\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d\.\d+([+-][0-2]\d:[0-5]\d|Z))|
+                (\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))|
+                (\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))
+            ) |
+            (
+                \b(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s+)?
+                (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+
+                ([\d]{1,2})\s+
+                ([\d]{2}:[\d]{2}:[\d]{2})\s+
+                [\d]{4}
+            ) |
+            (
+                \b(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat),\s+)?
+                (0[1-9]|[1-2]?[\d]|3[01])\s+
+                (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+
+                (19[\d]{2}|[2-9][\d]{3})\s+
+                (2[0-3]|[0-1][\d]):([0-5][\d])
+                (?::(60|[0-5][\d]))?\s+
+                ([-\+][\d]{2}[0-5][\d]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))
+            ) |
+            (datetime.datetime\(.*?\))
+        """,
+    ),
+    ParameterizationRegex(name="duration", raw_pattern=r"""\b(\d+ms) | (\d(\.\d+)?s)\b"""),
+    ParameterizationRegex(name="hex", raw_pattern=r"""\b0[xX][0-9a-fA-F]+\b"""),
+    ParameterizationRegex(name="float", raw_pattern=r"""-\d+\.\d+\b | \b\d+\.\d+\b"""),
+    ParameterizationRegex(name="int", raw_pattern=r"""-\d+\b | \b\d+\b"""),
+    ParameterizationRegex(
+        name="quoted_str",
+        raw_pattern=r"""# Using `=`lookbehind which guarantees we'll only match the value half of key-value pairs,
+            # rather than all quoted strings
+            '([^']+)' | "([^"]+)"
+        """,
+        lookbehind="=",
+    ),
+    ParameterizationRegex(
+        name="bool",
+        raw_pattern=r"""# Using `=`lookbehind which guarantees we'll only match the value half of key-value pairs,
+            # rather than all instances of the words 'true' and 'false'.
+            True |
+            true |
+            False |
+            false
+        """,
+        lookbehind="=",
+    ),
+]
+
+
+DEFAULT_PARAMETERIZATION_REGEXES_MAP = {r.name: r.pattern for r in DEFAULT_PARAMETERIZATION_REGEXES}
+
+
+@dataclasses.dataclass
+class ParameterizationCallable:
+    """
+    Represents a callable that can be used to modify a string, which can give
+    us more flexibility than just using regex.
+    """
+
+    name: str  # name of the pattern also used as group name in combined regex
+    apply: Callable[[str], tuple[str, int]]  # function to modifying the input string
+    counter: int = 0
+
+
+@dataclasses.dataclass
+class ParameterizationCallableExperiment(ParameterizationCallable):
+    def run(self, content: str, callback: Callable[[str, int], None]) -> str:
+        content, count = self.apply(content)
+        if count:
+            callback(self.name, count)
+        return content
+
+
+class ParameterizationRegexExperiment(ParameterizationRegex):
+    def run(
+        self,
+        content: str,
+        callback: Callable[[re.Match[str]], str],
+    ) -> str:
+        return self.compiled_pattern.sub(callback, content)
+
+
+class _UniqueId:
+    # just a namespace for the uniq_id logic, no need to instantiate
+
+    NAME = "uniq_id"
+
+    @staticmethod
+    @lru_cache(maxsize=1)
+    def tiktoken_encoding() -> tiktoken.Encoding:
+        return tiktoken.get_encoding("cl100k_base")
+
+    @staticmethod
+    def num_tokens_from_string(token_str: str) -> int:
+        """Returns the number of tokens in a text string."""
+        num_tokens = len(_UniqueId.tiktoken_encoding().encode(token_str))
+        return num_tokens
+
+    # These are all somewhat arbitrary based on examples.
+    TOKEN_LENGTH_MINIMUM = (
+        4  # Tokens smaller than this are unlikely to be unique ids regardless of other attributes
+    )
+    TOKEN_LENGTH_RATIO_DEFAULT = 0.5
+    TOKEN_LENGTH_LONG = 10
+    TOKEN_LENGTH_RATIO_LONG = 0.4
+
+    @staticmethod
+    def is_probably_uniq_id(token_str: str) -> bool:
+        token_str = token_str.strip("\"'[]{}():;")
+        if len(token_str) < _UniqueId.TOKEN_LENGTH_MINIMUM:
+            return False
+        if (
+            token_str[0] == "<" and token_str[-1] == ">"
+        ):  # Don't replace already-parameterized tokens
+            return False
+        token_length_ratio = _UniqueId.num_tokens_from_string(token_str) / len(token_str)
+        if (
+            len(token_str) > _UniqueId.TOKEN_LENGTH_LONG
+            and token_length_ratio > _UniqueId.TOKEN_LENGTH_RATIO_LONG
+        ):
+            return True
+        return token_length_ratio > _UniqueId.TOKEN_LENGTH_RATIO_DEFAULT
+
+    @staticmethod
+    def replace_uniq_ids_in_str(string: str) -> tuple[str, int]:
+        """
+        Return result and count of replacements
+        """
+        strings = string.split(" ")
+        count = 0
+        for i, s in enumerate(strings):
+            if _UniqueId.is_probably_uniq_id(s):
+                strings[i] = "<uniq_id>"
+                count += 1
+        return (" ".join(strings), count)
+
+
+UniqueIdExperiment = ParameterizationCallableExperiment(
+    name=_UniqueId.NAME, apply=_UniqueId.replace_uniq_ids_in_str
+)
+
+
+ParameterizationExperiment = ParameterizationCallableExperiment | ParameterizationRegexExperiment
+
+
+class Parameterizer:
+    def __init__(
+        self,
+        regex_pattern_keys: Sequence[str],
+        experiments: Sequence[ParameterizationExperiment] = (),
+    ):
+        self._parameterization_regex = self._make_regex_from_patterns(regex_pattern_keys)
+        self._experiments = experiments
+
+        self.matches_counter: defaultdict[str, int] = defaultdict(int)
+
+    @staticmethod
+    def _make_regex_from_patterns(pattern_keys: Sequence[str]) -> re.Pattern[str]:
+        """
+        Takes list of pattern keys and returns a compiled regex pattern that matches any of them.
+
+        @param pattern_keys: A list of keys to match in the _parameterization_regex_components dict.
+        @returns: A compiled regex pattern that matches any of the given keys.
+        @raises: KeyError on pattern key not in the _parameterization_regex_components dict
+
+        The `(?x)` tells the regex compiler to ignore comments and unescaped whitespace,
+        so we can use newlines and indentation for better legibility in patterns above.
+        """
+
+        return re.compile(
+            rf"(?x){'|'.join(DEFAULT_PARAMETERIZATION_REGEXES_MAP[k] for k in pattern_keys)}"
+        )
+
+    def parametrize_w_regex(self, content: str) -> str:
+        """
+        Replace all matches of the given regex in the content with a placeholder string.
+
+        @param content: The string to replace matches in.
+        @param parameterization_regex: The compiled regex pattern to match.
+        @param match_callback: An optional callback function to call with the key of the matched pattern.
+
+        @returns: The content with all matches replaced with placeholders.
+        """
+
+        def _handle_regex_match(match: re.Match[str]) -> str:
+            # Find the first (should be only) non-None match entry, and sub in the placeholder. For
+            # example, given the groupdict item `('hex', '0x40000015')`, this returns '<hex>' as a
+            # replacement for the original value in the string.
+            for key, value in match.groupdict().items():
+                if value is not None:
+                    self.matches_counter[key] += 1
+                    return f"<{key}>"
+            return ""
+
+        return self._parameterization_regex.sub(_handle_regex_match, content)
+
+    def parametrize_w_experiments(
+        self, content: str, should_run: Callable[[str], bool] = lambda _: True
+    ) -> str:
+        """
+        Apply all experiments to the content.
+
+        @param content: The string to apply experiments to.
+        @returns: The content with all experiments applied.
+        """
+
+        def _incr_counter(key: str, count: int) -> None:
+            self.matches_counter[key] += count
+
+        def _handle_regex_match(match: re.Match[str]) -> str:
+            # Find the first (should be only) non-None match entry, and sub in the placeholder. For
+            # example, given the groupdict item `('hex', '0x40000015')`, this returns '<hex>' as a
+            # replacement for the original value in the string.
+            for key, value in match.groupdict().items():
+                if value is not None:
+                    self.matches_counter[key] += 1
+                    return f"<{key}>"
+            return ""
+
+        for experiment in self._experiments:
+            if not should_run(experiment.name):
+                continue
+            if isinstance(experiment, ParameterizationCallableExperiment):
+                content = experiment.run(content, _incr_counter)
+            else:
+                content = experiment.run(content, _handle_regex_match)
+
+        return content
+
+    def get_successful_experiments(self) -> Sequence[ParameterizationExperiment]:
+        return [e for e in self._experiments if self.matches_counter[e.name] > 0]
+
+    def parameterize_all(
+        self, content: str, should_run: Callable[[str], bool] = lambda _: True
+    ) -> str:
+        return self.parametrize_w_experiments(self.parametrize_w_regex(content), should_run)

+ 58 - 292
src/sentry/grouping/strategies/message.py

@@ -1,18 +1,11 @@
-import dataclasses
-import re
-from collections import defaultdict
-from collections.abc import Callable
-from functools import lru_cache
 from itertools import islice
-from re import Match
 from typing import Any
 
-import tiktoken
-
 from sentry import analytics
 from sentry.eventstore.models import Event
 from sentry.features.rollout import in_rollout_group
 from sentry.grouping.component import GroupingComponent
+from sentry.grouping.parameterization import Parameterizer, UniqueIdExperiment
 from sentry.grouping.strategies.base import (
     GroupingContext,
     ReturnedVariants,
@@ -22,232 +15,6 @@ from sentry.grouping.strategies.base import (
 from sentry.interfaces.message import Message
 from sentry.utils import metrics
 
-# The `(?x)` tells the regex compiler to ignore comments and unescaped whitespace,
-# so we can use newlines and indentation for better legibility.
-_parameterization_regex_str = r"""(?x)
-    (?P<email>
-        [a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*
-    ) |
-    (?P<url>
-        \b(wss?|https?|ftp)://[^\s/$.?#].[^\s]*
-    ) |
-    (?P<hostname> # Top 100 TLDs. The complete list is 1000s long.
-        \b
-        ([a-zA-Z0-9\-]{1,63}\.)+?
-        (
-            (COM|NET|ORG|JP|DE|UK|FR|BR|IT|RU|ES|ME|GOV|PL|CA|AU|CN|CO|IN|NL|EDU|INFO|EU|CH|ID|AT|KR|CZ|MX|BE|TV|SE|TR|TW|AL|UA|IR|VN|CL|SK|LY|CC|TO|NO|FI|US|PT|DK|AR|HU|TK|GR|IL|NEWS|RO|MY|BIZ|IE|ZA|NZ|SG|EE|TH|IO|XYZ|PE|BG|HK|RS|LT|LINK|PH|CLUB|SI|SITE|MOBI|BY|CAT|WIKI|LA|GA|XXX|CF|HR|NG|JOBS|ONLINE|KZ|UG|GQ|AE|IS|LV|PRO|FM|TIPS|MS|SA|APP)|
-            (com|net|org|jp|de|uk|fr|br|it|ru|es|me|gov|pl|ca|au|cn|co|in|nl|edu|info|eu|ch|id|at|kr|cz|mx|be|tv|se|tr|tw|al|ua|ir|vn|cl|sk|ly|cc|to|no|fi|us|pt|dk|ar|hu|tk|gr|il|news|ro|my|biz|ie|za|nz|sg|ee|th|io|xyz|pe|bg|hk|rs|lt|link|ph|club|si|site|mobi|by|cat|wiki|la|ga|xxx|cf|hr|ng|jobs|online|kz|ug|gq|ae|is|lv|pro|fm|tips|ms|sa|app)
-        )
-        \b
-    ) |
-    (?P<ip>
-        (
-            ([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|
-            ([0-9a-fA-F]{1,4}:){1,7}:|
-            ([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|
-            ([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|
-            ([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|
-            ([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|
-            ([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|
-            [0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|
-            :((:[0-9a-fA-F]{1,4}){1,7}|:)|
-            fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|
-            ::(ffff(:0{1,4}){0,1}:){0,1}
-            ((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}
-            (25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|
-            ([0-9a-fA-F]{1,4}:){1,4}:
-            ((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}
-            (25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\b
-        ) |
-        (
-            \b((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}
-            (25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\b
-        )
-    ) |
-    (?P<uuid>
-        \b
-            [0-9a-fA-F]{8}-
-            [0-9a-fA-F]{4}-
-            [0-9a-fA-F]{4}-
-            [0-9a-fA-F]{4}-
-            [0-9a-fA-F]{12}
-        \b
-    ) |
-    (?P<sha1>
-        \b[0-9a-fA-F]{40}\b
-    ) |
-    (?P<md5>
-        \b[0-9a-fA-F]{32}\b
-    ) |
-    (?P<date>
-        # No word boundaries required around dates. Should there be?
-        # RFC822, RFC1123, RFC1123Z
-        ((?:Sun|Mon|Tue|Wed|Thu|Fri|Sat),\s\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{2,4}\s\d{1,2}:\d{1,2}(:\d{1,2})?\s([-\+][\d]{2}[0-5][\d]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z])))
-        |
-        # Similar to RFC822, but "Mon Jan 02, 1999", "Jan 02, 1999"
-        (((?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s)?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s[0-3]\d,\s\d{2,4})
-        |
-        # RFC850
-        ((?:Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday),\s\d{2}-(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-\d{2}\s\d{2}:\d{2}:\d{2}\s(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))
-        |
-        # RFC3339, RFC3339Nano
-        (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?Z?([+-]?\d{2}:\d{2})?)
-        |
-        # Datetime:
-        (\d{4}-?[01]\d-?[0-3]\d\s[0-2]\d:[0-5]\d:[0-5]\d)(\.\d+)?
-        |
-        # Kitchen
-        (\d{1,2}:\d{2}(:\d{2})?(?: [aApP][Mm])?)
-        |
-        # Date
-        (\d{4}-[01]\d-[0-3]\d)
-        |
-        # Time
-        ([0-2]\d:[0-5]\d:[0-5]\d)
-        |
-        # Old Date Formats, TODO: possibly safe to remove?
-        (
-            (\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d\.\d+([+-][0-2]\d:[0-5]\d|Z))|
-            (\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))|
-            (\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))
-        ) |
-        (
-            \b(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s+)?
-            (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+
-            ([\d]{1,2})\s+
-            ([\d]{2}:[\d]{2}:[\d]{2})\s+
-            [\d]{4}
-        ) |
-        (
-            \b(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat),\s+)?
-            (0[1-9]|[1-2]?[\d]|3[01])\s+
-            (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+
-            (19[\d]{2}|[2-9][\d]{3})\s+
-            (2[0-3]|[0-1][\d]):([0-5][\d])
-            (?::(60|[0-5][\d]))?\s+
-            ([-\+][\d]{2}[0-5][\d]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))
-        ) |
-        (datetime.datetime\(.*?\))
-    ) |
-    (?P<duration>
-        \b
-        (\d+ms) |
-        (\d(\.\d+)?s)
-        \b
-    ) |
-    (?P<hex>
-        \b0[xX][0-9a-fA-F]+\b
-    ) |
-    (?P<float>
-        -\d+\.\d+\b |
-        \b\d+\.\d+\b
-    ) |
-    (?P<int>
-        -\d+\b |
-        \b\d+\b
-    ) |
-    (?P<quoted_str>
-        # The `=` here guarantees we'll only match the value half of key-value pairs,
-        # rather than all quoted strings
-        ='([^']+)' |
-        ="([^"]+)"
-    ) |
-    (?P<bool>
-        # The `=` here guarantees we'll only match the value half of key-value pairs,
-        # rather than all instances of the words 'true' and 'false'.
-        =True |
-        =true |
-        =False |
-        =false
-    )
-"""
-
-_parameterization_regex = re.compile(_parameterization_regex_str)
-
-
-# UniqID logic
-@lru_cache(maxsize=1)
-def tiktoken_encoding():
-    return tiktoken.get_encoding("cl100k_base")
-
-
-def num_tokens_from_string(token_str: str) -> int:
-    """Returns the number of tokens in a text string."""
-    num_tokens = len(tiktoken_encoding().encode(token_str))
-    return num_tokens
-
-
-# These are all somewhat arbitrary based on examples.
-UNIQ_ID_TOKEN_LENGTH_MINIMUM = (
-    4  # Tokens smaller than this are unlikely to be unique ids regardless of other attributes
-)
-UNIQ_ID_TOKEN_LENGTH_RATIO_DEFAULT = 0.5
-UNIQ_ID_TOKEN_LENGTH_LONG = 10
-UNIQ_ID_TOKEN_LENGTH_RATIO_LONG = 0.4
-
-
-def is_probably_uniq_id(token_str: str) -> bool:
-    token_str = token_str.strip("\"'[]{}():;")
-    if len(token_str) < UNIQ_ID_TOKEN_LENGTH_MINIMUM:
-        return False
-    if token_str[0] == "<" and token_str[-1] == ">":  # Don't replace already-parameterized tokens
-        return False
-    token_length_ratio = num_tokens_from_string(token_str) / len(token_str)
-    if (
-        len(token_str) > UNIQ_ID_TOKEN_LENGTH_LONG
-        and token_length_ratio > UNIQ_ID_TOKEN_LENGTH_RATIO_LONG
-    ):
-        return True
-    return token_length_ratio > UNIQ_ID_TOKEN_LENGTH_RATIO_DEFAULT
-
-
-def replace_uniq_ids_in_str(string: str) -> tuple[str, int]:
-    """
-    Return result and count of replacements
-    """
-    strings = string.split(" ")
-    count = 0
-    for i, s in enumerate(strings):
-        if is_probably_uniq_id(s):
-            strings[i] = "<uniq_id>"
-            count += 1
-    return (" ".join(strings), count)
-
-
-def parameterization_experiment_default_run(
-    self: "ParameterizationExperiment", _handle_regex_match: Callable[[Match[str]], str], input: str
-) -> tuple[str, int]:
-    return (self.regex.sub(_handle_regex_match, input), 0)
-
-
-def parameterization_experiment_uniq_id(
-    self: "ParameterizationExperiment", _: Callable[[Match[str]], str], input: str
-) -> tuple[str, int]:
-    return replace_uniq_ids_in_str(input)
-
-
-@dataclasses.dataclass()
-class ParameterizationExperiment:
-    name: str
-    regex: Any
-    """A function that takes as arguments:
-            * This experiment
-            * A handle match function (may not be used), e.g. _handle_regex_match (note that this modifies trimmed_value_counter)
-            * A string input
-        And returns: a tuple of [output string, count of replacements(which overlaps with any added by _handle_regex_match, if used)]
-    """
-    run: Callable[
-        ["ParameterizationExperiment", Callable[[Match[str]], str], str], tuple[str, int]
-    ] = parameterization_experiment_default_run
-    counter: int = 0
-
-
-# Note that experiments are run AFTER the initial replacements. Which means they MUST not catch replacements made
-# in the primary parameterization regex.
-_parameterization_regex_experiments = [
-    ParameterizationExperiment(name="uniq_id", regex=None, run=parameterization_experiment_uniq_id),
-]
-
 
 @metrics.wraps("grouping.normalize_message_for_grouping")
 def normalize_message_for_grouping(message: str, event: Event, share_analytics: bool = True) -> str:
@@ -264,68 +31,67 @@ def normalize_message_for_grouping(message: str, event: Event, share_analytics:
     if trimmed != message:
         trimmed += "..."
 
-    trimmed_value_counter: defaultdict[str, int] = defaultdict(int)
-
-    def _handle_regex_match(match: Match[str]) -> str:
-        # Find the first (should be only) non-None match entry, and sub in the placeholder. For
-        # example, given the groupdict item `('hex', '0x40000015')`, this returns '<hex>' as a
-        # replacement for the original value in the string.
-        for key, value in match.groupdict().items():
-            if value is not None:
-                trimmed_value_counter[key] += 1
-                # For `quoted_str` and `bool` we want to preserve the `=` symbol, which we include in
-                # the match in order not to replace random quoted strings and the words 'true' and 'false'
-                # in contexts other than key-value pairs
-                if key in ["quoted_str", "bool"]:
-                    return f"=<{key}>"
-                else:
-                    return f"<{key}>"
-        return ""
+    parameterizer = Parameterizer(
+        regex_pattern_keys=(
+            "email",
+            "url",
+            "hostname",
+            "ip",
+            "uuid",
+            "sha1",
+            "md5",
+            "date",
+            "duration",
+            "hex",
+            "float",
+            "int",
+            "quoted_str",
+            "bool",
+        ),
+        experiments=(UniqueIdExperiment,),
+    )
 
-    normalized = _parameterization_regex.sub(_handle_regex_match, trimmed)
-    for experiment in _parameterization_regex_experiments:
-        if event.project_id and (
-            in_rollout_group(
-                f"grouping.experiments.parameterization.{experiment.name}", event.project_id
+    def _shoudl_run_experiment(experiment_name: str) -> bool:
+        return bool(
+            event.project_id
+            and (
+                in_rollout_group(
+                    f"grouping.experiments.parameterization.{experiment_name}", event.project_id
+                )
+                or event.project_id
+                in [  # Active internal Sentry projects
+                    155735,
+                    4503972821204992,
+                    1267915,
+                    221969,
+                    11276,
+                    1269704,
+                    4505469596663808,
+                    1,
+                    54785,
+                    1492057,
+                    162676,
+                    6690737,
+                    300688,
+                    4506400311934976,
+                    6424467,
+                ]
             )
-            or event.project_id
-            in [  # Active internal Sentry projects
-                155735,
-                4503972821204992,
-                1267915,
-                221969,
-                11276,
-                1269704,
-                4505469596663808,
-                1,
-                54785,
-                1492057,
-                162676,
-                6690737,
-                300688,
-                4506400311934976,
-                6424467,
-            ]
-        ):
-            experiment_output, metric_inc = experiment.run(
-                experiment, _handle_regex_match, normalized
+        )
+
+    normalized = parameterizer.parameterize_all(trimmed, _shoudl_run_experiment)
+
+    for experiment in parameterizer.get_successful_experiments():
+        if share_analytics and experiment.counter < 100:
+            experiment.counter += 1
+            analytics.record(
+                "grouping.experiments.parameterization",
+                experiment_name=experiment.name,
+                project_id=event.project_id,
+                event_id=event.event_id,
             )
-            if experiment_output != normalized:
-                trimmed_value_counter[experiment.name] += metric_inc
-                # Register 100 (arbitrary, bounded number) analytics events per experiment per instance restart
-                # This generates samples for review consistently but creates a hard cap on
-                # analytics event volume
-                if share_analytics and experiment.counter < 100:
-                    experiment.counter += 1
-                    analytics.record(
-                        "grouping.experiments.parameterization",
-                        experiment_name=experiment.name,
-                        project_id=event.project_id,
-                        event_id=event.event_id,
-                    )
-                normalized = experiment_output
 
-    for key, value in trimmed_value_counter.items():
+    for key, value in parameterizer.matches_counter.items():
         # `key` can only be one of the keys from `_parameterization_regex`, thus, not a large
         # cardinality. Tracking the key helps distinguish what kinds of replacements are happening.
         metrics.incr("grouping.value_trimmed_from_message", amount=value, tags={"key": key})

+ 290 - 0
tests/sentry/grouping/test_parameterization.py

@@ -0,0 +1,290 @@
+from unittest import mock
+
+import pytest
+
+from sentry.grouping.parameterization import (
+    ParameterizationRegexExperiment,
+    Parameterizer,
+    UniqueIdExperiment,
+)
+
+
+@pytest.fixture
+def parameterizer():
+    return Parameterizer(
+        regex_pattern_keys=(
+            "email",
+            "url",
+            "hostname",
+            "ip",
+            "uuid",
+            "sha1",
+            "md5",
+            "date",
+            "duration",
+            "hex",
+            "float",
+            "int",
+            "quoted_str",
+            "bool",
+        ),
+        experiments=(UniqueIdExperiment,),
+    )
+
+
+@pytest.mark.parametrize(
+    ("name", "input", "expected"),
+    [
+        ("email", """blah test@email.com had a problem""", """blah <email> had a problem"""),
+        ("url", """blah http://some.email.com had a problem""", """blah <url> had a problem"""),
+        (
+            "url - existing behavior",
+            """blah tcp://user:pass@email.com:10 had a problem""",
+            """blah tcp://user:<email>:<int> had a problem""",
+        ),
+        ("ip", """blah 0.0.0.0 had a problem""", """blah <ip> had a problem"""),
+        (
+            "UUID",
+            """blah 7c1811ed-e98f-4c9c-a9f9-58c757ff494f had a problem""",
+            """blah <uuid> had a problem""",
+        ),
+        (
+            "UUID",
+            """blah bea691f2-2e25-4bec-6838-e0c44b03d60a/7c1811ed-e98f-4c9c-a9f9-58c757ff494f had a problem""",
+            """blah <uuid>/<uuid> had a problem""",
+        ),
+        (
+            "SHA1",
+            """blah 5fc35719b9cf96ec602dbc748ff31c587a46961d had a problem""",
+            """blah <sha1> had a problem""",
+        ),
+        (
+            "MD5",
+            """blah 0751007cd28df267e8e051b51f918c60 had a problem""",
+            """blah <md5> had a problem""",
+        ),
+        (
+            "Date",
+            """blah 2024-02-20T22:16:36 had a problem""",
+            """blah <date> had a problem""",
+        ),
+        (
+            "Date RFC822",
+            """blah Mon, 02 Jan 06 15:04 MST had a problem""",
+            """blah <date> had a problem""",
+        ),
+        (
+            "Date RFC822Z",
+            """blah Mon, 02 Jan 06 15:04 -0700 had a problem""",
+            """blah <date> had a problem""",
+        ),
+        (
+            "Date RFC850",
+            """blah Monday, 02-Jan-06 15:04:05 MST had a problem""",
+            """blah <date> had a problem""",
+        ),
+        (
+            "Date RFC1123",
+            """blah Mon, 02 Jan 2006 15:04:05 MST had a problem""",
+            """blah <date> had a problem""",
+        ),
+        (
+            "Date RFC1123Z",
+            """blah Mon, 02 Jan 2006 15:04:05 -0700 had a problem""",
+            """blah <date> had a problem""",
+        ),
+        (
+            "Date RFC3339",
+            """blah 2006-01-02T15:04:05Z07:00 had a problem""",
+            """blah <date> had a problem""",
+        ),
+        (
+            "Date RFC3339Nano",
+            """blah 2006-01-02T15:04:05.999999999Z07:00 had a problem""",
+            """blah <date> had a problem""",
+        ),
+        ("Date - plain", """blah 2006-01-02 had a problem""", """blah <date> had a problem"""),
+        ("Date - long", """blah Jan 18, 2019 had a problem""", """blah <date> had a problem"""),
+        (
+            "Date - Datetime",
+            """blah 2006-01-02 15:04:05 had a problem""",
+            """blah <date> had a problem""",
+        ),
+        ("Date - Kitchen", """blah 3:04PM had a problem""", """blah <date> had a problem"""),
+        ("Date - Time", """blah 15:04:05 had a problem""", """blah <date> had a problem"""),
+        (
+            "Date - basic",
+            """blah Mon Jan 02, 1999 had a problem""",
+            """blah <date> had a problem""",
+        ),
+        (
+            "Datetime - compressed",
+            """blah 20240220 11:55:33.546593 had a problem""",
+            """blah <date> had a problem""",
+        ),
+        (
+            "Datetime - datestamp",
+            """blah 2024-02-23 02:13:53.418 had a problem""",
+            """blah <date> had a problem""",
+        ),
+        ("hex", """blah 0x9af8c3b had a problem""", """blah <hex> had a problem"""),
+        ("float", """blah 0.23 had a problem""", """blah <float> had a problem"""),
+        ("int", """blah 23 had a problem""", """blah <int> had a problem"""),
+        ("quoted str", """blah b="1" had a problem""", """blah b=<quoted_str> had a problem"""),
+        ("bool", """blah a=true had a problem""", """blah a=<bool> had a problem"""),
+        (
+            "Duration - ms",
+            """blah connection failed after 12345ms 1.899s 3s""",
+            """blah connection failed after <duration> <duration> <duration>""",
+        ),
+        (
+            "Hostname - 2 levels",
+            """Blocked 'connect' from 'gggggggdasdwefwewqqqfefwef.com'""",
+            """Blocked 'connect' from '<hostname>'""",
+        ),
+        (
+            "Hostname - 3 levels",
+            """Blocked 'font' from 'www.time.co'""",
+            """Blocked 'font' from '<hostname>'""",
+        ),
+        (
+            "Nothing to replace",
+            """A quick brown fox jumped over the lazy dog""",
+            """A quick brown fox jumped over the lazy dog""",
+        ),
+    ],
+)
+def test_parameterize_standard(name, input, expected, parameterizer):
+    assert expected == parameterizer.parameterize_all(input), f"Case {name} Failed"
+
+
+@pytest.mark.parametrize(
+    ("name", "input", "expected"),
+    [
+        (
+            "Uniq ID - sql savepoint",
+            '''SQL: RELEASE SAVEPOINT "s140177518376768_x2"''',
+            """SQL: RELEASE SAVEPOINT <uniq_id>""",
+        ),
+        (
+            "Uniq ID - api gateway",
+            """API gateway VdLchF7iDo8sVkg= blah""",
+            """API gateway <uniq_id> blah""",
+        ),
+        (
+            "Uniq ID - fb trace",
+            """fbtrace_id Aba64NMEPMmBwi_cPLaGeeK AugPfq0jxGbto4u3kxn8u6p blah""",
+            """fbtrace_id <uniq_id> <uniq_id> blah""",
+        ),
+        (
+            "Uniq ID - word with numerical pre/suffix",
+            """1password python3 abc123 123abc""",
+            """1password python3 abc123 123abc""",
+        ),
+        (
+            "Uniq ID - cloudflare trace",
+            """cloudflare trace 230b030023ae2822-SJC 819cc532aex26akb-SNP blah""",
+            """cloudflare trace <uniq_id> <uniq_id> blah""",
+        ),
+        (
+            "Uniq ID - JWT",
+            """blah eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c""",
+            """blah <uniq_id>""",
+        ),
+        (
+            "Uniq ID - Nothing to replace",
+            """I am the test words 1password python3 abc123 123abc""",
+            """I am the test words 1password python3 abc123 123abc""",
+        ),
+        (
+            "Uniq ID - react element",
+            """Permission denied to access property "__reactFiber$b6c78e70asw" """,
+            """Permission denied to access property <uniq_id> """,
+        ),
+        (
+            "Uniq ID - no change variable name",
+            """TypeError: Cannot read property 'startRTM' of undefined""",
+            """TypeError: Cannot read property 'startRTM' of undefined""",
+        ),
+        (
+            "Uniq ID - json ignored properly",
+            """[401,""]""",
+            """[<int>,""]""",
+        ),
+        (
+            "Uniq ID - no change",
+            """Blocked 'script' from 'wasm-eval:'""",
+            """Blocked 'script' from 'wasm-eval:'""",
+        ),
+    ],
+)
+def test_parameterize_experiment(name, input, expected, parameterizer):
+    assert expected == parameterizer.parameterize_all(input), f"Case {name} Failed"
+    if "<uniq_id>" in expected:
+        experiments = parameterizer.get_successful_experiments()
+        assert len(experiments) == 1
+        assert experiments[0] == UniqueIdExperiment
+
+
+def test_parameterize_regex_experiment():
+    """
+    We don't have any of these yet, but we need to test that they work
+    """
+    FooExperiment = ParameterizationRegexExperiment(name="foo", raw_pattern=r"f[oO]{2}")
+
+    parameterizer = Parameterizer(
+        regex_pattern_keys=(),
+        experiments=(FooExperiment,),
+    )
+    input = "blah foobarbaz fooooo"
+    normalized = parameterizer.parameterize_all(input)
+    assert normalized == "blah <foo>barbaz <foo>ooo"
+    assert len(parameterizer.get_successful_experiments()) == 1
+    assert parameterizer.get_successful_experiments()[0] == FooExperiment
+
+
+def test_parameterize_regex_experiment_cached_compiled():
+
+    with mock.patch.object(
+        ParameterizationRegexExperiment,
+        "pattern",
+        new_callable=mock.PropertyMock,
+        return_value=r"(?P<foo>f[oO]{2})",
+    ) as mocked_pattern:
+        FooExperiment = ParameterizationRegexExperiment(name="foo", raw_pattern=r"f[oO]{2}")
+        parameterizer = Parameterizer(
+            regex_pattern_keys=(),
+            experiments=(FooExperiment,),
+        )
+        input = "blah foobarbaz fooooo"
+        _ = parameterizer.parameterize_all(input)
+        _ = parameterizer.parameterize_all(input)
+
+    mocked_pattern.assert_called_once()
+
+
+# These are test cases that we should fix
+@pytest.mark.xfail()
+@pytest.mark.parametrize(
+    ("name", "input", "expected"),
+    [
+        (
+            "URL - non-http protocol user/pass/port",
+            """blah tcp://user:pass@email.com:10 had a problem""",
+            """blah <url> had a problem""",
+        ),
+        ("URL - IP w/ port", """blah 0.0.0.0:10 had a problem""", """blah <ip> had a problem"""),
+        (
+            "Int - parens",
+            """Tb.Worker {"msg" => "(#239323) Received ...""",
+            """Tb.Worker {"msg" => "(#<int>) Received ...""",
+        ),
+        (
+            "Uniq ID - Snuba query",
+            """Error running query: SELECT (divide(plus(sumMergeIf((value AS _snuba_value), equals((arrayElement(tags.raw_value, indexOf(tags.key, 9223372036854776026)) AS `_snuba_tags_raw[9223372036854776026]`), 'satisfactory') AND equals((metric_id AS _snuba_metric_id), 9223372036854775936)), divide(sumMergeIf(_snuba_value, equals(`_snuba_tags_raw[9223372036854776026]`, 'tolerable') AND equals(_snuba_metric_id, 9223372036854775936)), 2)), sumMergeIf(_snuba_value, equals(_snuba_metric_id, 9223372036854775936))) AS `_snuba_c:transactions/on_demand@none`) FROM generic_metric_counters_aggregated_dist WHERE equals(granularity, 1) AND equals((org_id AS _snuba_org_id), 1383997) AND in((project_id AS _snuba_project_id), [6726638]) AND greaterOrEquals((timestamp AS _snuba_timestamp), toDateTime('2024-03-18T22:52:00', 'Universal')) AND less(_snuba_timestamp, toDateTime('2024-03-18T23:22:00', 'Universal')) AND equals((arrayElement(tags.raw_value, indexOf(tags.key, 9223372036854776069)) AS `_snuba_tags_raw[9223372036854776069]`), '2d896d92') AND in(_s...}""",
+            """Error running query: SELECT (divide(plus(sumMergeIf((value AS _snuba_value), equals((arrayElement(tags.raw_value, indexOf(tags.key, <int>)) AS `_snuba_tags_raw[<int>]`), 'satisfactory') AND equals((metric_id AS _snuba_metric_id), <int>)), divide(sumMergeIf(_snuba_value, equals(`_snuba_tags_raw[<int>]`, 'tolerable') AND equals(_snuba_metric_id, <int>)), 2)), sumMergeIf(_snuba_value, equals(_snuba_metric_id, <int>))) AS `_snuba_c:transactions/on_demand@none`) FROM generic_metric_counters_aggregated_dist WHERE equals(granularity, 1) AND equals((org_id AS _snuba_org_id), <int>) AND in((project_id AS _snuba_project_id), [<int>]) AND greaterOrEquals((timestamp AS _snuba_timestamp), toDateTime('2024-03-18T22:52:00', 'Universal')) AND less(_snuba_timestamp, toDateTime('<date>', 'Universal')) AND equals((arrayElement(tags.raw_value, indexOf(tags.key, <int>)) AS `_snuba_tags_raw[<int>]`), '<uniq_id>') AND in(_s...}""",
+        ),
+    ],
+)
+def test_fail_parameterize(name, input, expected, parameterizer):
+    assert expected == parameterizer.parameterize_all(input), f"Case {name} Failed"