9 months ago · 805151bd9b
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -549,6 +549,7 @@ module = [
 
				     "sentry.buffer.*",
			
 
				     "sentry.build.*",
			
 
				     "sentry.eventstore.reprocessing.redis",
			
 
				+    "sentry.grouping.parameterization",
			
 
				     "sentry.hybridcloud",
			
 
				     "sentry.hybridcloud.migrations.*",
			
 
				     "sentry.hybridcloud.options",
			
--- a/src/sentry/grouping/parameterization.py
+++ b/src/sentry/grouping/parameterization.py
@@ -0,0 +1,368 @@
 
				+import dataclasses
			
 
				+import re
			
 
				+from collections import defaultdict
			
 
				+from collections.abc import Callable, Sequence
			
 
				+from functools import lru_cache
			
 
				+
			
 
				+import tiktoken
			
 
				+
			
 
				+__all__ = [
			
 
				+    "ParameterizationCallable",
			
 
				+    "ParameterizationCallableExperiment",
			
 
				+    "ParameterizationExperiment",
			
 
				+    "ParameterizationRegex",
			
 
				+    "ParameterizationRegexExperiment",
			
 
				+    "Parameterizer",
			
 
				+    "UniqueIdExperiment",
			
 
				+]
			
 
				+
			
 
				+
			
 
				+@dataclasses.dataclass
			
 
				+class ParameterizationRegex:
			
 
				+
			
 
				+    name: str  # name of the pattern also used as group name in combined regex
			
 
				+    raw_pattern: str  # regex pattern w/o matching group name
			
 
				+    lookbehind: str | None = None  # positive lookbehind prefix if needed
			
 
				+    lookahead: str | None = None  # positive lookahead postfix if needed
			
 
				+    counter: int = 0
			
 
				+
			
 
				+    # These need to be used with `(?x)` tells the regex compiler to ignore comments
			
 
				+    # and unescaped whitespace, so we can use newlines and indentation for better legibility.
			
 
				+
			
 
				+    @property
			
 
				+    def pattern(self) -> str:
			
 
				+        """
			
 
				+        Returns the regex pattern for with as a named matching group and lookbehind/lookahead if needed.
			
 
				+        """
			
 
				+        prefix = rf"(?<={self.lookbehind})" if self.lookbehind else ""
			
 
				+        postfix = rf"(?={self.lookahead})" if self.lookahead else ""
			
 
				+        return rf"{prefix}(?P<{self.name}>{self.raw_pattern}){postfix}"
			
 
				+
			
 
				+    @property
			
 
				+    def compiled_pattern(self) -> re.Pattern[str]:
			
 
				+        """
			
 
				+        Returns the compiled regex pattern for with as a named matching group and lookbehind/lookahead if needed.
			
 
				+        """
			
 
				+        if not hasattr(self, "_compiled_pattern"):
			
 
				+            self._compiled_pattern = re.compile(rf"(?x){self.pattern}")
			
 
				+        return self._compiled_pattern
			
 
				+
			
 
				+
			
 
				+DEFAULT_PARAMETERIZATION_REGEXES = [
			
 
				+    ParameterizationRegex(
			
 
				+        name="email",
			
 
				+        raw_pattern=r"""[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*""",
			
 
				+    ),
			
 
				+    ParameterizationRegex(name="url", raw_pattern=r"""\b(wss?|https?|ftp)://[^\s/$.?#].[^\s]*"""),
			
 
				+    ParameterizationRegex(
			
 
				+        name="hostname",
			
 
				+        raw_pattern=r"""
			
 
				+            # Top 100 TLDs. The complete list is 1000s long.
			
 
				+            \b
			
 
				+            ([a-zA-Z0-9\-]{1,63}\.)+?
			
 
				+            (
			
 
				+                (COM|NET|ORG|JP|DE|UK|FR|BR|IT|RU|ES|ME|GOV|PL|CA|AU|CN|CO|IN|NL|EDU|INFO|EU|CH|ID|AT|KR|CZ|MX|BE|TV|SE|TR|TW|AL|UA|IR|VN|CL|SK|LY|CC|TO|NO|FI|US|PT|DK|AR|HU|TK|GR|IL|NEWS|RO|MY|BIZ|IE|ZA|NZ|SG|EE|TH|IO|XYZ|PE|BG|HK|RS|LT|LINK|PH|CLUB|SI|SITE|MOBI|BY|CAT|WIKI|LA|GA|XXX|CF|HR|NG|JOBS|ONLINE|KZ|UG|GQ|AE|IS|LV|PRO|FM|TIPS|MS|SA|APP)|
			
 
				+                (com|net|org|jp|de|uk|fr|br|it|ru|es|me|gov|pl|ca|au|cn|co|in|nl|edu|info|eu|ch|id|at|kr|cz|mx|be|tv|se|tr|tw|al|ua|ir|vn|cl|sk|ly|cc|to|no|fi|us|pt|dk|ar|hu|tk|gr|il|news|ro|my|biz|ie|za|nz|sg|ee|th|io|xyz|pe|bg|hk|rs|lt|link|ph|club|si|site|mobi|by|cat|wiki|la|ga|xxx|cf|hr|ng|jobs|online|kz|ug|gq|ae|is|lv|pro|fm|tips|ms|sa|app)
			
 
				+            )
			
 
				+            \b
			
 
				+        """,
			
 
				+    ),
			
 
				+    ParameterizationRegex(
			
 
				+        name="ip",
			
 
				+        raw_pattern=r"""
			
 
				+            (
			
 
				+                ([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|
			
 
				+                ([0-9a-fA-F]{1,4}:){1,7}:|
			
 
				+                ([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|
			
 
				+                ([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|
			
 
				+                ([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|
			
 
				+                ([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|
			
 
				+                ([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|
			
 
				+                [0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|
			
 
				+                :((:[0-9a-fA-F]{1,4}){1,7}|:)|
			
 
				+                fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|
			
 
				+                ::(ffff(:0{1,4}){0,1}:){0,1}
			
 
				+                ((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}
			
 
				+                (25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|
			
 
				+                ([0-9a-fA-F]{1,4}:){1,4}:
			
 
				+                ((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}
			
 
				+                (25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\b
			
 
				+            ) |
			
 
				+            (
			
 
				+                \b((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}
			
 
				+                (25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\b
			
 
				+            )
			
 
				+        """,
			
 
				+    ),
			
 
				+    ParameterizationRegex(
			
 
				+        name="uuid",
			
 
				+        raw_pattern=r"""\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b""",
			
 
				+    ),
			
 
				+    ParameterizationRegex(name="sha1", raw_pattern=r"""\b[0-9a-fA-F]{40}\b"""),
			
 
				+    ParameterizationRegex(name="md5", raw_pattern=r"""\b[0-9a-fA-F]{32}\b"""),
			
 
				+    ParameterizationRegex(
			
 
				+        name="date",
			
 
				+        raw_pattern=r"""
			
 
				+            # No word boundaries required around dates. Should there be?
			
 
				+            # RFC822, RFC1123, RFC1123Z
			
 
				+            ((?:Sun|Mon|Tue|Wed|Thu|Fri|Sat),\s\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{2,4}\s\d{1,2}:\d{1,2}(:\d{1,2})?\s([-\+][\d]{2}[0-5][\d]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z])))
			
 
				+            |
			
 
				+            # Similar to RFC822, but "Mon Jan 02, 1999", "Jan 02, 1999"
			
 
				+            (((?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s)?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s[0-3]\d,\s\d{2,4})
			
 
				+            |
			
 
				+            # RFC850
			
 
				+            ((?:Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday),\s\d{2}-(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-\d{2}\s\d{2}:\d{2}:\d{2}\s(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))
			
 
				+            |
			
 
				+            # RFC3339, RFC3339Nano
			
 
				+            (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?Z?([+-]?\d{2}:\d{2})?)
			
 
				+            |
			
 
				+            # Datetime:
			
 
				+            (\d{4}-?[01]\d-?[0-3]\d\s[0-2]\d:[0-5]\d:[0-5]\d)(\.\d+)?
			
 
				+            |
			
 
				+            # Kitchen
			
 
				+            (\d{1,2}:\d{2}(:\d{2})?(?: [aApP][Mm])?)
			
 
				+            |
			
 
				+            # Date
			
 
				+            (\d{4}-[01]\d-[0-3]\d)
			
 
				+            |
			
 
				+            # Time
			
 
				+            ([0-2]\d:[0-5]\d:[0-5]\d)
			
 
				+            |
			
 
				+            # Old Date Formats, TODO: possibly safe to remove?
			
 
				+            (
			
 
				+                (\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d\.\d+([+-][0-2]\d:[0-5]\d|Z))|
			
 
				+                (\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))|
			
 
				+                (\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))
			
 
				+            ) |
			
 
				+            (
			
 
				+                \b(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s+)?
			
 
				+                (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+
			
 
				+                ([\d]{1,2})\s+
			
 
				+                ([\d]{2}:[\d]{2}:[\d]{2})\s+
			
 
				+                [\d]{4}
			
 
				+            ) |
			
 
				+            (
			
 
				+                \b(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat),\s+)?
			
 
				+                (0[1-9]|[1-2]?[\d]|3[01])\s+
			
 
				+                (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+
			
 
				+                (19[\d]{2}|[2-9][\d]{3})\s+
			
 
				+                (2[0-3]|[0-1][\d]):([0-5][\d])
			
 
				+                (?::(60|[0-5][\d]))?\s+
			
 
				+                ([-\+][\d]{2}[0-5][\d]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))
			
 
				+            ) |
			
 
				+            (datetime.datetime\(.*?\))
			
 
				+        """,
			
 
				+    ),
			
 
				+    ParameterizationRegex(name="duration", raw_pattern=r"""\b(\d+ms) | (\d(\.\d+)?s)\b"""),
			
 
				+    ParameterizationRegex(name="hex", raw_pattern=r"""\b0[xX][0-9a-fA-F]+\b"""),
			
 
				+    ParameterizationRegex(name="float", raw_pattern=r"""-\d+\.\d+\b | \b\d+\.\d+\b"""),
			
 
				+    ParameterizationRegex(name="int", raw_pattern=r"""-\d+\b | \b\d+\b"""),
			
 
				+    ParameterizationRegex(
			
 
				+        name="quoted_str",
			
 
				+        raw_pattern=r"""# Using `=`lookbehind which guarantees we'll only match the value half of key-value pairs,
			
 
				+            # rather than all quoted strings
			
 
				+            '([^']+)' | "([^"]+)"
			
 
				+        """,
			
 
				+        lookbehind="=",
			
 
				+    ),
			
 
				+    ParameterizationRegex(
			
 
				+        name="bool",
			
 
				+        raw_pattern=r"""# Using `=`lookbehind which guarantees we'll only match the value half of key-value pairs,
			
 
				+            # rather than all instances of the words 'true' and 'false'.
			
 
				+            True |
			
 
				+            true |
			
 
				+            False |
			
 
				+            false
			
 
				+        """,
			
 
				+        lookbehind="=",
			
 
				+    ),
			
 
				+]
			
 
				+
			
 
				+
			
 
				+DEFAULT_PARAMETERIZATION_REGEXES_MAP = {r.name: r.pattern for r in DEFAULT_PARAMETERIZATION_REGEXES}
			
 
				+
			
 
				+
			
 
				+@dataclasses.dataclass
			
 
				+class ParameterizationCallable:
			
 
				+    """
			
 
				+    Represents a callable that can be used to modify a string, which can give
			
 
				+    us more flexibility than just using regex.
			
 
				+    """
			
 
				+
			
 
				+    name: str  # name of the pattern also used as group name in combined regex
			
 
				+    apply: Callable[[str], tuple[str, int]]  # function to modifying the input string
			
 
				+    counter: int = 0
			
 
				+
			
 
				+
			
 
				+@dataclasses.dataclass
			
 
				+class ParameterizationCallableExperiment(ParameterizationCallable):
			
 
				+    def run(self, content: str, callback: Callable[[str, int], None]) -> str:
			
 
				+        content, count = self.apply(content)
			
 
				+        if count:
			
 
				+            callback(self.name, count)
			
 
				+        return content
			
 
				+
			
 
				+
			
 
				+class ParameterizationRegexExperiment(ParameterizationRegex):
			
 
				+    def run(
			
 
				+        self,
			
 
				+        content: str,
			
 
				+        callback: Callable[[re.Match[str]], str],
			
 
				+    ) -> str:
			
 
				+        return self.compiled_pattern.sub(callback, content)
			
 
				+
			
 
				+
			
 
				+class _UniqueId:
			
 
				+    # just a namespace for the uniq_id logic, no need to instantiate
			
 
				+
			
 
				+    NAME = "uniq_id"
			
 
				+
			
 
				+    @staticmethod
			
 
				+    @lru_cache(maxsize=1)
			
 
				+    def tiktoken_encoding() -> tiktoken.Encoding:
			
 
				+        return tiktoken.get_encoding("cl100k_base")
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def num_tokens_from_string(token_str: str) -> int:
			
 
				+        """Returns the number of tokens in a text string."""
			
 
				+        num_tokens = len(_UniqueId.tiktoken_encoding().encode(token_str))
			
 
				+        return num_tokens
			
 
				+
			
 
				+    # These are all somewhat arbitrary based on examples.
			
 
				+    TOKEN_LENGTH_MINIMUM = (
			
 
				+        4  # Tokens smaller than this are unlikely to be unique ids regardless of other attributes
			
 
				+    )
			
 
				+    TOKEN_LENGTH_RATIO_DEFAULT = 0.5
			
 
				+    TOKEN_LENGTH_LONG = 10
			
 
				+    TOKEN_LENGTH_RATIO_LONG = 0.4
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def is_probably_uniq_id(token_str: str) -> bool:
			
 
				+        token_str = token_str.strip("\"'[]{}():;")
			
 
				+        if len(token_str) < _UniqueId.TOKEN_LENGTH_MINIMUM:
			
 
				+            return False
			
 
				+        if (
			
 
				+            token_str[0] == "<" and token_str[-1] == ">"
			
 
				+        ):  # Don't replace already-parameterized tokens
			
 
				+            return False
			
 
				+        token_length_ratio = _UniqueId.num_tokens_from_string(token_str) / len(token_str)
			
 
				+        if (
			
 
				+            len(token_str) > _UniqueId.TOKEN_LENGTH_LONG
			
 
				+            and token_length_ratio > _UniqueId.TOKEN_LENGTH_RATIO_LONG
			
 
				+        ):
			
 
				+            return True
			
 
				+        return token_length_ratio > _UniqueId.TOKEN_LENGTH_RATIO_DEFAULT
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def replace_uniq_ids_in_str(string: str) -> tuple[str, int]:
			
 
				+        """
			
 
				+        Return result and count of replacements
			
 
				+        """
			
 
				+        strings = string.split(" ")
			
 
				+        count = 0
			
 
				+        for i, s in enumerate(strings):
			
 
				+            if _UniqueId.is_probably_uniq_id(s):
			
 
				+                strings[i] = "<uniq_id>"
			
 
				+                count += 1
			
 
				+        return (" ".join(strings), count)
			
 
				+
			
 
				+
			
 
				+UniqueIdExperiment = ParameterizationCallableExperiment(
			
 
				+    name=_UniqueId.NAME, apply=_UniqueId.replace_uniq_ids_in_str
			
 
				+)
			
 
				+
			
 
				+
			
 
				+ParameterizationExperiment = ParameterizationCallableExperiment | ParameterizationRegexExperiment
			
 
				+
			
 
				+
			
 
				+class Parameterizer:
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        regex_pattern_keys: Sequence[str],
			
 
				+        experiments: Sequence[ParameterizationExperiment] = (),
			
 
				+    ):
			
 
				+        self._parameterization_regex = self._make_regex_from_patterns(regex_pattern_keys)
			
 
				+        self._experiments = experiments
			
 
				+
			
 
				+        self.matches_counter: defaultdict[str, int] = defaultdict(int)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _make_regex_from_patterns(pattern_keys: Sequence[str]) -> re.Pattern[str]:
			
 
				+        """
			
 
				+        Takes list of pattern keys and returns a compiled regex pattern that matches any of them.
			
 
				+
			
 
				+        @param pattern_keys: A list of keys to match in the _parameterization_regex_components dict.
			
 
				+        @returns: A compiled regex pattern that matches any of the given keys.
			
 
				+        @raises: KeyError on pattern key not in the _parameterization_regex_components dict
			
 
				+
			
 
				+        The `(?x)` tells the regex compiler to ignore comments and unescaped whitespace,
			
 
				+        so we can use newlines and indentation for better legibility in patterns above.
			
 
				+        """
			
 
				+
			
 
				+        return re.compile(
			
 
				+            rf"(?x){'|'.join(DEFAULT_PARAMETERIZATION_REGEXES_MAP[k] for k in pattern_keys)}"
			
 
				+        )
			
 
				+
			
 
				+    def parametrize_w_regex(self, content: str) -> str:
			
 
				+        """
			
 
				+        Replace all matches of the given regex in the content with a placeholder string.
			
 
				+
			
 
				+        @param content: The string to replace matches in.
			
 
				+        @param parameterization_regex: The compiled regex pattern to match.
			
 
				+        @param match_callback: An optional callback function to call with the key of the matched pattern.
			
 
				+
			
 
				+        @returns: The content with all matches replaced with placeholders.
			
 
				+        """
			
 
				+
			
 
				+        def _handle_regex_match(match: re.Match[str]) -> str:
			
 
				+            # Find the first (should be only) non-None match entry, and sub in the placeholder. For
			
 
				+            # example, given the groupdict item `('hex', '0x40000015')`, this returns '<hex>' as a
			
 
				+            # replacement for the original value in the string.
			
 
				+            for key, value in match.groupdict().items():
			
 
				+                if value is not None:
			
 
				+                    self.matches_counter[key] += 1
			
 
				+                    return f"<{key}>"
			
 
				+            return ""
			
 
				+
			
 
				+        return self._parameterization_regex.sub(_handle_regex_match, content)
			
 
				+
			
 
				+    def parametrize_w_experiments(
			
 
				+        self, content: str, should_run: Callable[[str], bool] = lambda _: True
			
 
				+    ) -> str:
			
 
				+        """
			
 
				+        Apply all experiments to the content.
			
 
				+
			
 
				+        @param content: The string to apply experiments to.
			
 
				+        @returns: The content with all experiments applied.
			
 
				+        """
			
 
				+
			
 
				+        def _incr_counter(key: str, count: int) -> None:
			
 
				+            self.matches_counter[key] += count
			
 
				+
			
 
				+        def _handle_regex_match(match: re.Match[str]) -> str:
			
 
				+            # Find the first (should be only) non-None match entry, and sub in the placeholder. For
			
 
				+            # example, given the groupdict item `('hex', '0x40000015')`, this returns '<hex>' as a
			
 
				+            # replacement for the original value in the string.
			
 
				+            for key, value in match.groupdict().items():
			
 
				+                if value is not None:
			
 
				+                    self.matches_counter[key] += 1
			
 
				+                    return f"<{key}>"
			
 
				+            return ""
			
 
				+
			
 
				+        for experiment in self._experiments:
			
 
				+            if not should_run(experiment.name):
			
 
				+                continue
			
 
				+            if isinstance(experiment, ParameterizationCallableExperiment):
			
 
				+                content = experiment.run(content, _incr_counter)
			
 
				+            else:
			
 
				+                content = experiment.run(content, _handle_regex_match)
			
 
				+
			
 
				+        return content
			
 
				+
			
 
				+    def get_successful_experiments(self) -> Sequence[ParameterizationExperiment]:
			
 
				+        return [e for e in self._experiments if self.matches_counter[e.name] > 0]
			
 
				+
			
 
				+    def parameterize_all(
			
 
				+        self, content: str, should_run: Callable[[str], bool] = lambda _: True
			
 
				+    ) -> str:
			
 
				+        return self.parametrize_w_experiments(self.parametrize_w_regex(content), should_run)
			
--- a/src/sentry/grouping/strategies/message.py
+++ b/src/sentry/grouping/strategies/message.py
@@ -1,18 +1,11 @@
 
				-import dataclasses
			
 
				-import re
			
 
				-from collections import defaultdict
			
 
				-from collections.abc import Callable
			
 
				-from functools import lru_cache
			
 
				 from itertools import islice
			
 
				-from re import Match
			
 
				 from typing import Any
			
 
				 
			
 
				-import tiktoken
			
 
				-
			
 
				 from sentry import analytics
			
 
				 from sentry.eventstore.models import Event
			
 
				 from sentry.features.rollout import in_rollout_group
			
 
				 from sentry.grouping.component import GroupingComponent
			
 
				+from sentry.grouping.parameterization import Parameterizer, UniqueIdExperiment
			
 
				 from sentry.grouping.strategies.base import (
			
 
				     GroupingContext,
			
 
				     ReturnedVariants,
			
@@ -22,232 +15,6 @@ from sentry.grouping.strategies.base import (
 
				 from sentry.interfaces.message import Message
			
 
				 from sentry.utils import metrics
			
 
				 
			
 
				-# The `(?x)` tells the regex compiler to ignore comments and unescaped whitespace,
			
 
				-# so we can use newlines and indentation for better legibility.
			
 
				-_parameterization_regex_str = r"""(?x)
			
 
				-    (?P<email>
			
 
				-        [a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*
			
 
				-    ) |
			
 
				-    (?P<url>
			
 
				-        \b(wss?|https?|ftp)://[^\s/$.?#].[^\s]*
			
 
				-    ) |
			
 
				-    (?P<hostname> # Top 100 TLDs. The complete list is 1000s long.
			
 
				-        \b
			
 
				-        ([a-zA-Z0-9\-]{1,63}\.)+?
			
 
				-        (
			
 
				-            (COM|NET|ORG|JP|DE|UK|FR|BR|IT|RU|ES|ME|GOV|PL|CA|AU|CN|CO|IN|NL|EDU|INFO|EU|CH|ID|AT|KR|CZ|MX|BE|TV|SE|TR|TW|AL|UA|IR|VN|CL|SK|LY|CC|TO|NO|FI|US|PT|DK|AR|HU|TK|GR|IL|NEWS|RO|MY|BIZ|IE|ZA|NZ|SG|EE|TH|IO|XYZ|PE|BG|HK|RS|LT|LINK|PH|CLUB|SI|SITE|MOBI|BY|CAT|WIKI|LA|GA|XXX|CF|HR|NG|JOBS|ONLINE|KZ|UG|GQ|AE|IS|LV|PRO|FM|TIPS|MS|SA|APP)|
			
 
				-            (com|net|org|jp|de|uk|fr|br|it|ru|es|me|gov|pl|ca|au|cn|co|in|nl|edu|info|eu|ch|id|at|kr|cz|mx|be|tv|se|tr|tw|al|ua|ir|vn|cl|sk|ly|cc|to|no|fi|us|pt|dk|ar|hu|tk|gr|il|news|ro|my|biz|ie|za|nz|sg|ee|th|io|xyz|pe|bg|hk|rs|lt|link|ph|club|si|site|mobi|by|cat|wiki|la|ga|xxx|cf|hr|ng|jobs|online|kz|ug|gq|ae|is|lv|pro|fm|tips|ms|sa|app)
			
 
				-        )
			
 
				-        \b
			
 
				-    ) |
			
 
				-    (?P<ip>
			
 
				-        (
			
 
				-            ([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|
			
 
				-            ([0-9a-fA-F]{1,4}:){1,7}:|
			
 
				-            ([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|
			
 
				-            ([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|
			
 
				-            ([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|
			
 
				-            ([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|
			
 
				-            ([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|
			
 
				-            [0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|
			
 
				-            :((:[0-9a-fA-F]{1,4}){1,7}|:)|
			
 
				-            fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|
			
 
				-            ::(ffff(:0{1,4}){0,1}:){0,1}
			
 
				-            ((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}
			
 
				-            (25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|
			
 
				-            ([0-9a-fA-F]{1,4}:){1,4}:
			
 
				-            ((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}
			
 
				-            (25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\b
			
 
				-        ) |
			
 
				-        (
			
 
				-            \b((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}
			
 
				-            (25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\b
			
 
				-        )
			
 
				-    ) |
			
 
				-    (?P<uuid>
			
 
				-        \b
			
 
				-            [0-9a-fA-F]{8}-
			
 
				-            [0-9a-fA-F]{4}-
			
 
				-            [0-9a-fA-F]{4}-
			
 
				-            [0-9a-fA-F]{4}-
			
 
				-            [0-9a-fA-F]{12}
			
 
				-        \b
			
 
				-    ) |
			
 
				-    (?P<sha1>
			
 
				-        \b[0-9a-fA-F]{40}\b
			
 
				-    ) |
			
 
				-    (?P<md5>
			
 
				-        \b[0-9a-fA-F]{32}\b
			
 
				-    ) |
			
 
				-    (?P<date>
			
 
				-        # No word boundaries required around dates. Should there be?
			
 
				-        # RFC822, RFC1123, RFC1123Z
			
 
				-        ((?:Sun|Mon|Tue|Wed|Thu|Fri|Sat),\s\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{2,4}\s\d{1,2}:\d{1,2}(:\d{1,2})?\s([-\+][\d]{2}[0-5][\d]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z])))
			
 
				-        |
			
 
				-        # Similar to RFC822, but "Mon Jan 02, 1999", "Jan 02, 1999"
			
 
				-        (((?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s)?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s[0-3]\d,\s\d{2,4})
			
 
				-        |
			
 
				-        # RFC850
			
 
				-        ((?:Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday),\s\d{2}-(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-\d{2}\s\d{2}:\d{2}:\d{2}\s(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))
			
 
				-        |
			
 
				-        # RFC3339, RFC3339Nano
			
 
				-        (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?Z?([+-]?\d{2}:\d{2})?)
			
 
				-        |
			
 
				-        # Datetime:
			
 
				-        (\d{4}-?[01]\d-?[0-3]\d\s[0-2]\d:[0-5]\d:[0-5]\d)(\.\d+)?
			
 
				-        |
			
 
				-        # Kitchen
			
 
				-        (\d{1,2}:\d{2}(:\d{2})?(?: [aApP][Mm])?)
			
 
				-        |
			
 
				-        # Date
			
 
				-        (\d{4}-[01]\d-[0-3]\d)
			
 
				-        |
			
 
				-        # Time
			
 
				-        ([0-2]\d:[0-5]\d:[0-5]\d)
			
 
				-        |
			
 
				-        # Old Date Formats, TODO: possibly safe to remove?
			
 
				-        (
			
 
				-            (\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d\.\d+([+-][0-2]\d:[0-5]\d|Z))|
			
 
				-            (\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))|
			
 
				-            (\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))
			
 
				-        ) |
			
 
				-        (
			
 
				-            \b(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s+)?
			
 
				-            (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+
			
 
				-            ([\d]{1,2})\s+
			
 
				-            ([\d]{2}:[\d]{2}:[\d]{2})\s+
			
 
				-            [\d]{4}
			
 
				-        ) |
			
 
				-        (
			
 
				-            \b(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat),\s+)?
			
 
				-            (0[1-9]|[1-2]?[\d]|3[01])\s+
			
 
				-            (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+
			
 
				-            (19[\d]{2}|[2-9][\d]{3})\s+
			
 
				-            (2[0-3]|[0-1][\d]):([0-5][\d])
			
 
				-            (?::(60|[0-5][\d]))?\s+
			
 
				-            ([-\+][\d]{2}[0-5][\d]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))
			
 
				-        ) |
			
 
				-        (datetime.datetime\(.*?\))
			
 
				-    ) |
			
 
				-    (?P<duration>
			
 
				-        \b
			
 
				-        (\d+ms) |
			
 
				-        (\d(\.\d+)?s)
			
 
				-        \b
			
 
				-    ) |
			
 
				-    (?P<hex>
			
 
				-        \b0[xX][0-9a-fA-F]+\b
			
 
				-    ) |
			
 
				-    (?P<float>
			
 
				-        -\d+\.\d+\b |
			
 
				-        \b\d+\.\d+\b
			
 
				-    ) |
			
 
				-    (?P<int>
			
 
				-        -\d+\b |
			
 
				-        \b\d+\b
			
 
				-    ) |
			
 
				-    (?P<quoted_str>
			
 
				-        # The `=` here guarantees we'll only match the value half of key-value pairs,
			
 
				-        # rather than all quoted strings
			
 
				-        ='([^']+)' |
			
 
				-        ="([^"]+)"
			
 
				-    ) |
			
 
				-    (?P<bool>
			
 
				-        # The `=` here guarantees we'll only match the value half of key-value pairs,
			
 
				-        # rather than all instances of the words 'true' and 'false'.
			
 
				-        =True |
			
 
				-        =true |
			
 
				-        =False |
			
 
				-        =false
			
 
				-    )
			
 
				-"""
			
 
				-
			
 
				-_parameterization_regex = re.compile(_parameterization_regex_str)
			
 
				-
			
 
				-
			
 
				-# UniqID logic
			
 
				-@lru_cache(maxsize=1)
			
 
				-def tiktoken_encoding():
			
 
				-    return tiktoken.get_encoding("cl100k_base")
			
 
				-
			
 
				-
			
 
				-def num_tokens_from_string(token_str: str) -> int:
			
 
				-    """Returns the number of tokens in a text string."""
			
 
				-    num_tokens = len(tiktoken_encoding().encode(token_str))
			
 
				-    return num_tokens
			
 
				-
			
 
				-
			
 
				-# These are all somewhat arbitrary based on examples.
			
 
				-UNIQ_ID_TOKEN_LENGTH_MINIMUM = (
			
 
				-    4  # Tokens smaller than this are unlikely to be unique ids regardless of other attributes
			
 
				-)
			
 
				-UNIQ_ID_TOKEN_LENGTH_RATIO_DEFAULT = 0.5
			
 
				-UNIQ_ID_TOKEN_LENGTH_LONG = 10
			
 
				-UNIQ_ID_TOKEN_LENGTH_RATIO_LONG = 0.4
			
 
				-
			
 
				-
			
 
				-def is_probably_uniq_id(token_str: str) -> bool:
			
 
				-    token_str = token_str.strip("\"'[]{}():;")
			
 
				-    if len(token_str) < UNIQ_ID_TOKEN_LENGTH_MINIMUM:
			
 
				-        return False
			
 
				-    if token_str[0] == "<" and token_str[-1] == ">":  # Don't replace already-parameterized tokens
			
 
				-        return False
			
 
				-    token_length_ratio = num_tokens_from_string(token_str) / len(token_str)
			
 
				-    if (
			
 
				-        len(token_str) > UNIQ_ID_TOKEN_LENGTH_LONG
			
 
				-        and token_length_ratio > UNIQ_ID_TOKEN_LENGTH_RATIO_LONG
			
 
				-    ):
			
 
				-        return True
			
 
				-    return token_length_ratio > UNIQ_ID_TOKEN_LENGTH_RATIO_DEFAULT
			
 
				-
			
 
				-
			
 
				-def replace_uniq_ids_in_str(string: str) -> tuple[str, int]:
			
 
				-    """
			
 
				-    Return result and count of replacements
			
 
				-    """
			
 
				-    strings = string.split(" ")
			
 
				-    count = 0
			
 
				-    for i, s in enumerate(strings):
			
 
				-        if is_probably_uniq_id(s):
			
 
				-            strings[i] = "<uniq_id>"
			
 
				-            count += 1
			
 
				-    return (" ".join(strings), count)
			
 
				-
			
 
				-
			
 
				-def parameterization_experiment_default_run(
			
 
				-    self: "ParameterizationExperiment", _handle_regex_match: Callable[[Match[str]], str], input: str
			
 
				-) -> tuple[str, int]:
			
 
				-    return (self.regex.sub(_handle_regex_match, input), 0)
			
 
				-
			
 
				-
			
 
				-def parameterization_experiment_uniq_id(
			
 
				-    self: "ParameterizationExperiment", _: Callable[[Match[str]], str], input: str
			
 
				-) -> tuple[str, int]:
			
 
				-    return replace_uniq_ids_in_str(input)
			
 
				-
			
 
				-
			
 
				-@dataclasses.dataclass()
			
 
				-class ParameterizationExperiment:
			
 
				-    name: str
			
 
				-    regex: Any
			
 
				-    """A function that takes as arguments:
			
 
				-            * This experiment
			
 
				-            * A handle match function (may not be used), e.g. _handle_regex_match (note that this modifies trimmed_value_counter)
			
 
				-            * A string input
			
 
				-        And returns: a tuple of [output string, count of replacements(which overlaps with any added by _handle_regex_match, if used)]
			
 
				-    """
			
 
				-    run: Callable[
			
 
				-        ["ParameterizationExperiment", Callable[[Match[str]], str], str], tuple[str, int]
			
 
				-    ] = parameterization_experiment_default_run
			
 
				-    counter: int = 0
			
 
				-
			
 
				-
			
 
				-# Note that experiments are run AFTER the initial replacements. Which means they MUST not catch replacements made
			
 
				-# in the primary parameterization regex.
			
 
				-_parameterization_regex_experiments = [
			
 
				-    ParameterizationExperiment(name="uniq_id", regex=None, run=parameterization_experiment_uniq_id),
			
 
				-]
			
 
				-
			
 
				 
			
 
				 @metrics.wraps("grouping.normalize_message_for_grouping")
			
 
				 def normalize_message_for_grouping(message: str, event: Event, share_analytics: bool = True) -> str:
			
@@ -264,68 +31,67 @@ def normalize_message_for_grouping(message: str, event: Event, share_analytics:
 
				     if trimmed != message:
			
 
				         trimmed += "..."
			
 
				 
			
 
				-    trimmed_value_counter: defaultdict[str, int] = defaultdict(int)
			
 
				-
			
 
				-    def _handle_regex_match(match: Match[str]) -> str:
			
 
				-        # Find the first (should be only) non-None match entry, and sub in the placeholder. For
			
 
				-        # example, given the groupdict item `('hex', '0x40000015')`, this returns '<hex>' as a
			
 
				-        # replacement for the original value in the string.
			
 
				-        for key, value in match.groupdict().items():
			
 
				-            if value is not None:
			
 
				-                trimmed_value_counter[key] += 1
			
 
				-                # For `quoted_str` and `bool` we want to preserve the `=` symbol, which we include in
			
 
				-                # the match in order not to replace random quoted strings and the words 'true' and 'false'
			
 
				-                # in contexts other than key-value pairs
			
 
				-                if key in ["quoted_str", "bool"]:
			
 
				-                    return f"=<{key}>"
			
 
				-                else:
			
 
				-                    return f"<{key}>"
			
 
				-        return ""
			
 
				+    parameterizer = Parameterizer(
			
 
				+        regex_pattern_keys=(
			
 
				+            "email",
			
 
				+            "url",
			
 
				+            "hostname",
			
 
				+            "ip",
			
 
				+            "uuid",
			
 
				+            "sha1",
			
 
				+            "md5",
			
 
				+            "date",
			
 
				+            "duration",
			
 
				+            "hex",
			
 
				+            "float",
			
 
				+            "int",
			
 
				+            "quoted_str",
			
 
				+            "bool",
			
 
				+        ),
			
 
				+        experiments=(UniqueIdExperiment,),
			
 
				+    )
			
 
				 
			
 
				-    normalized = _parameterization_regex.sub(_handle_regex_match, trimmed)
			
 
				-    for experiment in _parameterization_regex_experiments:
			
 
				-        if event.project_id and (
			
 
				-            in_rollout_group(
			
 
				-                f"grouping.experiments.parameterization.{experiment.name}", event.project_id
			
 
				+    def _shoudl_run_experiment(experiment_name: str) -> bool:
			
 
				+        return bool(
			
 
				+            event.project_id
			
 
				+            and (
			
 
				+                in_rollout_group(
			
 
				+                    f"grouping.experiments.parameterization.{experiment_name}", event.project_id
			
 
				+                )
			
 
				+                or event.project_id
			
 
				+                in [  # Active internal Sentry projects
			
 
				+                    155735,
			
 
				+                    4503972821204992,
			
 
				+                    1267915,
			
 
				+                    221969,
			
 
				+                    11276,
			
 
				+                    1269704,
			
 
				+                    4505469596663808,
			
 
				+                    1,
			
 
				+                    54785,
			
 
				+                    1492057,
			
 
				+                    162676,
			
 
				+                    6690737,
			
 
				+                    300688,
			
 
				+                    4506400311934976,
			
 
				+                    6424467,
			
 
				+                ]
			
 
				             )
			
 
				-            or event.project_id
			
 
				-            in [  # Active internal Sentry projects
			
 
				-                155735,
			
 
				-                4503972821204992,
			
 
				-                1267915,
			
 
				-                221969,
			
 
				-                11276,
			
 
				-                1269704,
			
 
				-                4505469596663808,
			
 
				-                1,
			
 
				-                54785,
			
 
				-                1492057,
			
 
				-                162676,
			
 
				-                6690737,
			
 
				-                300688,
			
 
				-                4506400311934976,
			
 
				-                6424467,
			
 
				-            ]
			
 
				-        ):
			
 
				-            experiment_output, metric_inc = experiment.run(
			
 
				-                experiment, _handle_regex_match, normalized
			
 
				+        )
			
 
				+
			
 
				+    normalized = parameterizer.parameterize_all(trimmed, _shoudl_run_experiment)
			
 
				+
			
 
				+    for experiment in parameterizer.get_successful_experiments():
			
 
				+        if share_analytics and experiment.counter < 100:
			
 
				+            experiment.counter += 1
			
 
				+            analytics.record(
			
 
				+                "grouping.experiments.parameterization",
			
 
				+                experiment_name=experiment.name,
			
 
				+                project_id=event.project_id,
			
 
				+                event_id=event.event_id,
			
 
				             )
			
 
				-            if experiment_output != normalized:
			
 
				-                trimmed_value_counter[experiment.name] += metric_inc
			
 
				-                # Register 100 (arbitrary, bounded number) analytics events per experiment per instance restart
			
 
				-                # This generates samples for review consistently but creates a hard cap on
			
 
				-                # analytics event volume
			
 
				-                if share_analytics and experiment.counter < 100:
			
 
				-                    experiment.counter += 1
			
 
				-                    analytics.record(
			
 
				-                        "grouping.experiments.parameterization",
			
 
				-                        experiment_name=experiment.name,
			
 
				-                        project_id=event.project_id,
			
 
				-                        event_id=event.event_id,
			
 
				-                    )
			
 
				-                normalized = experiment_output
			
 
				 
			
 
				-    for key, value in trimmed_value_counter.items():
			
 
				+    for key, value in parameterizer.matches_counter.items():
			
 
				         # `key` can only be one of the keys from `_parameterization_regex`, thus, not a large
			
 
				         # cardinality. Tracking the key helps distinguish what kinds of replacements are happening.
			
 
				         metrics.incr("grouping.value_trimmed_from_message", amount=value, tags={"key": key})
			
--- a/tests/sentry/grouping/test_parameterization.py
+++ b/tests/sentry/grouping/test_parameterization.py
@@ -0,0 +1,290 @@
 
				+from unittest import mock
			
 
				+
			
 
				+import pytest
			
 
				+
			
 
				+from sentry.grouping.parameterization import (
			
 
				+    ParameterizationRegexExperiment,
			
 
				+    Parameterizer,
			
 
				+    UniqueIdExperiment,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def parameterizer():
			
 
				+    return Parameterizer(
			
 
				+        regex_pattern_keys=(
			
 
				+            "email",
			
 
				+            "url",
			
 
				+            "hostname",
			
 
				+            "ip",
			
 
				+            "uuid",
			
 
				+            "sha1",
			
 
				+            "md5",
			
 
				+            "date",
			
 
				+            "duration",
			
 
				+            "hex",
			
 
				+            "float",
			
 
				+            "int",
			
 
				+            "quoted_str",
			
 
				+            "bool",
			
 
				+        ),
			
 
				+        experiments=(UniqueIdExperiment,),
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+@pytest.mark.parametrize(
			
 
				+    ("name", "input", "expected"),
			
 
				+    [
			
 
				+        ("email", """blah test@email.com had a problem""", """blah <email> had a problem"""),
			
 
				+        ("url", """blah http://some.email.com had a problem""", """blah <url> had a problem"""),
			
 
				+        (
			
 
				+            "url - existing behavior",
			
 
				+            """blah tcp://user:pass@email.com:10 had a problem""",
			
 
				+            """blah tcp://user:<email>:<int> had a problem""",
			
 
				+        ),
			
 
				+        ("ip", """blah 0.0.0.0 had a problem""", """blah <ip> had a problem"""),
			
 
				+        (
			
 
				+            "UUID",
			
 
				+            """blah 7c1811ed-e98f-4c9c-a9f9-58c757ff494f had a problem""",
			
 
				+            """blah <uuid> had a problem""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "UUID",
			
 
				+            """blah bea691f2-2e25-4bec-6838-e0c44b03d60a/7c1811ed-e98f-4c9c-a9f9-58c757ff494f had a problem""",
			
 
				+            """blah <uuid>/<uuid> had a problem""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "SHA1",
			
 
				+            """blah 5fc35719b9cf96ec602dbc748ff31c587a46961d had a problem""",
			
 
				+            """blah <sha1> had a problem""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "MD5",
			
 
				+            """blah 0751007cd28df267e8e051b51f918c60 had a problem""",
			
 
				+            """blah <md5> had a problem""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Date",
			
 
				+            """blah 2024-02-20T22:16:36 had a problem""",
			
 
				+            """blah <date> had a problem""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Date RFC822",
			
 
				+            """blah Mon, 02 Jan 06 15:04 MST had a problem""",
			
 
				+            """blah <date> had a problem""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Date RFC822Z",
			
 
				+            """blah Mon, 02 Jan 06 15:04 -0700 had a problem""",
			
 
				+            """blah <date> had a problem""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Date RFC850",
			
 
				+            """blah Monday, 02-Jan-06 15:04:05 MST had a problem""",
			
 
				+            """blah <date> had a problem""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Date RFC1123",
			
 
				+            """blah Mon, 02 Jan 2006 15:04:05 MST had a problem""",
			
 
				+            """blah <date> had a problem""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Date RFC1123Z",
			
 
				+            """blah Mon, 02 Jan 2006 15:04:05 -0700 had a problem""",
			
 
				+            """blah <date> had a problem""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Date RFC3339",
			
 
				+            """blah 2006-01-02T15:04:05Z07:00 had a problem""",
			
 
				+            """blah <date> had a problem""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Date RFC3339Nano",
			
 
				+            """blah 2006-01-02T15:04:05.999999999Z07:00 had a problem""",
			
 
				+            """blah <date> had a problem""",
			
 
				+        ),
			
 
				+        ("Date - plain", """blah 2006-01-02 had a problem""", """blah <date> had a problem"""),
			
 
				+        ("Date - long", """blah Jan 18, 2019 had a problem""", """blah <date> had a problem"""),
			
 
				+        (
			
 
				+            "Date - Datetime",
			
 
				+            """blah 2006-01-02 15:04:05 had a problem""",
			
 
				+            """blah <date> had a problem""",
			
 
				+        ),
			
 
				+        ("Date - Kitchen", """blah 3:04PM had a problem""", """blah <date> had a problem"""),
			
 
				+        ("Date - Time", """blah 15:04:05 had a problem""", """blah <date> had a problem"""),
			
 
				+        (
			
 
				+            "Date - basic",
			
 
				+            """blah Mon Jan 02, 1999 had a problem""",
			
 
				+            """blah <date> had a problem""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Datetime - compressed",
			
 
				+            """blah 20240220 11:55:33.546593 had a problem""",
			
 
				+            """blah <date> had a problem""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Datetime - datestamp",
			
 
				+            """blah 2024-02-23 02:13:53.418 had a problem""",
			
 
				+            """blah <date> had a problem""",
			
 
				+        ),
			
 
				+        ("hex", """blah 0x9af8c3b had a problem""", """blah <hex> had a problem"""),
			
 
				+        ("float", """blah 0.23 had a problem""", """blah <float> had a problem"""),
			
 
				+        ("int", """blah 23 had a problem""", """blah <int> had a problem"""),
			
 
				+        ("quoted str", """blah b="1" had a problem""", """blah b=<quoted_str> had a problem"""),
			
 
				+        ("bool", """blah a=true had a problem""", """blah a=<bool> had a problem"""),
			
 
				+        (
			
 
				+            "Duration - ms",
			
 
				+            """blah connection failed after 12345ms 1.899s 3s""",
			
 
				+            """blah connection failed after <duration> <duration> <duration>""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Hostname - 2 levels",
			
 
				+            """Blocked 'connect' from 'gggggggdasdwefwewqqqfefwef.com'""",
			
 
				+            """Blocked 'connect' from '<hostname>'""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Hostname - 3 levels",
			
 
				+            """Blocked 'font' from 'www.time.co'""",
			
 
				+            """Blocked 'font' from '<hostname>'""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Nothing to replace",
			
 
				+            """A quick brown fox jumped over the lazy dog""",
			
 
				+            """A quick brown fox jumped over the lazy dog""",
			
 
				+        ),
			
 
				+    ],
			
 
				+)
			
 
				+def test_parameterize_standard(name, input, expected, parameterizer):
			
 
				+    assert expected == parameterizer.parameterize_all(input), f"Case {name} Failed"
			
 
				+
			
 
				+
			
 
				+@pytest.mark.parametrize(
			
 
				+    ("name", "input", "expected"),
			
 
				+    [
			
 
				+        (
			
 
				+            "Uniq ID - sql savepoint",
			
 
				+            '''SQL: RELEASE SAVEPOINT "s140177518376768_x2"''',
			
 
				+            """SQL: RELEASE SAVEPOINT <uniq_id>""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Uniq ID - api gateway",
			
 
				+            """API gateway VdLchF7iDo8sVkg= blah""",
			
 
				+            """API gateway <uniq_id> blah""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Uniq ID - fb trace",
			
 
				+            """fbtrace_id Aba64NMEPMmBwi_cPLaGeeK AugPfq0jxGbto4u3kxn8u6p blah""",
			
 
				+            """fbtrace_id <uniq_id> <uniq_id> blah""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Uniq ID - word with numerical pre/suffix",
			
 
				+            """1password python3 abc123 123abc""",
			
 
				+            """1password python3 abc123 123abc""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Uniq ID - cloudflare trace",
			
 
				+            """cloudflare trace 230b030023ae2822-SJC 819cc532aex26akb-SNP blah""",
			
 
				+            """cloudflare trace <uniq_id> <uniq_id> blah""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Uniq ID - JWT",
			
 
				+            """blah eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c""",
			
 
				+            """blah <uniq_id>""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Uniq ID - Nothing to replace",
			
 
				+            """I am the test words 1password python3 abc123 123abc""",
			
 
				+            """I am the test words 1password python3 abc123 123abc""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Uniq ID - react element",
			
 
				+            """Permission denied to access property "__reactFiber$b6c78e70asw" """,
			
 
				+            """Permission denied to access property <uniq_id> """,
			
 
				+        ),
			
 
				+        (
			
 
				+            "Uniq ID - no change variable name",
			
 
				+            """TypeError: Cannot read property 'startRTM' of undefined""",
			
 
				+            """TypeError: Cannot read property 'startRTM' of undefined""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Uniq ID - json ignored properly",
			
 
				+            """[401,""]""",
			
 
				+            """[<int>,""]""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Uniq ID - no change",
			
 
				+            """Blocked 'script' from 'wasm-eval:'""",
			
 
				+            """Blocked 'script' from 'wasm-eval:'""",
			
 
				+        ),
			
 
				+    ],
			
 
				+)
			
 
				+def test_parameterize_experiment(name, input, expected, parameterizer):
			
 
				+    assert expected == parameterizer.parameterize_all(input), f"Case {name} Failed"
			
 
				+    if "<uniq_id>" in expected:
			
 
				+        experiments = parameterizer.get_successful_experiments()
			
 
				+        assert len(experiments) == 1
			
 
				+        assert experiments[0] == UniqueIdExperiment
			
 
				+
			
 
				+
			
 
				+def test_parameterize_regex_experiment():
			
 
				+    """
			
 
				+    We don't have any of these yet, but we need to test that they work
			
 
				+    """
			
 
				+    FooExperiment = ParameterizationRegexExperiment(name="foo", raw_pattern=r"f[oO]{2}")
			
 
				+
			
 
				+    parameterizer = Parameterizer(
			
 
				+        regex_pattern_keys=(),
			
 
				+        experiments=(FooExperiment,),
			
 
				+    )
			
 
				+    input = "blah foobarbaz fooooo"
			
 
				+    normalized = parameterizer.parameterize_all(input)
			
 
				+    assert normalized == "blah <foo>barbaz <foo>ooo"
			
 
				+    assert len(parameterizer.get_successful_experiments()) == 1
			
 
				+    assert parameterizer.get_successful_experiments()[0] == FooExperiment
			
 
				+
			
 
				+
			
 
				+def test_parameterize_regex_experiment_cached_compiled():
			
 
				+
			
 
				+    with mock.patch.object(
			
 
				+        ParameterizationRegexExperiment,
			
 
				+        "pattern",
			
 
				+        new_callable=mock.PropertyMock,
			
 
				+        return_value=r"(?P<foo>f[oO]{2})",
			
 
				+    ) as mocked_pattern:
			
 
				+        FooExperiment = ParameterizationRegexExperiment(name="foo", raw_pattern=r"f[oO]{2}")
			
 
				+        parameterizer = Parameterizer(
			
 
				+            regex_pattern_keys=(),
			
 
				+            experiments=(FooExperiment,),
			
 
				+        )
			
 
				+        input = "blah foobarbaz fooooo"
			
 
				+        _ = parameterizer.parameterize_all(input)
			
 
				+        _ = parameterizer.parameterize_all(input)
			
 
				+
			
 
				+    mocked_pattern.assert_called_once()
			
 
				+
			
 
				+
			
 
				+# These are test cases that we should fix
			
 
				+@pytest.mark.xfail()
			
 
				+@pytest.mark.parametrize(
			
 
				+    ("name", "input", "expected"),
			
 
				+    [
			
 
				+        (
			
 
				+            "URL - non-http protocol user/pass/port",
			
 
				+            """blah tcp://user:pass@email.com:10 had a problem""",
			
 
				+            """blah <url> had a problem""",
			
 
				+        ),
			
 
				+        ("URL - IP w/ port", """blah 0.0.0.0:10 had a problem""", """blah <ip> had a problem"""),
			
 
				+        (
			
 
				+            "Int - parens",
			
 
				+            """Tb.Worker {"msg" => "(#239323) Received ...""",
			
 
				+            """Tb.Worker {"msg" => "(#<int>) Received ...""",
			
 
				+        ),
			
 
				+        (
			
 
				+            "Uniq ID - Snuba query",
			
 
				+            """Error running query: SELECT (divide(plus(sumMergeIf((value AS _snuba_value), equals((arrayElement(tags.raw_value, indexOf(tags.key, 9223372036854776026)) AS `_snuba_tags_raw[9223372036854776026]`), 'satisfactory') AND equals((metric_id AS _snuba_metric_id), 9223372036854775936)), divide(sumMergeIf(_snuba_value, equals(`_snuba_tags_raw[9223372036854776026]`, 'tolerable') AND equals(_snuba_metric_id, 9223372036854775936)), 2)), sumMergeIf(_snuba_value, equals(_snuba_metric_id, 9223372036854775936))) AS `_snuba_c:transactions/on_demand@none`) FROM generic_metric_counters_aggregated_dist WHERE equals(granularity, 1) AND equals((org_id AS _snuba_org_id), 1383997) AND in((project_id AS _snuba_project_id), [6726638]) AND greaterOrEquals((timestamp AS _snuba_timestamp), toDateTime('2024-03-18T22:52:00', 'Universal')) AND less(_snuba_timestamp, toDateTime('2024-03-18T23:22:00', 'Universal')) AND equals((arrayElement(tags.raw_value, indexOf(tags.key, 9223372036854776069)) AS `_snuba_tags_raw[9223372036854776069]`), '2d896d92') AND in(_s...}""",
			
 
				+            """Error running query: SELECT (divide(plus(sumMergeIf((value AS _snuba_value), equals((arrayElement(tags.raw_value, indexOf(tags.key, <int>)) AS `_snuba_tags_raw[<int>]`), 'satisfactory') AND equals((metric_id AS _snuba_metric_id), <int>)), divide(sumMergeIf(_snuba_value, equals(`_snuba_tags_raw[<int>]`, 'tolerable') AND equals(_snuba_metric_id, <int>)), 2)), sumMergeIf(_snuba_value, equals(_snuba_metric_id, <int>))) AS `_snuba_c:transactions/on_demand@none`) FROM generic_metric_counters_aggregated_dist WHERE equals(granularity, 1) AND equals((org_id AS _snuba_org_id), <int>) AND in((project_id AS _snuba_project_id), [<int>]) AND greaterOrEquals((timestamp AS _snuba_timestamp), toDateTime('2024-03-18T22:52:00', 'Universal')) AND less(_snuba_timestamp, toDateTime('<date>', 'Universal')) AND equals((arrayElement(tags.raw_value, indexOf(tags.key, <int>)) AS `_snuba_tags_raw[<int>]`), '<uniq_id>') AND in(_s...}""",
			
 
				+        ),
			
 
				+    ],
			
 
				+)
			
 
				+def test_fail_parameterize(name, input, expected, parameterizer):
			
 
				+    assert expected == parameterizer.parameterize_all(input), f"Case {name} Failed"