Browse Source

perf(query-builder): Use startsWith/endsWith for some glob conditions (#73460)

Additional optimization here for common glob patterns for starts and
ends with.
Tony Xiao 8 months ago
parent
commit
88ac464d01

+ 54 - 0
src/sentry/api/event_search.py

@@ -387,6 +387,60 @@ class SearchValue(NamedTuple):
             return False
         return bool(WILDCARD_CHARS.search(self.raw_value))
 
+    def classify_wildcard(self) -> Literal["prefix", "infix", "suffix", "other"]:
+        if not self.is_wildcard():
+            return "other"
+
+        ret = WILDCARD_CHARS.finditer(self.raw_value)
+
+        leading_wildcard = False
+        trailing_wildcard = False
+        middle_wildcard = False
+
+        for x in ret:
+            start, end = x.span()
+            if start == 0 and end == 1:
+                # It must span exactly [0, 1) because if it spans further,
+                # the pattern also matched on some leading slashes.
+                leading_wildcard = True
+            elif end == len(self.raw_value):
+                # It only needs to match on end because if it matches on
+                # some slashes before the *, that's okay.
+                trailing_wildcard = True
+            else:
+                # The wildcard happens somewhere in the middle of the value.
+                # We care about this because when this happens, it's not
+                # trivial to optimize the query, so let it fall back to
+                # the existing regex approach.
+                middle_wildcard = True
+
+        if not middle_wildcard:
+            if leading_wildcard and trailing_wildcard:
+                return "infix"
+            elif leading_wildcard:
+                return "suffix"
+            elif trailing_wildcard:
+                return "prefix"
+
+        return "other"
+
+    def format_wildcard(self, kind: Literal["prefix", "infix", "suffix", "other"]) -> str:
+        if kind == "prefix":
+            # If it's a prefix wildcard, we strip off the last character
+            # which is always a `*` and match on the rest.
+            return translate_escape_sequences(self.raw_value[:-1])
+        elif kind == "infix":
+            # If it's an infix wildcard, we strip off the first and last character
+            # which is always a `*` and match on the rest.
+            return translate_escape_sequences(self.raw_value[1:-1])
+        elif kind == "suffix":
+            # If it's a suffix wildcard, we strip off the first character
+            # which is always a `*` and match on the rest.
+            return translate_escape_sequences(self.raw_value[1:])
+
+        # Fall back to the usual formatting that includes formatting escape values.
+        return self.value
+
     def is_event_id(self) -> bool:
         """Return whether the current value is a valid event id
 

+ 33 - 11
src/sentry/search/events/builder/base.py

@@ -1287,18 +1287,40 @@ class BaseQueryBuilder:
             is_null_condition = Condition(Function("isNull", [lhs]), Op.EQ, 1)
 
         if search_filter.value.is_wildcard():
-            raw_value = str(search_filter.value.raw_value)
-            new_value = event_search.SearchValue(raw_value[1:-1])
-            if (
-                raw_value.startswith("*")
-                and raw_value.endswith("*")
-                and not new_value.is_wildcard()
-            ):
-                # This is an optimization to avoid using regular expressions
-                # for wild card searches if it can be avoided.
-                # Here, we're just interested if the substring exists.
+            kind = (
+                search_filter.value.classify_wildcard()
+                if self.config.optimize_wildcard_searches
+                else "other"
+            )
+            if kind == "prefix":
+                condition = Condition(
+                    Function(
+                        "startsWith",
+                        [
+                            Function("lower", [lhs]),
+                            search_filter.value.format_wildcard(kind).lower(),
+                        ],
+                    ),
+                    Op.EQ if search_filter.operator in constants.EQUALITY_OPERATORS else Op.NEQ,
+                    1,
+                )
+            elif kind == "suffix":
+                condition = Condition(
+                    Function(
+                        "endsWith",
+                        [
+                            Function("lower", [lhs]),
+                            search_filter.value.format_wildcard(kind).lower(),
+                        ],
+                    ),
+                    Op.EQ if search_filter.operator in constants.EQUALITY_OPERATORS else Op.NEQ,
+                    1,
+                )
+            elif kind == "infix":
                 condition = Condition(
-                    Function("positionCaseInsensitive", [lhs, new_value.value]),
+                    Function(
+                        "positionCaseInsensitive", [lhs, search_filter.value.format_wildcard(kind)]
+                    ),
                     Op.NEQ if search_filter.operator in constants.EQUALITY_OPERATORS else Op.EQ,
                     0,
                 )

+ 1 - 0
src/sentry/search/events/datasets/base.py

@@ -19,6 +19,7 @@ class DatasetConfig(abc.ABC):
     custom_threshold_columns: set[str] = set()
     non_nullable_keys: set[str] = set()
     missing_function_error: ClassVar[type[Exception]] = InvalidSearchQuery
+    optimize_wildcard_searches = False
 
     def __init__(self, builder: BaseQueryBuilder):
         pass

+ 2 - 0
src/sentry/search/events/datasets/spans_indexed.py

@@ -26,6 +26,8 @@ from sentry.search.utils import DEVICE_CLASS
 
 
 class SpansIndexedDatasetConfig(DatasetConfig):
+    optimize_wildcard_searches = True
+
     def __init__(self, builder: builder.QueryBuilder):
         self.builder = builder
         self.total_count: int | None = None

+ 29 - 0
tests/sentry/api/test_event_search.py

@@ -710,3 +710,32 @@ def test_search_value_to_query_string(value, expected_query_string):
     actual = search_value.to_query_string()
 
     assert actual == expected_query_string
+
+
+@pytest.mark.parametrize(
+    ["value", "expected_kind", "expected_value"],
+    [
+        (1, "other", 1),
+        ("1", "other", "1"),
+        ("*", "suffix", ""),  # consider special casing this
+        ("*foo", "suffix", "foo"),
+        ("foo*", "prefix", "foo"),
+        ("*foo*", "infix", "foo"),
+        (r"\*foo", "other", r"*foo"),
+        (r"\\*foo", "other", r"^\\.*foo$"),
+        (r"foo\*", "other", r"foo*"),
+        (r"foo\\*", "prefix", r"foo\\"),
+        ("*f*o*o*", "other", "^.*f.*o.*o.*$"),
+        (r"*foo\*", "suffix", r"foo*"),
+        (r"*foo\\*", "infix", r"foo\\"),
+    ],
+)
+def test_search_value_classify_and_format_wildcard(value, expected_kind, expected_value):
+    """
+    Test classifying the wildcard type into one of prefix/suffix/infix/other
+    and formatting the value according to the classification results.
+    """
+    search_value = SearchValue(value)
+    kind = search_value.classify_wildcard()
+    assert kind == expected_kind
+    assert search_value.format_wildcard(kind) == expected_value

+ 54 - 0
tests/sentry/search/events/builder/test_spans_indexed.py

@@ -72,6 +72,10 @@ def test_field_alias(params, field, expected):
     assert expected in builder.columns
 
 
+def tags(key, column="tags"):
+    return Function("ifNull", [Column(f"{column}[{key}]"), ""])
+
+
 @pytest.mark.parametrize(
     ["condition", "expected"],
     [
@@ -116,6 +120,56 @@ def test_field_alias(params, field, expected):
             Condition(Column("span_status"), Op.IN, [3, 5]),
             id="span.status:[invalid_argument,not_found]",
         ),
+        pytest.param(
+            "foo:*bar*",
+            Condition(Function("positionCaseInsensitive", [tags("foo"), "bar"]), Op.NEQ, 0),
+            id="foo:*bar*",
+        ),
+        pytest.param(
+            "!foo:*bar*",
+            Condition(Function("positionCaseInsensitive", [tags("foo"), "bar"]), Op.EQ, 0),
+            id="!foo:*bar*",
+        ),
+        pytest.param(
+            r"foo:Bar*",
+            Condition(Function("startsWith", [Function("lower", [tags("foo")]), "bar"]), Op.EQ, 1),
+            id=r"foo:Bar*",
+        ),
+        pytest.param(
+            r"!foo:Bar*",
+            Condition(Function("startsWith", [Function("lower", [tags("foo")]), "bar"]), Op.NEQ, 1),
+            id=r"!foo:Bar*",
+        ),
+        pytest.param(
+            r"foo:*Bar",
+            Condition(Function("endsWith", [Function("lower", [tags("foo")]), "bar"]), Op.EQ, 1),
+            id=r"foo:*Bar",
+        ),
+        pytest.param(
+            r"!foo:*Bar",
+            Condition(Function("endsWith", [Function("lower", [tags("foo")]), "bar"]), Op.NEQ, 1),
+            id=r"!foo:*Bar",
+        ),
+        pytest.param(
+            r"foo:*Bar\*",
+            Condition(Function("endsWith", [Function("lower", [tags("foo")]), "bar*"]), Op.EQ, 1),
+            id=r"foo:*Bar\*",
+        ),
+        pytest.param(
+            r"!foo:*Bar\*",
+            Condition(Function("endsWith", [Function("lower", [tags("foo")]), "bar*"]), Op.NEQ, 1),
+            id=r"!foo:*Bar\*",
+        ),
+        pytest.param(
+            r"foo:*b*a*r*",
+            Condition(Function("match", [tags("foo"), "(?i)^.*b.*a.*r.*$"]), Op.EQ, 1),
+            id=r"foo:*b*a*r*",
+        ),
+        pytest.param(
+            r"!foo:*b*a*r*",
+            Condition(Function("match", [tags("foo"), "(?i)^.*b.*a.*r.*$"]), Op.NEQ, 1),
+            id=r"!foo:*b*a*r*",
+        ),
     ],
 )
 @django_db_all

+ 31 - 0
tests/snuba/api/endpoints/test_organization_events_span_indexed.py

@@ -376,3 +376,34 @@ class OrganizationEventsSpanIndexedEndpointTest(OrganizationEventsEndpointTestBa
         assert data[0]["measurements.messaging.message.body.size"] == 1024
         assert data[0]["measurements.messaging.message.retry.count"] == 2
         assert meta["dataset"] == "spansIndexed"
+
+    def test_tag_wildcards(self):
+        self.store_spans(
+            [
+                self.create_span(
+                    {"description": "foo", "tags": {"foo": "BaR"}},
+                    start_ts=self.ten_mins_ago,
+                ),
+                self.create_span(
+                    {"description": "qux", "tags": {"foo": "QuX"}},
+                    start_ts=self.ten_mins_ago,
+                ),
+            ]
+        )
+
+        for query in [
+            "foo:b*",
+            "foo:*r",
+            "foo:*a*",
+            "foo:b*r",
+        ]:
+            response = self.do_request(
+                {
+                    "field": ["foo", "count()"],
+                    "query": query,
+                    "project": self.project.id,
+                    "dataset": "spansIndexed",
+                }
+            )
+            assert response.status_code == 200, response.content
+            assert response.data["data"] == [{"foo": "BaR", "count()": 1}]