Browse Source

feat(replays): Add improved tags object construction and add filtering (#38514)

Colton Allen 2 years ago
parent
commit
3d175c04d9

+ 100 - 9
src/sentry/replays/lib/query.py

@@ -2,7 +2,7 @@
 from typing import Any, List, Optional, Tuple, Union
 
 from rest_framework.exceptions import ParseError
-from snuba_sdk import Column, Condition, Op
+from snuba_sdk import Column, Condition, Function, Identifier, Lambda, Op
 from snuba_sdk.conditions import And, Or
 from snuba_sdk.expressions import Expression
 from snuba_sdk.orderby import Direction, OrderBy
@@ -20,12 +20,11 @@ OPERATOR_MAP = {
     "<": Op.LT,
     "<=": Op.LTE,
     "IN": Op.IN,
+    "NOT IN": Op.NOT_IN,
 }
 
 
 class Field:
-    attribute_name: Optional[str] = None
-
     def __init__(
         self,
         name: Optional[str] = None,
@@ -36,6 +35,7 @@ class Field:
         operators: Optional[list] = None,
         validators: Optional[list] = None,
     ) -> None:
+        self.attribute_name = None
         self.field_alias = field_alias or name
         self.query_alias = query_alias or name
         self.is_filterable = is_filterable
@@ -52,7 +52,7 @@ class Field:
         else:
             return op, []
 
-    def deserialize_values(self, values: List[str]) -> Tuple[Any, List[str]]:
+    def deserialize_values(self, values: List[str]) -> Tuple[List[Any], List[str]]:
         parsed_values = []
         for value in values:
             parsed_value, errors = self.deserialize_value(value)
@@ -79,17 +79,57 @@ class Field:
 
         return typed_value, []
 
+    def as_condition(
+        self,
+        field_alias: str,
+        operator: Op,
+        value: Union[List[str], str],
+    ) -> Condition:
+        return Condition(Column(self.query_alias or self.attribute_name), operator, value)
+
 
 class String(Field):
-    _operators = [Op.EQ, Op.NEQ, Op.IN]
+    _operators = [Op.EQ, Op.NEQ, Op.IN, Op.NOT_IN]
     _python_type = str
 
 
 class Number(Field):
-    _operators = [Op.EQ, Op.NEQ, Op.GT, Op.GTE, Op.LT, Op.LTE, Op.IN]
+    _operators = [Op.EQ, Op.NEQ, Op.GT, Op.GTE, Op.LT, Op.LTE, Op.IN, Op.NOT_IN]
     _python_type = int
 
 
+class Tag(Field):
+    _operators = [Op.EQ, Op.NEQ, Op.IN, Op.NOT_IN]
+    _negation_map = [False, True, False, True]
+    _python_type = str
+
+    def __init__(self, **kwargs):
+        kwargs.pop("operators", None)
+        return super().__init__(**kwargs)
+
+    def deserialize_operator(self, operator: str) -> Tuple[Op, List[str]]:
+        op = OPERATOR_MAP.get(operator)
+        if op is None:
+            return None, ["Operator not found."]
+        elif op not in self._operators:
+            return None, ["Operator not permitted."]
+        else:
+            return op, []
+
+    def as_condition(
+        self,
+        field_alias: str,
+        operator: Op,
+        value: Union[List[str], str],
+    ) -> Condition:
+        negated = operator not in (Op.EQ, Op.IN)
+        return filter_tag_by_value(
+            key=field_alias,
+            values=value,
+            negated=negated,
+        )
+
+
 class QueryConfig:
     def __init__(self, only: Optional[Tuple[str]] = None) -> None:
         self.fields = {}
@@ -154,10 +194,10 @@ def filter_to_condition(search_filter: SearchFilter, query_config: QueryConfig)
     """Coerce SearchFilter syntax to snuba Condition syntax."""
     # Validate field exists and is filterable.
     field_alias = search_filter.key.name
-    field = query_config.get(field_alias)
+    field = query_config.get(field_alias) or query_config.get("*")
     if field is None:
         raise ParseError(f"Invalid field specified: {field_alias}.")
-    elif not field.is_filterable:
+    if not field.is_filterable:
         raise ParseError(f'"{field_alias}" is not filterable.')
 
     # Validate strategy is correct.
@@ -172,7 +212,7 @@ def filter_to_condition(search_filter: SearchFilter, query_config: QueryConfig)
     if errors:
         raise ParseError(f"Invalid value specified: {field_alias}.")
 
-    return Condition(Column(field.query_alias or field.attribute_name), operator, value)
+    return field.as_condition(field_alias, operator, value)
 
 
 def attempt_compressed_condition(
@@ -213,3 +253,54 @@ def get_valid_sort_commands(
         raise ParseError(f"Invalid field specified: {field_name}.")
     else:
         return [OrderBy(Column(field.query_alias or field.attribute_name), strategy)]
+
+
+# Tag filtering behavior.
+
+
+def filter_tag_by_value(
+    key: str,
+    values: Union[List[str], str],
+    negated: bool = False,
+) -> Condition:
+    """Helper function that allows filtering a tag by multiple values."""
+    function = "hasAny" if isinstance(values, list) else "has"
+    expected = 0 if negated else 1
+    return Condition(
+        Function(function, parameters=[_all_values_for_tag_key(key), values]),
+        Op.EQ,
+        expected,
+    )
+
+
+def _all_values_for_tag_key(key: str) -> Function:
+    return Function(
+        "arrayFilter",
+        parameters=[
+            Lambda(
+                ["key", "mask"],
+                Function("equals", parameters=[Identifier("mask"), 1]),
+            ),
+            Column("tv"),
+            _bitmask_on_tag_key(key),
+        ],
+    )
+
+
+def _bitmask_on_tag_key(key: str) -> Function:
+    """Create a bit mask.
+
+    Returns an array where the integer 1 represents a match.
+        e.g.: [0, 0, 1, 0, 1, 0]
+    """
+    return Function(
+        "arrayMap",
+        parameters=[
+            Lambda(
+                ["index", "key"],
+                Function("equals", parameters=[Identifier("key"), key]),
+            ),
+            Function("arrayEnumerate", parameters=[Column("tk")]),
+            Column("tk"),
+        ],
+    )

+ 19 - 2
src/sentry/replays/post_process.py

@@ -1,4 +1,5 @@
-from typing import Any, Dict, Generator, List, Optional, Tuple
+import collections
+from typing import Any, Dict, Generator, List, Optional, Sequence, Tuple
 
 
 def process_raw_response(response: List[Dict[str, Any]], fields: List[str]) -> List[Dict[str, Any]]:
@@ -26,7 +27,7 @@ def generate_normalized_output(
         item["id"] = item.pop("replay_id")
         item["longestTransaction"] = 0
         item["environment"] = item.pop("agg_environment")
-        item["tags"] = dict(zip(item.pop("tags.key") or [], item.pop("tags.value") or []))
+        item["tags"] = dict_unique_list(zip(item.pop("tk") or [], item.pop("tv") or []))
         item["user"] = {
             "id": item.pop("user_id"),
             "name": item.pop("user_name"),
@@ -59,3 +60,19 @@ def generate_sorted_urls(url_groups: List[Tuple[int, List[str]]]) -> Generator[N
     """Return a flat list of ordered urls."""
     for _, url_group in sorted(url_groups, key=lambda item: item[0]):
         yield from url_group
+
+
+def dict_unique_list(items: Sequence[Tuple[str, str]]) -> Dict[str, List[str]]:
+    """Populate a dictionary with the first key, value pair seen.
+
+    There is a potential for duplicate keys to exist in the result set.  When we filter these keys
+    in Clickhouse we will filter by the first value so we ignore subsequent updates to the key.
+    This function ensures what is displayed matches what was filtered.
+    """
+    result = collections.defaultdict(set)
+    for key, value in items:
+        result[key].add(value)
+
+    for key, value in result.items():
+        result[key] = list(value)
+    return result

+ 14 - 26
src/sentry/replays/query.py

@@ -24,6 +24,7 @@ from sentry.replays.lib.query import (
     Number,
     QueryConfig,
     String,
+    Tag,
     generate_valid_conditions,
     get_valid_sort_commands,
 )
@@ -177,9 +178,17 @@ def make_select_statement() -> List[Union[Column, Function]]:
         _grouped_unique_scalar_value(column_name="device_model"),
         _grouped_unique_scalar_value(column_name="sdk_name"),
         _grouped_unique_scalar_value(column_name="sdk_version"),
-        _grouped_unique_scalar_value(column_name="tags.key"),
-        _grouped_unique_scalar_value(column_name="tags.value"),
         # Flatten array of arrays.
+        Function(
+            "groupArrayArray",
+            parameters=[Column("tags.key")],
+            alias="tk",
+        ),
+        Function(
+            "groupArrayArray",
+            parameters=[Column("tags.value")],
+            alias="tv",
+        ),
         Function(
             "arrayMap",
             parameters=[
@@ -259,30 +268,6 @@ def _grouped_unique_scalar_value(
 
 replay_url_parser_config = SearchConfig(
     numeric_keys={"duration", "countErrors", "countSegments"},
-    allowed_keys={
-        "id",
-        "projectId",
-        "platform",
-        "release",
-        "dist",
-        "duration",
-        "countErrors",
-        "countSegments",
-        "user.id",
-        "user.email",
-        "user.name",
-        "user.ipAddress",
-        "sdk.name",
-        "sdk.version",
-        "os.name",
-        "os.version",
-        "browser.name",
-        "browser.version",
-        "device.name",
-        "device.brand",
-        "device.model",
-        "device.family",
-    },
 )
 
 
@@ -313,6 +298,9 @@ class ReplayQueryConfig(QueryConfig):
     sdk_name = String(field_alias="sdk.name")
     sdk_version = String(field_alias="sdk.version")
 
+    # Tag
+    tags = Tag(field_alias="*")
+
     # Sort keys
     started_at = String(name="startedAt", is_filterable=False)
     finished_at = String(name="finishedAt", is_filterable=False)

+ 13 - 6
src/sentry/replays/testutils.py

@@ -13,7 +13,14 @@ def assert_expected_response(
         assert key in response, key
         response_value = response.pop(key)
 
-        if isinstance(response_value, list):
+        if isinstance(response_value, dict):
+            assert isinstance(value, dict)
+            for k, v in value.items():
+                if isinstance(v, list):
+                    assert sorted(response_value[k]) == sorted(v)
+                else:
+                    assert response_value[k] == v
+        elif isinstance(response_value, list):
             assert len(response_value) == len(value), f'"{response_value}" "{value}"'
             for item in response_value:
                 assert item in value, f"{key}, {item}"
@@ -76,7 +83,7 @@ def mock_expected_response(
             "name": kwargs.pop("user_name", "username"),
             "ip_address": kwargs.pop("user_ip_address", "127.0.0.1"),
         },
-        "tags": {"customtag": "is_set"},
+        "tags": kwargs.pop("tags", {}),
     }
 
 
@@ -86,6 +93,9 @@ def mock_replay(
     replay_id: str,
     **kwargs: typing.Dict[str, typing.Any],
 ) -> typing.Dict[str, typing.Any]:
+    tags = kwargs.pop("tags", {})
+    tags.update({"transaction": kwargs.pop("title", "Title")})
+
     return {
         "type": "replay_event",
         "start_time": int(timestamp.timestamp()),
@@ -99,10 +109,7 @@ def mock_replay(
                         "type": "replay_event",
                         "replay_id": replay_id,
                         "segment_id": kwargs.pop("segment_id", 0),
-                        "tags": {
-                            "customtag": "is_set",
-                            "transaction": kwargs.pop("title", "Title"),
-                        },
+                        "tags": tags,
                         "urls": kwargs.pop("urls", []),
                         "error_ids": kwargs.pop(
                             "error_ids", ["a3a62ef6-ac86-415b-83c2-416fc2f76db1"]

+ 20 - 7
tests/sentry/replays/test_organization_replay_index.py

@@ -53,6 +53,7 @@ class OrganizationReplayIndexTest(APITestCase, ReplaysSnubaTestCase):
                     "http://localhost:3000/",
                     "http://localhost:3000/login",
                 ],  # duplicate urls are okay,
+                tags={"test": "hello", "other": "hello"},
             )
         )
         self.store_replays(
@@ -62,6 +63,7 @@ class OrganizationReplayIndexTest(APITestCase, ReplaysSnubaTestCase):
                 replay1_id,
                 # error_ids=[uuid.uuid4().hex, replay1_id],  # duplicate error-id
                 urls=["http://localhost:3000/"],  # duplicate urls are okay
+                tags={"test": "world", "other": "hello"},
             )
         )
 
@@ -87,6 +89,7 @@ class OrganizationReplayIndexTest(APITestCase, ReplaysSnubaTestCase):
                 count_segments=2,
                 # count_errors=3,
                 count_errors=1,
+                tags={"test": ["hello", "world"], "other": ["hello"]},
             )
             assert_expected_response(response_data["data"][0], expected_response)
 
@@ -336,6 +339,7 @@ class OrganizationReplayIndexTest(APITestCase, ReplaysSnubaTestCase):
                 device_brand="Apple",
                 device_family="Macintosh",
                 device_model="10",
+                tags={"a": "m", "b": "q"},
             )
         )
         self.store_replays(
@@ -351,6 +355,7 @@ class OrganizationReplayIndexTest(APITestCase, ReplaysSnubaTestCase):
                 device_brand=None,
                 device_family=None,
                 device_model=None,
+                tags={"a": "n", "b": "o"},
             )
         )
 
@@ -376,6 +381,7 @@ class OrganizationReplayIndexTest(APITestCase, ReplaysSnubaTestCase):
                 "device.model:10",
                 # Contains operator.
                 f"id:[{replay1_id},{uuid.uuid4().hex},{uuid.uuid4().hex}]",
+                f"!id:[{uuid.uuid4().hex}]",
                 # Or expression.
                 f"id:{replay1_id} OR id:{uuid.uuid4().hex} OR id:{uuid.uuid4().hex}",
                 # Paren wrapped expression.
@@ -384,6 +390,9 @@ class OrganizationReplayIndexTest(APITestCase, ReplaysSnubaTestCase):
                 f"(id:{replay1_id} OR id:b) AND (duration:>15 OR id:d)",
                 # Implicit And.
                 f"(id:{replay1_id} OR id:b) OR (duration:>15 platform:javascript)",
+                # Tag filters.
+                "a:m",
+                "a:[n,o]",
             ]
 
             for query in queries:
@@ -405,6 +414,8 @@ class OrganizationReplayIndexTest(APITestCase, ReplaysSnubaTestCase):
                 f"id:{replay1_id} AND id:b",
                 f"id:{replay1_id} AND duration:>1000",
                 "id:b OR duration:>1000",
+                "a:o",
+                "a:[o,p]",
             ]
             for query in null_queries:
                 response = self.client.get(self.url + f"?query={query}")
@@ -557,14 +568,16 @@ class OrganizationReplayIndexTest(APITestCase, ReplaysSnubaTestCase):
                 assert r["data"][0]["id"] == replay1_id, key
                 assert r["data"][1]["id"] == replay2_id, key
 
-    def test_get_replays_filter_bad_field(self):
-        """Test replays conform to the interchange format."""
-        self.create_project(teams=[self.team])
+    # No such thing as a bad field with the tag filtering behavior.
+    #
+    # def test_get_replays_filter_bad_field(self):
+    #     """Test replays conform to the interchange format."""
+    #     self.create_project(teams=[self.team])
 
-        with self.feature(REPLAYS_FEATURES):
-            response = self.client.get(self.url + "?query=xyz:a")
-            assert response.status_code == 400
-            assert b"xyz" in response.content
+    #     with self.feature(REPLAYS_FEATURES):
+    #         response = self.client.get(self.url + "?query=xyz:a")
+    #         assert response.status_code == 400
+    #         assert b"xyz" in response.content
 
     def test_get_replays_filter_bad_value(self):
         """Test replays conform to the interchange format."""