Browse Source

perf(replays): query directly on replay_id uuid instead of stripped s… (#45715)

Using our base query, it's shown that removing the string conversion /
dash stripping on the replay_id column improves memory usage by ~26% and
speed by ~21%. Queries pasted in at bottom.

This PR creates a new `Field` `UUIDField` as we have to do validation /
conversion prior to sending our query to clickhouse, and our minimum
clickhouse version doesn't have useful helpful functions so we have to
work around.

We also now have to strip dashes on replay_ids in the post processing of
our queries. We could do the same for error_ids and trace_ids in a
future PRs if we want that optimization on those fields as well.




```
SET send_logs_level = 'trace'
```

```
SELECT     project_id AS _snuba_project_id,     replaceAll (toString (replay_id), '-', '') AS _snuba_replay_id,     max(timestamp AS _snuba_timestamp) AS _snuba_finished_at,     sum(length(error_ids AS _snuba_error_ids)) AS _snuba_count_errors,     groupArray (1) (environment AS _snuba_environment)[1] AS _snuba_agg_environment,     _snuba_replay_id,     notEmpty (groupArray (is_archived AS _snuba_is_archived)) AS _snuba_isArchived FROM     replays_local WHERE (_snuba_project_id IN[11276])     AND (_snuba_timestamp < toDateTime ('2023-03-11T00:30:34', 'Universal'))     AND (_snuba_timestamp >= toDateTime ('2022-03-10T00:30:34', 'Universal')) GROUP BY     _snuba_project_id,     _snuba_replay_id HAVING (min(segment_id AS _snuba_segment_id) = 0) AND (_snuba_finished_at < toDateTime ('2023-03-11T00:30:34', 'Universal')) AND (_snuba_isArchived = 0) ORDER BY     _snuba_count_errors ASC LIMIT 0, 10

Peak memory usage (for query): 612.65 MiB.
10 rows in set. Elapsed: 2.912 sec. Processed 7.94 million rows, 341.42 MB (2.73 million rows/s., 117.25 MB/s.)
```

```
SELECT     project_id AS _snuba_project_id,     replay_id,     max(timestamp AS _snuba_timestamp) AS _snuba_finished_at,     sum(length(error_ids AS _snuba_error_ids)) AS _snuba_count_errors,     groupArray (1) (environment AS _snuba_environment)[1] AS _snuba_agg_environment,     notEmpty (groupArray (is_archived AS _snuba_is_archived)) AS _snuba_isArchived FROM     replays_local WHERE (_snuba_project_id IN[11276])     AND (_snuba_timestamp < toDateTime ('2023-03-11T00:30:34', 'Universal'))     AND (_snuba_timestamp >= toDateTime ('2022-03-10T00:30:34', 'Universal')) GROUP BY     _snuba_project_id,     replay_id HAVING (min(segment_id AS _snuba_segment_id) = 0) AND (_snuba_finished_at < toDateTime ('2023-03-11T00:30:34', 'Universal')) AND (_snuba_isArchived = 0) ORDER BY     _snuba_count_errors ASC LIMIT 0, 10

MemoryTracker: Peak memory usage (for query): 448.65 MiB
10 rows in set. Elapsed: 1.711 sec. Processed 7.94 million rows, 341.42 MB (4.64 million rows/s., 199.60 MB/s.)
```
Josh Ferge 2 years ago
parent
commit
c90915bdfe

+ 1 - 1
src/sentry/replays/endpoints/project_replay_details.py

@@ -36,7 +36,7 @@ class ProjectReplayDetailsEndpoint(ProjectEndpoint):
         filter_params = self.get_filter_params(request, project)
 
         try:
-            uuid.UUID(replay_id)
+            replay_id = str(uuid.UUID(replay_id))
         except ValueError:
             return Response(status=404)
 

+ 27 - 0
src/sentry/replays/lib/query.py

@@ -92,6 +92,33 @@ class Field:
         return Condition(Column(self.query_alias or self.attribute_name), operator, value)
 
 
+class UUIDField(Field):
+    """
+    as our minimum supported clickhouse version is 20.3, we don't have
+    isUUIDOrZero function, so we must validate the uuid before supplying to clickhouse.
+    """
+
+    _operators = [Op.EQ, Op.NEQ, Op.IN, Op.NOT_IN]
+    _python_type = str
+
+    def as_condition(
+        self, field_alias: str, operator: Op, value: Union[List[str], str], is_wildcard: bool
+    ) -> Condition:
+        if isinstance(value, list):
+            uuids = _transform_uuids(value)
+            if uuids is None:
+                return Condition(Function("identity", parameters=[1]), Op.EQ, 2)
+            value = uuids
+        else:
+            uuids = _transform_uuids([value])
+            if uuids is None:
+                return Condition(Function("identity", parameters=[1]), Op.EQ, 2)
+
+            value = uuids[0]
+
+        return super().as_condition(field_alias, operator, value)
+
+
 class String(Field):
     _operators = [Op.EQ, Op.NEQ, Op.IN, Op.NOT_IN]
     _python_type = str

+ 7 - 1
src/sentry/replays/post_process.py

@@ -19,12 +19,18 @@ def generate_restricted_fieldset(
         yield from response
 
 
+def _strip_dashes(field):
+    if field:
+        return field.replace("-", "")
+    return field
+
+
 def generate_normalized_output(
     response: List[Dict[str, Any]]
 ) -> Generator[None, None, Dict[str, Any]]:
     """For each payload in the response strip "agg_" prefixes."""
     for item in response:
-        item["id"] = item.pop("replay_id", None)
+        item["id"] = _strip_dashes(item.pop("replay_id", None))
         item["project_id"] = str(item["project_id"])
         item["trace_ids"] = item.pop("traceIds", [])
         item["error_ids"] = item.pop("errorIds", [])

+ 3 - 2
src/sentry/replays/query.py

@@ -29,6 +29,7 @@ from sentry.replays.lib.query import (
     QueryConfig,
     String,
     Tag,
+    UUIDField,
     all_values_for_tag_key,
     generate_valid_conditions,
     get_valid_sort_commands,
@@ -393,7 +394,7 @@ class ReplayQueryConfig(QueryConfig):
     activity = Number()
 
     # String filters.
-    replay_id = String(field_alias="id")
+    replay_id = UUIDField(field_alias="id")
     replay_type = String(query_alias="replay_type")
     platform = String()
     releases = ListField()
@@ -585,7 +586,7 @@ FIELD_QUERY_ALIAS_MAP: Dict[str, List[str]] = {
 # match the column's query alias.
 
 QUERY_ALIAS_COLUMN_MAP = {
-    "replay_id": _strip_uuid_dashes("replay_id", Column("replay_id")),
+    "replay_id": Column("replay_id"),
     "project_id": Column("project_id"),
     "trace_ids": Function(
         "arrayMap",