Browse Source

fix(discover) Fix user.display in aggregates (#20329)

Include the full column function in the group by clause so that we don't
get errors about ungrouped fields.

Mostly remove the string formatting for ArgValue substitutions. Using
ArgValue lets us expand function parameters into other expressions.

Refs SENTRY-GWT

Co-authored-by: evanh <evanh@users.noreply.github.com>
Co-authored-by: Tony <Zylphrex@users.noreply.github.com>
Mark Story 4 years ago
parent
commit
3b3e6a9892

+ 40 - 22
src/sentry/api/event_search.py

@@ -782,8 +782,7 @@ def convert_search_filter_to_snuba_query(search_filter, key=None):
         # allow snuba's prewhere optimizer to find this condition.
         return [name, search_filter.operator, value]
     elif name == USER_DISPLAY_ALIAS:
-        # Slice off the function without an alias.
-        user_display_expr = FIELD_ALIASES[USER_DISPLAY_ALIAS]["fields"][0][0:2]
+        user_display_expr = FIELD_ALIASES[USER_DISPLAY_ALIAS]["expression"]
 
         # Handle 'has' condition
         if search_filter.value.raw_value == "":
@@ -1179,12 +1178,8 @@ def get_filter(query=None, params=None):
 FIELD_ALIASES = {
     "project": {"fields": ["project.id"], "column_alias": "project.id"},
     "issue": {"fields": ["issue.id"], "column_alias": "issue.id"},
-    # TODO(mark) This alias doesn't work inside count_unique().
-    # The column resolution code is really convoluted and expanding
-    # it to support functions that need columns resolved inside of
-    # aggregations is complicated. Until that gets sorted user.display
-    # should not be added to the public field list/docs.
     "user.display": {
+        "expression": ["coalesce", ["user.email", "user.username", "user.ip"]],
         "fields": [["coalesce", ["user.email", "user.username", "user.ip"], "user.display"]],
         "column_alias": "user.display",
     },
@@ -1231,19 +1226,39 @@ class FunctionArg(object):
         return False
 
 
+class NullColumn(FunctionArg):
+    """
+    Convert the provided column to null so that we
+    can drop it. Used to make count() not have a
+    required argument that we ignore.
+    """
+
+    def has_default(self, params):
+        return None
+
+    def normalize(self, value):
+        return None
+
+
 class CountColumn(FunctionArg):
     def has_default(self, params):
         return None
 
     def normalize(self, value):
         if value is None:
+            raise InvalidFunctionArgument("a column is required")
+
+        if value not in FIELD_ALIASES:
             return value
 
-        # If we use an alias inside an aggregate, resolve it here
-        if value in FIELD_ALIASES:
-            value = FIELD_ALIASES[value].get("column_alias", value)
+        alias = FIELD_ALIASES[value]
 
-        return value
+        # If the alias has an expression prefer that over the column alias
+        # This enables user.display to work in aggregates
+        if "expression" in alias:
+            return alias["expression"]
+
+        return alias.get("column_alias", value)
 
 
 class NumericColumn(FunctionArg):
@@ -1410,7 +1425,7 @@ FUNCTIONS = {
         "column": [
             "multiply",
             [
-                ["floor", [["divide", [u"{column}", ArgValue("bucket_size")]]]],
+                ["floor", [["divide", [ArgValue("column"), ArgValue("bucket_size")]]]],
                 ArgValue("bucket_size"),
             ],
             None,
@@ -1420,38 +1435,35 @@ FUNCTIONS = {
     "count_unique": {
         "name": "count_unique",
         "args": [CountColumn("column")],
-        "aggregate": ["uniq", u"{column}", None],
+        "aggregate": ["uniq", ArgValue("column"), None],
         "result_type": "integer",
     },
-    # TODO(evanh) Count doesn't accept parameters in the frontend, but we support it here
-    # for backwards compatibility. Once we've migrated existing queries this should get
-    # changed to accept no parameters.
     "count": {
         "name": "count",
-        "args": [CountColumn("column")],
+        "args": [NullColumn("column")],
         "aggregate": ["count", None, None],
         "result_type": "integer",
     },
     "min": {
         "name": "min",
         "args": [NumericColumnNoLookup("column")],
-        "aggregate": ["min", u"{column}", None],
+        "aggregate": ["min", ArgValue("column"), None],
     },
     "max": {
         "name": "max",
         "args": [NumericColumnNoLookup("column")],
-        "aggregate": ["max", u"{column}", None],
+        "aggregate": ["max", ArgValue("column"), None],
     },
     "avg": {
         "name": "avg",
         "args": [DurationColumnNoLookup("column")],
-        "aggregate": ["avg", u"{column}", None],
+        "aggregate": ["avg", ArgValue("column"), None],
         "result_type": "duration",
     },
     "sum": {
         "name": "sum",
         "args": [DurationColumnNoLookup("column")],
-        "aggregate": ["sum", u"{column}", None],
+        "aggregate": ["sum", ArgValue("column"), None],
         "result_type": "duration",
     },
     # Currently only being used by the baseline PoC
@@ -1560,7 +1572,9 @@ def resolve_function(field, match=None, params=None):
         aggregate[0] = aggregate[0].format(**arguments)
         if isinstance(aggregate[1], six.string_types):
             aggregate[1] = aggregate[1].format(**arguments)
-
+        elif isinstance(aggregate[1], ArgValue):
+            arg = aggregate[1].arg
+            aggregate[1] = arguments[arg]
         if aggregate[2] is None:
             aggregate[2] = get_function_alias_with_columns(
                 function["name"], columns if not used_default else []
@@ -1744,6 +1758,10 @@ def resolve_field_list(fields, snuba_filter, auto_fields=True):
                 if column[0] == "transform":
                     # When there's a project transform, we already group by project_id
                     continue
+                if column[2] == USER_DISPLAY_ALIAS:
+                    # user.display needs to be grouped by its coalesce function
+                    groupby.append(column)
+                    continue
                 groupby.append(column[2])
             else:
                 groupby.append(column)

+ 2 - 2
src/sentry/static/sentry/app/utils/discover/fields.tsx

@@ -330,12 +330,11 @@ enum FieldKey {
   TRANSACTION_DURATION = 'transaction.duration',
   TRANSACTION_OP = 'transaction.op',
   TRANSACTION_STATUS = 'transaction.status',
-  // user.display is intentionally not here as
-  // it isn't stable and ready for customers to use just yet.
   USER_EMAIL = 'user.email',
   USER_ID = 'user.id',
   USER_IP = 'user.ip',
   USER_USERNAME = 'user.username',
+  USER_DISPLAY = 'user.display',
 }
 
 /**
@@ -410,6 +409,7 @@ export const FIELDS: Readonly<Record<FieldKey, ColumnType>> = {
   // Field alises defined in src/sentry/api/event_search.py
   [FieldKey.PROJECT]: 'string',
   [FieldKey.ISSUE]: 'string',
+  [FieldKey.USER_DISPLAY]: 'string',
 };
 
 export type FieldTag = {

+ 16 - 2
src/sentry/utils/snuba.py

@@ -752,7 +752,11 @@ def nest_groups(data, groups, aggregate_cols):
 
 def resolve_column(dataset):
     def _resolve_column(col):
-        if col is None or col.startswith("tags[") or QUOTED_LITERAL_RE.match(col):
+        if col is None:
+            return col
+        if isinstance(col, six.string_types) and (
+            col.startswith("tags[") or QUOTED_LITERAL_RE.match(col)
+        ):
             return col
 
         # Some dataset specific logic:
@@ -973,7 +977,17 @@ def resolve_snuba_aliases(snuba_filter, resolve_func, function_translations=None
         if isinstance(aggregation[1], six.string_types):
             aggregation[1] = resolve_func(aggregation[1])
         elif isinstance(aggregation[1], (set, tuple, list)):
-            aggregation[1] = [resolve_func(col) for col in aggregation[1]]
+            # The aggregation has another function call as its parameter
+            func_index = get_function_index(aggregation[1])
+            if func_index is not None:
+                # Resolve the columns on the nested function, and add a wrapping
+                # list to become a valid query expression.
+                aggregation[1] = [
+                    [aggregation[1][0], [resolve_func(col) for col in aggregation[1][1]]]
+                ]
+            else:
+                # Parameter is a list of fields.
+                aggregation[1] = [resolve_func(col) for col in aggregation[1]]
     resolved.aggregations = aggregations
 
     conditions = resolved.conditions

+ 32 - 3
tests/sentry/api/test_event_search.py

@@ -1816,11 +1816,24 @@ class ResolveFieldListTest(unittest.TestCase):
         assert result["groupby"] == [
             "title",
             "issue.id",
-            "user.display",
+            ["coalesce", ["user.email", "user.username", "user.ip"], "user.display"],
             "message",
             "project.id",
         ]
 
+    def test_field_alias_with_aggregates(self):
+        fields = ["event.type", "user.display", "count_unique(title)"]
+        result = resolve_field_list(fields, eventstore.Filter())
+        assert result["selected_columns"] == [
+            "event.type",
+            ["coalesce", ["user.email", "user.username", "user.ip"], "user.display"],
+        ]
+        assert result["aggregations"] == [["uniq", "title", "count_unique_title"]]
+        assert result["groupby"] == [
+            "event.type",
+            ["coalesce", ["user.email", "user.username", "user.ip"], "user.display"],
+        ]
+
     def test_aggregate_function_expansion(self):
         fields = ["count_unique(user)", "count(id)", "min(timestamp)"]
         result = resolve_field_list(fields, eventstore.Filter())
@@ -1833,13 +1846,16 @@ class ResolveFieldListTest(unittest.TestCase):
         ]
         assert result["groupby"] == []
 
-    @pytest.mark.xfail(reason="functions need to handle non string column replacements")
     def test_aggregate_function_complex_field_expansion(self):
         fields = ["count_unique(user.display)"]
         result = resolve_field_list(fields, eventstore.Filter())
         assert result["selected_columns"] == []
         assert result["aggregations"] == [
-            ["uniq", ["coalesce", ["user.email", "user.username", "user.ip"]], "count_unique_user"],
+            [
+                "uniq",
+                ["coalesce", ["user.email", "user.username", "user.ip"]],
+                "count_unique_user_display",
+            ],
         ]
         assert result["groupby"] == []
 
@@ -1883,6 +1899,19 @@ class ResolveFieldListTest(unittest.TestCase):
             in six.text_type(err)
         )
 
+    def test_aggregate_function_missing_parameter(self):
+        with pytest.raises(InvalidSearchQuery) as err:
+            fields = ["count_unique()"]
+            resolve_field_list(fields, eventstore.Filter())
+        assert (
+            "InvalidSearchQuery: count_unique(): column argument invalid: a column is required"
+            in six.text_type(err)
+        )
+
+        with pytest.raises(InvalidSearchQuery) as err:
+            fields = ["count_unique(  )"]
+            resolve_field_list(fields, eventstore.Filter())
+
     def test_percentile_function(self):
         fields = ["percentile(transaction.duration, 0.75)"]
         result = resolve_field_list(fields, eventstore.Filter())

+ 47 - 0
tests/snuba/api/endpoints/test_organization_events_v2.py

@@ -1505,6 +1505,53 @@ class OrganizationEventsV2EndpointTest(APITestCase, SnubaTestCase):
         result = set([r["user.display"] for r in data])
         assert result == set(["catherine", "cathy@example.com"])
 
+    def test_user_display_with_aggregates(self):
+        self.login_as(user=self.user)
+
+        project1 = self.create_project()
+        self.store_event(
+            data={
+                "event_id": "a" * 32,
+                "transaction": "/example",
+                "message": "how to make fast",
+                "timestamp": self.two_min_ago,
+                "user": {"email": "cathy@example.com"},
+            },
+            project_id=project1.id,
+        )
+
+        with self.feature(
+            {"organizations:discover-basic": True, "organizations:global-views": True}
+        ):
+            response = self.client.get(
+                self.url,
+                format="json",
+                data={
+                    "field": ["event.type", "user.display", "count_unique(title)"],
+                    "statsPeriod": "24h",
+                },
+            )
+
+            assert response.status_code == 200, response.content
+            data = response.data["data"]
+            assert len(data) == 1
+            result = set([r["user.display"] for r in data])
+            assert result == set(["cathy@example.com"])
+
+        with self.feature(
+            {"organizations:discover-basic": True, "organizations:global-views": True}
+        ):
+            response = self.client.get(
+                self.url,
+                format="json",
+                data={"field": ["event.type", "count_unique(user.display)"], "statsPeriod": "24h"},
+            )
+
+            assert response.status_code == 200, response.content
+            data = response.data["data"]
+            assert len(data) == 1
+            assert data[0]["count_unique_user_display"] == 1
+
     def test_has_transaction_status(self):
         project = self.create_project()
         data = load_data("transaction", timestamp=before_now(minutes=1))