@@ -1,19 +1,15 @@
import re
-from collections import defaultdict, namedtuple
-from copy import deepcopy
+from collections import namedtuple
from datetime import datetime
from django.utils.functional import cached_property
-from parsimonious.exceptions import IncompleteParseError, ParseError
+from parsimonious.exceptions import IncompleteParseError
from parsimonious.expressions import Optional
from parsimonious.grammar import Grammar, NodeVisitor
from parsimonious.nodes import Node, RegexNode
-from sentry_relay.consts import SPAN_STATUS_NAME_TO_CODE
-from sentry import eventstore
-from sentry.discover.models import KeyTransaction
-from sentry.models import Project
-from sentry.models.group import Group
+from sentry.search.events.constants import KEY_TRANSACTION_ALIAS, SEARCH_MAP, TAG_KEY_RE
+from sentry.search.events.fields import FIELD_ALIASES, FUNCTIONS, InvalidSearchQuery, resolve_field
from sentry.search.utils import (
@@ -22,22 +18,9 @@ from sentry.search.utils import (
- parse_release,
-from sentry.snuba.dataset import Dataset
-from sentry.utils.compat import filter, map, zip
-from sentry.utils.dates import to_timestamp
-from sentry.utils.snuba import (
- get_json_type,
- is_duration_measurement,
- is_measurement,
- is_span_op_breakdown,
+from sentry.utils.compat import filter, map
+from sentry.utils.snuba import is_duration_measurement, is_measurement, is_span_op_breakdown
WILDCARD_CHARS = re.compile(r"[\*]")
@@ -48,10 +31,6 @@ NEGATION_MAP = {
">=": "<",
"IN": "NOT IN",
-equality_operators = frozenset(["=", "IN"])
-inequality_operators = frozenset(["!=", "NOT IN"])
-RESULT_TYPES = {"duration", "string", "number", "integer", "percentage", "date"}
def translate(pat):
@@ -196,49 +175,6 @@ spaces = ~r"\ *"
-# Create the known set of fields from the issue properties
-# and the transactions and events dataset mapping definitions.
- "start": "start",
- "end": "end",
- "project_id": "project_id",
- "first_seen": "first_seen",
- "last_seen": "last_seen",
- "times_seen": "times_seen",
-no_conversion = {"start", "end"}
-PROJECT_NAME_ALIAS = "project.name"
-PROJECT_ALIAS = "project"
-ISSUE_ALIAS = "issue"
-ISSUE_ID_ALIAS = "issue.id"
-RELEASE_ALIAS = "release"
-USER_DISPLAY_ALIAS = "user.display"
-ERROR_UNHANDLED_ALIAS = "error.unhandled"
-KEY_TRANSACTION_ALIAS = "key_transaction"
- "error.mechanism",
- "error.type",
- "error.value",
- "stack.abs_path",
- "stack.colno",
- "stack.filename",
- "stack.function",
- "stack.in_app",
- "stack.lineno",
- "stack.module",
- "stack.package",
- "stack.stack_level",
-class InvalidSearchQuery(Exception):
- pass
class SearchBoolean(namedtuple("SearchBoolean", "left_term operator right_term")):
@@ -841,2171 +777,3 @@ def parse_search_query(query, allow_boolean=True, params=None):
return SearchVisitor(allow_boolean, params=params).visit(tree)
-def convert_aggregate_filter_to_snuba_query(aggregate_filter, params):
- name = aggregate_filter.key.name
- value = aggregate_filter.value.value
- if params is not None and name in params.get("aliases", {}):
- return params["aliases"][name].converter(aggregate_filter)
- value = (
- int(to_timestamp(value)) if isinstance(value, datetime) and name != "timestamp" else value
- )
- if aggregate_filter.operator in ("=", "!=") and aggregate_filter.value.value == "":
- return [["isNull", [name]], aggregate_filter.operator, 1]
- function = resolve_field(name, params, functions_acl=FUNCTIONS.keys())
- if function.aggregate is not None:
- name = function.aggregate[-1]
- condition = [name, aggregate_filter.operator, value]
- return condition
-def translate_transaction_status(val):
- if val not in SPAN_STATUS_NAME_TO_CODE:
- raise InvalidSearchQuery(
- f"Invalid value {val} for transaction.status condition. Accepted "
- f"values are {', '.join(SPAN_STATUS_NAME_TO_CODE.keys())}"
- )
-def convert_search_filter_to_snuba_query(search_filter, key=None, params=None):
- name = search_filter.key.name if key is None else key
- value = search_filter.value.value
- # We want to use group_id elsewhere so shouldn't be removed from the dataset
- # but if a user has a tag with the same name we want to make sure that works
- if name in {"group_id"}:
- name = f"tags[{name}]"
- if name in no_conversion:
- return
- elif name == "id" and search_filter.value.is_wildcard():
- raise InvalidSearchQuery("Wildcard conditions are not permitted on `id` field.")
- elif name == "environment":
- # conditions added to env_conditions are OR'd
- env_conditions = []
- values = set(value if isinstance(value, (list, tuple)) else [value])
- # the "no environment" environment is null in snuba
- if "" in values:
- values.remove("")
- operator = "IS NULL" if search_filter.operator == "=" else "IS NOT NULL"
- env_conditions.append(["environment", operator, None])
- if len(values) == 1:
- operator = "=" if search_filter.operator in equality_operators else "!="
- env_conditions.append(["environment", operator, values.pop()])
- elif values:
- operator = "IN" if search_filter.operator in equality_operators else "NOT IN"
- env_conditions.append(["environment", operator, values])
- return env_conditions
- elif name == "message":
- if search_filter.value.is_wildcard():
- # XXX: We don't want the '^$' values at the beginning and end of
- # the regex since we want to find the pattern anywhere in the
- # message. Strip off here
- value = search_filter.value.value[1:-1]
- return [["match", ["message", f"'(?i){value}'"]], search_filter.operator, 1]
- elif value == "":
- operator = "=" if search_filter.operator == "=" else "!="
- return [["equals", ["message", f"{value}"]], operator, 1]
- else:
- # https://clickhouse.yandex/docs/en/query_language/functions/string_search_functions/#position-haystack-needle
- # positionCaseInsensitive returns 0 if not found and an index of 1 or more if found
- # so we should flip the operator here
- operator = "!=" if search_filter.operator in equality_operators else "="
- if search_filter.is_in_filter:
- # XXX: This `toString` usage is unnecessary, but we need it in place to
- # trick the legacy Snuba language into not treating `message` as a
- # function. Once we switch over to snql it can be removed.
- return [
- [
- "multiSearchFirstPositionCaseInsensitive",
- [["toString", ["message"]], ["array", [f"'{v}'" for v in value]]],
- ],
- operator,
- 0,
- ]
- # make message search case insensitive
- return [["positionCaseInsensitive", ["message", f"'{value}'"]], operator, 0]
- elif name in ARRAY_FIELDS and search_filter.value.is_wildcard():
- # Escape and convert meta characters for LIKE expressions.
- raw_value = search_filter.value.raw_value
- like_value = raw_value.replace("%", "\\%").replace("_", "\\_").replace("*", "%")
- operator = "LIKE" if search_filter.operator == "=" else "NOT LIKE"
- return [name, operator, like_value]
- elif name in ARRAY_FIELDS and search_filter.is_in_filter:
- operator = "=" if search_filter.operator == "IN" else "!="
- # XXX: This `arrayConcat` usage is unnecessary, but we need it in place to
- # trick the legacy Snuba language into not treating `name` as a
- # function. Once we switch over to snql it can be removed.
- return [
- ["hasAny", [["arrayConcat", [name]], ["array", [f"'{v}'" for v in value]]]],
- operator,
- 1,
- ]
- elif name == "transaction.status":
- # Handle "has" queries
- if search_filter.value.raw_value == "":
- return [["isNull", [name]], search_filter.operator, 1]
- if search_filter.is_in_filter:
- internal_value = [
- translate_transaction_status(val) for val in search_filter.value.raw_value
- ]
- else:
- internal_value = translate_transaction_status(search_filter.value.raw_value)
- return [name, search_filter.operator, internal_value]
- elif name == "issue.id":
- # Handle "has" queries
- if (
- search_filter.value.raw_value == ""
- or search_filter.is_in_filter
- and [v for v in value if not v]
- ):
- # The state of having no issues is represented differently on transactions vs
- # other events. On the transactions table, it is represented by 0 whereas it is
- # represented by NULL everywhere else. We use coalesce here so we can treat this
- # consistently
- name = ["coalesce", [name, 0]]
- if search_filter.is_in_filter:
- value = [v if v else 0 for v in value]
- else:
- value = 0
- # Skip isNull check on group_id value as we want to
- # allow snuba's prewhere optimizer to find this condition.
- return [name, search_filter.operator, value]
- elif name == USER_DISPLAY_ALIAS:
- user_display_expr = FIELD_ALIASES[USER_DISPLAY_ALIAS].get_expression(params)
- # Handle 'has' condition
- if search_filter.value.raw_value == "":
- return [["isNull", [user_display_expr]], search_filter.operator, 1]
- if search_filter.value.is_wildcard():
- return [
- ["match", [user_display_expr, f"'(?i){value}'"]],
- search_filter.operator,
- 1,
- ]
- return [user_display_expr, search_filter.operator, value]
- # This field is the inversion of error.handled, otherwise the logic is the same.
- if search_filter.value.raw_value == "":
- output = 0 if search_filter.operator == "!=" else 1
- return [["isHandled", []], "=", output]
- if value in ("1", 1):
- return [["notHandled", []], "=", 1]
- if value in ("0", 0):
- return [["isHandled", []], "=", 1]
- raise InvalidSearchQuery(
- "Invalid value for error.unhandled condition. Accepted values are 1, 0"
- )
- elif name == "error.handled":
- # Treat has filter as equivalent to handled
- if search_filter.value.raw_value == "":
- output = 1 if search_filter.operator == "!=" else 0
- return [["isHandled", []], "=", output]
- # Null values and 1 are the same, and both indicate a handled error.
- if value in ("1", 1):
- return [["isHandled", []], "=", 1]
- if value in (
- "0",
- 0,
- ):
- return [["notHandled", []], "=", 1]
- raise InvalidSearchQuery(
- "Invalid value for error.handled condition. Accepted values are 1, 0"
- )
- key_transaction_expr = FIELD_ALIASES[KEY_TRANSACTION_ALIAS].get_expression(params)
- if search_filter.value.raw_value == "":
- operator = "!=" if search_filter.operator == "!=" else "="
- return [key_transaction_expr, operator, 0]
- if value in ("1", 1):
- return [key_transaction_expr, "=", 1]
- if value in ("0", 0):
- return [key_transaction_expr, "=", 0]
- raise InvalidSearchQuery(
- "Invalid value for key_transaction condition. Accepted values are 1, 0"
- )
- elif name in ARRAY_FIELDS and search_filter.value.raw_value == "":
- return [["notEmpty", [name]], "=", 1 if search_filter.operator == "!=" else 0]
- else:
- # timestamp{,.to_{hour,day}} need a datetime string
- # last_seen needs an integer
- if isinstance(value, datetime) and name not in {
- "timestamp",
- "timestamp.to_hour",
- "timestamp.to_day",
- }:
- value = int(to_timestamp(value)) * 1000
- # most field aliases are handled above but timestamp.to_{hour,day} are
- # handled here
- if name in FIELD_ALIASES:
- name = FIELD_ALIASES[name].get_expression(params)
- # Tags are never null, but promoted tags are columns and so can be null.
- # To handle both cases, use `ifNull` to convert to an empty string and
- # compare so we need to check for empty values.
- if search_filter.key.is_tag:
- name = ["ifNull", [name, "''"]]
- # Handle checks for existence
- if search_filter.operator in ("=", "!=") and search_filter.value.value == "":
- if search_filter.key.is_tag:
- return [name, search_filter.operator, value]
- else:
- # If not a tag, we can just check that the column is null.
- return [["isNull", [name]], search_filter.operator, 1]
- is_null_condition = None
- # TODO(wmak): Skip this for all non-nullable keys not just event.type
- if (
- search_filter.operator in ("!=", "NOT IN")
- and not search_filter.key.is_tag
- and name != "event.type"
- ):
- # Handle null columns on inequality comparisons. Any comparison
- # between a value and a null will result to null, so we need to
- # explicitly check for whether the condition is null, and OR it
- # together with the inequality check.
- # We don't need to apply this for tags, since if they don't exist
- # they'll always be an empty string.
- is_null_condition = [["isNull", [name]], "=", 1]
- if search_filter.value.is_wildcard():
- condition = [["match", [name, f"'(?i){value}'"]], search_filter.operator, 1]
- else:
- condition = [name, search_filter.operator, value]
- # We only want to return as a list if we have the check for null
- # present. Returning as a list causes these conditions to be ORed
- # together. Otherwise just return the raw condition, so that it can be
- # used correctly in aggregates.
- if is_null_condition:
- return [is_null_condition, condition]
- else:
- return condition
-def to_list(value):
- if isinstance(value, list):
- return value
- return [value]
-def format_search_filter(term, params):
- projects_to_filter = [] # Used to avoid doing multiple conditions on project ID
- conditions = []
- group_ids = None
- name = term.key.name
- value = term.value.value
- if term.operator == "=" and value == "":
- raise InvalidSearchQuery("Invalid query for 'has' search: 'project' cannot be empty.")
- slugs = to_list(value)
- projects = {
- p.slug: p.id
- for p in Project.objects.filter(id__in=params.get("project_id", []), slug__in=slugs)
- }
- missing = [slug for slug in slugs if slug not in projects]
- if missing and term.operator in equality_operators:
- raise InvalidSearchQuery(
- f"Invalid query. Project(s) {', '.join(missing)} do not exist or are not actively selected."
- )
- project_ids = list(sorted(projects.values()))
- if project_ids:
- # Create a new search filter with the correct values
- term = SearchFilter(
- SearchKey("project_id"),
- term.operator,
- SearchValue(project_ids if term.is_in_filter else project_ids[0]),
- )
- converted_filter = convert_search_filter_to_snuba_query(term)
- if converted_filter:
- if term.operator in equality_operators:
- projects_to_filter = project_ids
- conditions.append(converted_filter)
- elif name == ISSUE_ID_ALIAS and value != "":
- # A blank term value means that this is a has filter
- group_ids = to_list(value)
- elif name == ISSUE_ALIAS:
- operator = term.operator
- value = to_list(value)
- # `unknown` is a special value for when there is no issue associated with the event
- group_short_ids = [v for v in value if v and v != "unknown"]
- filter_values = ["" for v in value if not v or v == "unknown"]
- if group_short_ids and params and "organization_id" in params:
- try:
- groups = Group.objects.by_qualified_short_id_bulk(
- params["organization_id"],
- group_short_ids,
- )
- except Exception:
- raise InvalidSearchQuery(f"Invalid value '{group_short_ids}' for 'issue:' filter")
- else:
- filter_values.extend([g.id for g in groups])
- term = SearchFilter(
- SearchKey("issue.id"),
- operator,
- SearchValue(filter_values if term.is_in_filter else filter_values[0]),
- )
- converted_filter = convert_search_filter_to_snuba_query(term)
- conditions.append(converted_filter)
- elif (
- and params
- and (value == "latest" or term.is_in_filter and any(v == "latest" for v in value))
- ):
- value = [
- parse_release(
- v,
- params["project_id"],
- params.get("environment_objects"),
- params.get("organization_id"),
- )
- for v in to_list(value)
- ]
- converted_filter = convert_search_filter_to_snuba_query(
- SearchFilter(
- term.key,
- term.operator,
- SearchValue(value if term.is_in_filter else value[0]),
- )
- )
- if converted_filter:
- conditions.append(converted_filter)
- else:
- converted_filter = convert_search_filter_to_snuba_query(term, params=params)
- if converted_filter:
- conditions.append(converted_filter)
- return conditions, projects_to_filter, group_ids
-def convert_condition_to_function(cond):
- function = OPERATOR_TO_FUNCTION.get(cond[1])
- if not function:
- # It's hard to make this error more specific without exposing internals to the end user
- raise InvalidSearchQuery(f"Operator {cond[1]} is not a valid condition operator.")
- return [function, [cond[0], cond[2]]]
-def convert_function_to_condition(func):
- operator = FUNCTION_TO_OPERATOR.get(func[0])
- if not operator:
- return [func, "=", 1]
- return [func[1][0], operator, func[1][1]]
-def convert_array_to_tree(operator, terms):
- """
- Convert an array of conditions into a binary tree joined by the operator.
- """
- if len(terms) == 1:
- return terms[0]
- elif len(terms) == 2:
- return [operator, terms]
- return [operator, [terms[0], convert_array_to_tree(operator, terms[1:])]]
-def flatten_condition_tree(tree, condition_function):
- """
- Take a binary tree of conditions, and flatten all of the terms using the condition function.
- E.g. f( and(and(b, c), and(d, e)), and ) -> [b, c, d, e]
- """
- stack = [tree]
- flattened = []
- while len(stack) > 0:
- item = stack.pop(0)
- if item[0] == condition_function:
- stack.extend(item[1])
- else:
- flattened.append(item)
- return flattened
-def is_condition(term):
- return isinstance(term, (tuple, list)) and len(term) == 3 and term[1] in OPERATOR_TO_FUNCTION
-def convert_snuba_condition_to_function(term, params=None):
- if isinstance(term, ParenExpression):
- return convert_search_boolean_to_snuba_query(term.children, params)
- group_ids = []
- projects_to_filter = []
- if isinstance(term, SearchFilter):
- conditions, projects_to_filter, group_ids = format_search_filter(term, params)
- group_ids = group_ids if group_ids else []
- if conditions:
- conditions_to_and = []
- for cond in conditions:
- if is_condition(cond):
- conditions_to_and.append(convert_condition_to_function(cond))
- else:
- conditions_to_and.append(
- convert_array_to_tree(
- SNUBA_OR, [convert_condition_to_function(c) for c in cond]
- )
- )
- condition_tree = None
- if len(conditions_to_and) == 1:
- condition_tree = conditions_to_and[0]
- elif len(conditions_to_and) > 1:
- condition_tree = convert_array_to_tree(SNUBA_AND, conditions_to_and)
- return condition_tree, None, projects_to_filter, group_ids
- elif isinstance(term, AggregateFilter):
- converted_filter = convert_aggregate_filter_to_snuba_query(term, params)
- return None, convert_condition_to_function(converted_filter), projects_to_filter, group_ids
- return None, None, projects_to_filter, group_ids
-def convert_search_boolean_to_snuba_query(terms, params=None):
- if len(terms) == 1:
- return convert_snuba_condition_to_function(terms[0], params)
- # Filter out any ANDs since we can assume anything without an OR is an AND. Also do some
- # basic sanitization of the query: can't have two operators next to each other, and can't
- # start or end a query with an operator.
- prev = None
- new_terms = []
- for term in terms:
- if prev:
- if SearchBoolean.is_operator(prev) and SearchBoolean.is_operator(term):
- raise InvalidSearchQuery(
- f"Missing condition in between two condition operators: '{prev} {term}'"
- )
- else:
- if SearchBoolean.is_operator(term):
- raise InvalidSearchQuery(
- f"Condition is missing on the left side of '{term}' operator"
- )
- if term != SearchBoolean.BOOLEAN_AND:
- new_terms.append(term)
- prev = term
- if SearchBoolean.is_operator(term):
- raise InvalidSearchQuery(f"Condition is missing on the right side of '{term}' operator")
- terms = new_terms
- # We put precedence on AND, which sort of counter-intuitevely means we have to split the query
- # on ORs first, so the ANDs are grouped together. Search through the query for ORs and split the
- # query on each OR.
- # We want to maintain a binary tree, so split the terms on the first OR we can find and recurse on
- # the two sides. If there is no OR, split the first element out to AND
- index = None
- lhs, rhs = None, None
- operator = None
- try:
- index = terms.index(SearchBoolean.BOOLEAN_OR)
- lhs, rhs = terms[:index], terms[index + 1 :]
- operator = SNUBA_OR
- except Exception:
- lhs, rhs = terms[:1], terms[1:]
- operator = SNUBA_AND
- (
- lhs_condition,
- lhs_having,
- projects_to_filter,
- group_ids,
- ) = convert_search_boolean_to_snuba_query(lhs, params)
- (
- rhs_condition,
- rhs_having,
- rhs_projects_to_filter,
- rhs_group_ids,
- ) = convert_search_boolean_to_snuba_query(rhs, params)
- projects_to_filter.extend(rhs_projects_to_filter)
- group_ids.extend(rhs_group_ids)
- if operator == SNUBA_OR and (lhs_condition or rhs_condition) and (lhs_having or rhs_having):
- raise InvalidSearchQuery(
- "Having an OR between aggregate filters and normal filters is invalid."
- )
- condition, having = None, None
- if lhs_condition or rhs_condition:
- args = filter(None, [lhs_condition, rhs_condition])
- if not args:
- condition = None
- elif len(args) == 1:
- condition = args[0]
- else:
- condition = [operator, args]
- if lhs_having or rhs_having:
- args = filter(None, [lhs_having, rhs_having])
- if not args:
- having = None
- elif len(args) == 1:
- having = args[0]
- else:
- having = [operator, args]
- return condition, having, projects_to_filter, group_ids
-def get_filter(query=None, params=None):
- """
- Returns an eventstore filter given the search text provided by the user and
- URL params
- """
- # NOTE: this function assumes project permissions check already happened
- parsed_terms = []
- if query is not None:
- try:
- parsed_terms = parse_search_query(query, allow_boolean=True, params=params)
- except ParseError as e:
- raise InvalidSearchQuery(f"Parse error: {e.expr.name} (column {e.column():d})")
- kwargs = {
- "start": None,
- "end": None,
- "conditions": [],
- "having": [],
- "user_id": None,
- "organization_id": None,
- "project_ids": [],
- "group_ids": [],
- "condition_aggregates": [],
- "aliases": params.get("aliases", {}) if params is not None else {},
- }
- projects_to_filter = []
- if any(
- isinstance(term, ParenExpression) or SearchBoolean.is_operator(term)
- for term in parsed_terms
- ):
- (
- condition,
- having,
- found_projects_to_filter,
- group_ids,
- ) = convert_search_boolean_to_snuba_query(parsed_terms, params)
- if condition:
- and_conditions = flatten_condition_tree(condition, SNUBA_AND)
- for func in and_conditions:
- kwargs["conditions"].append(convert_function_to_condition(func))
- if having:
- kwargs["condition_aggregates"] = [
- term.key.name for term in parsed_terms if isinstance(term, AggregateFilter)
- ]
- and_having = flatten_condition_tree(having, SNUBA_AND)
- for func in and_having:
- kwargs["having"].append(convert_function_to_condition(func))
- if found_projects_to_filter:
- projects_to_filter = list(set(found_projects_to_filter))
- if group_ids is not None:
- kwargs["group_ids"].extend(list(set(group_ids)))
- else:
- projects_to_filter = set()
- for term in parsed_terms:
- if isinstance(term, SearchFilter):
- conditions, found_projects_to_filter, group_ids = format_search_filter(term, params)
- if len(conditions) > 0:
- kwargs["conditions"].extend(conditions)
- if found_projects_to_filter:
- projects_to_filter.update(found_projects_to_filter)
- if group_ids is not None:
- kwargs["group_ids"].extend(group_ids)
- elif isinstance(term, AggregateFilter):
- converted_filter = convert_aggregate_filter_to_snuba_query(term, params)
- kwargs["condition_aggregates"].append(term.key.name)
- if converted_filter:
- kwargs["having"].append(converted_filter)
- projects_to_filter = list(projects_to_filter)
- # Keys included as url params take precedent if same key is included in search
- # They are also considered safe and to have had access rules applied unlike conditions
- # from the query string.
- if params:
- for key in ("start", "end"):
- kwargs[key] = params.get(key, None)
- # OrganizationEndpoint.get_filter() uses project_id, but eventstore.Filter uses project_ids
- if "user_id" in params:
- kwargs["user_id"] = params["user_id"]
- if "organization_id" in params:
- kwargs["organization_id"] = params["organization_id"]
- if "project_id" in params:
- if projects_to_filter:
- kwargs["project_ids"] = projects_to_filter
- else:
- kwargs["project_ids"] = params["project_id"]
- if "environment" in params:
- term = SearchFilter(SearchKey("environment"), "=", SearchValue(params["environment"]))
- kwargs["conditions"].append(convert_search_filter_to_snuba_query(term))
- if "group_ids" in params:
- kwargs["group_ids"] = to_list(params["group_ids"])
- # Deprecated alias, use `group_ids` instead
- if ISSUE_ID_ALIAS in params:
- kwargs["group_ids"] = to_list(params["issue.id"])
- return eventstore.Filter(**kwargs)
-class PseudoField:
- def __init__(self, name, alias, expression=None, expression_fn=None, result_type=None):
- self.name = name
- self.alias = alias
- self.expression = expression
- self.expression_fn = expression_fn
- self.result_type = result_type
- self.validate()
- def get_expression(self, params):
- if isinstance(self.expression, (list, tuple)):
- return deepcopy(self.expression)
- elif self.expression_fn is not None:
- return self.expression_fn(params)
- return None
- def get_field(self, params=None):
- expression = self.get_expression(params)
- if expression is not None:
- expression.append(self.alias)
- return expression
- return self.alias
- def validate(self):
- assert self.alias is not None, f"{self.name}: alias is required"
- assert (
- self.expression is None or self.expression_fn is None
- ), f"{self.name}: only one of expression, expression_fn is allowed"
-def key_transaction_expression(user_id, organization_id, project_ids):
- """
- This function may be called multiple times, making for repeated data bases queries.
- Lifting the query higher to earlier in the call stack will require a lot more changes
- as there are numerous entry points. So we will leave the duplicate query alone for now.
- """
- if user_id is None or organization_id is None or project_ids is None:
- raise InvalidSearchQuery("Missing necessary meta for key transaction field.")
- key_transactions = (
- KeyTransaction.objects.filter(
- owner_id=user_id,
- organization_id=organization_id,
- project_id__in=project_ids,
- )
- .order_by("transaction", "project_id")
- .values("project_id", "transaction")
- )
- # if there are no key transactions, the value should always be 0
- if not len(key_transactions):
- return ["toInt64", [0]]
- return [
- "has",
- [
- [
- "array",
- [
- [
- "tuple",
- [
- ["toUInt64", [transaction["project_id"]]],
- "'{}'".format(transaction["transaction"]),
- ],
- ]
- for transaction in key_transactions
- ],
- ],
- ["tuple", ["project_id", "transaction"]],
- ],
- ]
-# When updating this list, also check if the following need to be updated:
-# - convert_search_filter_to_snuba_query (otherwise aliased field will be treated as tag)
-# - static/app/utils/discover/fields.tsx FIELDS (for discover column list and search box autocomplete)
- field.name: field
- for field in [
- PseudoField("project", "project.id"),
- PseudoField("issue", "issue.id"),
- PseudoField(
- "timestamp.to_hour", "timestamp.to_hour", expression=["toStartOfHour", ["timestamp"]]
- ),
- PseudoField(
- "timestamp.to_day", "timestamp.to_day", expression=["toStartOfDay", ["timestamp"]]
- ),
- PseudoField(ERROR_UNHANDLED_ALIAS, ERROR_UNHANDLED_ALIAS, expression=["notHandled", []]),
- PseudoField(
- expression=["coalesce", ["user.email", "user.username", "user.ip"]],
- ),
- # the key transaction field is intentially not added to the discover/fields list yet
- # because there needs to be some work on the front end to integrate this into discover
- PseudoField(
- expression_fn=lambda params: key_transaction_expression(
- params.get("user_id"),
- params.get("organization_id"),
- params.get("project_id"),
- ),
- result_type="boolean",
- ),
- ]
-def get_json_meta_type(field_alias, snuba_type, function=None):
- alias_definition = FIELD_ALIASES.get(field_alias)
- if alias_definition and alias_definition.result_type is not None:
- return alias_definition.result_type
- snuba_json = get_json_type(snuba_type)
- if snuba_json != "string":
- if function is not None:
- result_type = function.instance.get_result_type(function.field, function.arguments)
- if result_type is not None:
- return result_type
- function_match = FUNCTION_ALIAS_PATTERN.match(field_alias)
- if function_match:
- function_definition = FUNCTIONS.get(function_match.group(1))
- if function_definition:
- result_type = function_definition.get_result_type()
- if result_type is not None:
- return result_type
- if (
- "duration" in field_alias
- or is_duration_measurement(field_alias)
- or is_span_op_breakdown(field_alias)
- ):
- return "duration"
- if is_measurement(field_alias):
- return "number"
- if field_alias == "transaction.status":
- return "string"
- return snuba_json
-# Based on general/src/protocol/tags.rs in relay
-VALID_FIELD_PATTERN = re.compile(r"^[a-zA-Z0-9_.:-]*$")
-# The regex for alias here is to match any word, but exclude anything that is only digits
-# eg. 123 doesn't match, but test_123 will match
-ALIAS_REGEX = r"(\w+)?(?!\d+)\w+"
-ALIAS_PATTERN = re.compile(fr"{ALIAS_REGEX}$")
-FUNCTION_PATTERN = re.compile(
- fr"^(?P<function>[^\(]+)\((?P<columns>.*)\)( (as|AS) (?P<alias>{ALIAS_REGEX}))?$"
-class InvalidFunctionArgument(Exception):
- pass
-class ArgValue:
- def __init__(self, arg):
- self.arg = arg
-class FunctionArg:
- def __init__(self, name):
- self.name = name
- self.has_default = False
- def get_default(self, params):
- raise InvalidFunctionArgument(f"{self.name} has no defaults")
- def normalize(self, value, params):
- return value
- def get_type(self, value):
- raise InvalidFunctionArgument(f"{self.name} has no type defined")
-class FunctionAliasArg(FunctionArg):
- def normalize(self, value, params):
- if not ALIAS_PATTERN.match(value):
- raise InvalidFunctionArgument(f"{value} is not a valid function alias")
- return value
-class NullColumn(FunctionArg):
- """
- Convert the provided column to null so that we
- can drop it. Used to make count() not have a
- required argument that we ignore.
- """
- def __init__(self, name):
- super().__init__(name)
- self.has_default = True
- def get_default(self, params):
- return None
- def normalize(self, value, params):
- return None
-class CountColumn(FunctionArg):
- def __init__(self, name):
- super().__init__(name)
- self.has_default = True
- def get_default(self, params):
- return None
- def normalize(self, value, params):
- if value is None:
- raise InvalidFunctionArgument("a column is required")
- if value not in FIELD_ALIASES:
- return value
- field = FIELD_ALIASES[value]
- # If the alias has an expression prefer that over the column alias
- # This enables user.display to work in aggregates
- expression = field.get_expression(params)
- if expression is not None:
- return expression
- elif field.alias is not None:
- return field.alias
- return value
-class FieldColumn(CountColumn):
- """ Allow any field column, of any type """
- def get_type(self, value):
- if is_duration_measurement(value) or is_span_op_breakdown(value):
- return "duration"
- elif value == "transaction.duration":
- return "duration"
- elif value == "timestamp":
- return "date"
- return "string"
-class StringArg(FunctionArg):
- def __init__(self, name, unquote=False, unescape_quotes=False):
- super().__init__(name)
- self.unquote = unquote
- self.unescape_quotes = unescape_quotes
- def normalize(self, value, params):
- if self.unquote:
- if len(value) < 2 or value[0] != '"' or value[-1] != '"':
- raise InvalidFunctionArgument("string should be quoted")
- value = value[1:-1]
- if self.unescape_quotes:
- value = re.sub(r'\\"', '"', value)
- return f"'{value}'"
-class DateArg(FunctionArg):
- date_format = "%Y-%m-%dT%H:%M:%S"
- def normalize(self, value, params):
- try:
- datetime.strptime(value, self.date_format)
- except ValueError:
- raise InvalidFunctionArgument(
- f"{value} is in the wrong format, expected a date like 2020-03-14T15:14:15"
- )
- return f"'{value}'"
-class ConditionArg(FunctionArg):
- # List and not a set so the error message is consistent
- "equals",
- "notEquals",
- "lessOrEquals",
- "greaterOrEquals",
- "less",
- "greater",
- ]
- def normalize(self, value, params):
- if value not in self.VALID_CONDITIONS:
- raise InvalidFunctionArgument(
- "{} is not a valid condition, the only supported conditions are: {}".format(
- value,
- ",".join(self.VALID_CONDITIONS),
- )
- )
- return value
-class Column(FunctionArg):
- def __init__(self, name, allowed_columns=None):
- super().__init__(name)
- # make sure to map the allowed columns to their snuba names
- self.allowed_columns = [SEARCH_MAP.get(col) for col in allowed_columns]
- def normalize(self, value, params):
- snuba_column = SEARCH_MAP.get(value)
- if self.allowed_columns is not None:
- if value in self.allowed_columns or snuba_column in self.allowed_columns:
- return snuba_column
- else:
- raise InvalidFunctionArgument(f"{value} is not an allowed column")
- if not snuba_column:
- raise InvalidFunctionArgument(f"{value} is not a valid column")
- return snuba_column
-class ColumnNoLookup(Column):
- def __init__(self, name, allowed_columns=None):
- super().__init__(name, allowed_columns=allowed_columns)
- def normalize(self, value, params):
- super().normalize(value, params)
- return value
-class NumericColumn(FunctionArg):
- def _normalize(self, value):
- # This method is written in this way so that `get_type` can always call
- # this even in child classes where `normalize` have been overridden.
- snuba_column = SEARCH_MAP.get(value)
- if not snuba_column and is_measurement(value):
- return value
- if not snuba_column and is_span_op_breakdown(value):
- return value
- if not snuba_column:
- raise InvalidFunctionArgument(f"{value} is not a valid column")
- elif snuba_column not in ["time", "timestamp", "duration"]:
- raise InvalidFunctionArgument(f"{value} is not a numeric column")
- return snuba_column
- def normalize(self, value, params):
- return self._normalize(value)
- def get_type(self, value):
- snuba_column = self._normalize(value)
- if is_duration_measurement(snuba_column) or is_span_op_breakdown(snuba_column):
- return "duration"
- elif snuba_column == "duration":
- return "duration"
- elif snuba_column == "timestamp":
- return "date"
- return "number"
-class NumericColumnNoLookup(NumericColumn):
- def __init__(self, name, allow_array_value=False):
- super().__init__(name)
- self.allow_array_value = allow_array_value
- def normalize(self, value, params):
- # `measurement_value` and `span_op_breakdowns_value` are actually an
- # array of Float64s. But when used in this context, we always want to
- # expand it using `arrayJoin`. The resulting column will be a numeric
- # column of type Float64.
- if self.allow_array_value:
- if value in {"measurements_value", "span_op_breakdowns_value"}:
- return ["arrayJoin", [value]]
- super().normalize(value, params)
- return value
-class DurationColumn(FunctionArg):
- def normalize(self, value, params):
- snuba_column = SEARCH_MAP.get(value)
- if not snuba_column and is_duration_measurement(value):
- return value
- if not snuba_column and is_span_op_breakdown(value):
- return value
- if not snuba_column:
- raise InvalidFunctionArgument(f"{value} is not a valid column")
- elif snuba_column != "duration":
- raise InvalidFunctionArgument(f"{value} is not a duration column")
- return snuba_column
-class DurationColumnNoLookup(DurationColumn):
- def normalize(self, value, params):
- super().normalize(value, params)
- return value
-class StringArrayColumn(FunctionArg):
- def normalize(self, value, params):
- if value in ["tags.key", "tags.value", "measurements_key", "span_op_breakdowns_key"]:
- return value
- raise InvalidFunctionArgument(f"{value} is not a valid string array column")
-class NumberRange(FunctionArg):
- def __init__(self, name, start, end):
- super().__init__(name)
- self.start = start
- self.end = end
- def normalize(self, value, params):
- try:
- value = float(value)
- except ValueError:
- raise InvalidFunctionArgument(f"{value} is not a number")
- if self.start and value < self.start:
- raise InvalidFunctionArgument(
- f"{value:g} must be greater than or equal to {self.start:g}"
- )
- elif self.end and value >= self.end:
- raise InvalidFunctionArgument(f"{value:g} must be less than {self.end:g}")
- return value
-class IntervalDefault(NumberRange):
- def __init__(self, name, start, end):
- super().__init__(name, start, end)
- self.has_default = True
- def get_default(self, params):
- if not params or not params.get("start") or not params.get("end"):
- raise InvalidFunctionArgument("function called without default")
- elif not isinstance(params.get("start"), datetime) or not isinstance(
- params.get("end"), datetime
- ):
- raise InvalidFunctionArgument("function called with invalid default")
- interval = (params["end"] - params["start"]).total_seconds()
- return int(interval)
-def with_default(default, argument):
- argument.has_default = True
- argument.get_default = lambda *_: default
- return argument
-class Function:
- def __init__(
- self,
- name,
- required_args=None,
- optional_args=None,
- calculated_args=None,
- column=None,
- aggregate=None,
- transform=None,
- result_type_fn=None,
- default_result_type=None,
- redundant_grouping=False,
- private=False,
- ):
- """
- Specifies a function interface that must be followed when defining new functions
- :param str name: The name of the function, this refers to the name to invoke.
- :param list[FunctionArg] required_args: The list of required arguments to the function.
- If any of these arguments are not specified, an error will be raised.
- :param list[FunctionArg] optional_args: The list of optional arguments to the function.
- If any of these arguments are not specified, they will be filled using their default value.
- :param list[obj] calculated_args: The list of calculated arguments to the function.
- These arguments will be computed based on the list of specified arguments.
- :param [str, [any], str or None] column: The column to be passed to snuba once formatted.
- The arguments will be filled into the column where needed. This must not be an aggregate.
- :param [str, [any], str or None] aggregate: The aggregate to be passed to snuba once formatted.
- The arguments will be filled into the aggregate where needed. This must be an aggregate.
- :param str transform: NOTE: Use aggregate over transform whenever possible.
- An aggregate string to be passed to snuba once formatted. The arguments
- will be filled into the string using `.format(...)`.
- :param str result_type_fn: A function to call with in order to determine the result type.
- This function will be passed the list of argument classes and argument values. This should
- be tried first as the source of truth if available.
- :param str default_result_type: The default resulting type of this function. Must be a type
- defined by RESULTS_TYPES.
- :param bool redundant_grouping: This function will result in redundant grouping if its column
- is included as a field as well.
- :param bool private: Whether or not this function should be disabled for general use.
- """
- self.name = name
- self.required_args = [] if required_args is None else required_args
- self.optional_args = [] if optional_args is None else optional_args
- self.calculated_args = [] if calculated_args is None else calculated_args
- self.column = column
- self.aggregate = aggregate
- self.transform = transform
- self.result_type_fn = result_type_fn
- self.default_result_type = default_result_type
- self.redundant_grouping = redundant_grouping
- self.private = private
- self.validate()
- @property
- def required_args_count(self):
- return len(self.required_args)
- @property
- def optional_args_count(self):
- return len(self.optional_args)
- @property
- def total_args_count(self):
- return self.required_args_count + self.optional_args_count
- @property
- def args(self):
- return self.required_args + self.optional_args
- def alias_as(self, name):
- """ Create a copy of this function to be used as an alias """
- alias = deepcopy(self)
- alias.name = name
- return alias
- def add_default_arguments(self, field, columns, params):
- # make sure to validate the argument count first to
- # ensure the right number of arguments have been passed
- self.validate_argument_count(field, columns)
- columns = [column for column in columns]
- # use default values to populate optional arguments if any
- for argument in self.args[len(columns) :]:
- try:
- default = argument.get_default(params)
- except InvalidFunctionArgument as e:
- raise InvalidSearchQuery(f"{field}: invalid arguments: {e}")
- # Hacky, but we expect column arguments to be strings so easiest to convert it back
- columns.append(str(default) if default else default)
- return columns
- def format_as_arguments(self, field, columns, params):
- columns = self.add_default_arguments(field, columns, params)
- arguments = {}
- # normalize the arguments before putting them in a dict
- for argument, column in zip(self.args, columns):
- try:
- arguments[argument.name] = argument.normalize(column, params)
- except InvalidFunctionArgument as e:
- raise InvalidSearchQuery(f"{field}: {argument.name} argument invalid: {e}")
- # populate any computed args
- for calculation in self.calculated_args:
- arguments[calculation["name"]] = calculation["fn"](arguments)
- return arguments
- def get_result_type(self, field=None, arguments=None):
- if field is None or arguments is None or self.result_type_fn is None:
- return self.default_result_type
- result_type = self.result_type_fn(self.args, arguments)
- if result_type is None:
- return self.default_result_type
- self.validate_result_type(result_type)
- return result_type
- def validate(self):
- # assert that all optional args have defaults available
- for i, arg in enumerate(self.optional_args):
- assert (
- arg.has_default
- ), f"{self.name}: optional argument at index {i} does not have default"
- # assert that the function has only one of the following specified
- # `column`, `aggregate`, or `transform`
- assert (
- sum([self.column is not None, self.aggregate is not None, self.transform is not None])
- == 1
- ), f"{self.name}: only one of column, aggregate, or transform is allowed"
- # assert that no duplicate argument names are used
- names = set()
- for arg in self.args:
- assert (
- arg.name not in names
- ), f"{self.name}: argument {arg.name} specified more than once"
- names.add(arg.name)
- for calculation in self.calculated_args:
- assert (
- calculation["name"] not in names
- ), "{}: argument {} specified more than once".format(self.name, calculation["name"])
- names.add(calculation["name"])
- self.validate_result_type(self.default_result_type)
- def validate_argument_count(self, field, arguments):
- """
- Validate the number of required arguments the function defines against
- provided arguments. Raise an exception if there is a mismatch in the
- number of arguments. Do not return any values.
- There are 4 cases:
- 1. provided # of arguments != required # of arguments AND provided # of arguments != total # of arguments (bad, raise an error)
- 2. provided # of arguments < required # of arguments (bad, raise an error)
- 3. provided # of arguments > total # of arguments (bad, raise an error)
- 4. required # of arguments <= provided # of arguments <= total # of arguments (good, pass the validation)
- """
- args_count = len(arguments)
- total_args_count = self.total_args_count
- if args_count != total_args_count:
- required_args_count = self.required_args_count
- if required_args_count == total_args_count:
- raise InvalidSearchQuery(f"{field}: expected {total_args_count:g} argument(s)")
- elif args_count < required_args_count:
- raise InvalidSearchQuery(
- f"{field}: expected at least {required_args_count:g} argument(s)"
- )
- elif args_count > total_args_count:
- raise InvalidSearchQuery(
- f"{field}: expected at most {total_args_count:g} argument(s)"
- )
- def validate_result_type(self, result_type):
- assert (
- result_type is None or result_type in RESULT_TYPES
- ), f"{self.name}: result type {result_type} not one of {list(RESULT_TYPES)}"
- def is_accessible(self, acl=None):
- if not self.private:
- return True
- elif not acl:
- return False
- return self.name in acl
-def reflective_result_type(index=0):
- def result_type_fn(function_arguments, parameter_values):
- argument = function_arguments[index]
- value = parameter_values[argument.name]
- return argument.get_type(value)
- return result_type_fn
-# When updating this list, also check if the following need to be updated:
-# - convert_search_filter_to_snuba_query
-# - static/app/utils/discover/fields.tsx FIELDS (for discover column list and search box autocomplete)
- function.name: function
- for function in [
- Function(
- "percentile",
- required_args=[NumericColumnNoLookup("column"), NumberRange("percentile", 0, 1)],
- aggregate=["quantile({percentile:g})", ArgValue("column"), None],
- result_type_fn=reflective_result_type(),
- default_result_type="duration",
- redundant_grouping=True,
- ),
- Function(
- "p50",
- optional_args=[with_default("transaction.duration", NumericColumnNoLookup("column"))],
- aggregate=["quantile(0.5)", ArgValue("column"), None],
- result_type_fn=reflective_result_type(),
- default_result_type="duration",
- redundant_grouping=True,
- ),
- Function(
- "p75",
- optional_args=[with_default("transaction.duration", NumericColumnNoLookup("column"))],
- aggregate=["quantile(0.75)", ArgValue("column"), None],
- result_type_fn=reflective_result_type(),
- default_result_type="duration",
- redundant_grouping=True,
- ),
- Function(
- "p95",
- optional_args=[with_default("transaction.duration", NumericColumnNoLookup("column"))],
- aggregate=["quantile(0.95)", ArgValue("column"), None],
- result_type_fn=reflective_result_type(),
- default_result_type="duration",
- redundant_grouping=True,
- ),
- Function(
- "p99",
- optional_args=[with_default("transaction.duration", NumericColumnNoLookup("column"))],
- aggregate=["quantile(0.99)", ArgValue("column"), None],
- result_type_fn=reflective_result_type(),
- default_result_type="duration",
- redundant_grouping=True,
- ),
- Function(
- "p100",
- optional_args=[with_default("transaction.duration", NumericColumnNoLookup("column"))],
- aggregate=["max", ArgValue("column"), None],
- result_type_fn=reflective_result_type(),
- default_result_type="duration",
- redundant_grouping=True,
- ),
- Function(
- "eps",
- optional_args=[IntervalDefault("interval", 1, None)],
- transform="divide(count(), {interval:g})",
- default_result_type="number",
- ),
- Function(
- "epm",
- optional_args=[IntervalDefault("interval", 1, None)],
- transform="divide(count(), divide({interval:g}, 60))",
- default_result_type="number",
- ),
- Function(
- "last_seen",
- aggregate=["max", "timestamp", "last_seen"],
- default_result_type="date",
- redundant_grouping=True,
- ),
- Function(
- "latest_event",
- aggregate=["argMax", ["id", "timestamp"], "latest_event"],
- default_result_type="string",
- ),
- Function(
- "apdex",
- required_args=[NumberRange("satisfaction", 0, None)],
- transform="apdex(duration, {satisfaction:g})",
- default_result_type="number",
- ),
- Function(
- "count_miserable",
- required_args=[CountColumn("column"), NumberRange("satisfaction", 0, None)],
- calculated_args=[{"name": "tolerated", "fn": lambda args: args["satisfaction"] * 4.0}],
- aggregate=[
- "uniqIf",
- [ArgValue("column"), ["greater", ["transaction.duration", ArgValue("tolerated")]]],
- None,
- ],
- default_result_type="number",
- ),
- Function(
- "user_misery",
- required_args=[NumberRange("satisfaction", 0, None)],
- # To correct for sensitivity to low counts, User Misery is modeled as a Beta Distribution Function.
- # With prior expectations, we have picked the expected mean user misery to be 0.05 and variance
- # to be 0.0004. This allows us to calculate the alpha (5.8875) and beta (111.8625) parameters,
- # with the user misery being adjusted for each fast/slow unique transaction. See:
- # https://stats.stackexchange.com/questions/47771/what-is-the-intuition-behind-beta-distribution
- # for an intuitive explanation of the Beta Distribution Function.
- optional_args=[
- with_default(5.8875, NumberRange("alpha", 0, None)),
- with_default(111.8625, NumberRange("beta", 0, None)),
- ],
- calculated_args=[
- {"name": "tolerated", "fn": lambda args: args["satisfaction"] * 4.0},
- {"name": "parameter_sum", "fn": lambda args: args["alpha"] + args["beta"]},
- ],
- transform="ifNull(divide(plus(uniqIf(user, greater(duration, {tolerated:g})), {alpha}), plus(uniq(user), {parameter_sum})), 0)",
- default_result_type="number",
- ),
- Function("failure_rate", transform="failure_rate()", default_result_type="percentage"),
- Function(
- "failure_count",
- aggregate=[
- "countIf",
- [
- [
- "not",
- [
- [
- "has",
- [
- [
- "array",
- [
- for name in ["ok", "cancelled", "unknown"]
- ],
- ],
- "transaction_status",
- ],
- ],
- ],
- ],
- ],
- None,
- ],
- default_result_type="integer",
- ),
- Function(
- "array_join",
- required_args=[StringArrayColumn("column")],
- column=["arrayJoin", [ArgValue("column")], None],
- default_result_type="string",
- private=True,
- ),
- Function(
- "histogram",
- required_args=[
- NumericColumnNoLookup("column", allow_array_value=True),
- # the bucket_size and start_offset should already be adjusted
- # using the multiplier before it is passed here
- NumberRange("bucket_size", 0, None),
- NumberRange("start_offset", 0, None),
- NumberRange("multiplier", 1, None),
- ],
- # floor((x * multiplier - start_offset) / bucket_size) * bucket_size + start_offset
- column=[
- "plus",
- [
- [
- "multiply",
- [
- [
- "floor",
- [
- [
- "divide",
- [
- [
- "minus",
- [
- [
- "multiply",
- [
- ArgValue("column"),
- ArgValue("multiplier"),
- ],
- ],
- ArgValue("start_offset"),
- ],
- ],
- ArgValue("bucket_size"),
- ],
- ],
- ],
- ],
- ArgValue("bucket_size"),
- ],
- ],
- ArgValue("start_offset"),
- ],
- None,
- ],
- default_result_type="number",
- private=True,
- ),
- Function(
- "count_unique",
- optional_args=[CountColumn("column")],
- aggregate=["uniq", ArgValue("column"), None],
- default_result_type="integer",
- ),
- Function(
- "count",
- optional_args=[NullColumn("column")],
- aggregate=["count", None, None],
- default_result_type="integer",
- ),
- Function(
- "count_at_least",
- required_args=[NumericColumnNoLookup("column"), NumberRange("threshold", 0, None)],
- aggregate=[
- "countIf",
- [["greaterOrEquals", [ArgValue("column"), ArgValue("threshold")]]],
- None,
- ],
- default_result_type="integer",
- ),
- Function(
- "min",
- required_args=[NumericColumnNoLookup("column")],
- aggregate=["min", ArgValue("column"), None],
- result_type_fn=reflective_result_type(),
- default_result_type="duration",
- redundant_grouping=True,
- ),
- Function(
- "max",
- required_args=[NumericColumnNoLookup("column")],
- aggregate=["max", ArgValue("column"), None],
- result_type_fn=reflective_result_type(),
- default_result_type="duration",
- redundant_grouping=True,
- ),
- Function(
- "avg",
- required_args=[NumericColumnNoLookup("column")],
- aggregate=["avg", ArgValue("column"), None],
- result_type_fn=reflective_result_type(),
- default_result_type="duration",
- redundant_grouping=True,
- ),
- Function(
- "var",
- required_args=[NumericColumnNoLookup("column")],
- aggregate=["varSamp", ArgValue("column"), None],
- default_result_type="number",
- redundant_grouping=True,
- ),
- Function(
- "stddev",
- required_args=[NumericColumnNoLookup("column")],
- aggregate=["stddevSamp", ArgValue("column"), None],
- default_result_type="number",
- redundant_grouping=True,
- ),
- Function(
- "sum",
- required_args=[NumericColumnNoLookup("column")],
- aggregate=["sum", ArgValue("column"), None],
- result_type_fn=reflective_result_type(),
- default_result_type="duration",
- ),
- Function(
- "any",
- required_args=[FieldColumn("column")],
- aggregate=["min", ArgValue("column"), None],
- result_type_fn=reflective_result_type(),
- redundant_grouping=True,
- ),
- # Currently only being used by the baseline PoC
- Function(
- "absolute_delta",
- required_args=[DurationColumnNoLookup("column"), NumberRange("target", 0, None)],
- column=["abs", [["minus", [ArgValue("column"), ArgValue("target")]]], None],
- default_result_type="duration",
- ),
- # These range functions for performance trends, these aren't If functions
- # to avoid allowing arbitrary if statements
- # Not yet supported in Discover, and shouldn't be added to fields.tsx
- Function(
- "percentile_range",
- required_args=[
- NumericColumnNoLookup("column"),
- NumberRange("percentile", 0, 1),
- ConditionArg("condition"),
- DateArg("middle"),
- ],
- aggregate=[
- "quantileIf({percentile:.2f})",
- [
- ArgValue("column"),
- # NOTE: This condition is written in this seemingly backwards way
- # because of how snuba special cases the following syntax
- # ["a", ["b", ["c", ["d"]]]
- #
- # This array is can be interpreted 2 ways
- # 1. a(b(c(d))) the way snuba interprets it
- # - snuba special cases it when it detects an array where the first
- # element is a literal, and the second element is an array and
- # treats it as a function call rather than 2 separate arguments
- # 2. a(b, c(d)) the way we want it to be interpreted
- #
- # Because of how snuba interprets this expression, it makes it impossible
- # to specify a function with 2 arguments whose first argument is a literal
- # and the second argument is an expression.
- #
- # Working with this limitation, we have to invert the conditions in
- # order to express a function whose first argument is an expression while
- # the second argument is a literal.
- [ArgValue("condition"), [["toDateTime", [ArgValue("middle")]], "timestamp"]],
- ],
- None,
- ],
- default_result_type="duration",
- ),
- Function(
- "avg_range",
- required_args=[
- NumericColumnNoLookup("column"),
- ConditionArg("condition"),
- DateArg("middle"),
- ],
- aggregate=[
- "avgIf",
- [
- ArgValue("column"),
- # see `percentile_range` for why this condition feels backwards
- [ArgValue("condition"), [["toDateTime", [ArgValue("middle")]], "timestamp"]],
- ],
- None,
- ],
- default_result_type="duration",
- ),
- Function(
- "variance_range",
- required_args=[
- NumericColumnNoLookup("column"),
- ConditionArg("condition"),
- DateArg("middle"),
- ],
- aggregate=[
- "varSampIf",
- [
- ArgValue("column"),
- # see `percentile_range` for why this condition feels backwards
- [ArgValue("condition"), [["toDateTime", [ArgValue("middle")]], "timestamp"]],
- ],
- None,
- ],
- default_result_type="duration",
- ),
- Function(
- "count_range",
- required_args=[ConditionArg("condition"), DateArg("middle")],
- aggregate=[
- "countIf",
- # see `percentile_range` for why this condition feels backwards
- [[ArgValue("condition"), [["toDateTime", [ArgValue("middle")]], "timestamp"]]],
- None,
- ],
- default_result_type="integer",
- ),
- Function(
- "percentage",
- required_args=[FunctionArg("numerator"), FunctionArg("denominator")],
- # Since percentage is only used on aggregates, it needs to be an aggregate and not a column
- # This is because as a column it will be added to the `WHERE` clause instead of the `HAVING` clause
- aggregate=[
- "if(greater({denominator},0),divide({numerator},{denominator}),null)",
- None,
- None,
- ],
- default_result_type="percentage",
- ),
- # Calculate the Welch's t-test value, this is used to help identify which of our trends are significant or not
- Function(
- "t_test",
- required_args=[
- FunctionAliasArg("avg_1"),
- FunctionAliasArg("avg_2"),
- FunctionAliasArg("variance_1"),
- FunctionAliasArg("variance_2"),
- FunctionAliasArg("count_1"),
- FunctionAliasArg("count_2"),
- ],
- aggregate=[
- "divide(minus({avg_1},{avg_2}),sqrt(plus(divide({variance_1},{count_1}),divide({variance_2},{count_2}))))",
- None,
- "t_test",
- ],
- default_result_type="number",
- ),
- Function(
- "minus",
- required_args=[FunctionArg("minuend"), FunctionArg("subtrahend")],
- aggregate=["minus", [ArgValue("minuend"), ArgValue("subtrahend")], None],
- default_result_type="duration",
- ),
- Function(
- "absolute_correlation",
- aggregate=[
- "abs",
- [["corr", [["toUnixTimestamp", ["timestamp"]], "transaction.duration"]]],
- None,
- ],
- default_result_type="number",
- ),
- # Currently only used by trace meta so we can count event types which is why this only accepts strings
- Function(
- "count_if",
- required_args=[
- ColumnNoLookup("column", allowed_columns=["event.type", "http.status_code"]),
- ConditionArg("condition"),
- StringArg("value"),
- ],
- aggregate=[
- "countIf",
- [
- [
- ArgValue("condition"),
- [
- ArgValue("column"),
- ArgValue("value"),
- ],
- ]
- ],
- None,
- ],
- default_result_type="integer",
- ),
- Function(
- "compare_numeric_aggregate",
- required_args=[
- FunctionAliasArg("aggregate_alias"),
- ConditionArg("condition"),
- NumberRange("value", 0, None),
- ],
- aggregate=[
- # snuba json syntax isn't compatible with this query here
- # this function can't be a column, since we want to use this with aggregates
- "{condition}({aggregate_alias},{value})",
- None,
- None,
- ],
- default_result_type="number",
- ),
- Function(
- "to_other",
- required_args=[
- ColumnNoLookup("column", allowed_columns=["release", "trace.parent_span"]),
- StringArg("value", unquote=True, unescape_quotes=True),
- ],
- optional_args=[
- with_default("that", StringArg("that")),
- with_default("this", StringArg("this")),
- ],
- column=[
- "if",
- [
- ["equals", [ArgValue("column"), ArgValue("value")]],
- ArgValue("this"),
- ArgValue("that"),
- ],
- ],
- ),
- ]
-# In Performance TPM is used as an alias to EPM
- "tpm": "epm",
- "tps": "eps",
-for alias, name in FUNCTION_ALIASES.items():
- FUNCTIONS[alias] = FUNCTIONS[name].alias_as(alias)
-FUNCTION_ALIAS_PATTERN = re.compile(r"^({}).*".format("|".join(list(FUNCTIONS.keys()))))
-def is_function(field):
- function_match = FUNCTION_PATTERN.search(field)
- if function_match:
- return function_match
- return None
-def get_function_alias(field):
- match = FUNCTION_PATTERN.search(field)
- if match is None:
- return field
- if match.group("alias") is not None:
- return match.group("alias")
- function = match.group("function")
- columns = parse_arguments(function, match.group("columns"))
- return get_function_alias_with_columns(function, columns)
-def get_function_alias_with_columns(function_name, columns):
- columns = re.sub(r"[^\w]", "_", "_".join(columns))
- return f"{function_name}_{columns}".rstrip("_")
-def format_column_arguments(column_args, arguments):
- for i in range(len(column_args)):
- if isinstance(column_args[i], (list, tuple)):
- if isinstance(column_args[i][0], ArgValue):
- column_args[i][0] = arguments[column_args[i][0].arg]
- format_column_arguments(column_args[i][1], arguments)
- elif isinstance(column_args[i], str):
- column_args[i] = column_args[i].format(**arguments)
- elif isinstance(column_args[i], ArgValue):
- column_args[i] = arguments[column_args[i].arg]
-def parse_arguments(function, columns):
- """
- The to_other function takes a quoted string for one of its arguments
- that may contain commas, so it requires special handling.
- """
- if function != "to_other":
- return [c.strip() for c in columns.split(",") if len(c.strip()) > 0]
- args = []
- quoted = False
- escaped = False
- i, j = 0, 0
- while j < len(columns):
- if i == j and columns[j] == '"':
- # when we see a quote at the beginning of
- # an argument, then this is a quoted string
- quoted = True
- elif quoted and not escaped and columns[j] == "\\":
- # when we see a slash inside a quoted string,
- # the next character is an escape character
- escaped = True
- elif quoted and not escaped and columns[j] == '"':
- # when we see a non-escaped quote while inside
- # of a quoted string, we should end it
- quoted = False
- elif quoted and escaped:
- # when we are inside a quoted string and have
- # begun an escape character, we should end it
- escaped = False
- elif quoted and columns[j] == ",":
- # when we are inside a quoted string and see
- # a comma, it should not be considered an
- # argument separator
- pass
- elif columns[j] == ",":
- # when we see a comma outside of a quoted string
- # it is an argument separator
- args.append(columns[i:j].strip())
- i = j + 1
- j += 1
- if i != j:
- # add in the last argument if any
- args.append(columns[i:].strip())
- return [arg for arg in args if arg]
-def parse_function(field, match=None, err_msg=None):
- if not match:
- match = is_function(field)
- if not match or match.group("function") not in FUNCTIONS:
- if err_msg is None:
- err_msg = f"{field} is not a valid function"
- raise InvalidSearchQuery(err_msg)
- function = match.group("function")
- return (
- function,
- parse_arguments(function, match.group("columns")),
- match.group("alias"),
- )
-FunctionDetails = namedtuple("FunctionDetails", "field instance arguments")
-ResolvedFunction = namedtuple("ResolvedFunction", "details column aggregate")
-def resolve_function(field, match=None, params=None, functions_acl=False):
- if params is not None and field in params.get("aliases", {}):
- alias = params["aliases"][field]
- return ResolvedFunction(
- FunctionDetails(field, FUNCTIONS["percentage"], []),
- None,
- alias.aggregate,
- )
- function_name, columns, alias = parse_function(field, match)
- function = FUNCTIONS[function_name]
- if not function.is_accessible(functions_acl):
- raise InvalidSearchQuery(f"{function.name}: no access to private function")
- arguments = function.format_as_arguments(field, columns, params)
- details = FunctionDetails(field, function, arguments)
- if function.transform is not None:
- snuba_string = function.transform.format(**arguments)
- if alias is None:
- alias = get_function_alias_with_columns(function.name, columns)
- return ResolvedFunction(
- details,
- None,
- [snuba_string, None, alias],
- )
- elif function.aggregate is not None:
- aggregate = deepcopy(function.aggregate)
- aggregate[0] = aggregate[0].format(**arguments)
- if isinstance(aggregate[1], (list, tuple)):
- format_column_arguments(aggregate[1], arguments)
- elif isinstance(aggregate[1], ArgValue):
- arg = aggregate[1].arg
- # The aggregate function has only a single argument
- # however that argument is an expression, so we have
- # to make sure to nest it so it doesn't get treated
- # as a list of arguments by snuba.
- if isinstance(arguments[arg], (list, tuple)):
- aggregate[1] = [arguments[arg]]
- else:
- aggregate[1] = arguments[arg]
- if alias is not None:
- aggregate[2] = alias
- elif aggregate[2] is None:
- aggregate[2] = get_function_alias_with_columns(function.name, columns)
- return ResolvedFunction(details, None, aggregate)
- elif function.column is not None:
- # These can be very nested functions, so we need to iterate through all the layers
- addition = deepcopy(function.column)
- addition[0] = addition[0].format(**arguments)
- if isinstance(addition[1], (list, tuple)):
- format_column_arguments(addition[1], arguments)
- if len(addition) < 3:
- if alias is not None:
- addition.append(alias)
- else:
- addition.append(get_function_alias_with_columns(function.name, columns))
- elif len(addition) == 3:
- if alias is not None:
- addition[2] = alias
- elif addition[2] is None:
- addition[2] = get_function_alias_with_columns(function.name, columns)
- else:
- addition[2] = addition[2].format(**arguments)
- return ResolvedFunction(details, addition, None)
-def resolve_orderby(orderby, fields, aggregations):
- """
- We accept column names, aggregate functions, and aliases as order by
- values. Aggregates and field aliases need to be resolve/validated.
- TODO(mark) Once we're no longer using the dataset selection function
- should allow all non-tag fields to be used as sort clauses, instead of only
- those that are currently selected.
- """
- orderby = orderby if isinstance(orderby, (list, tuple)) else [orderby]
- validated = []
- for column in orderby:
- bare_column = column.lstrip("-")
- if bare_column in fields:
- validated.append(column)
- continue
- if is_function(bare_column):
- bare_column = get_function_alias(bare_column)
- found = [agg[2] for agg in aggregations if agg[2] == bare_column]
- if found:
- prefix = "-" if column.startswith("-") else ""
- validated.append(prefix + bare_column)
- continue
- if (
- bare_column in FIELD_ALIASES
- and FIELD_ALIASES[bare_column].alias
- and bare_column != PROJECT_ALIAS
- ):
- prefix = "-" if column.startswith("-") else ""
- validated.append(prefix + FIELD_ALIASES[bare_column].alias)
- continue
- found = [
- col[2]
- for col in fields
- if isinstance(col, (list, tuple)) and col[2].strip("`") == bare_column
- ]
- if found:
- prefix = "-" if column.startswith("-") else ""
- validated.append(prefix + bare_column)
- if len(validated) == len(orderby):
- return validated
- raise InvalidSearchQuery("Cannot order by a field that is not selected.")
-def get_aggregate_alias(match):
- column = match.group("column").replace(".", "_")
- return "{}_{}".format(match.group("function"), column).rstrip("_")
-def resolve_field(field, params=None, functions_acl=None):
- if not isinstance(field, str):
- raise InvalidSearchQuery("Field names must be strings")
- match = is_function(field)
- if match:
- return resolve_function(field, match, params, functions_acl)
- if field in FIELD_ALIASES:
- special_field = FIELD_ALIASES[field]
- return ResolvedFunction(None, special_field.get_field(params), None)
- tag_match = TAG_KEY_RE.search(field)
- tag_field = tag_match.group("tag") if tag_match else field
- if VALID_FIELD_PATTERN.match(tag_field):
- return ResolvedFunction(None, field, None)
- else:
- raise InvalidSearchQuery(f"Invalid characters in field {field}")
-def resolve_field_list(
- fields, snuba_filter, auto_fields=True, auto_aggregations=False, functions_acl=None
- """
- Expand a list of fields based on aliases and aggregate functions.
- Returns a dist of aggregations, selected_columns, and
- groupby that can be merged into the result of get_snuba_query_args()
- to build a more complete snuba query based on event search conventions.
- Auto aggregates are aggregates that will be automatically added to the
- list of aggregations when they're used in a condition. This is so that
- they can be used in a condition without having to manually add the
- aggregate to a field.
- """
- aggregations = []
- aggregate_fields = defaultdict(set)
- columns = []
- groupby = []
- project_key = ""
- functions = {}
- # If project is requested, we need to map ids to their names since snuba only has ids
- if "project" in fields:
- fields.remove("project")
- project_key = "project"
- # since project.name is more specific, if both are included use project.name instead of project
- if PROJECT_NAME_ALIAS in fields:
- fields.remove(PROJECT_NAME_ALIAS)
- project_key = PROJECT_NAME_ALIAS
- if project_key:
- if "project.id" not in fields:
- fields.append("project.id")
- for field in fields:
- if isinstance(field, str) and field.strip() == "":
- continue
- function = resolve_field(field, snuba_filter.params, functions_acl)
- if function.column is not None and function.column not in columns:
- columns.append(function.column)
- if function.details is not None and isinstance(function.column, (list, tuple)):
- functions[function.column[-1]] = function.details
- elif function.aggregate is not None:
- aggregations.append(function.aggregate)
- if function.details is not None and isinstance(function.aggregate, (list, tuple)):
- functions[function.aggregate[-1]] = function.details
- if function.details.instance.redundant_grouping:
- aggregate_fields[function.aggregate[1]].add(field)
- # Only auto aggregate when there's one other so the group by is not unexpectedly changed
- if auto_aggregations and snuba_filter.having and len(aggregations) > 0:
- for agg in snuba_filter.condition_aggregates:
- if agg not in snuba_filter.aliases:
- function = resolve_field(agg, snuba_filter.params, functions_acl)
- if function.aggregate is not None and function.aggregate not in aggregations:
- aggregations.append(function.aggregate)
- if function.details is not None and isinstance(
- function.aggregate, (list, tuple)
- ):
- functions[function.aggregate[-1]] = function.details
- if function.details.instance.redundant_grouping:
- aggregate_fields[function.aggregate[1]].add(field)
- rollup = snuba_filter.rollup
- if not rollup and auto_fields:
- # Ensure fields we require to build a functioning interface
- # are present. We don't add fields when using a rollup as the additional fields
- # would be aggregated away.
- if not aggregations and "id" not in columns:
- columns.append("id")
- if "id" in columns and "project.id" not in columns:
- columns.append("project.id")
- project_key = PROJECT_NAME_ALIAS
- if project_key:
- # Check to see if there's a condition on project ID already, to avoid unnecessary lookups
- filtered_project_ids = None
- if snuba_filter.conditions:
- for cond in snuba_filter.conditions:
- if cond[0] == "project_id":
- filtered_project_ids = [cond[2]] if cond[1] == "=" else cond[2]
- project_ids = filtered_project_ids or snuba_filter.filter_keys.get("project_id", [])
- projects = Project.objects.filter(id__in=project_ids).values("slug", "id")
- # Clickhouse gets confused when the column contains a period
- # This is specifically for project.name and should be removed once we can stop supporting it
- if "." in project_key:
- project_key = f"`{project_key}`"
- columns.append(
- [
- "transform",
- [
- # This is a workaround since having the column by itself currently is being treated as a function
- ["toString", ["project_id"]],
- ["array", ["'{}'".format(project["id"]) for project in projects]],
- ["array", ["'{}'".format(project["slug"]) for project in projects]],
- # Default case, what to do if a project id without a slug is found
- "''",
- ],
- project_key,
- ]
- )
- if rollup and columns and not aggregations:
- raise InvalidSearchQuery("You cannot use rollup without an aggregate field.")
- orderby = snuba_filter.orderby
- # Only sort if there are columns. When there are only aggregates there's no need to sort
- if orderby and len(columns) > 0:
- orderby = resolve_orderby(orderby, columns, aggregations)
- else:
- orderby = None
- # If aggregations are present all columns
- # need to be added to the group by so that the query is valid.
- if aggregations:
- for column in columns:
- if isinstance(column, (list, tuple)):
- if column[0] == "transform":
- # When there's a project transform, we already group by project_id
- continue
- if column[2] == USER_DISPLAY_ALIAS:
- # user.display needs to be grouped by its coalesce function
- groupby.append(column)
- continue
- groupby.append(column[2])
- else:
- if column in aggregate_fields:
- conflicting_functions = list(aggregate_fields[column])
- raise InvalidSearchQuery(
- "A single field cannot be used both inside and outside a function in the same query. To use {field} you must first remove the function(s): {function_msg}".format(
- field=column,
- function_msg=", ".join(conflicting_functions[:2])
- + (
- f" and {len(conflicting_functions) - 2} more."
- if len(conflicting_functions) > 2
- else ""
- ),
- )
- )
- groupby.append(column)
- return {
- "selected_columns": columns,
- "aggregations": aggregations,
- "groupby": groupby,
- "orderby": orderby,
- "functions": functions,
- }
-TAG_KEY_RE = re.compile(r"^tags\[(?P<tag>.*)\]$")