Browse Source

fix(ddm): Fix units normalization for coefficients (#66579)

Riccardo Busetti 1 year ago
parent
commit
6838e9fb0d

+ 7 - 0
src/sentry/sentry_metrics/querying/common.py

@@ -1,3 +1,5 @@
+from snuba_sdk import ArithmeticOperator
+
 # Snuba can return at most 10.000 rows.
 SNUBA_QUERY_LIMIT = 10000
 # Intervals in seconds which are used by the product to query data.
@@ -11,3 +13,8 @@ DEFAULT_QUERY_INTERVALS = [
     60 * 5,  # 5 min
     60,  # 1 min
 ]
+# Operators in formulas that use coefficients.
+COEFFICIENT_OPERATORS = {
+    ArithmeticOperator.DIVIDE.value,
+    ArithmeticOperator.MULTIPLY.value,
+}

+ 35 - 18
src/sentry/sentry_metrics/querying/visitors/query_expression.py

@@ -1,17 +1,10 @@
 from collections.abc import Sequence
 
-from snuba_sdk import (
-    AliasedExpression,
-    ArithmeticOperator,
-    Column,
-    Condition,
-    Formula,
-    Op,
-    Timeseries,
-)
+from snuba_sdk import AliasedExpression, Column, Condition, Formula, Op, Timeseries
 from snuba_sdk.conditions import ConditionGroup
 
 from sentry.models.environment import Environment
+from sentry.sentry_metrics.querying.common import COEFFICIENT_OPERATORS
 from sentry.sentry_metrics.querying.errors import InvalidMetricsQueryError
 from sentry.sentry_metrics.querying.types import QueryExpression
 from sentry.sentry_metrics.querying.units import (
@@ -282,10 +275,6 @@ class UnitsNormalizationV2Visitor(QueryExpressionVisitor[tuple[UnitMetadata, Que
     Visitor that recursively transforms the `QueryExpression` components to have the same unit.
     """
 
-    UNITLESS_FORMULA_FUNCTIONS = {
-        ArithmeticOperator.DIVIDE.value,
-        ArithmeticOperator.MULTIPLY.value,
-    }
     UNITLESS_AGGREGATES = {"count", "count_unique"}
 
     def __init__(self):
@@ -329,15 +318,21 @@ class UnitsNormalizationV2Visitor(QueryExpressionVisitor[tuple[UnitMetadata, Que
         if last_metadata is None:
             return WithNoUnit(), formula
 
+        has_coefficient_operators = formula.function_name in COEFFICIENT_OPERATORS
+
         # If we have all timeseries as parameters of a formula and the function belongs to `*` or `/` we will
         # not perform any normalization.
-        if formula.function_name in self.UNITLESS_FORMULA_FUNCTIONS and has_all_timeseries_params:
+        if has_coefficient_operators and has_all_timeseries_params:
             return WithNoUnit(), formula
 
         # We convert all scalars in the formula using the last seen scaling factor. Since we are always working with
         # two operands, this means that if we found at least one numeric scalar, the scaling factor will belong to the
         # other operand.
-        if future_units and last_metadata.unit is not None:
+        # It's important to note that we are not doing any scalar normalization if we have a coefficient operator, since
+        # we don't want to scale both operands.
+        # Example:
+        #  a * 2 with a scaling factor of 1000 must become a * 1000 * 2 and not a * 1000 * 2 * 1000
+        if not has_coefficient_operators and future_units and last_metadata.unit is not None:
             for index, future_unit in future_units:
                 parameters[index] = self._normalize_future_units(last_metadata.unit, future_unit)
 
@@ -377,6 +372,9 @@ class UnitsNormalizationV2Visitor(QueryExpressionVisitor[tuple[UnitMetadata, Que
         return WithNoUnit(), string
 
     def _extract_unit(self, timeseries: Timeseries) -> str | None:
+        """
+        Extracts the unit from the timeseries, by parsing its MRI.
+        """
         if timeseries.aggregate in self.UNITLESS_AGGREGATES:
             return None
 
@@ -386,9 +384,6 @@ class UnitsNormalizationV2Visitor(QueryExpressionVisitor[tuple[UnitMetadata, Que
 
         return None
 
-    def _is_numeric_scalar(self, value: QueryExpression) -> bool:
-        return isinstance(value, int) or isinstance(value, float)
-
     def _normalize_future_units(self, unit: Unit, value: QueryExpression) -> QueryExpression:
         """
         Normalizes all future units, which in our case are just numeric scalars, using a common unit. This assumes
@@ -406,8 +401,30 @@ class NumericScalarsNormalizationVisitor(QueryExpressionVisitor[QueryExpression]
     def __init__(self, unit: Unit):
         self._unit = unit
 
+    def _visit_formula(self, formula: Formula) -> QueryExpression:
+        has_coefficient_operators = formula.function_name in COEFFICIENT_OPERATORS
+
+        # In case the formula has a coefficient operator with all scalars, we want to scale the entire formula by
+        # wrapping it in another formula. For all the other cases, we just want to apply the scaling to each component
+        # of the formula, to make the formula less deep.
+        # Example:
+        #  scaling (a * b) by 1000 = (a * b) * 1000
+        #  scaling (a + b) by 1000 = (a * 1000 + b * 1000) in this case the multiplication is performed in-memory
+        if has_coefficient_operators:
+            has_all_scalars = True
+            for parameter in formula.parameters:
+                if not self._is_numeric_scalar(parameter):
+                    has_all_scalars = False
+
+            return self._unit.apply_on_query_expression(formula) if has_all_scalars else formula
+
+        return super()._visit_formula(formula)
+
     def _visit_int(self, int_number: float) -> QueryExpression:
         return self._unit.apply_on_query_expression(int_number)
 
     def _visit_float(self, float_number: float) -> QueryExpression:
         return self._unit.apply_on_query_expression(float_number)
+
+    def _is_numeric_scalar(self, value: QueryExpression) -> bool:
+        return isinstance(value, int) or isinstance(value, float)

+ 6 - 1
tests/sentry/sentry_metrics/querying/data_v2/test_api.py

@@ -1349,6 +1349,8 @@ class MetricsAPITestCase(TestCase, BaseMetricsTestCase):
         for formula, expected_result, expected_unit_family in (
             # (($query_2 * 1000) + 10000.0)
             ("($query_2 + 10)", 30000.0, UnitFamily.DURATION.value),
+            # (($query_2 * 1000) + (10 * 2) * 1000)
+            ("($query_2 + (10 * 2))", 40000.0, UnitFamily.DURATION.value),
             # (10000.0 + ($query_2 * 1000))
             ("(10 + $query_2)", 30000.0, UnitFamily.DURATION.value),
             # (($query_2 + 1000) + (10000.0 + 20000.0))
@@ -1531,7 +1533,7 @@ class MetricsAPITestCase(TestCase, BaseMetricsTestCase):
         assert meta[0][1]["scaling_factor"] is None
 
     @with_feature("organizations:ddm-metrics-api-unit-normalization")
-    def test_query_with_basic_formula_and_unitless_formula_functions(self):
+    def test_query_with_basic_formula_and_coefficient_operators(self):
         mri_1 = "d:custom/page_load@nanosecond"
         mri_2 = "d:custom/load_time@microsecond"
         for mri, value in ((mri_1, 20), (mri_2, 15)):
@@ -1551,7 +1553,10 @@ class MetricsAPITestCase(TestCase, BaseMetricsTestCase):
             ("$query_1 * $query_2 + 25", 325.0, None, None),
             ("$query_1 * $query_2 / 1", 300.0, None, None),
             ("$query_1 * 2", 40.0, UnitFamily.DURATION.value, "nanosecond"),
+            ("$query_2 * 2", 30000.0, UnitFamily.DURATION.value, "nanosecond"),
             ("$query_1 / 2", 10.0, UnitFamily.DURATION.value, "nanosecond"),
+            ("$query_2 / 2", 7500.0, UnitFamily.DURATION.value, "nanosecond"),
+            ("$query_2 * (2 + 1)", 45000.0, UnitFamily.DURATION.value, "nanosecond"),
         ):
             query_1 = self.mql("avg", mri_1)
             query_2 = self.mql("sum", mri_2)