Browse Source

feat(perf): process data filters for span histograms (#33320)

* feat(perf): Exclude outliers in the histogram exclusive time

* add tests

* make upper and lower fence names more descriptive
Dameli Ushbayeva 2 years ago
parent
commit
f172f7e6f5

+ 42 - 16
src/sentry/snuba/discover.py

@@ -1342,7 +1342,7 @@ def spans_histogram_query(
     group_by=None,
     order_by=None,
     limit_by=None,
-    extra_snql_condition=None,
+    extra_condition=None,
     normalize_results=True,
 ):
     """
@@ -1361,7 +1361,7 @@ def spans_histogram_query(
     :param [str] group_by: Allows additional grouping to serve multifacet histograms.
     :param [str] order_by: Allows additional ordering within each alias to serve multifacet histograms.
     :param [str] limit_by: Allows limiting within a group when serving multifacet histograms.
-    :param [Condition] extra_snql_condition: Adds any additional conditions to the histogram query
+    :param [Condition] extra_condition: Adds any additional conditions to the histogram query
     :param bool normalize_results: Indicate whether to normalize the results by column into bins.
     """
     multiplier = int(10 ** precision)
@@ -1397,8 +1397,8 @@ def spans_histogram_query(
         orderby=order_by,
         limitby=limit_by,
     )
-    if extra_snql_condition is not None:
-        builder.add_conditions(extra_snql_condition)
+    if extra_condition is not None:
+        builder.add_conditions(extra_condition)
     results = builder.run_query(referrer)
 
     if not normalize_results:
@@ -1657,31 +1657,36 @@ def find_span_histogram_min_max(span, min_value, max_value, user_query, params,
     :param {str: str} params: Filtering parameters with start, end, project_id, environment
     :param str data_filter: Indicate the filter strategy to be applied to the data.
     """
-
     if min_value is not None and max_value is not None:
         return min_value, max_value
 
     selected_columns = []
     min_column = ""
     max_column = ""
+    outlier_lower_fence = ""
+    outlier_upper_fence = ""
     if min_value is None:
         min_column = f'fn_span_exclusive_time("{span.op}", {span.group}, min)'
         selected_columns.append(min_column)
     if max_value is None:
         max_column = f'fn_span_exclusive_time("{span.op}", {span.group}, max)'
         selected_columns.append(max_column)
+    if data_filter == "exclude_outliers":
+        outlier_lower_fence = f'fn_span_exclusive_time("{span.op}", {span.group}, quantile(0.25))'
+        outlier_upper_fence = f'fn_span_exclusive_time("{span.op}", {span.group}, quantile(0.75))'
+        selected_columns.append(outlier_lower_fence)
+        selected_columns.append(outlier_upper_fence)
 
-    referrer = "api.organization-spans-histogram-min-max"
-    builder = QueryBuilder(
-        dataset=Dataset.Discover,
-        params=params,
+    results = query(
         selected_columns=selected_columns,
         query=user_query,
+        params=params,
         limit=1,
+        referrer="api.organization-spans-histogram-min-max",
         functions_acl=["fn_span_exclusive_time"],
+        use_snql=True,
     )
 
-    results = builder.run_query(referrer)
     data = results.get("data")
 
     # there should be exactly 1 row in the results, but if something went wrong here,
@@ -1706,12 +1711,33 @@ def find_span_histogram_min_max(span, min_value, max_value, user_query, params,
         calculated_max_value = row[get_function_alias(max_column)]
         max_value = calculated_max_value if calculated_max_value else None
 
-    if max_value is not None and min_value is not None:
-        # min_value may be either queried or provided by the user. max_value was queried.
-        # If min_value > max_value, then max_value should be adjusted with respect to
-        # min_value, since min_value is a lower bound, and any and all data below
-        # min_value should be ignored.
-        max_value = max([max_value, min_value])
+        max_fence_value = None
+        if data_filter == "exclude_outliers":
+            outlier_lower_fence_alias = get_function_alias(outlier_lower_fence)
+            outlier_upper_fence_alias = get_function_alias(outlier_upper_fence)
+
+            first_quartile = row[outlier_lower_fence_alias]
+            third_quartile = row[outlier_upper_fence_alias]
+
+            if (
+                first_quartile is not None
+                or third_quartile is not None
+                or not math.isnan(first_quartile)
+                or not math.isnan(third_quartile)
+            ):
+                interquartile_range = abs(third_quartile - first_quartile)
+                upper_outer_fence = third_quartile + 3 * interquartile_range
+                max_fence_value = upper_outer_fence
+
+        candidates = [max_fence_value, max_value]
+        candidates = list(filter(lambda v: v is not None, candidates))
+        max_value = min(candidates) if candidates else None
+        if max_value is not None and min_value is not None:
+            # min_value may be either queried or provided by the user. max_value was queried.
+            # If min_value > max_value, then max_value should be adjusted with respect to
+            # min_value, since min_value is a lower bound, and any and all data below
+            # min_value should be ignored.
+            max_value = max([max_value, min_value])
 
     return min_value, max_value
 

+ 74 - 1
tests/snuba/api/endpoints/test_organization_events_spans_histogram.py

@@ -24,7 +24,6 @@ class OrganizationEventsSpansHistogramEndpointTest(APITestCase, SnubaTestCase):
         )
 
         self.min_ago = before_now(minutes=1).replace(microsecond=0)
-        self.day_ago = before_now(days=1).replace(hour=10, minute=0, second=0, microsecond=0)
 
     def create_event(self, **kwargs):
         if "spans" not in kwargs:
@@ -220,6 +219,20 @@ class OrganizationEventsSpansHistogramEndpointTest(APITestCase, SnubaTestCase):
         assert response.status_code == 400, "failing for max"
         assert response.data == {"max": ["A valid number is required."]}, "failing for max"
 
+    def test_bad_params_invalid_data_filter(self):
+        query = {
+            "project": [self.project.id],
+            "span": self.format_span("django.middleware", "cd" * 8),
+            "numBuckets": 50,
+            "dataFilter": "invalid",
+        }
+
+        response = self.do_request(query)
+        assert response.status_code == 400, "failing for dataFilter"
+        assert response.data == {
+            "dataFilter": ['"invalid" is not a valid choice.']
+        }, "failing for dataFilter"
+
     def test_histogram_empty(self):
         num_buckets = 5
         query = {
@@ -314,3 +327,63 @@ class OrganizationEventsSpansHistogramEndpointTest(APITestCase, SnubaTestCase):
         for bucket in response.data:
             assert bucket["count"] == 0
         assert response.data[-1] == {"bin": max - 1, "count": 0}
+
+    def test_histogram_all_data_filter(self):
+        # populate with default spans
+        self.create_event()
+
+        spans = [
+            {
+                "same_process_as_parent": True,
+                "parent_span_id": "a" * 16,
+                "span_id": "e" * 16,
+                "start_timestamp": iso_format(self.min_ago + timedelta(seconds=1)),
+                "timestamp": iso_format(self.min_ago + timedelta(seconds=4)),
+                "op": "django.middleware",
+                "description": "middleware span",
+                "hash": "cd" * 8,
+                "exclusive_time": 60.0,
+            }
+        ]
+
+        # populate with an outlier span
+        self.create_event(spans=spans)
+        query = {
+            "project": [self.project.id],
+            "span": self.format_span("django.middleware", "cd" * 8),
+            "numBuckets": 10,
+            "dataFilter": "all",
+        }
+        response = self.do_request(query)
+        assert response.status_code == 200
+        assert response.data[-1] == {"bin": 60, "count": 1}
+
+    def test_histogram_exclude_outliers_data_filter(self):
+        # populate with default spans
+        self.create_event()
+
+        spans = [
+            {
+                "same_process_as_parent": True,
+                "parent_span_id": "a" * 16,
+                "span_id": "e" * 16,
+                "start_timestamp": iso_format(self.min_ago + timedelta(seconds=1)),
+                "timestamp": iso_format(self.min_ago + timedelta(seconds=4)),
+                "op": "django.middleware",
+                "description": "middleware span",
+                "hash": "cd" * 8,
+                "exclusive_time": 60.0,
+            }
+        ]
+
+        # populate with an outlier span
+        self.create_event(spans=spans)
+        query = {
+            "project": [self.project.id],
+            "span": self.format_span("django.middleware", "cd" * 8),
+            "numBuckets": 10,
+            "dataFilter": "exclude_outliers",
+        }
+        response = self.do_request(query)
+        assert response.status_code == 200
+        assert response.data[-1]["bin"] != 60