Browse Source

feat(starfish): Return correlation coeff for facets (#48706)

Returns Pearsons correlation coefficient for the top
5 facets. This will allow us show if a correlation
exists between a change in facet count and a change
in p75 of transaction duration.

We're running the correlation function in sentry because
clickhouse does not like aggregates inside corr() and
we currently can't do subqueries through snuba.
Shruthi 1 year ago
parent
commit
ce23c271f2

+ 54 - 30
src/sentry/api/endpoints/organization_events_facets_stats_performance.py

@@ -12,21 +12,11 @@ from sentry.api.endpoints.organization_events_facets_performance import (
     query_facet_performance,
     query_tag_data,
 )
-from sentry.snuba.discover import top_events_timeseries
+from sentry.snuba import discover
 from sentry.snuba.referrer import Referrer
 
-ALLOWED_AGGREGATE_COLUMNS = {
-    "transaction.duration",
-    "measurements.lcp",
-    "spans.browser",
-    "spans.http",
-    "spans.db",
-    "spans.resource",
-}
-
 TAG_ALIASES = {"release": "sentry:release", "dist": "sentry:dist", "user": "sentry:user"}
-DEFAULT_TAG_KEY_LIMIT = 5
-ONE_DAY = int(timedelta(hours=6).total_seconds())
+SIX_HOURS = int(timedelta(hours=6).total_seconds())
 
 
 @region_silo_endpoint
@@ -76,7 +66,7 @@ class OrganizationEventsFacetsStatsPerformanceEndpoint(
             if not top_facets:
                 return {"data": []}
 
-            def get_event_stats(
+            def top_events_stats(
                 query_columns: Sequence[str],
                 query: str,
                 params: Dict[str, str],
@@ -84,7 +74,7 @@ class OrganizationEventsFacetsStatsPerformanceEndpoint(
                 zerofill_results: bool,
                 comparison_delta: Optional[datetime],
             ):
-                return top_events_timeseries(
+                return discover.top_events_timeseries(
                     timeseries_columns=query_columns,
                     selected_columns=["tags_key", "tags_value"],
                     top_events=top_facets,
@@ -94,29 +84,63 @@ class OrganizationEventsFacetsStatsPerformanceEndpoint(
                     # TODO: Better selection of granularity,
                     # but we generally only need pretty low granularity
                     # for this since it's only being used for sparklines
-                    rollup=ONE_DAY,
+                    rollup=SIX_HOURS,
                     limit=10000,
                     organization=None,
                     referrer=referrer,
                 )
 
-        results = self.get_event_stats_data(
-            request,
-            organization,
-            get_event_stats,
-            top_events=5,
-            query=filter_query,
-            query_column="count()",
-            additional_query_column="p75(transaction.duration)",
-        )
+            results = self.get_event_stats_data(
+                request,
+                organization,
+                top_events_stats,
+                top_events=5,
+                query=filter_query,
+                query_column="count()",
+                additional_query_column="p75(transaction.duration)",
+            )
 
-        totals = {}
-        for facet in top_facets["data"]:
-            key = facet.pop("tags_key")
-            value = facet.pop("tags_value")
-            totals[f"{key},{value}"] = facet
+            def get_event_stats(
+                query_columns: Sequence[str],
+                query: str,
+                params: Dict[str, str],
+                rollup: int,
+                zerofill_results: bool,
+                comparison_delta: Optional[datetime] = None,
+            ):
+                return discover.timeseries_query(
+                    selected_columns=query_columns,
+                    query=query,
+                    params=params,
+                    # TODO: Better selection of granularity,
+                    # but we generally only need pretty low granularity
+                    # for this since it's only being used for sparklines
+                    rollup=SIX_HOURS,
+                )
+
+            events_stats = self.get_event_stats_data(
+                request,
+                organization,
+                get_event_stats,
+                top_events=5,
+                query=filter_query,
+                query_column="p75(transaction.duration)",
+            )
+
+        with sentry_sdk.start_span(op="discover.endpoint", description="find_correlation"):
+            totals = {}
+            for facet in top_facets["data"]:
+                key = facet.pop("tags_key")
+                value = facet.pop("tags_value")
+                new_key = f"{key},{value}"
+
+                sum_correlation = discover.corr_snuba_timeseries(
+                    results[new_key]["count()"]["data"], events_stats["data"]
+                )
+                facet["sum_correlation"] = sum_correlation
+                totals[new_key] = facet
 
-        results["totals"] = totals
+            results["totals"] = totals
 
         return Response(
             results,

+ 37 - 1
src/sentry/snuba/discover.py

@@ -4,7 +4,7 @@ import random
 from collections import namedtuple
 from copy import deepcopy
 from datetime import datetime, timedelta
-from typing import Any, Dict, List, Optional, Sequence
+from typing import Any, Dict, List, Optional, Sequence, Tuple
 
 import sentry_sdk
 from sentry_relay.consts import SPAN_STATUS_CODE_TO_NAME
@@ -1333,3 +1333,39 @@ def check_multihistogram_fields(fields):
         elif histogram_type == "span_op_breakdowns" and not is_span_op_breakdown(field):
             return False
     return histogram_type
+
+
+def corr_snuba_timeseries(
+    x: Sequence[Tuple[int, Sequence[Dict[str, float]]]],
+    y: Sequence[Tuple[int, Sequence[Dict[str, float]]]],
+):
+    """
+    Returns the Pearson's coefficient of two snuba timeseries.
+    """
+    if len(x) != len(y):
+        return
+
+    n = len(x)
+    sum_x, sum_y, sum_xy, sum_x_squared, sum_y_squared = 0, 0, 0, 0, 0
+    for i in range(n):
+        x_datum = x[i]
+        y_datum = y[i]
+
+        x_ = x_datum[1][0]["count"]
+        y_ = y_datum[1][0]["count"]
+
+        sum_x += x_
+        sum_y += y_
+        sum_xy += x_ * y_
+        sum_x_squared += x_ * x_
+        sum_y_squared += y_ * y_
+
+    denominator = math.sqrt(
+        (n * sum_x_squared - sum_x * sum_x) * (n * sum_y_squared - sum_y * sum_y)
+    )
+    if denominator == 0:
+        return
+
+    pearsons_corr_coeff = ((n * sum_xy) - (sum_x * sum_y)) / denominator
+
+    return pearsons_corr_coeff

+ 1 - 0
tests/snuba/api/endpoints/test_organization_events_facets_stats_performance.py

@@ -106,6 +106,7 @@ class OrganizationEventsFacetsPerformanceEndpointTest(
             "count_delta": -1.0,
             "count_range_1": 5,
             "count_range_total": 5,
+            "sum_correlation": 0.9718819143525331,
         }
 
         assert data["color,blue"]