Просмотр исходного кода

feat(transactions): Endpoint to evaluate transaction clustering (#41735)

We want to detect high-cardinality URL patterns automatically in order
to erase identifiers from transaction names. This PR implements a
tree-based algorithm to derive these replacement rules, and exposes an
endpoint that allows us to experiment with different parameters.

Not in this PR:
* Cron job to run the clustering
* Persistence of detected rules or communication with Relay
Joris Bayer 2 лет назад
Родитель
Сommit
75e387c9db

+ 1 - 0
.github/CODEOWNERS

@@ -21,6 +21,7 @@
 # Event Ingestion
 /src/sentry/attachments/           @getsentry/owners-ingest
 /src/sentry/api/endpoints/relay/   @getsentry/owners-ingest
+/src/sentry/api/endpoints/project_transaction_names.py @getsentry/owners-ingest
 /src/sentry/coreapi.py             @getsentry/owners-ingest
 /src/sentry/ingest/                @getsentry/owners-ingest
 /src/sentry/interfaces/            @getsentry/owners-ingest

+ 1 - 0
mypy.ini

@@ -75,6 +75,7 @@ files = fixtures/mypy-stubs,
         src/sentry/grouping/strategies/utils.py,
         src/sentry/incidents/charts.py,
         src/sentry/ingest/billing_metrics_consumer.py,
+        src/sentry/ingest/transaction_clusterer/,
         src/sentry/integrations/base.py,
         src/sentry/integrations/github/,
         src/sentry/integrations/slack/,

+ 52 - 0
src/sentry/api/endpoints/project_transaction_names.py

@@ -0,0 +1,52 @@
+from rest_framework.exceptions import ParseError
+from rest_framework.request import Request
+from rest_framework.response import Response
+
+from sentry.api.base import region_silo_endpoint
+from sentry.api.bases.project import ProjectEndpoint
+from sentry.api.utils import get_date_range_from_stats_period
+from sentry.ingest.transaction_clusterer.datasource import fetch_unique_transaction_names
+from sentry.ingest.transaction_clusterer.tree import TreeClusterer
+
+
+@region_silo_endpoint
+class ProjectTransactionNamesCluster(ProjectEndpoint):
+    private = True
+
+    def get(self, request: Request, project) -> Response:
+        """Run the transaction name clusterer and return its output.
+
+        This endpoint is intended for internal evaluation of the clustering
+        algorithm, not for public usage.
+        """
+
+        params = request.GET
+        start, end = get_date_range_from_stats_period(params)
+        if start is None or end is None:
+            raise ParseError(detail="Invalid date range")
+
+        snuba_limit = int(params.get("limit", 1000))
+        merge_threshold = int(params.get("threshold", 100))
+        return_all_names = params.get("returnAllNames")
+
+        transaction_names = list(
+            fetch_unique_transaction_names(
+                project,
+                (start, end),
+                snuba_limit,
+            )
+        )
+
+        clusterer = TreeClusterer(merge_threshold=merge_threshold)
+        clusterer.add_input(transaction_names)
+
+        return Response(
+            {
+                "rules": clusterer.get_rules(),
+                "meta": {
+                    "unique_transaction_names": transaction_names
+                    if return_all_names
+                    else len(transaction_names)
+                },
+            }
+        )

+ 6 - 0
src/sentry/api/urls.py

@@ -15,6 +15,7 @@ from sentry.api.endpoints.organization_sentry_function_details import (
     OrganizationSentryFunctionDetailsEndpoint,
 )
 from sentry.api.endpoints.project_grouping_configs import ProjectGroupingConfigsEndpoint
+from sentry.api.endpoints.project_transaction_names import ProjectTransactionNamesCluster
 from sentry.api.endpoints.project_transaction_threshold_override import (
     ProjectTransactionThresholdOverrideEndpoint,
 )
@@ -2204,6 +2205,11 @@ urlpatterns = [
                     ProjectPluginDetailsEndpoint.as_view(),
                     name="sentry-api-0-project-plugin-details",
                 ),
+                url(
+                    r"^(?P<organization_slug>[^\/]+)/(?P<project_slug>[^\/]+)/cluster-transaction-names/$",
+                    ProjectTransactionNamesCluster.as_view(),
+                    name="sentry-api-0-organization-project-cluster-transaction-names",
+                ),
                 url(
                     r"^(?P<organization_slug>[^\/]+)/(?P<project_slug>[^\/]+)/plugins?/",
                     include("sentry.plugins.base.project_api_urls"),

+ 1 - 0
src/sentry/ingest/transaction_clusterer/__init__.py

@@ -0,0 +1 @@
+""" Strategies for clustering high-cardinality transaction names """

+ 24 - 0
src/sentry/ingest/transaction_clusterer/base.py

@@ -0,0 +1,24 @@
+from abc import abstractmethod
+from typing import Iterable, List, NewType
+
+#: Rule to replace high-cardinality patterns in a transaction name.
+#: For now, format these rules as simple strings
+ReplacementRule = NewType("ReplacementRule", str)
+
+
+class Clusterer:
+    """Strategy for clustering transaction names
+
+    Derives replacement rules from a given set of transaction names.
+
+    """
+
+    @abstractmethod
+    def add_input(self, transaction_name: Iterable[str]) -> None:
+        """Add a batch of transaction names to the clusterer's state"""
+        ...
+
+    @abstractmethod
+    def get_rules(self) -> List[ReplacementRule]:
+        """Compute and retrieve rules"""
+        ...

+ 36 - 0
src/sentry/ingest/transaction_clusterer/datasource.py

@@ -0,0 +1,36 @@
+from datetime import datetime
+from typing import Iterable, Tuple
+
+from snuba_sdk import Column, Condition, Entity, Limit, Op, Query, Request
+
+from sentry.models import Project
+from sentry.utils.snuba import raw_snql_query
+
+TRANSACTION_SOURCE = "url"
+
+
+def fetch_unique_transaction_names(
+    project: Project, time_range: Tuple[datetime, datetime], limit: int
+) -> Iterable[str]:
+    then, now = time_range
+    snuba_request = Request(
+        "transactions",
+        app_id="transactions",
+        query=Query(
+            match=Entity("transactions"),
+            select=[Column("transaction")],
+            where=[
+                Condition(Column("project_id"), Op.EQ, project.id),
+                Condition(Column("finish_ts"), Op.GTE, then),
+                Condition(Column("finish_ts"), Op.LT, now),
+                Condition(Column("transaction_source"), Op.EQ, TRANSACTION_SOURCE),
+            ],
+            groupby=[Column("transaction")],
+            limit=Limit(limit),
+        ),
+    )
+    snuba_response = raw_snql_query(
+        snuba_request, referrer="src.sentry.ingest.transaction_clusterer"
+    )
+
+    return (row["transaction"] for row in snuba_response["data"])

+ 142 - 0
src/sentry/ingest/transaction_clusterer/tree.py

@@ -0,0 +1,142 @@
+""" Build a directory tree from URL patterns and merge nodes with many siblings.
+
+For example, a blog project might contain transaction names along the lines of:
+
+/users/my-user-name/posts/2022-01-01-that-one-time-i-did-something
+
+We build a tree that looks like this:
+
+/users
+  /my-user-name
+    /posts
+      /2022-01-01-that-one-time-i-did-something
+  /my-user-name2
+    /posts
+      /2022-12-31-i-did-something-too
+  /my-user-name3
+    /posts
+      /2022-07-15-but-what-about-me
+    /settings
+
+As soon as the node /users has reached the threshold of X children, we merge them:
+
+/users
+  /*
+    /posts
+      /2022-01-01-that-one-time-i-did-something
+      /2022-12-31-i-did-something-too
+      /2022-07-15-but-what-about-me
+    /settings
+
+If /posts now also has reached the threshold of X children, those are merged as well:
+
+/users
+  /*
+    /posts
+      *
+    /settings
+
+Resulting in the following replacement rules:
+
+/users/*/posts/*/**  # To replace both identifiers
+/users/*/**          # For URLs with only one identifier, e.g. /users/*/settings
+
+The replacement rules are interpreted by Relay to match and replace `*`, and to match but ignore `**`.
+
+"""
+
+import logging
+from collections import UserDict, defaultdict
+from typing import Iterable, List, Optional, Union
+
+import sentry_sdk
+from typing_extensions import TypeAlias
+
+from .base import Clusterer, ReplacementRule
+
+__all__ = ["TreeClusterer"]
+
+
+class Merged:
+    pass
+
+
+#: Symbol representing a merged node
+MERGED = Merged()
+
+#: Separator by which we build the tree
+SEP = "/"
+
+
+logger = logging.getLogger(__name__)
+
+
+class TreeClusterer(Clusterer):
+    def __init__(self, *, merge_threshold: int) -> None:
+        self._merge_threshold = merge_threshold
+        self._tree = Node()
+
+    def add_input(self, transaction_names: Iterable[str]) -> None:
+        for tx_name in transaction_names:
+            parts = tx_name.split(SEP)
+            node = self._tree
+            for part in parts:
+                node = node.setdefault(part, Node())
+
+    def get_rules(self) -> List[ReplacementRule]:
+        """Merge high-cardinality nodes in the graph and extract rules"""
+        with sentry_sdk.start_span(op="txcluster_merge"):
+            self._tree.merge(self._merge_threshold)
+
+        # Generate exactly 1 rule for every merge
+        rule_paths = [path for path in self._tree.paths() if path[-1] is MERGED]
+
+        # Sort by path length, descending (most specific rule first)
+        rule_paths.sort(key=len, reverse=True)
+
+        return [self._build_rule(path) for path in rule_paths]
+
+    @staticmethod
+    def _build_rule(path: List["Edge"]) -> ReplacementRule:
+        path_str = SEP.join(["*" if isinstance(key, Merged) else key for key in path])
+        path_str += "/**"
+        return ReplacementRule(path_str)
+
+
+#: Represents the edges between graph nodes. These edges serve as keys in the
+#: node dictionary.
+Edge: TypeAlias = Union[str, Merged]
+
+
+class Node(UserDict):  # type: ignore
+    """Keys in this dict are names of the children"""
+
+    def paths(self, ancestors: Optional[List[Edge]] = None) -> Iterable[List[Edge]]:
+        """Collect all paths and subpaths through the graph"""
+        if ancestors is None:
+            ancestors = []
+        for name, child in self.items():
+            path = ancestors + [name]
+            yield path
+            yield from child.paths(ancestors=path)
+
+    def merge(self, merge_threshold: int) -> None:
+        """Recursively merge children of high-cardinality nodes"""
+        if len(self) >= merge_threshold:
+            merged_children = self._merge_nodes(self.values())
+            self.clear()
+            self[MERGED] = merged_children
+
+        for child in self.values():
+            child.merge(merge_threshold)
+
+    @classmethod
+    def _merge_nodes(cls, nodes: Iterable["Node"]) -> "Node":
+        children_by_name = defaultdict(list)
+        for node in nodes:
+            for name, child in node.items():
+                children_by_name[name].append(child)
+
+        return Node(
+            {name: cls._merge_nodes(children) for name, children in children_by_name.items()}
+        )

+ 55 - 0
tests/sentry/api/endpoints/test_project_transaction_names.py

@@ -0,0 +1,55 @@
+from django.urls import reverse
+
+from sentry.testutils import APITestCase
+from sentry.testutils.helpers.datetime import before_now
+from sentry.testutils.silo import region_silo_test
+from sentry.utils.samples import load_data
+
+
+@region_silo_test
+class ProjectTransactionNamesClusterTest(APITestCase):
+    def setUp(self) -> None:
+        super().setUp()
+
+        self.login_as(user=self.user)
+
+        self.org = self.create_organization(owner=self.user)
+        self.project = self.create_project(organization=self.org)
+
+        self.url = reverse(
+            "sentry-api-0-organization-project-cluster-transaction-names",
+            args=[self.org.slug, self.project.slug],
+        )
+
+        for transaction in ["/a/b/c/", "/a/foo", "/a/whathever/c/d/", "/not_a/"]:
+            event = load_data(
+                "transaction",
+                timestamp=before_now(minutes=1),
+                start_timestamp=before_now(minutes=1, milliseconds=500),
+            )
+            event["transaction"] = transaction
+            event["transaction_info"] = {"source": "url"}
+            self.store_event(event, project_id=self.project.id)
+
+    def test_get(self):
+        response = self.client.get(
+            self.url,
+            data={
+                "project": [self.project.id],
+                "statsPeriod": "1h",
+                "limit": 5,
+                "threshold": 3,
+                "returnAllNames": True,
+            },
+            format="json",
+        )
+
+        assert response.status_code == 200, response.content
+        data = response.data
+        data["meta"]["unique_transaction_names"].sort()
+        assert data == {
+            "rules": ["/a/*/**"],
+            "meta": {
+                "unique_transaction_names": ["/a/b/c/", "/a/foo", "/a/whathever/c/d/", "/not_a/"]
+            },
+        }

+ 29 - 0
tests/sentry/ingest/test_transaction_clusterer.py

@@ -0,0 +1,29 @@
+from sentry.ingest.transaction_clusterer.tree import TreeClusterer
+
+
+def test_multi_fanout():
+    clusterer = TreeClusterer(merge_threshold=3)
+    transaction_names = [
+        "/a/b0/c/d0/e",
+        "/a/b0/c/d1/e",
+        "/a/b0/c/d2/e",
+        "/a/b1/c/d0/e",
+        "/a/b1/c/d1/e/",
+        "/a/b1/c/d2/e",
+        "/a/b2/c/d0/e",
+        "/a/b2/c/d1/e/",
+        "/a/b2/c/d2/e",
+        "/a/b2/c1/d2/e",
+    ]
+    clusterer.add_input(transaction_names)
+    assert clusterer.get_rules() == ["/a/*/c/*/**", "/a/*/**"]
+
+
+def test_single_leaf():
+    clusterer = TreeClusterer(merge_threshold=2)
+    transaction_names = [
+        "/a/b1/c/",
+        "/a/b2/c/",
+    ]
+    clusterer.add_input(transaction_names)
+    assert clusterer.get_rules() == ["/a/*/**"]