2 лет назад · 75e387c9db
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -21,6 +21,7 @@
 
				 # Event Ingestion
			
 
				 /src/sentry/attachments/           @getsentry/owners-ingest
			
 
				 /src/sentry/api/endpoints/relay/   @getsentry/owners-ingest
			
 
				+/src/sentry/api/endpoints/project_transaction_names.py @getsentry/owners-ingest
			
 
				 /src/sentry/coreapi.py             @getsentry/owners-ingest
			
 
				 /src/sentry/ingest/                @getsentry/owners-ingest
			
 
				 /src/sentry/interfaces/            @getsentry/owners-ingest
			
--- a/mypy.ini
+++ b/mypy.ini
@@ -75,6 +75,7 @@ files = fixtures/mypy-stubs,
 
				         src/sentry/grouping/strategies/utils.py,
			
 
				         src/sentry/incidents/charts.py,
			
 
				         src/sentry/ingest/billing_metrics_consumer.py,
			
 
				+        src/sentry/ingest/transaction_clusterer/,
			
 
				         src/sentry/integrations/base.py,
			
 
				         src/sentry/integrations/github/,
			
 
				         src/sentry/integrations/slack/,
			
--- a/src/sentry/api/endpoints/project_transaction_names.py
+++ b/src/sentry/api/endpoints/project_transaction_names.py
@@ -0,0 +1,52 @@
 
				+from rest_framework.exceptions import ParseError
			
 
				+from rest_framework.request import Request
			
 
				+from rest_framework.response import Response
			
 
				+
			
 
				+from sentry.api.base import region_silo_endpoint
			
 
				+from sentry.api.bases.project import ProjectEndpoint
			
 
				+from sentry.api.utils import get_date_range_from_stats_period
			
 
				+from sentry.ingest.transaction_clusterer.datasource import fetch_unique_transaction_names
			
 
				+from sentry.ingest.transaction_clusterer.tree import TreeClusterer
			
 
				+
			
 
				+
			
 
				+@region_silo_endpoint
			
 
				+class ProjectTransactionNamesCluster(ProjectEndpoint):
			
 
				+    private = True
			
 
				+
			
 
				+    def get(self, request: Request, project) -> Response:
			
 
				+        """Run the transaction name clusterer and return its output.
			
 
				+
			
 
				+        This endpoint is intended for internal evaluation of the clustering
			
 
				+        algorithm, not for public usage.
			
 
				+        """
			
 
				+
			
 
				+        params = request.GET
			
 
				+        start, end = get_date_range_from_stats_period(params)
			
 
				+        if start is None or end is None:
			
 
				+            raise ParseError(detail="Invalid date range")
			
 
				+
			
 
				+        snuba_limit = int(params.get("limit", 1000))
			
 
				+        merge_threshold = int(params.get("threshold", 100))
			
 
				+        return_all_names = params.get("returnAllNames")
			
 
				+
			
 
				+        transaction_names = list(
			
 
				+            fetch_unique_transaction_names(
			
 
				+                project,
			
 
				+                (start, end),
			
 
				+                snuba_limit,
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        clusterer = TreeClusterer(merge_threshold=merge_threshold)
			
 
				+        clusterer.add_input(transaction_names)
			
 
				+
			
 
				+        return Response(
			
 
				+            {
			
 
				+                "rules": clusterer.get_rules(),
			
 
				+                "meta": {
			
 
				+                    "unique_transaction_names": transaction_names
			
 
				+                    if return_all_names
			
 
				+                    else len(transaction_names)
			
 
				+                },
			
 
				+            }
			
 
				+        )
			
--- a/src/sentry/api/urls.py
+++ b/src/sentry/api/urls.py
@@ -15,6 +15,7 @@ from sentry.api.endpoints.organization_sentry_function_details import (
 
				     OrganizationSentryFunctionDetailsEndpoint,
			
 
				 )
			
 
				 from sentry.api.endpoints.project_grouping_configs import ProjectGroupingConfigsEndpoint
			
 
				+from sentry.api.endpoints.project_transaction_names import ProjectTransactionNamesCluster
			
 
				 from sentry.api.endpoints.project_transaction_threshold_override import (
			
 
				     ProjectTransactionThresholdOverrideEndpoint,
			
 
				 )
			
@@ -2204,6 +2205,11 @@ urlpatterns = [
 
				                     ProjectPluginDetailsEndpoint.as_view(),
			
 
				                     name="sentry-api-0-project-plugin-details",
			
 
				                 ),
			
 
				+                url(
			
 
				+                    r"^(?P<organization_slug>[^\/]+)/(?P<project_slug>[^\/]+)/cluster-transaction-names/$",
			
 
				+                    ProjectTransactionNamesCluster.as_view(),
			
 
				+                    name="sentry-api-0-organization-project-cluster-transaction-names",
			
 
				+                ),
			
 
				                 url(
			
 
				                     r"^(?P<organization_slug>[^\/]+)/(?P<project_slug>[^\/]+)/plugins?/",
			
 
				                     include("sentry.plugins.base.project_api_urls"),
			
--- a/src/sentry/ingest/transaction_clusterer/__init__.py
+++ b/src/sentry/ingest/transaction_clusterer/__init__.py
@@ -0,0 +1 @@
 
				+""" Strategies for clustering high-cardinality transaction names """
			
--- a/src/sentry/ingest/transaction_clusterer/base.py
+++ b/src/sentry/ingest/transaction_clusterer/base.py
@@ -0,0 +1,24 @@
 
				+from abc import abstractmethod
			
 
				+from typing import Iterable, List, NewType
			
 
				+
			
 
				+#: Rule to replace high-cardinality patterns in a transaction name.
			
 
				+#: For now, format these rules as simple strings
			
 
				+ReplacementRule = NewType("ReplacementRule", str)
			
 
				+
			
 
				+
			
 
				+class Clusterer:
			
 
				+    """Strategy for clustering transaction names
			
 
				+
			
 
				+    Derives replacement rules from a given set of transaction names.
			
 
				+
			
 
				+    """
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def add_input(self, transaction_name: Iterable[str]) -> None:
			
 
				+        """Add a batch of transaction names to the clusterer's state"""
			
 
				+        ...
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def get_rules(self) -> List[ReplacementRule]:
			
 
				+        """Compute and retrieve rules"""
			
 
				+        ...
			
--- a/src/sentry/ingest/transaction_clusterer/datasource.py
+++ b/src/sentry/ingest/transaction_clusterer/datasource.py
@@ -0,0 +1,36 @@
 
				+from datetime import datetime
			
 
				+from typing import Iterable, Tuple
			
 
				+
			
 
				+from snuba_sdk import Column, Condition, Entity, Limit, Op, Query, Request
			
 
				+
			
 
				+from sentry.models import Project
			
 
				+from sentry.utils.snuba import raw_snql_query
			
 
				+
			
 
				+TRANSACTION_SOURCE = "url"
			
 
				+
			
 
				+
			
 
				+def fetch_unique_transaction_names(
			
 
				+    project: Project, time_range: Tuple[datetime, datetime], limit: int
			
 
				+) -> Iterable[str]:
			
 
				+    then, now = time_range
			
 
				+    snuba_request = Request(
			
 
				+        "transactions",
			
 
				+        app_id="transactions",
			
 
				+        query=Query(
			
 
				+            match=Entity("transactions"),
			
 
				+            select=[Column("transaction")],
			
 
				+            where=[
			
 
				+                Condition(Column("project_id"), Op.EQ, project.id),
			
 
				+                Condition(Column("finish_ts"), Op.GTE, then),
			
 
				+                Condition(Column("finish_ts"), Op.LT, now),
			
 
				+                Condition(Column("transaction_source"), Op.EQ, TRANSACTION_SOURCE),
			
 
				+            ],
			
 
				+            groupby=[Column("transaction")],
			
 
				+            limit=Limit(limit),
			
 
				+        ),
			
 
				+    )
			
 
				+    snuba_response = raw_snql_query(
			
 
				+        snuba_request, referrer="src.sentry.ingest.transaction_clusterer"
			
 
				+    )
			
 
				+
			
 
				+    return (row["transaction"] for row in snuba_response["data"])
			
--- a/src/sentry/ingest/transaction_clusterer/tree.py
+++ b/src/sentry/ingest/transaction_clusterer/tree.py
@@ -0,0 +1,142 @@
 
				+""" Build a directory tree from URL patterns and merge nodes with many siblings.
			
 
				+
			
 
				+For example, a blog project might contain transaction names along the lines of:
			
 
				+
			
 
				+/users/my-user-name/posts/2022-01-01-that-one-time-i-did-something
			
 
				+
			
 
				+We build a tree that looks like this:
			
 
				+
			
 
				+/users
			
 
				+  /my-user-name
			
 
				+    /posts
			
 
				+      /2022-01-01-that-one-time-i-did-something
			
 
				+  /my-user-name2
			
 
				+    /posts
			
 
				+      /2022-12-31-i-did-something-too
			
 
				+  /my-user-name3
			
 
				+    /posts
			
 
				+      /2022-07-15-but-what-about-me
			
 
				+    /settings
			
 
				+
			
 
				+As soon as the node /users has reached the threshold of X children, we merge them:
			
 
				+
			
 
				+/users
			
 
				+  /*
			
 
				+    /posts
			
 
				+      /2022-01-01-that-one-time-i-did-something
			
 
				+      /2022-12-31-i-did-something-too
			
 
				+      /2022-07-15-but-what-about-me
			
 
				+    /settings
			
 
				+
			
 
				+If /posts now also has reached the threshold of X children, those are merged as well:
			
 
				+
			
 
				+/users
			
 
				+  /*
			
 
				+    /posts
			
 
				+      *
			
 
				+    /settings
			
 
				+
			
 
				+Resulting in the following replacement rules:
			
 
				+
			
 
				+/users/*/posts/*/**  # To replace both identifiers
			
 
				+/users/*/**          # For URLs with only one identifier, e.g. /users/*/settings
			
 
				+
			
 
				+The replacement rules are interpreted by Relay to match and replace `*`, and to match but ignore `**`.
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import logging
			
 
				+from collections import UserDict, defaultdict
			
 
				+from typing import Iterable, List, Optional, Union
			
 
				+
			
 
				+import sentry_sdk
			
 
				+from typing_extensions import TypeAlias
			
 
				+
			
 
				+from .base import Clusterer, ReplacementRule
			
 
				+
			
 
				+__all__ = ["TreeClusterer"]
			
 
				+
			
 
				+
			
 
				+class Merged:
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+#: Symbol representing a merged node
			
 
				+MERGED = Merged()
			
 
				+
			
 
				+#: Separator by which we build the tree
			
 
				+SEP = "/"
			
 
				+
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class TreeClusterer(Clusterer):
			
 
				+    def __init__(self, *, merge_threshold: int) -> None:
			
 
				+        self._merge_threshold = merge_threshold
			
 
				+        self._tree = Node()
			
 
				+
			
 
				+    def add_input(self, transaction_names: Iterable[str]) -> None:
			
 
				+        for tx_name in transaction_names:
			
 
				+            parts = tx_name.split(SEP)
			
 
				+            node = self._tree
			
 
				+            for part in parts:
			
 
				+                node = node.setdefault(part, Node())
			
 
				+
			
 
				+    def get_rules(self) -> List[ReplacementRule]:
			
 
				+        """Merge high-cardinality nodes in the graph and extract rules"""
			
 
				+        with sentry_sdk.start_span(op="txcluster_merge"):
			
 
				+            self._tree.merge(self._merge_threshold)
			
 
				+
			
 
				+        # Generate exactly 1 rule for every merge
			
 
				+        rule_paths = [path for path in self._tree.paths() if path[-1] is MERGED]
			
 
				+
			
 
				+        # Sort by path length, descending (most specific rule first)
			
 
				+        rule_paths.sort(key=len, reverse=True)
			
 
				+
			
 
				+        return [self._build_rule(path) for path in rule_paths]
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _build_rule(path: List["Edge"]) -> ReplacementRule:
			
 
				+        path_str = SEP.join(["*" if isinstance(key, Merged) else key for key in path])
			
 
				+        path_str += "/**"
			
 
				+        return ReplacementRule(path_str)
			
 
				+
			
 
				+
			
 
				+#: Represents the edges between graph nodes. These edges serve as keys in the
			
 
				+#: node dictionary.
			
 
				+Edge: TypeAlias = Union[str, Merged]
			
 
				+
			
 
				+
			
 
				+class Node(UserDict):  # type: ignore
			
 
				+    """Keys in this dict are names of the children"""
			
 
				+
			
 
				+    def paths(self, ancestors: Optional[List[Edge]] = None) -> Iterable[List[Edge]]:
			
 
				+        """Collect all paths and subpaths through the graph"""
			
 
				+        if ancestors is None:
			
 
				+            ancestors = []
			
 
				+        for name, child in self.items():
			
 
				+            path = ancestors + [name]
			
 
				+            yield path
			
 
				+            yield from child.paths(ancestors=path)
			
 
				+
			
 
				+    def merge(self, merge_threshold: int) -> None:
			
 
				+        """Recursively merge children of high-cardinality nodes"""
			
 
				+        if len(self) >= merge_threshold:
			
 
				+            merged_children = self._merge_nodes(self.values())
			
 
				+            self.clear()
			
 
				+            self[MERGED] = merged_children
			
 
				+
			
 
				+        for child in self.values():
			
 
				+            child.merge(merge_threshold)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _merge_nodes(cls, nodes: Iterable["Node"]) -> "Node":
			
 
				+        children_by_name = defaultdict(list)
			
 
				+        for node in nodes:
			
 
				+            for name, child in node.items():
			
 
				+                children_by_name[name].append(child)
			
 
				+
			
 
				+        return Node(
			
 
				+            {name: cls._merge_nodes(children) for name, children in children_by_name.items()}
			
 
				+        )
			
--- a/tests/sentry/api/endpoints/test_project_transaction_names.py
+++ b/tests/sentry/api/endpoints/test_project_transaction_names.py
@@ -0,0 +1,55 @@
 
				+from django.urls import reverse
			
 
				+
			
 
				+from sentry.testutils import APITestCase
			
 
				+from sentry.testutils.helpers.datetime import before_now
			
 
				+from sentry.testutils.silo import region_silo_test
			
 
				+from sentry.utils.samples import load_data
			
 
				+
			
 
				+
			
 
				+@region_silo_test
			
 
				+class ProjectTransactionNamesClusterTest(APITestCase):
			
 
				+    def setUp(self) -> None:
			
 
				+        super().setUp()
			
 
				+
			
 
				+        self.login_as(user=self.user)
			
 
				+
			
 
				+        self.org = self.create_organization(owner=self.user)
			
 
				+        self.project = self.create_project(organization=self.org)
			
 
				+
			
 
				+        self.url = reverse(
			
 
				+            "sentry-api-0-organization-project-cluster-transaction-names",
			
 
				+            args=[self.org.slug, self.project.slug],
			
 
				+        )
			
 
				+
			
 
				+        for transaction in ["/a/b/c/", "/a/foo", "/a/whathever/c/d/", "/not_a/"]:
			
 
				+            event = load_data(
			
 
				+                "transaction",
			
 
				+                timestamp=before_now(minutes=1),
			
 
				+                start_timestamp=before_now(minutes=1, milliseconds=500),
			
 
				+            )
			
 
				+            event["transaction"] = transaction
			
 
				+            event["transaction_info"] = {"source": "url"}
			
 
				+            self.store_event(event, project_id=self.project.id)
			
 
				+
			
 
				+    def test_get(self):
			
 
				+        response = self.client.get(
			
 
				+            self.url,
			
 
				+            data={
			
 
				+                "project": [self.project.id],
			
 
				+                "statsPeriod": "1h",
			
 
				+                "limit": 5,
			
 
				+                "threshold": 3,
			
 
				+                "returnAllNames": True,
			
 
				+            },
			
 
				+            format="json",
			
 
				+        )
			
 
				+
			
 
				+        assert response.status_code == 200, response.content
			
 
				+        data = response.data
			
 
				+        data["meta"]["unique_transaction_names"].sort()
			
 
				+        assert data == {
			
 
				+            "rules": ["/a/*/**"],
			
 
				+            "meta": {
			
 
				+                "unique_transaction_names": ["/a/b/c/", "/a/foo", "/a/whathever/c/d/", "/not_a/"]
			
 
				+            },
			
 
				+        }
			
--- a/tests/sentry/ingest/test_transaction_clusterer.py
+++ b/tests/sentry/ingest/test_transaction_clusterer.py
@@ -0,0 +1,29 @@
 
				+from sentry.ingest.transaction_clusterer.tree import TreeClusterer
			
 
				+
			
 
				+
			
 
				+def test_multi_fanout():
			
 
				+    clusterer = TreeClusterer(merge_threshold=3)
			
 
				+    transaction_names = [
			
 
				+        "/a/b0/c/d0/e",
			
 
				+        "/a/b0/c/d1/e",
			
 
				+        "/a/b0/c/d2/e",
			
 
				+        "/a/b1/c/d0/e",
			
 
				+        "/a/b1/c/d1/e/",
			
 
				+        "/a/b1/c/d2/e",
			
 
				+        "/a/b2/c/d0/e",
			
 
				+        "/a/b2/c/d1/e/",
			
 
				+        "/a/b2/c/d2/e",
			
 
				+        "/a/b2/c1/d2/e",
			
 
				+    ]
			
 
				+    clusterer.add_input(transaction_names)
			
 
				+    assert clusterer.get_rules() == ["/a/*/c/*/**", "/a/*/**"]
			
 
				+
			
 
				+
			
 
				+def test_single_leaf():
			
 
				+    clusterer = TreeClusterer(merge_threshold=2)
			
 
				+    transaction_names = [
			
 
				+        "/a/b1/c/",
			
 
				+        "/a/b2/c/",
			
 
				+    ]
			
 
				+    clusterer.add_input(transaction_names)
			
 
				+    assert clusterer.get_rules() == ["/a/*/**"]
		`@@ -0,0 +1 @@`
		`+""" Strategies for clustering high-cardinality transaction names """`