Просмотр исходного кода

fix(tx-clusterer): Exclude 404 transactions from clustering (#44286)

404 transactions are currently provided as input for the transaction
clusterer and degrade the quality of produced rules, creating rules that
are too aggressive due to the high cardinality on every level of the
tree. This commit excludes those transactions before providing them to the
clusterer. The way to determine whether a transaction is coming from a
404 is looking at the tag `http.status_code` -- a transaction is
considered a 404 if that tag exists and its value is `404`.
Iker Barriocanal 2 лет назад
Родитель
Сommit
860acdbed9

+ 1 - 0
src/sentry/ingest/transaction_clusterer/datasource/__init__.py

@@ -1,2 +1,3 @@
 TRANSACTION_SOURCE_URL = "url"
 TRANSACTION_SOURCE_SANITIZED = "sanitized"
+HTTP_404_TAG = ["http.status_code", "404"]

+ 32 - 17
src/sentry/ingest/transaction_clusterer/datasource/redis.py

@@ -7,6 +7,7 @@ from django.conf import settings
 
 from sentry import features
 from sentry.ingest.transaction_clusterer.datasource import (
+    HTTP_404_TAG,
     TRANSACTION_SOURCE_SANITIZED,
     TRANSACTION_SOURCE_URL,
 )
@@ -82,27 +83,41 @@ def clear_transaction_names(project: Project) -> None:
 
 
 def record_transaction_name(project: Project, event_data: Mapping[str, Any], **kwargs: Any) -> None:
-    transaction_info = event_data.get("transaction_info") or {}
-
     transaction_name = event_data.get("transaction")
-    source = transaction_info.get("source")
-    if transaction_name and features.has(
-        "organizations:transaction-name-clusterer", project.organization
+
+    if (
+        transaction_name
+        and features.has("organizations:transaction-name-clusterer", project.organization)
+        and _should_store_transaction_name(event_data)
     ):
-        # For now, we also feed back transactions into the clustering algorithm
-        # that have already been sanitized, so we have a chance to discover
-        # more high cardinality segments after partial sanitation.
-        # For example, we may have sanitized `/orgs/*/projects/foo`,
-        # But the clusterer has yet to discover `/orgs/*/projects/*`.
-        #
-        # Disadvantage: the load on redis does not decrease over time.
-        #
-        if source in (TRANSACTION_SOURCE_URL, TRANSACTION_SOURCE_SANITIZED):
-            safe_execute(
-                _store_transaction_name, project, transaction_name, _with_transaction=False
-            )
+        safe_execute(_store_transaction_name, project, transaction_name, _with_transaction=False)
+
         # TODO: For every transaction that had a rule applied to it, we should
         # bump the rule's lifetime here such that it stays alive while it is
         # being used.
         # For that purpose, we need to add the applied rule to the transaction
         # payload so we can check it here.
+
+
+def _should_store_transaction_name(event_data: Mapping[str, Any]) -> bool:
+    """Returns whether the given event must be stored as input for the
+    transaction clusterer."""
+    tags = event_data.get("tags")
+    transaction_info = event_data.get("transaction_info") or {}
+    source = transaction_info.get("source")
+
+    # For now, we also feed back transactions into the clustering algorithm
+    # that have already been sanitized, so we have a chance to discover
+    # more high cardinality segments after partial sanitation.
+    # For example, we may have sanitized `/orgs/*/projects/foo`,
+    # But the clusterer has yet to discover `/orgs/*/projects/*`.
+    #
+    # Disadvantage: the load on redis does not decrease over time.
+    #
+    if source not in (TRANSACTION_SOURCE_URL, TRANSACTION_SOURCE_SANITIZED):
+        return False
+
+    if tags and HTTP_404_TAG in tags:
+        return False
+
+    return True

+ 10 - 8
tests/sentry/ingest/test_transaction_clusterer.py

@@ -103,24 +103,26 @@ def test_distribution():
 @mock.patch("sentry.ingest.transaction_clusterer.datasource.redis._store_transaction_name")
 @pytest.mark.django_db
 @pytest.mark.parametrize(
-    "source,txname,feature_enabled,expected",
+    "source, txname, tags, feature_enabled, expected",
     [
-        ("url", "/a/b/c", True, 1),
-        ("route", "/", True, 0),
-        ("url", None, True, 0),
-        ("url", "/", False, 0),
-        ("route", None, False, 0),
+        ("url", "/a/b/c", [["transaction", "/a/b/c"]], True, 1),
+        ("url", "/a/b/c", [["http.status_code", "200"]], True, 1),
+        ("route", "/", [["transaction", "/"]], True, 0),
+        ("url", None, [], True, 0),
+        ("url", "/a/b/c", [["http.status_code", "404"]], True, 0),
+        ("url", "/", [["transaction", "/"]], False, 0),
+        ("route", None, [], False, 0),
     ],
 )
 def test_record_transactions(
-    mocked_record, default_organization, source, txname, feature_enabled, expected
+    mocked_record, default_organization, source, txname, tags, feature_enabled, expected
 ):
     with Feature({"organizations:transaction-name-clusterer": feature_enabled}):
         project = Project(id=111, name="project", organization_id=default_organization.id)
         record_transaction_name(
             project,
             {
-                "tags": [["transaction", txname]],
+                "tags": tags,
                 "transaction": txname,
                 "transaction_info": {"source": source},
             },