Просмотр исходного кода

feat(txnames): Increase threshold (#46303)

We currently merge a node in the URL tree if it has > 100 children,
assuming they are high-cardinality identifiers. But there are legitimate
cases where websites have > 100 static pages, which we should not treat
as such.
 
This PR sets the threshold to 200.

Increasing threshold requires also increasing the sample size, to keep
the probability of discovering a new rule more or less the same.

https://github.com/getsentry/sentry/pull/45573 already introduced this
parameter change, but now it does not depend on a feature flag anymore.

Fixes https://github.com/getsentry/team-ingest/issues/92
Joris Bayer 1 год назад
Родитель
Сommit
c7421166b5

+ 0 - 2
src/sentry/conf/server.py

@@ -1141,8 +1141,6 @@ SENTRY_FEATURES = {
     "organizations:transaction-name-mark-scrubbed-as-sanitized": False,
     # Try to derive normalization rules by clustering transaction names.
     "organizations:transaction-name-clusterer": False,
-    # Use a larger sample size & merge threshold for transaction clustering.
-    "organizations:transaction-name-clusterer-2x": False,
     # Sanitize transaction names in the ingestion pipeline.
     "organizations:transaction-name-sanitization": False,  # DEPRECATED
     # Extraction metrics for transactions during ingestion.

+ 0 - 1
src/sentry/features/__init__.py

@@ -180,7 +180,6 @@ default_manager.add("organizations:team-roles", OrganizationFeature, FeatureHand
 default_manager.add("organizations:transaction-name-normalize", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
 default_manager.add("organizations:transaction-name-mark-scrubbed-as-sanitized", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
 default_manager.add("organizations:transaction-name-clusterer", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
-default_manager.add("organizations:transaction-name-clusterer-2x", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
 default_manager.add("organizations:transaction-name-sanitization", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
 default_manager.add("organizations:transaction-metrics-extraction", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
 default_manager.add("organizations:use-metrics-layer", OrganizationFeature, FeatureHandlerStrategy.REMOTE)

+ 2 - 5
src/sentry/ingest/transaction_clusterer/datasource/redis.py

@@ -17,7 +17,7 @@ from sentry.utils.safe import safe_execute
 
 #: Maximum number of transaction names per project that we want
 #: to store in redis.
-MAX_SET_SIZE = 1000
+MAX_SET_SIZE = 2000
 
 #: Retention of a set.
 #: Remove the set if it has not received any updates for 24 hours.
@@ -64,10 +64,7 @@ def _store_transaction_name(project: Project, transaction_name: str) -> None:
     with sentry_sdk.start_span(op="txcluster.store_transaction_name"):
         client = get_redis_client()
         redis_key = _get_redis_key(project)
-        max_set_size = MAX_SET_SIZE
-        if features.has("organizations:transaction-name-clusterer-2x", project.organization):
-            max_set_size = 2 * MAX_SET_SIZE
-        add_to_set(client, [redis_key], [transaction_name, max_set_size, SET_TTL])
+        add_to_set(client, [redis_key], [transaction_name, MAX_SET_SIZE, SET_TTL])
 
 
 def get_transaction_names(project: Project) -> Iterator[str]:

+ 2 - 7
src/sentry/ingest/transaction_clusterer/tasks.py

@@ -19,7 +19,7 @@ from .tree import TreeClusterer
 #: Minimum number of children in the URL tree which triggers a merge.
 #: See TreeClusterer for more information.
 #: NOTE: We could make this configurable through django settings or even per-project in the future.
-MERGE_THRESHOLD = 100
+MERGE_THRESHOLD = 200
 
 #: Number of projects to process in one celery task
 #: The number 100 was chosen at random and might still need tweaking.
@@ -62,12 +62,7 @@ def cluster_projects(projects: Sequence[Project]) -> None:
         if features.has("organizations:transaction-name-clusterer", project.organization):
             with sentry_sdk.start_span(op="txcluster_project") as span:
                 span.set_data("project_id", project.id)
-                merge_threshold = MERGE_THRESHOLD
-                if features.has(
-                    "organizations:transaction-name-clusterer-2x", project.organization
-                ):
-                    merge_threshold = 2 * MERGE_THRESHOLD
-                clusterer = TreeClusterer(merge_threshold=merge_threshold)
+                clusterer = TreeClusterer(merge_threshold=MERGE_THRESHOLD)
                 clusterer.add_input(redis.get_transaction_names(project))
                 new_rules = clusterer.get_rules()
                 rules.update_rules(project, new_rules)

+ 0 - 35
tests/sentry/ingest/test_transaction_clusterer.py

@@ -238,41 +238,6 @@ def test_run_clusterer_task(cluster_projects_delay, default_organization):
         }
 
 
-@mock.patch("django.conf.settings.SENTRY_TRANSACTION_CLUSTERER_RUN", True)
-@mock.patch("sentry.ingest.transaction_clusterer.tasks.MERGE_THRESHOLD", 2)
-@mock.patch("sentry.ingest.transaction_clusterer.datasource.redis.MAX_SET_SIZE", 2)
-@pytest.mark.django_db
-@pytest.mark.parametrize("use_larger", (False, True))
-def test_larger_threshold_and_sample_size(default_organization, use_larger):
-    with Feature(
-        {
-            "organizations:transaction-name-clusterer": True,
-            "organizations:transaction-name-clusterer-2x": use_larger,
-        }
-    ):
-        project = Project(id=123, name="project1", organization_id=default_organization.id)
-        project.save()
-        _store_transaction_name(project, "/foo/foo")
-        _store_transaction_name(project, "/foo/bar")
-        _store_transaction_name(project, "/foo/baz")
-
-        cluster_projects([project])
-
-        rules = set(_get_rules(project).keys())
-        if use_larger:
-            assert rules == set()
-        else:
-            assert rules == {"/foo/*/**"}
-
-        # Add another one, now the rule should be there:
-        _store_transaction_name(project, "/foo/foo")
-        _store_transaction_name(project, "/foo/bar")
-        _store_transaction_name(project, "/foo/baz")
-        _store_transaction_name(project, "/foo/zap")
-        cluster_projects([project])
-        assert set(_get_rules(project).keys()) == {"/foo/*/**"}
-
-
 @pytest.mark.django_db
 def test_get_deleted_project():
     deleted_project = Project(pk=666, organization=Organization(pk=666))