Browse Source

fix(clusterer): Exclude schema+stars rules from clusterer (#62076)

This PR excludes rules composed of a scheme and all-stars, like
`http://*/*/**`. The clusterer currently drops rules composed by
all-stars, like `*/*/**`.
Iker Barriocanal 1 year ago
parent
commit
e41291f5b4

+ 42 - 2
src/sentry/ingest/transaction_clusterer/rule_validator.py

@@ -1,4 +1,5 @@
 from typing import Optional, Set
+from urllib.parse import urlparse
 
 from .base import ReplacementRule
 
@@ -9,7 +10,46 @@ class RuleValidator:
         self._char_domain: Set[str] = set(char_domain) if char_domain else set("*/")
 
     def is_valid(self) -> bool:
-        return not self._is_all_stars()
+        if self._is_all_stars() or self._is_schema_and_all_stars():
+            return False
+        return True
 
     def _is_all_stars(self) -> bool:
-        return set(self._rule) <= self._char_domain
+        return self._is_string_all_stars(self._rule)
+
+    def _is_string_all_stars(self, string) -> bool:
+        return set(string) <= self._char_domain
+
+    def _is_schema_and_all_stars(self) -> bool:
+        """
+        Return true if the rule looks like a URL scheme and stars.
+
+        ## Examples
+        `http://*/*/**` -> `True`
+        `http://domain.com/*/**` -> `False`
+
+        Rules composed by the scheme and all stars provide the same value as all
+        stars: none. These rules are low quality and provide little renaming
+        value.
+
+        ## Assumptions
+        - Rules are URL-parsable, otherwise `False` is returned.
+        - URLs don't have queries or fragments, and thus they are ignored.
+        """
+        # urlparse doesn't validate the input and raises an exception if the
+        # string isn't a valid URL
+        try:
+            url = urlparse(self._rule)
+        except ValueError:
+            return False
+
+        # urlparse sets empty strings in `scheme` and `netloc` when these can't
+        # be parsed.
+        if url.scheme == "":
+            return False
+        # urlparse may extract the first `*` from the rule as the netloc. This
+        # still means no netloc in the rule.
+        if url.netloc not in ("", "*"):
+            return False
+
+        return self._is_string_all_stars(url.path)

+ 6 - 1
src/sentry/ingest/transaction_clusterer/rules.py

@@ -5,6 +5,7 @@ import sentry_sdk
 
 from sentry.ingest.transaction_clusterer import ClustererNamespace
 from sentry.ingest.transaction_clusterer.datasource.redis import get_redis_client
+from sentry.ingest.transaction_clusterer.rule_validator import RuleValidator
 from sentry.models.project import Project
 from sentry.utils import metrics
 
@@ -132,9 +133,13 @@ class CompositeRuleStore:
     def merge(self, project: Project) -> None:
         """Read rules from all stores, merge and write them back so they all are up-to-date."""
         merged_rules = self.read(project)
+        merged_rules = self._clean_rules(merged_rules)
         trimmed_rules = self._trim_rules(merged_rules)
         self.write(project, trimmed_rules)
 
+    def _clean_rules(self, rules: RuleSet) -> RuleSet:
+        return {rule: seen for rule, seen in rules.items() if RuleValidator(rule).is_valid()}
+
     def _trim_rules(self, rules: RuleSet) -> RuleSet:
         sorted_rules = sorted(rules.items(), key=lambda p: p[1], reverse=True)
         last_seen_deadline = _now() - TRANSACTION_NAME_RULE_TTL_SECS
@@ -200,7 +205,7 @@ def get_sorted_rules(
 def update_rules(
     namespace: ClustererNamespace, project: Project, new_rules: Sequence[ReplacementRule]
 ) -> int:
-    """Write newly discovered rules to projection option and redis, and update last_used.
+    """Write newly discovered rules to project option and redis, and update last_used.
 
     Return the number of _new_ rules (that were not already present in project option).
     """

+ 31 - 0
tests/sentry/ingest/test_transaction_clusterer.py

@@ -74,6 +74,18 @@ def test_deep_tree():
     clusterer.get_rules()
 
 
+def test_clusterer_doesnt_generate_invalid_rules():
+    clusterer = TreeClusterer(merge_threshold=1)
+    all_stars = ["/a/b", "/b/c", "/c/d"]
+    clusterer.add_input(all_stars)
+    assert clusterer.get_rules() == []
+
+    clusterer = TreeClusterer(merge_threshold=2)
+    scheme_stars = ["http://a", "http://b", "http://c"]
+    clusterer.add_input(scheme_stars)
+    assert clusterer.get_rules() == []
+
+
 @mock.patch("sentry.ingest.transaction_clusterer.datasource.redis.MAX_SET_SIZE", 5)
 def test_collection():
     org = Organization(pk=666)
@@ -526,3 +538,22 @@ def test_bump_last_used():
         "foo": 1,
         "bar": 946688400,
     }
+
+
+@django_db_all
+def test_stored_invalid_rules_dropped_on_update(default_project):
+    """
+    Validate that invalid rules are removed from storage even if they already
+    exist there. Updates before and after the removal don't impact the outcome.
+    """
+    rule = ReplacementRule("http://*/*/**")
+
+    assert len(get_sorted_rules(ClustererNamespace.TRANSACTIONS, default_project)) == 0
+    RedisRuleStore(namespace=ClustererNamespace.TRANSACTIONS).write(default_project, {rule: 1})
+    assert get_redis_rules(ClustererNamespace.TRANSACTIONS, default_project) == {rule: 1}
+    with freeze_time("2000-01-01 01:00:00"):
+        update_rules(ClustererNamespace.TRANSACTIONS, default_project, [rule])
+    assert get_sorted_rules(ClustererNamespace.TRANSACTIONS, default_project) == []
+    with freeze_time("2000-01-01 01:00:00"):
+        update_rules(ClustererNamespace.TRANSACTIONS, default_project, [rule])
+    assert get_sorted_rules(ClustererNamespace.TRANSACTIONS, default_project) == []

+ 14 - 0
tests/sentry/ingest/test_transaction_rule_validator.py

@@ -22,3 +22,17 @@ def test_non_all_star_rules_valid():
 
     assert RuleValidator(ReplacementRule("a/*/b/*/c/**")).is_valid()
     assert RuleValidator(ReplacementRule("/a/*/b/*/c/**")).is_valid()
+
+
+def test_schema_all_stars_invalid():
+    assert not RuleValidator(ReplacementRule("http://*/*/*/**")).is_valid()
+    assert not RuleValidator(ReplacementRule("https://*/*/*/**")).is_valid()
+    assert RuleValidator(ReplacementRule("http://a/*/*/**")).is_valid()
+    assert RuleValidator(ReplacementRule("http://example.com/*/*/**")).is_valid()
+    assert RuleValidator(
+        ReplacementRule("http://user:password@www.example.com:80/*/*/*/**")
+    ).is_valid()
+    assert RuleValidator(ReplacementRule("ftp:///ftp.example.com/rfcs/*/**")).is_valid()
+    assert not RuleValidator(ReplacementRule("ftp://*/*/*/**")).is_valid()
+    assert RuleValidator(ReplacementRule("file:///example.txt")).is_valid()
+    assert not RuleValidator(ReplacementRule("file:///*/*/**")).is_valid()