1 year ago · e41291f5b4
--- a/src/sentry/ingest/transaction_clusterer/rule_validator.py
+++ b/src/sentry/ingest/transaction_clusterer/rule_validator.py
@@ -1,4 +1,5 @@
 
				 from typing import Optional, Set
			
 
				+from urllib.parse import urlparse
			
 
				 
			
 
				 from .base import ReplacementRule
			
 
				 
			
@@ -9,7 +10,46 @@ class RuleValidator:
 
				         self._char_domain: Set[str] = set(char_domain) if char_domain else set("*/")
			
 
				 
			
 
				     def is_valid(self) -> bool:
			
 
				-        return not self._is_all_stars()
			
 
				+        if self._is_all_stars() or self._is_schema_and_all_stars():
			
 
				+            return False
			
 
				+        return True
			
 
				 
			
 
				     def _is_all_stars(self) -> bool:
			
 
				-        return set(self._rule) <= self._char_domain
			
 
				+        return self._is_string_all_stars(self._rule)
			
 
				+
			
 
				+    def _is_string_all_stars(self, string) -> bool:
			
 
				+        return set(string) <= self._char_domain
			
 
				+
			
 
				+    def _is_schema_and_all_stars(self) -> bool:
			
 
				+        """
			
 
				+        Return true if the rule looks like a URL scheme and stars.
			
 
				+
			
 
				+        ## Examples
			
 
				+        `http://*/*/**` -> `True`
			
 
				+        `http://domain.com/*/**` -> `False`
			
 
				+
			
 
				+        Rules composed by the scheme and all stars provide the same value as all
			
 
				+        stars: none. These rules are low quality and provide little renaming
			
 
				+        value.
			
 
				+
			
 
				+        ## Assumptions
			
 
				+        - Rules are URL-parsable, otherwise `False` is returned.
			
 
				+        - URLs don't have queries or fragments, and thus they are ignored.
			
 
				+        """
			
 
				+        # urlparse doesn't validate the input and raises an exception if the
			
 
				+        # string isn't a valid URL
			
 
				+        try:
			
 
				+            url = urlparse(self._rule)
			
 
				+        except ValueError:
			
 
				+            return False
			
 
				+
			
 
				+        # urlparse sets empty strings in `scheme` and `netloc` when these can't
			
 
				+        # be parsed.
			
 
				+        if url.scheme == "":
			
 
				+            return False
			
 
				+        # urlparse may extract the first `*` from the rule as the netloc. This
			
 
				+        # still means no netloc in the rule.
			
 
				+        if url.netloc not in ("", "*"):
			
 
				+            return False
			
 
				+
			
 
				+        return self._is_string_all_stars(url.path)
			
--- a/src/sentry/ingest/transaction_clusterer/rules.py
+++ b/src/sentry/ingest/transaction_clusterer/rules.py
@@ -5,6 +5,7 @@ import sentry_sdk
 
				 
			
 
				 from sentry.ingest.transaction_clusterer import ClustererNamespace
			
 
				 from sentry.ingest.transaction_clusterer.datasource.redis import get_redis_client
			
 
				+from sentry.ingest.transaction_clusterer.rule_validator import RuleValidator
			
 
				 from sentry.models.project import Project
			
 
				 from sentry.utils import metrics
			
 
				 
			
@@ -132,9 +133,13 @@ class CompositeRuleStore:
 
				     def merge(self, project: Project) -> None:
			
 
				         """Read rules from all stores, merge and write them back so they all are up-to-date."""
			
 
				         merged_rules = self.read(project)
			
 
				+        merged_rules = self._clean_rules(merged_rules)
			
 
				         trimmed_rules = self._trim_rules(merged_rules)
			
 
				         self.write(project, trimmed_rules)
			
 
				 
			
 
				+    def _clean_rules(self, rules: RuleSet) -> RuleSet:
			
 
				+        return {rule: seen for rule, seen in rules.items() if RuleValidator(rule).is_valid()}
			
 
				+
			
 
				     def _trim_rules(self, rules: RuleSet) -> RuleSet:
			
 
				         sorted_rules = sorted(rules.items(), key=lambda p: p[1], reverse=True)
			
 
				         last_seen_deadline = _now() - TRANSACTION_NAME_RULE_TTL_SECS
			
@@ -200,7 +205,7 @@ def get_sorted_rules(
 
				 def update_rules(
			
 
				     namespace: ClustererNamespace, project: Project, new_rules: Sequence[ReplacementRule]
			
 
				 ) -> int:
			
 
				-    """Write newly discovered rules to projection option and redis, and update last_used.
			
 
				+    """Write newly discovered rules to project option and redis, and update last_used.
			
 
				 
			
 
				     Return the number of _new_ rules (that were not already present in project option).
			
 
				     """
			
--- a/tests/sentry/ingest/test_transaction_clusterer.py
+++ b/tests/sentry/ingest/test_transaction_clusterer.py
@@ -74,6 +74,18 @@ def test_deep_tree():
 
				     clusterer.get_rules()
			
 
				 
			
 
				 
			
 
				+def test_clusterer_doesnt_generate_invalid_rules():
			
 
				+    clusterer = TreeClusterer(merge_threshold=1)
			
 
				+    all_stars = ["/a/b", "/b/c", "/c/d"]
			
 
				+    clusterer.add_input(all_stars)
			
 
				+    assert clusterer.get_rules() == []
			
 
				+
			
 
				+    clusterer = TreeClusterer(merge_threshold=2)
			
 
				+    scheme_stars = ["http://a", "http://b", "http://c"]
			
 
				+    clusterer.add_input(scheme_stars)
			
 
				+    assert clusterer.get_rules() == []
			
 
				+
			
 
				+
			
 
				 @mock.patch("sentry.ingest.transaction_clusterer.datasource.redis.MAX_SET_SIZE", 5)
			
 
				 def test_collection():
			
 
				     org = Organization(pk=666)
			
@@ -526,3 +538,22 @@ def test_bump_last_used():
 
				         "foo": 1,
			
 
				         "bar": 946688400,
			
 
				     }
			
 
				+
			
 
				+
			
 
				+@django_db_all
			
 
				+def test_stored_invalid_rules_dropped_on_update(default_project):
			
 
				+    """
			
 
				+    Validate that invalid rules are removed from storage even if they already
			
 
				+    exist there. Updates before and after the removal don't impact the outcome.
			
 
				+    """
			
 
				+    rule = ReplacementRule("http://*/*/**")
			
 
				+
			
 
				+    assert len(get_sorted_rules(ClustererNamespace.TRANSACTIONS, default_project)) == 0
			
 
				+    RedisRuleStore(namespace=ClustererNamespace.TRANSACTIONS).write(default_project, {rule: 1})
			
 
				+    assert get_redis_rules(ClustererNamespace.TRANSACTIONS, default_project) == {rule: 1}
			
 
				+    with freeze_time("2000-01-01 01:00:00"):
			
 
				+        update_rules(ClustererNamespace.TRANSACTIONS, default_project, [rule])
			
 
				+    assert get_sorted_rules(ClustererNamespace.TRANSACTIONS, default_project) == []
			
 
				+    with freeze_time("2000-01-01 01:00:00"):
			
 
				+        update_rules(ClustererNamespace.TRANSACTIONS, default_project, [rule])
			
 
				+    assert get_sorted_rules(ClustererNamespace.TRANSACTIONS, default_project) == []
			
--- a/tests/sentry/ingest/test_transaction_rule_validator.py
+++ b/tests/sentry/ingest/test_transaction_rule_validator.py
@@ -22,3 +22,17 @@ def test_non_all_star_rules_valid():
 
				 
			
 
				     assert RuleValidator(ReplacementRule("a/*/b/*/c/**")).is_valid()
			
 
				     assert RuleValidator(ReplacementRule("/a/*/b/*/c/**")).is_valid()
			
 
				+
			
 
				+
			
 
				+def test_schema_all_stars_invalid():
			
 
				+    assert not RuleValidator(ReplacementRule("http://*/*/*/**")).is_valid()
			
 
				+    assert not RuleValidator(ReplacementRule("https://*/*/*/**")).is_valid()
			
 
				+    assert RuleValidator(ReplacementRule("http://a/*/*/**")).is_valid()
			
 
				+    assert RuleValidator(ReplacementRule("http://example.com/*/*/**")).is_valid()
			
 
				+    assert RuleValidator(
			
 
				+        ReplacementRule("http://user:password@www.example.com:80/*/*/*/**")
			
 
				+    ).is_valid()
			
 
				+    assert RuleValidator(ReplacementRule("ftp:///ftp.example.com/rfcs/*/**")).is_valid()
			
 
				+    assert not RuleValidator(ReplacementRule("ftp://*/*/*/**")).is_valid()
			
 
				+    assert RuleValidator(ReplacementRule("file:///example.txt")).is_valid()
			
 
				+    assert not RuleValidator(ReplacementRule("file:///*/*/**")).is_valid()