3 years ago · 91a8e52e3e
--- a/mypy.ini
+++ b/mypy.ini
@@ -50,6 +50,7 @@ files = src/sentry/api/bases/external_actor.py,
 
				         src/sentry/shared_integrations/constants.py,
			
 
				         src/sentry/snuba/outcomes.py,
			
 
				         src/sentry/snuba/query_subscription_consumer.py,
			
 
				+        src/sentry/spans/**/*.py,
			
 
				         src/sentry/tasks/app_store_connect.py,
			
 
				         src/sentry/tasks/update_user_reports.py,
			
 
				         src/sentry/unmerge.py,
			
--- a/src/sentry/event_manager.py
+++ b/src/sentry/event_manager.py
@@ -1697,6 +1697,27 @@ def _calculate_event_grouping(project, event, grouping_config) -> CalculatedHash
 
				     return hashes
			
 
				 
			
 
				 
			
 
				+def _calculate_span_grouping(jobs, projects):
			
 
				+    for job in jobs:
			
 
				+        # Make sure this snippet doesn't crash ingestion
			
 
				+        # as the feature is under development.
			
 
				+        try:
			
 
				+            event = job["event"]
			
 
				+            project = projects[job["project_id"]]
			
 
				+
			
 
				+            if not features.has(
			
 
				+                "organizations:performance-suspect-spans-ingestion",
			
 
				+                project.organization,
			
 
				+                actor=None,
			
 
				+            ):
			
 
				+                continue
			
 
				+
			
 
				+            groupings = event.get_span_groupings()
			
 
				+            groupings.write_to_event(event.data)
			
 
				+        except Exception:
			
 
				+            sentry_sdk.capture_exception()
			
 
				+
			
 
				+
			
 
				 @metrics.wraps("event_manager.save_transaction_events")
			
 
				 def save_transaction_events(jobs, projects):
			
 
				     with metrics.timer("event_manager.save_transactions.collect_organization_ids"):
			
@@ -1730,6 +1751,7 @@ def save_transaction_events(jobs, projects):
 
				     _get_event_user_many(jobs, projects)
			
 
				     _derive_plugin_tags_many(jobs, projects)
			
 
				     _derive_interface_tags_many(jobs)
			
 
				+    _calculate_span_grouping(jobs, projects)
			
 
				     _materialize_metadata_many(jobs)
			
 
				     _get_or_create_environment_many(jobs, projects)
			
 
				     _get_or_create_release_associated_models(jobs, projects)
			
--- a/src/sentry/eventstore/models.py
+++ b/src/sentry/eventstore/models.py
@@ -16,6 +16,7 @@ from sentry.grouping.result import CalculatedHashes
 
				 from sentry.interfaces.base import get_interfaces
			
 
				 from sentry.models import EventDict
			
 
				 from sentry.snuba.events import Columns
			
 
				+from sentry.spans.grouping.api import load_span_grouping_config
			
 
				 from sentry.utils import json
			
 
				 from sentry.utils.cache import memoize
			
 
				 from sentry.utils.canonical import CanonicalKeyView
			
@@ -471,6 +472,10 @@ class Event:
 
				 
			
 
				         return None
			
 
				 
			
 
				+    def get_span_groupings(self, force_config=None):
			
 
				+        config = load_span_grouping_config(force_config)
			
 
				+        return config.execute_strategy(self.data)
			
 
				+
			
 
				     @property
			
 
				     def organization(self):
			
 
				         return self.project.organization
			
--- a/src/sentry/spans/__init__.py
+++ b/src/sentry/spans/__init__.py
--- a/src/sentry/spans/grouping/__init__.py
+++ b/src/sentry/spans/grouping/__init__.py
--- a/src/sentry/spans/grouping/api.py
+++ b/src/sentry/spans/grouping/api.py
@@ -0,0 +1,26 @@
 
				+from typing import Any, Optional
			
 
				+
			
 
				+from sentry.spans.grouping.strategy.config import (
			
 
				+    CONFIGURATIONS,
			
 
				+    DEFAULT_CONFIG_ID,
			
 
				+    SpanGroupingConfig,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+class SpanGroupingConfigNotFound(LookupError):
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+def load_span_grouping_config(config: Optional[Any] = None) -> SpanGroupingConfig:
			
 
				+    if config is None:
			
 
				+        config_id = DEFAULT_CONFIG_ID
			
 
				+
			
 
				+    else:
			
 
				+        if "id" not in config:
			
 
				+            raise ValueError("Malformed configuration: missing 'id'")
			
 
				+        config_id = config["id"]
			
 
				+
			
 
				+    if config_id not in CONFIGURATIONS:
			
 
				+        raise SpanGroupingConfigNotFound(config_id)
			
 
				+
			
 
				+    return CONFIGURATIONS[config_id]
			
--- a/src/sentry/spans/grouping/result.py
+++ b/src/sentry/spans/grouping/result.py
@@ -0,0 +1,55 @@
 
				+from dataclasses import dataclass
			
 
				+from typing import Any, Dict, Optional
			
 
				+
			
 
				+
			
 
				+@dataclass(frozen=True)
			
 
				+class SpanGroupingResults:
			
 
				+    id: str
			
 
				+    results: Dict[str, str]
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_event(cls, event_data: Any) -> Optional["SpanGroupingResults"]:
			
 
				+        grouping_config = event_data.get("span_grouping_config")
			
 
				+        if grouping_config is None or grouping_config.get("id") is None:
			
 
				+            return None
			
 
				+
			
 
				+        results: Dict[str, str] = {}
			
 
				+
			
 
				+        # check the spans in the transaction
			
 
				+        for span in event_data.get("spans", []):
			
 
				+            span_id = span.get("span_id")
			
 
				+            span_hash = span.get("hash")
			
 
				+            if span_id is None or span_hash is None:
			
 
				+                # Every span should have a span id and hash.
			
 
				+                # If not, return None to indicate that the grouping
			
 
				+                # results could not be constructed from the event.
			
 
				+                return None
			
 
				+            results[span_id] = span_hash
			
 
				+
			
 
				+        # check the transaction root span
			
 
				+        trace_context = event_data["contexts"]["trace"]
			
 
				+        span_id = trace_context.get("span_id")
			
 
				+        span_hash = trace_context.get("hash")
			
 
				+        if span_id is None or span_hash is None:
			
 
				+            # Every span should have a span id and hash.
			
 
				+            # If not, return None to indicate that the grouping
			
 
				+            # results could not be constructed from the event.
			
 
				+            return None
			
 
				+        results[span_id] = span_hash
			
 
				+
			
 
				+        return cls(grouping_config["id"], results)
			
 
				+
			
 
				+    def write_to_event(self, event_data: Any) -> None:
			
 
				+        # write the hashes of the spans in the transaction
			
 
				+        for span in event_data.get("spans", []):
			
 
				+            span_hash = self.results.get(span["span_id"])
			
 
				+            if span_hash is not None:
			
 
				+                span["hash"] = span_hash
			
 
				+
			
 
				+        # write the hash of the transaction root spans
			
 
				+        trace_context = event_data["contexts"]["trace"]
			
 
				+        span_hash = self.results.get(trace_context["span_id"])
			
 
				+        if span_hash is not None:
			
 
				+            trace_context["hash"] = span_hash
			
 
				+
			
 
				+        event_data["span_grouping_config"] = {"id": self.id}
			
--- a/src/sentry/spans/grouping/strategy/__init__.py
+++ b/src/sentry/spans/grouping/strategy/__init__.py
--- a/src/sentry/spans/grouping/strategy/base.py
+++ b/src/sentry/spans/grouping/strategy/base.py
@@ -0,0 +1,194 @@
 
				+import re
			
 
				+from dataclasses import dataclass
			
 
				+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Sequence
			
 
				+from urllib.parse import urlparse
			
 
				+
			
 
				+from sentry.spans.grouping.utils import Hash, parse_fingerprint_var
			
 
				+
			
 
				+# TODO(3.8): This is a hack so we can get TypedDicts before 3.8
			
 
				+if TYPE_CHECKING:
			
 
				+    from mypy_extensions import TypedDict
			
 
				+else:
			
 
				+
			
 
				+    def TypedDict(*args, **kwargs):
			
 
				+        pass
			
 
				+
			
 
				+
			
 
				+Span = TypedDict(
			
 
				+    "Span",
			
 
				+    {
			
 
				+        "trace_id": str,
			
 
				+        "parent_span_id": str,
			
 
				+        "span_id": str,
			
 
				+        "start_timestamp": float,
			
 
				+        "timestamp": float,
			
 
				+        "same_process_as_parent": bool,
			
 
				+        "op": str,
			
 
				+        "description": Optional[str],
			
 
				+        "fingerprint": Optional[Sequence[str]],
			
 
				+        "tags": Optional[Any],
			
 
				+        "data": Optional[Any],
			
 
				+    },
			
 
				+)
			
 
				+
			
 
				+
			
 
				+# A callable strategy is a callable that when given a span, it tries to
			
 
				+# returns a fingerprint. If the strategy does not apply to the span, it
			
 
				+# should return `None` to indicate that the strategy should not be used
			
 
				+# and to try a different strategy. If the strategy does apply, it should
			
 
				+# return a list of strings that will serve as the span fingerprint.
			
 
				+CallableStrategy = Callable[[Span], Optional[Sequence[str]]]
			
 
				+
			
 
				+
			
 
				+@dataclass(frozen=True)
			
 
				+class SpanGroupingStrategy:
			
 
				+    name: str
			
 
				+    # The strategies to use with the default fingerprint
			
 
				+    strategies: Sequence[CallableStrategy]
			
 
				+
			
 
				+    def execute(self, event_data: Any) -> Dict[str, str]:
			
 
				+        spans = event_data.get("spans", [])
			
 
				+        span_groups = {span["span_id"]: self.get_span_group(span) for span in spans}
			
 
				+
			
 
				+        # make sure to get the group id for the transaction root span
			
 
				+        span_id = event_data["contexts"]["trace"]["span_id"]
			
 
				+        span_groups[span_id] = self.get_transaction_span_group(event_data)
			
 
				+
			
 
				+        return span_groups
			
 
				+
			
 
				+    def get_transaction_span_group(self, event_data: Any) -> str:
			
 
				+        result = Hash()
			
 
				+        result.update(event_data["transaction"])
			
 
				+        return result.hexdigest()
			
 
				+
			
 
				+    def get_span_group(self, span: Span) -> str:
			
 
				+        fingerprints = span.get("fingerprint") or ["{{ default }}"]
			
 
				+
			
 
				+        result = Hash()
			
 
				+
			
 
				+        for fingerprint in fingerprints:
			
 
				+            values: Sequence[str] = [fingerprint]
			
 
				+
			
 
				+            var = parse_fingerprint_var(fingerprint)
			
 
				+            if var == "default":
			
 
				+                values = self.handle_default_fingerprint(span)
			
 
				+
			
 
				+            result.update(values)
			
 
				+
			
 
				+        return result.hexdigest()
			
 
				+
			
 
				+    def handle_default_fingerprint(self, span: Span) -> Sequence[str]:
			
 
				+        span_group = None
			
 
				+
			
 
				+        # Try using all of the strategies in order to generate
			
 
				+        # the appropriate span group. The first strategy that
			
 
				+        # successfully generates a span group will be chosen.
			
 
				+        for strategy in self.strategies:
			
 
				+            span_group = strategy(span)
			
 
				+            if span_group is not None:
			
 
				+                break
			
 
				+
			
 
				+        # If no strategies generated a valid span group,
			
 
				+        # fall back to using the raw description strategy
			
 
				+        if span_group is None:
			
 
				+            span_group = raw_description_strategy(span)
			
 
				+
			
 
				+        return span_group
			
 
				+
			
 
				+
			
 
				+def span_op(op_name: str) -> Callable[[CallableStrategy], CallableStrategy]:
			
 
				+    def wrapped(fn: CallableStrategy) -> CallableStrategy:
			
 
				+        return lambda span: fn(span) if span.get("op") == op_name else None
			
 
				+
			
 
				+    return wrapped
			
 
				+
			
 
				+
			
 
				+def raw_description_strategy(span: Span) -> Sequence[str]:
			
 
				+    """The catch-all strategy to use if all other strategies fail. This
			
 
				+    strategy is only effective if the span description is a fixed string.
			
 
				+    Otherwise, this strategy will produce a large number of span groups.
			
 
				+    """
			
 
				+    return [span.get("description") or ""]
			
 
				+
			
 
				+
			
 
				+IN_CONDITION_PATTERN = re.compile(r" IN \(%s(\s*,\s*%s)*\)")
			
 
				+
			
 
				+
			
 
				+@span_op("db")
			
 
				+def normalized_db_span_in_condition_strategy(span: Span) -> Optional[Sequence[str]]:
			
 
				+    """For a `db` span, the `IN` condition contains the same same number of elements
			
 
				+    on the right hand side as the raw query. This results in identical queries that
			
 
				+    have different number of elements on the right hand side to be seen as different
			
 
				+    spans. We want these spans to be seen as similar spans, so we normalize the right
			
 
				+    hand side of `IN` conditions to `(%s) to use in the fingerprint.
			
 
				+    """
			
 
				+    description = span.get("description") or ""
			
 
				+    cleaned, count = IN_CONDITION_PATTERN.subn(" IN (%s)", description)
			
 
				+    if count == 0:
			
 
				+        return None
			
 
				+    return [cleaned]
			
 
				+
			
 
				+
			
 
				+HTTP_METHODS = {
			
 
				+    "GET",
			
 
				+    "HEAD",
			
 
				+    "POST",
			
 
				+    "PUT",
			
 
				+    "DELETE",
			
 
				+    "CONNECT",
			
 
				+    "OPTIONS",
			
 
				+    "TRACE",
			
 
				+    "PATCH",
			
 
				+}
			
 
				+
			
 
				+
			
 
				+@span_op("http.client")
			
 
				+def remove_http_client_query_string_strategy(span: Span) -> Optional[Sequence[str]]:
			
 
				+    """For a `http.client` span, the fingerprint to use is
			
 
				+
			
 
				+    - The http method
			
 
				+    - The url scheme
			
 
				+    - The url domain
			
 
				+    - The url path
			
 
				+
			
 
				+    This strategy means that different url path parameters are seen as different
			
 
				+    spans but different url query parameters are seen as same spans.
			
 
				+
			
 
				+    For example,
			
 
				+
			
 
				+    `GET https://sentry.io/organizations/this-org/issues/` and
			
 
				+    `GET https://sentry.io/organizations/that-org/issues/` differ in the url path.
			
 
				+    Therefore, these are different spans.
			
 
				+
			
 
				+    `GET https://sentry.io/organizations/this-org/issues/?id=1` and
			
 
				+    `GET https://sentry.io/organizations/this-org/issues/?id=2` differ in the query
			
 
				+    string. Therefore, these are similar spans.
			
 
				+    """
			
 
				+
			
 
				+    # Check the description is of the form `<HTTP METHOD> <URL>`
			
 
				+    description = span.get("description") or ""
			
 
				+    parts = description.split(" ", 1)
			
 
				+    if len(parts) != 2:
			
 
				+        return None
			
 
				+
			
 
				+    # Ensure that this is a valid http method
			
 
				+    method, url_str = parts
			
 
				+    method = method.upper()
			
 
				+    if method not in HTTP_METHODS:
			
 
				+        return None
			
 
				+
			
 
				+    url = urlparse(url_str)
			
 
				+    return [method, url.scheme, url.netloc, url.path]
			
 
				+
			
 
				+
			
 
				+@span_op("redis")
			
 
				+def remove_redis_command_arguments_strategy(span: Span) -> Optional[Sequence[str]]:
			
 
				+    """For a `redis` span, the fingerprint to use is simply the redis command name.
			
 
				+    The arguments to the redis command is highly variable and therefore not used as
			
 
				+    a part of the fingerprint.
			
 
				+    """
			
 
				+    description = span.get("description") or ""
			
 
				+    parts = description.split(" ", 1)
			
 
				+
			
 
				+    # the redis command name is the first word in the description
			
 
				+    return [parts[0]]
			
--- a/src/sentry/spans/grouping/strategy/config.py
+++ b/src/sentry/spans/grouping/strategy/config.py
@@ -0,0 +1,52 @@
 
				+from dataclasses import dataclass
			
 
				+from typing import Any, Dict, Sequence
			
 
				+
			
 
				+from sentry.spans.grouping.result import SpanGroupingResults
			
 
				+from sentry.spans.grouping.strategy.base import (
			
 
				+    CallableStrategy,
			
 
				+    SpanGroupingStrategy,
			
 
				+    normalized_db_span_in_condition_strategy,
			
 
				+    remove_http_client_query_string_strategy,
			
 
				+    remove_redis_command_arguments_strategy,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@dataclass(frozen=True)
			
 
				+class SpanGroupingConfig:
			
 
				+    id: str
			
 
				+    strategy: SpanGroupingStrategy
			
 
				+
			
 
				+    def execute_strategy(self, event_data: Any) -> SpanGroupingResults:
			
 
				+        # If there are hashes using the same grouping config stored
			
 
				+        # in the data, they should be reused. Otherwise, fall back to
			
 
				+        # generating new hashes using the data.
			
 
				+        grouping_results = SpanGroupingResults.from_event(event_data)
			
 
				+        if grouping_results is not None and grouping_results.id == self.id:
			
 
				+            return grouping_results
			
 
				+
			
 
				+        results = self.strategy.execute(event_data)
			
 
				+        return SpanGroupingResults(self.id, results)
			
 
				+
			
 
				+
			
 
				+CONFIGURATIONS: Dict[str, SpanGroupingConfig] = {}
			
 
				+
			
 
				+
			
 
				+def register_configuration(config_id: str, strategies: Sequence[CallableStrategy]) -> None:
			
 
				+    if config_id in CONFIGURATIONS:
			
 
				+        raise ValueError(f"Duplicate configuration id: {config_id}")
			
 
				+
			
 
				+    strategy = SpanGroupingStrategy(config_id, [] if strategies is None else strategies)
			
 
				+    CONFIGURATIONS[config_id] = SpanGroupingConfig(config_id, strategy)
			
 
				+
			
 
				+
			
 
				+DEFAULT_CONFIG_ID = "default:2021-08-25"
			
 
				+
			
 
				+
			
 
				+register_configuration(
			
 
				+    "default:2021-08-25",
			
 
				+    strategies=[
			
 
				+        normalized_db_span_in_condition_strategy,
			
 
				+        remove_http_client_query_string_strategy,
			
 
				+        remove_redis_command_arguments_strategy,
			
 
				+    ],
			
 
				+)