2 years ago · f7f86b5856
--- a/src/sentry/utils/performance_issues/detectors/__init__.py
+++ b/src/sentry/utils/performance_issues/detectors/__init__.py
@@ -1,2 +1,3 @@
 
				+from .consecutive_db_detector import ConsecutiveDBSpanDetector  # NOQA
			
 
				 from .n_plus_one_api_calls_detector import NPlusOneAPICallsDetector  # NOQA
			
 
				 from .uncompressed_asset_detector import UncompressedAssetSpanDetector  # NOQA
			
--- a/src/sentry/utils/performance_issues/detectors/consecutive_db_detector.py
+++ b/src/sentry/utils/performance_issues/detectors/consecutive_db_detector.py
@@ -0,0 +1,225 @@
 
				+from __future__ import annotations
			
 
				+
			
 
				+import random
			
 
				+import re
			
 
				+from datetime import timedelta
			
 
				+from typing import Optional, Sequence
			
 
				+
			
 
				+from sentry import features
			
 
				+from sentry.issues.grouptype import PerformanceConsecutiveDBQueriesGroupType
			
 
				+from sentry.models import Organization, Project
			
 
				+from sentry.utils.event_frames import get_sdk_name
			
 
				+
			
 
				+from ..base import DetectorType, PerformanceDetector, fingerprint_spans, get_span_duration
			
 
				+from ..performance_problem import PerformanceProblem
			
 
				+from ..types import Span
			
 
				+
			
 
				+
			
 
				+def join_regexes(regexes: Sequence[str]) -> str:
			
 
				+    return r"(?:" + r")|(?:".join(regexes) + r")"
			
 
				+
			
 
				+
			
 
				+CONTAINS_PARAMETER_REGEX = re.compile(
			
 
				+    join_regexes(
			
 
				+        [
			
 
				+            r"'(?:[^']|'')*?(?:\\'.*|'(?!'))",  # single-quoted strings
			
 
				+            r"\b(?:true|false)\b",  # booleans
			
 
				+            r"-?\b(?:[0-9]+\.)?[0-9]+(?:[eE][+-]?[0-9]+)?\b",  # numbers
			
 
				+            r"\?|\$1|%s",  # existing parameters
			
 
				+        ]
			
 
				+    )
			
 
				+)
			
 
				+
			
 
				+
			
 
				+class ConsecutiveDBSpanDetector(PerformanceDetector):
			
 
				+    """
			
 
				+    Let X and Y be the consecutive db span count threshold and the span duration threshold respectively,
			
 
				+    each defined in the threshold settings.
			
 
				+
			
 
				+    The detector first looks for X number of consecutive db query spans,
			
 
				+    Once these set of spans are found, the detector will compare each db span in the consecutive list
			
 
				+    to determine if they are dependant on one another.
			
 
				+    If the sum of the durations of the independent spans exceeds Y, then a performance issue is found.
			
 
				+
			
 
				+    This detector assuming spans are ordered chronologically
			
 
				+    """
			
 
				+
			
 
				+    __slots__ = "stored_problems"
			
 
				+
			
 
				+    type: DetectorType = DetectorType.CONSECUTIVE_DB_OP
			
 
				+    settings_key = DetectorType.CONSECUTIVE_DB_OP
			
 
				+
			
 
				+    def init(self):
			
 
				+        self.stored_problems: dict[str, PerformanceProblem] = {}
			
 
				+        self.consecutive_db_spans: list[Span] = []
			
 
				+        self.independent_db_spans: list[Span] = []
			
 
				+
			
 
				+    def visit_span(self, span: Span) -> None:
			
 
				+        span_id = span.get("span_id", None)
			
 
				+
			
 
				+        if not span_id or not self._is_db_query(span) or self._overlaps_last_span(span):
			
 
				+            self._validate_and_store_performance_problem()
			
 
				+            self._reset_variables()
			
 
				+            return
			
 
				+
			
 
				+        self._add_problem_span(span)
			
 
				+
			
 
				+    def _add_problem_span(self, span: Span) -> None:
			
 
				+        self.consecutive_db_spans.append(span)
			
 
				+
			
 
				+    def _validate_and_store_performance_problem(self):
			
 
				+        self._set_independent_spans(self.consecutive_db_spans)
			
 
				+        if not len(self.independent_db_spans):
			
 
				+            return
			
 
				+
			
 
				+        exceeds_count_threshold = len(self.consecutive_db_spans) >= self.settings.get(
			
 
				+            "consecutive_count_threshold"
			
 
				+        )
			
 
				+        exceeds_span_duration_threshold = all(
			
 
				+            get_span_duration(span).total_seconds() * 1000
			
 
				+            > self.settings.get("span_duration_threshold")
			
 
				+            for span in self.independent_db_spans
			
 
				+        )
			
 
				+
			
 
				+        time_saved = self._calculate_time_saved(self.independent_db_spans)
			
 
				+        total_time = self._sum_span_duration(self.consecutive_db_spans)
			
 
				+
			
 
				+        exceeds_time_saved_threshold = time_saved >= self.settings.get("min_time_saved")
			
 
				+
			
 
				+        exceeds_time_saved_threshold_ratio = False
			
 
				+        if total_time > 0:
			
 
				+            exceeds_time_saved_threshold_ratio = time_saved / total_time >= self.settings.get(
			
 
				+                "min_time_saved_ratio"
			
 
				+            )
			
 
				+
			
 
				+        if (
			
 
				+            exceeds_count_threshold
			
 
				+            and exceeds_span_duration_threshold
			
 
				+            and exceeds_time_saved_threshold
			
 
				+            and exceeds_time_saved_threshold_ratio
			
 
				+        ):
			
 
				+            self._store_performance_problem()
			
 
				+
			
 
				+    def _store_performance_problem(self) -> None:
			
 
				+        fingerprint = self._fingerprint()
			
 
				+        offender_span_ids = [span.get("span_id", None) for span in self.independent_db_spans]
			
 
				+        cause_span_ids = [span.get("span_id", None) for span in self.consecutive_db_spans]
			
 
				+        query: str = self.independent_db_spans[0].get("description", None)
			
 
				+
			
 
				+        self.stored_problems[fingerprint] = PerformanceProblem(
			
 
				+            fingerprint,
			
 
				+            "db",
			
 
				+            desc=query,  # TODO - figure out which query to use for description
			
 
				+            type=PerformanceConsecutiveDBQueriesGroupType,
			
 
				+            cause_span_ids=cause_span_ids,
			
 
				+            parent_span_ids=None,
			
 
				+            offender_span_ids=offender_span_ids,
			
 
				+        )
			
 
				+
			
 
				+        self._reset_variables()
			
 
				+
			
 
				+    def _sum_span_duration(self, spans: list[Span]) -> int:
			
 
				+        "Given a list of spans, find the sum of the span durations in milliseconds"
			
 
				+        sum = 0
			
 
				+        for span in spans:
			
 
				+            sum += get_span_duration(span).total_seconds() * 1000
			
 
				+        return sum
			
 
				+
			
 
				+    def _set_independent_spans(self, spans: list[Span]):
			
 
				+        """
			
 
				+        Given a list of spans, checks if there is at least a single span that is independent of the rest.
			
 
				+        To start, we are just checking for a span in a list of consecutive span without a WHERE clause
			
 
				+        """
			
 
				+        independent_spans = []
			
 
				+        for span in spans[1:]:
			
 
				+            query: str = span.get("description", None)
			
 
				+            if (
			
 
				+                query
			
 
				+                and contains_complete_query(span)
			
 
				+                and "WHERE" not in query.upper()
			
 
				+                and not CONTAINS_PARAMETER_REGEX.search(query)
			
 
				+            ):
			
 
				+                independent_spans.append(span)
			
 
				+        self.independent_db_spans = independent_spans
			
 
				+
			
 
				+    def _calculate_time_saved(self, independent_spans: list[Span]) -> float:
			
 
				+        """
			
 
				+        Calculates the cost saved by running spans in parallel,
			
 
				+        this is the maximum time saved of running all independent queries in parallel
			
 
				+        note, maximum means it does not account for db connection times and overhead associated with parallelization,
			
 
				+        this is where thresholds come in
			
 
				+        """
			
 
				+        consecutive_spans = self.consecutive_db_spans
			
 
				+        total_duration = self._sum_span_duration(consecutive_spans)
			
 
				+
			
 
				+        max_independent_span_duration = max(
			
 
				+            [get_span_duration(span).total_seconds() * 1000 for span in independent_spans]
			
 
				+        )
			
 
				+
			
 
				+        sum_of_dependent_span_durations = 0
			
 
				+        for span in consecutive_spans:
			
 
				+            if span not in independent_spans:
			
 
				+                sum_of_dependent_span_durations += get_span_duration(span).total_seconds() * 1000
			
 
				+
			
 
				+        return total_duration - max(max_independent_span_duration, sum_of_dependent_span_durations)
			
 
				+
			
 
				+    def _overlaps_last_span(self, span: Span) -> bool:
			
 
				+        if len(self.consecutive_db_spans) == 0:
			
 
				+            return False
			
 
				+
			
 
				+        last_span = self.consecutive_db_spans[-1]
			
 
				+
			
 
				+        last_span_ends = timedelta(seconds=last_span.get("timestamp", 0))
			
 
				+        current_span_begins = timedelta(seconds=span.get("start_timestamp", 0))
			
 
				+        return last_span_ends > current_span_begins
			
 
				+
			
 
				+    def _reset_variables(self) -> None:
			
 
				+        self.consecutive_db_spans = []
			
 
				+        self.independent_db_spans = []
			
 
				+
			
 
				+    def _is_db_query(self, span: Span) -> bool:
			
 
				+        op: str = span.get("op", "") or ""
			
 
				+        description: str = span.get("description", "") or ""
			
 
				+        is_db_op = op == "db" or op.startswith("db.sql")
			
 
				+        is_query = "SELECT" in description.upper()  # TODO - make this more elegant
			
 
				+        return is_db_op and is_query
			
 
				+
			
 
				+    def _fingerprint(self) -> str:
			
 
				+        prior_span_index = self.consecutive_db_spans.index(self.independent_db_spans[0]) - 1
			
 
				+        hashed_spans = fingerprint_spans(
			
 
				+            [self.consecutive_db_spans[prior_span_index]] + self.independent_db_spans
			
 
				+        )
			
 
				+        return f"1-{PerformanceConsecutiveDBQueriesGroupType.type_id}-{hashed_spans}"
			
 
				+
			
 
				+    def on_complete(self) -> None:
			
 
				+        self._validate_and_store_performance_problem()
			
 
				+
			
 
				+    def is_creation_allowed_for_organization(self, organization: Organization) -> bool:
			
 
				+        return features.has(
			
 
				+            "organizations:performance-consecutive-db-issue", organization, actor=None
			
 
				+        )
			
 
				+
			
 
				+    def is_creation_allowed_for_project(self, project: Project) -> bool:
			
 
				+        return self.settings["detection_rate"] > random.random()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def is_event_eligible(cls, event, project: Project = None) -> bool:
			
 
				+        request = event.get("request", None) or None
			
 
				+        sdk_name = get_sdk_name(event) or ""
			
 
				+
			
 
				+        if request:
			
 
				+            url = request.get("url", "") or ""
			
 
				+            method = request.get("method", "") or ""
			
 
				+            if url.endswith("/graphql") and method.lower() in ["post", "get"]:
			
 
				+                return False
			
 
				+
			
 
				+        return "php" not in sdk_name.lower()
			
 
				+
			
 
				+
			
 
				+def contains_complete_query(span: Span, is_source: Optional[bool] = False) -> bool:
			
 
				+    # Remove the truncation check from the n_plus_one db detector.
			
 
				+    query = span.get("description", None)
			
 
				+    if is_source and query:
			
 
				+        return True
			
 
				+    else:
			
 
				+        return query and not query.endswith("...")
			
--- a/src/sentry/utils/performance_issues/performance_detection.py
+++ b/src/sentry/utils/performance_issues/performance_detection.py
@@ -15,7 +15,6 @@ from symbolic import ProguardMapper  # type: ignore
 
				 from sentry import features, nodestore, options, projectoptions
			
 
				 from sentry.eventstore.models import Event
			
 
				 from sentry.issues.grouptype import (
			
 
				-    PerformanceConsecutiveDBQueriesGroupType,
			
 
				     PerformanceFileIOMainThreadGroupType,
			
 
				     PerformanceMNPlusOneDBQueriesGroupType,
			
 
				     PerformanceNPlusOneGroupType,
			
@@ -34,18 +33,16 @@ from .base import (
 
				     PerformanceDetector,
			
 
				     fingerprint_resource_span,
			
 
				     fingerprint_span,
			
 
				-    fingerprint_spans,
			
 
				     get_span_duration,
			
 
				 )
			
 
				-from .detectors import NPlusOneAPICallsDetector, UncompressedAssetSpanDetector
			
 
				+from .detectors import (
			
 
				+    ConsecutiveDBSpanDetector,
			
 
				+    NPlusOneAPICallsDetector,
			
 
				+    UncompressedAssetSpanDetector,
			
 
				+)
			
 
				 from .performance_problem import PerformanceProblem
			
 
				 from .types import Span
			
 
				 
			
 
				-
			
 
				-def join_regexes(regexes: Sequence[str]) -> str:
			
 
				-    return r"(?:" + r")|(?:".join(regexes) + r")"
			
 
				-
			
 
				-
			
 
				 PERFORMANCE_GROUP_COUNT_LIMIT = 10
			
 
				 INTEGRATIONS_OF_INTEREST = [
			
 
				     "django",
			
@@ -56,16 +53,6 @@ INTEGRATIONS_OF_INTEREST = [
 
				 ]
			
 
				 
			
 
				 PARAMETERIZED_SQL_QUERY_REGEX = re.compile(r"\?|\$1|%s")
			
 
				-CONTAINS_PARAMETER_REGEX = re.compile(
			
 
				-    join_regexes(
			
 
				-        [
			
 
				-            r"'(?:[^']|'')*?(?:\\'.*|'(?!'))",  # single-quoted strings
			
 
				-            r"\b(?:true|false)\b",  # booleans
			
 
				-            r"-?\b(?:[0-9]+\.)?[0-9]+(?:[eE][+-]?[0-9]+)?\b",  # numbers
			
 
				-            r"\?|\$1|%s",  # existing parameters
			
 
				-        ]
			
 
				-    )
			
 
				-)
			
 
				 
			
 
				 
			
 
				 class EventPerformanceProblem:
			
@@ -518,191 +505,6 @@ class RenderBlockingAssetSpanDetector(PerformanceDetector):
 
				         return f"1-{PerformanceRenderBlockingAssetSpanGroupType.type_id}-{resource_url_hash}"
			
 
				 
			
 
				 
			
 
				-class ConsecutiveDBSpanDetector(PerformanceDetector):
			
 
				-    """
			
 
				-    Let X and Y be the consecutive db span count threshold and the span duration threshold respectively,
			
 
				-    each defined in the threshold settings.
			
 
				-
			
 
				-    The detector first looks for X number of consecutive db query spans,
			
 
				-    Once these set of spans are found, the detector will compare each db span in the consecutive list
			
 
				-    to determine if they are dependant on one another.
			
 
				-    If the sum of the durations of the independent spans exceeds Y, then a performance issue is found.
			
 
				-
			
 
				-    This detector assuming spans are ordered chronologically
			
 
				-    """
			
 
				-
			
 
				-    __slots__ = "stored_problems"
			
 
				-
			
 
				-    type: DetectorType = DetectorType.CONSECUTIVE_DB_OP
			
 
				-    settings_key = DetectorType.CONSECUTIVE_DB_OP
			
 
				-
			
 
				-    def init(self):
			
 
				-        self.stored_problems: dict[str, PerformanceProblem] = {}
			
 
				-        self.consecutive_db_spans: list[Span] = []
			
 
				-        self.independent_db_spans: list[Span] = []
			
 
				-
			
 
				-    def visit_span(self, span: Span) -> None:
			
 
				-        span_id = span.get("span_id", None)
			
 
				-
			
 
				-        if not span_id or not self._is_db_query(span) or self._overlaps_last_span(span):
			
 
				-            self._validate_and_store_performance_problem()
			
 
				-            self._reset_variables()
			
 
				-            return
			
 
				-
			
 
				-        self._add_problem_span(span)
			
 
				-
			
 
				-    def _add_problem_span(self, span: Span) -> None:
			
 
				-        self.consecutive_db_spans.append(span)
			
 
				-
			
 
				-    def _validate_and_store_performance_problem(self):
			
 
				-        self._set_independent_spans(self.consecutive_db_spans)
			
 
				-        if not len(self.independent_db_spans):
			
 
				-            return
			
 
				-
			
 
				-        exceeds_count_threshold = len(self.consecutive_db_spans) >= self.settings.get(
			
 
				-            "consecutive_count_threshold"
			
 
				-        )
			
 
				-        exceeds_span_duration_threshold = all(
			
 
				-            get_span_duration(span).total_seconds() * 1000
			
 
				-            > self.settings.get("span_duration_threshold")
			
 
				-            for span in self.independent_db_spans
			
 
				-        )
			
 
				-
			
 
				-        time_saved = self._calculate_time_saved(self.independent_db_spans)
			
 
				-        total_time = self._sum_span_duration(self.consecutive_db_spans)
			
 
				-
			
 
				-        exceeds_time_saved_threshold = time_saved >= self.settings.get("min_time_saved")
			
 
				-
			
 
				-        exceeds_time_saved_threshold_ratio = False
			
 
				-        if total_time > 0:
			
 
				-            exceeds_time_saved_threshold_ratio = time_saved / total_time >= self.settings.get(
			
 
				-                "min_time_saved_ratio"
			
 
				-            )
			
 
				-
			
 
				-        if (
			
 
				-            exceeds_count_threshold
			
 
				-            and exceeds_span_duration_threshold
			
 
				-            and exceeds_time_saved_threshold
			
 
				-            and exceeds_time_saved_threshold_ratio
			
 
				-        ):
			
 
				-            self._store_performance_problem()
			
 
				-
			
 
				-    def _store_performance_problem(self) -> None:
			
 
				-        fingerprint = self._fingerprint()
			
 
				-        offender_span_ids = [span.get("span_id", None) for span in self.independent_db_spans]
			
 
				-        cause_span_ids = [span.get("span_id", None) for span in self.consecutive_db_spans]
			
 
				-        query: str = self.independent_db_spans[0].get("description", None)
			
 
				-
			
 
				-        self.stored_problems[fingerprint] = PerformanceProblem(
			
 
				-            fingerprint,
			
 
				-            "db",
			
 
				-            desc=query,  # TODO - figure out which query to use for description
			
 
				-            type=PerformanceConsecutiveDBQueriesGroupType,
			
 
				-            cause_span_ids=cause_span_ids,
			
 
				-            parent_span_ids=None,
			
 
				-            offender_span_ids=offender_span_ids,
			
 
				-        )
			
 
				-
			
 
				-        self._reset_variables()
			
 
				-
			
 
				-    def _sum_span_duration(self, spans: list[Span]) -> int:
			
 
				-        "Given a list of spans, find the sum of the span durations in milliseconds"
			
 
				-        sum = 0
			
 
				-        for span in spans:
			
 
				-            sum += get_span_duration(span).total_seconds() * 1000
			
 
				-        return sum
			
 
				-
			
 
				-    def _set_independent_spans(self, spans: list[Span]):
			
 
				-        """
			
 
				-        Given a list of spans, checks if there is at least a single span that is independent of the rest.
			
 
				-        To start, we are just checking for a span in a list of consecutive span without a WHERE clause
			
 
				-        """
			
 
				-        independent_spans = []
			
 
				-        for span in spans[1:]:
			
 
				-            query: str = span.get("description", None)
			
 
				-            if (
			
 
				-                query
			
 
				-                and contains_complete_query(span)
			
 
				-                and "WHERE" not in query.upper()
			
 
				-                and not CONTAINS_PARAMETER_REGEX.search(query)
			
 
				-            ):
			
 
				-                independent_spans.append(span)
			
 
				-        self.independent_db_spans = independent_spans
			
 
				-
			
 
				-    def _calculate_time_saved(self, independent_spans: list[Span]) -> float:
			
 
				-        """
			
 
				-        Calculates the cost saved by running spans in parallel,
			
 
				-        this is the maximum time saved of running all independent queries in parallel
			
 
				-        note, maximum means it does not account for db connection times and overhead associated with parallelization,
			
 
				-        this is where thresholds come in
			
 
				-        """
			
 
				-        consecutive_spans = self.consecutive_db_spans
			
 
				-        total_duration = self._sum_span_duration(consecutive_spans)
			
 
				-
			
 
				-        max_independent_span_duration = max(
			
 
				-            [get_span_duration(span).total_seconds() * 1000 for span in independent_spans]
			
 
				-        )
			
 
				-
			
 
				-        sum_of_dependent_span_durations = 0
			
 
				-        for span in consecutive_spans:
			
 
				-            if span not in independent_spans:
			
 
				-                sum_of_dependent_span_durations += get_span_duration(span).total_seconds() * 1000
			
 
				-
			
 
				-        return total_duration - max(max_independent_span_duration, sum_of_dependent_span_durations)
			
 
				-
			
 
				-    def _overlaps_last_span(self, span: Span) -> bool:
			
 
				-        if len(self.consecutive_db_spans) == 0:
			
 
				-            return False
			
 
				-
			
 
				-        last_span = self.consecutive_db_spans[-1]
			
 
				-
			
 
				-        last_span_ends = timedelta(seconds=last_span.get("timestamp", 0))
			
 
				-        current_span_begins = timedelta(seconds=span.get("start_timestamp", 0))
			
 
				-        return last_span_ends > current_span_begins
			
 
				-
			
 
				-    def _reset_variables(self) -> None:
			
 
				-        self.consecutive_db_spans = []
			
 
				-        self.independent_db_spans = []
			
 
				-
			
 
				-    def _is_db_query(self, span: Span) -> bool:
			
 
				-        op: str = span.get("op", "") or ""
			
 
				-        description: str = span.get("description", "") or ""
			
 
				-        is_db_op = op == "db" or op.startswith("db.sql")
			
 
				-        is_query = "SELECT" in description.upper()  # TODO - make this more elegant
			
 
				-        return is_db_op and is_query
			
 
				-
			
 
				-    def _fingerprint(self) -> str:
			
 
				-        prior_span_index = self.consecutive_db_spans.index(self.independent_db_spans[0]) - 1
			
 
				-        hashed_spans = fingerprint_spans(
			
 
				-            [self.consecutive_db_spans[prior_span_index]] + self.independent_db_spans
			
 
				-        )
			
 
				-        return f"1-{PerformanceConsecutiveDBQueriesGroupType.type_id}-{hashed_spans}"
			
 
				-
			
 
				-    def on_complete(self) -> None:
			
 
				-        self._validate_and_store_performance_problem()
			
 
				-
			
 
				-    def is_creation_allowed_for_organization(self, organization: Organization) -> bool:
			
 
				-        return features.has(
			
 
				-            "organizations:performance-consecutive-db-issue", organization, actor=None
			
 
				-        )
			
 
				-
			
 
				-    def is_creation_allowed_for_project(self, project: Project) -> bool:
			
 
				-        return self.settings["detection_rate"] > random.random()
			
 
				-
			
 
				-    @classmethod
			
 
				-    def is_event_eligible(cls, event, project: Project = None) -> bool:
			
 
				-        request = event.get("request", None) or None
			
 
				-        sdk_name = get_sdk_name(event) or ""
			
 
				-
			
 
				-        if request:
			
 
				-            url = request.get("url", "") or ""
			
 
				-            method = request.get("method", "") or ""
			
 
				-            if url.endswith("/graphql") and method.lower() in ["post", "get"]:
			
 
				-                return False
			
 
				-
			
 
				-        return "php" not in sdk_name.lower()
			
 
				-
			
 
				-
			
 
				 class NPlusOneDBSpanDetector(PerformanceDetector):
			
 
				     """
			
 
				     Detector goals:
			
--- a/tests/sentry/utils/performance_issues/test_consecutive_db_detector.py
+++ b/tests/sentry/utils/performance_issues/test_consecutive_db_detector.py
@@ -13,8 +13,8 @@ from sentry.testutils.performance_issues.event_generators import (
 
				     modify_span_start,
			
 
				 )
			
 
				 from sentry.testutils.silo import region_silo_test
			
 
				+from sentry.utils.performance_issues.detectors import ConsecutiveDBSpanDetector
			
 
				 from sentry.utils.performance_issues.performance_detection import (
			
 
				-    ConsecutiveDBSpanDetector,
			
 
				     PerformanceProblem,
			
 
				     get_detection_settings,
			
 
				     run_detector_on_data,