Browse Source

feat(perf-issues) Create performance issues (#38319)

This PR enables performance group creation/update. For fingerprints
coming out of job["performance_problems"] an existing group gets updated
or a new group created. Group type is set based on the output of the
performance detection. Performance issue creation is rate limited to
ensure that we don't create too many groups at once.

Co-authored-by: Dameli Ushbayeva <dameli@cohere.ai>
Co-authored-by: Dan Fuller <dfuller@sentry.io>
Dameli Ushbayeva 2 years ago
parent
commit
aab0054487

+ 135 - 22
src/sentry/event_manager.py

@@ -6,8 +6,9 @@ import re
 import time
 from dataclasses import dataclass
 from datetime import datetime, timedelta
+from hashlib import md5
 from io import BytesIO
-from typing import Optional
+from typing import Optional, Sequence, TypedDict
 
 import sentry_sdk
 from django.conf import settings
@@ -83,7 +84,7 @@ from sentry.models.grouphistory import GroupHistoryStatus, record_group_history
 from sentry.models.integrations.repository_project_path_config import RepositoryProjectPathConfig
 from sentry.plugins.base import plugins
 from sentry.projectoptions.defaults import BETA_GROUPING_CONFIG, DEFAULT_GROUPING_CONFIG
-from sentry.ratelimits.sliding_windows import RedisSlidingWindowRateLimiter
+from sentry.ratelimits.sliding_windows import Quota, RedisSlidingWindowRateLimiter, RequestedQuota
 from sentry.reprocessing2 import is_reprocessed_event, save_unprocessed_event
 from sentry.shared_integrations.exceptions import ApiError
 from sentry.signals import first_event_received, first_transaction_received, issue_unresolved
@@ -96,7 +97,10 @@ from sentry.utils.cache import cache_key_for_event
 from sentry.utils.canonical import CanonicalKeyDict
 from sentry.utils.dates import to_datetime, to_timestamp
 from sentry.utils.outcomes import Outcome, track_outcome
-from sentry.utils.performance_issues.performance_detection import detect_performance_problems
+from sentry.utils.performance_issues.performance_detection import (
+    PerformanceProblem,
+    detect_performance_problems,
+)
 from sentry.utils.safe import get_path, safe_execute, setdefault_path, trim
 
 logger = logging.getLogger("sentry.events")
@@ -106,6 +110,11 @@ SECURITY_REPORT_INTERFACES = ("csp", "hpkp", "expectct", "expectstaple")
 # Timeout for cached group crash report counts
 CRASH_REPORT_TIMEOUT = 24 * 3600  # one day
 
+issue_rate_limiter = RedisSlidingWindowRateLimiter(
+    **settings.SENTRY_PERFORMANCE_ISSUES_RATE_LIMITER_OPTIONS
+)
+PERFORMANCE_ISSUE_QUOTA = Quota(3600, 60, 60)
+
 
 @dataclass
 class GroupInfo:
@@ -943,7 +952,6 @@ def _tsdb_record_all_metrics(jobs):
         incrs = []
         frequencies = []
         records = []
-
         incrs.append((tsdb.models.project, job["project_id"]))
         event = job["event"]
         release = job["release"]
@@ -995,8 +1003,9 @@ def _nodestore_save_many(jobs):
         # Write the event to Nodestore
         subkeys = {}
 
-        if job["groups"]:
-            event = job["event"]
+        event = job["event"]
+        # We only care about `unprocessed` for error events
+        if event.get_event_type() != "transaction" and job["groups"]:
             unprocessed = event_processing_store.get(
                 cache_key_for_event({"project": event.project_id, "event_id": event.event_id}),
                 unprocessed=True,
@@ -1935,6 +1944,123 @@ def _detect_performance_problems(jobs, projects):
         job["performance_problems"] = detect_performance_problems(job["data"])
 
 
+class Performance_Job(TypedDict, total=False):
+    performance_problems: Sequence[PerformanceProblem]
+
+
+@metrics.wraps("save_event.save_aggregate_performance")
+def _save_aggregate_performance(jobs: Sequence[Performance_Job], projects):
+
+    MAX_GROUPS = (
+        10  # safety check in case we are passed too many. constant will live somewhere else tbd
+    )
+    for job in jobs:
+        job["groups"] = []
+        event = job["event"]
+        project = event.project
+
+        # General system-wide option
+        rate = options.get("performance.issues.all.problem-creation") or 0
+
+        # More granular, per-project option
+        per_project_rate = project.get_option("sentry:performance_issue_creation_rate", 0)
+        if rate > random.random() and per_project_rate > random.random():
+
+            kwargs = _create_kwargs(job)
+            kwargs["culprit"] = job["culprit"]
+            kwargs["data"] = materialize_metadata(
+                event.data,
+                get_event_type(event.data),
+                dict(job["event_metadata"]),
+            )
+            kwargs["data"]["last_received"] = job["received_timestamp"]
+
+            performance_problems = job["performance_problems"]
+            for problem in performance_problems:
+                problem.fingerprint = md5(problem.fingerprint.encode("utf-8")).hexdigest()
+
+            performance_problems_by_fingerprint = {p.fingerprint: p for p in performance_problems}
+            all_group_hashes = [problem.fingerprint for problem in performance_problems]
+            group_hashes = all_group_hashes[:MAX_GROUPS]
+
+            existing_grouphashes = GroupHash.objects.filter(
+                project=project, hash__in=group_hashes
+            ).select_related("group")
+
+            new_grouphashes = set(group_hashes) - {hash.hash for hash in existing_grouphashes}
+            new_grouphashes_count = len(new_grouphashes)
+
+            if new_grouphashes:
+                granted_quota = issue_rate_limiter.check_and_use_quotas(
+                    [
+                        RequestedQuota(
+                            f"performance-issues:{project.id}",
+                            new_grouphashes_count,
+                            [PERFORMANCE_ISSUE_QUOTA],
+                        )
+                    ]
+                )[0]
+
+                # Log how many groups didn't get created because of rate limiting
+                _dropped_group_hash_count = new_grouphashes_count - granted_quota.granted
+                metrics.incr("performance.performance_issue.dropped", _dropped_group_hash_count)
+
+                for new_grouphash in list(new_grouphashes)[: granted_quota.granted]:
+
+                    # GROUP DOES NOT EXIST
+                    with sentry_sdk.start_span(
+                        op="event_manager.create_group_transaction"
+                    ) as span, metrics.timer(
+                        "event_manager.create_group_transaction"
+                    ) as metric_tags, transaction.atomic():
+                        span.set_tag("create_group_transaction.outcome", "no_group")
+                        metric_tags["create_group_transaction.outcome"] = "no_group"
+
+                        problem = performance_problems_by_fingerprint[new_grouphash]
+                        kwargs["type"] = problem.type.value
+                        kwargs["data"]["metadata"]["title"] = f"N+1 Query:{problem.desc}"
+
+                        group = _create_group(project, event, **kwargs)
+                        GroupHash.objects.create(project=project, hash=new_grouphash, group=group)
+
+                        is_new = True
+                        is_regression = False
+
+                        span.set_tag("create_group_transaction.outcome", "new_group")
+                        metric_tags["create_group_transaction.outcome"] = "new_group"
+
+                        metrics.incr(
+                            "group.created",
+                            skip_internal=True,
+                            tags={"platform": job["platform"] or "unknown"},
+                        )
+
+                        job["groups"].append(
+                            GroupInfo(group=group, is_new=is_new, is_regression=is_regression)
+                        )
+
+            if existing_grouphashes:
+
+                # GROUP EXISTS
+                for existing_grouphash in existing_grouphashes:
+                    group = existing_grouphash.group
+
+                    is_new = False
+
+                    description = performance_problems_by_fingerprint[existing_grouphash.hash].desc
+                    kwargs["data"]["metadata"]["title"] = f"N+1 Query:{description}"
+
+                    is_regression = _process_existing_aggregate(
+                        group=group, event=job["event"], data=kwargs, release=job["release"]
+                    )
+
+                    job["groups"].append(
+                        GroupInfo(group=group, is_new=is_new, is_regression=is_regression)
+                    )
+
+            job["event"].groups = [group_info.group for group_info in job["groups"]]
+
+
 @metrics.wraps("event_manager.save_transaction_events")
 def save_transaction_events(jobs, projects):
     with metrics.timer("event_manager.save_transactions.collect_organization_ids"):
@@ -1954,17 +2080,6 @@ def save_transaction_events(jobs, projects):
             except KeyError:
                 continue
 
-    with metrics.timer("event_manager.save_transactions.prepare_jobs"):
-        for job in jobs:
-            job["project_id"] = job["data"]["project"]
-            job["raw"] = False
-            job["group"] = None
-            # XXX: Temporary hack so that `groups` is always present
-            job["groups"] = []
-            job["is_new"] = False
-            job["is_regression"] = False
-            job["is_new_group_environment"] = False
-
     _pull_out_data(jobs, projects)
     _get_or_create_release_many(jobs, projects)
     _get_event_user_many(jobs, projects)
@@ -1973,16 +2088,14 @@ def save_transaction_events(jobs, projects):
     _calculate_span_grouping(jobs, projects)
     _detect_performance_problems(jobs, projects)
     _materialize_metadata_many(jobs)
+    _save_aggregate_performance(jobs, projects)
     _get_or_create_environment_many(jobs, projects)
+    _get_or_create_group_environment_many(jobs, projects)
     _get_or_create_release_associated_models(jobs, projects)
+    _get_or_create_group_release_many(jobs, projects)
     _tsdb_record_all_metrics(jobs)
     _materialize_event_metrics(jobs)
     _nodestore_save_many(jobs)
     _eventstream_insert_many(jobs)
     _track_outcome_accepted_many(jobs)
     return jobs
-
-
-issue_rate_limiter = RedisSlidingWindowRateLimiter(
-    **settings.SENTRY_PERFORMANCE_ISSUES_RATE_LIMITER_OPTIONS
-)

+ 1 - 0
src/sentry/models/group.py

@@ -414,6 +414,7 @@ class Group(Model):
                 _("Render Blocking Asset Span"),
             ),
             (GroupType.PERFORMANCE_DUPLICATE_SPANS.value, _("Duplicate Spans")),
+            # TODO add more group types when detection starts outputting them
         ),
     )
 

+ 4 - 3
src/sentry/utils/performance_issues/performance_detection.py

@@ -1,4 +1,5 @@
 import hashlib
+import logging
 import random
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -71,9 +72,9 @@ def detect_performance_problems(data: Event) -> List[PerformanceProblem]:
                 op="py.detect_performance_issue", description="none"
             ) as sdk_span:
                 return _detect_performance_problems(data, sdk_span)
-    except Exception as e:
-        sentry_sdk.capture_exception(e)
-        return []
+    except Exception:
+        logging.exception("Failed to detect performance problems")
+    return []
 
 
 # Gets some of the thresholds to perform performance detection. Can be made configurable later.

+ 112 - 0
tests/sentry/performance_issues/test_performance_issues.py

@@ -0,0 +1,112 @@
+import logging
+import uuid
+from unittest import mock
+
+from sentry.event_manager import EventManager
+from sentry.spans.grouping.utils import hash_values
+from sentry.testutils import TestCase
+from sentry.testutils.helpers import override_options
+from sentry.types.issues import GroupCategory, GroupType
+from tests.sentry.utils.performance_issues.test_performance_detection import EVENTS
+
+
+def make_event(**kwargs):
+    result = {
+        "event_id": uuid.uuid1().hex,
+        "level": logging.ERROR,
+        "logger": "default",
+        "tags": [],
+    }
+    result.update(kwargs)
+    return result
+
+
+class EventManagerTestMixin:
+    def make_release_event(self, release_name, project_id):
+        manager = EventManager(make_event(release=release_name))
+        manager.normalize()
+        event = manager.save(project_id)
+        return event
+
+
+class EventManagerTest(TestCase, EventManagerTestMixin):
+
+    # GROUPS TESTS
+    @override_options({"store.use-ingest-performance-detection-only": 1.0})
+    @override_options({"performance.issues.all.problem-creation": 1.0})
+    @override_options({"performance.issues.all.problem-detection": 1.0})
+    def test_perf_issue_creation(self):
+        self.project.update_option("sentry:performance_issue_creation_rate", 1.0)
+
+        with mock.patch("sentry_sdk.tracing.Span.containing_transaction"), self.feature(
+            "projects:performance-suspect-spans-ingestion"
+        ):
+            manager = EventManager(make_event(**EVENTS["n-plus-one-in-django-index-view"]))
+            manager.normalize()
+            event = manager.save(self.project.id)
+            data = event.data
+            assert data["type"] == "transaction"
+            assert data["span_grouping_config"]["id"] == "default:2021-08-25"
+            spans = [{"hash": span["hash"]} for span in data["spans"]]
+            # the basic strategy is to simply use the description
+            assert spans == [{"hash": hash_values([span["description"]])} for span in data["spans"]]
+            assert len(event.groups) == 1
+            group = event.groups[0]
+            assert (
+                group.title
+                == "N+1 Query:SELECT `books_book`.`id`, `books_book`.`title`, `books_book`.`author_id` FROM `books_book` LIMIT 10"
+            )
+            assert group.message == "/books/"
+            assert group.culprit == "/books/"
+            assert group.get_event_type() == "transaction"
+            assert group.get_event_metadata() == {
+                "location": "/books/",
+                "title": "N+1 Query:SELECT `books_book`.`id`, `books_book`.`title`, `books_book`.`author_id` FROM `books_book` LIMIT 10",
+            }
+            assert group.location() == "/books/"
+            assert group.level == 40
+            assert group.issue_category == GroupCategory.PERFORMANCE
+            assert group.issue_type == GroupType.PERFORMANCE_N_PLUS_ONE_DB_QUERIES
+
+    @override_options({"store.use-ingest-performance-detection-only": 1.0})
+    @override_options({"performance.issues.all.problem-creation": 1.0})
+    @override_options({"performance.issues.all.problem-detection": 1.0})
+    def test_perf_issue_update(self):
+        self.project.update_option("sentry:performance_issue_creation_rate", 1.0)
+
+        with mock.patch("sentry_sdk.tracing.Span.containing_transaction"), self.feature(
+            "projects:performance-suspect-spans-ingestion"
+        ):
+            manager = EventManager(make_event(**EVENTS["n-plus-one-in-django-index-view"]))
+            manager.normalize()
+            event = manager.save(self.project.id)
+            group = event.groups[0]
+            assert group.issue_category == GroupCategory.PERFORMANCE
+            assert group.issue_type == GroupType.PERFORMANCE_N_PLUS_ONE_DB_QUERIES
+            group.data["metadata"] = {
+                "location": "hi",
+                "title": "lol",
+            }
+            group.culprit = "wat"
+            group.message = "nope"
+            group.save()
+            assert group.location() == "hi"
+            assert group.title == "lol"
+
+            manager = EventManager(make_event(**EVENTS["n-plus-one-in-django-index-view"]))
+            manager.normalize()
+            with self.tasks():
+                manager.save(self.project.id)
+            # Make sure the original group is updated via buffers
+            group.refresh_from_db()
+            assert (
+                group.title
+                == "N+1 Query:SELECT `books_book`.`id`, `books_book`.`title`, `books_book`.`author_id` FROM `books_book` LIMIT 10"
+            )
+            assert group.get_event_metadata() == {
+                "location": "/books/",
+                "title": "N+1 Query:SELECT `books_book`.`id`, `books_book`.`title`, `books_book`.`author_id` FROM `books_book` LIMIT 10",
+            }
+            assert group.location() == "/books/"
+            assert group.message == "/books/"
+            assert group.culprit == "/books/"

+ 10 - 0
tests/sentry/utils/performance_issues/events/n-plus-one-in-django-index-view.json

@@ -4,6 +4,16 @@
   "culprit": "/books/",
   "environment": "production",
   "location": "/books/",
+  "contexts": {
+    "trace": {
+      "trace_id": "10d0b72df0fe4392a6788bce71ec2028",
+      "span_id": "1756e116945a4360",
+      "parent_span_id": "d71f841b69164c33",
+      "op": "http.server",
+      "status": "ok",
+      "type": "trace"
+    }
+},
   "spans": [
     {
       "timestamp": 1661957873.995433,