Browse Source

feat(crons): Resolve monitor issues (#58967)

Resolves issues with thresholds as appropriate
Richard Ortenberg 1 year ago
parent
commit
3d6f9fa4f2

+ 2 - 3
src/sentry/monitors/logic/mark_failed.py

@@ -14,7 +14,6 @@ from sentry.issues.grouptype import (
     MonitorCheckInMissed,
     MonitorCheckInTimeout,
 )
-from sentry.issues.producer import PayloadType
 from sentry.models.organization import Organization
 from sentry.monitors.constants import SUBTITLE_DATETIME_FORMAT, TIMEOUT
 from sentry.monitors.models import (
@@ -249,7 +248,7 @@ def create_issue_platform_occurrence(
     fingerprint=None,
 ):
     from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
-    from sentry.issues.producer import produce_occurrence_to_kafka
+    from sentry.issues.producer import PayloadType, produce_occurrence_to_kafka
 
     monitor_env = failed_checkin.monitor_environment
     current_timestamp = datetime.utcnow().replace(tzinfo=timezone.utc)
@@ -301,7 +300,7 @@ def create_issue_platform_occurrence(
         "contexts": {"monitor": get_monitor_environment_context(monitor_env)},
         "environment": monitor_env.environment.name,
         "event_id": occurrence.event_id,
-        "fingerprint": fingerprint
+        "fingerprint": [fingerprint]
         if fingerprint
         else [
             "monitor",

+ 42 - 12
src/sentry/monitors/logic/mark_ok.py

@@ -43,22 +43,52 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime):
                 previous_checkin["status"] == CheckInStatus.OK
                 for previous_checkin in previous_checkins
             )
+        else:
+            # Mark any open incidents as recovering by default
+            incident_recovering = True
 
-            # Resolve the incident
-            if incident_recovering:
-                MonitorIncident.objects.filter(
-                    monitor_environment=monitor_env,
-                    grouphash=monitor_env.incident_grouphash,
-                ).update(
-                    resolving_checkin=checkin,
-                    resolving_timestamp=checkin.date_added,
-                )
+        # Resolve any open incidents
+        if incident_recovering:
+            # TODO(rjo100): Check for multiple open incidents where we only
+            # resolved if recovery_threshold was set and not faiure_issue_threshold
+            active_incidents = MonitorIncident.objects.filter(
+                monitor_environment=monitor_env,
+                resolving_checkin__isnull=True,
+            )
 
+            # Only send an occurrence if we have an active incident
+            for grouphash in active_incidents.values_list("grouphash", flat=True):
+                resolve_incident_group(grouphash, checkin.monitor.project_id)
+            if active_incidents.update(
+                resolving_checkin=checkin,
+                resolving_timestamp=checkin.date_added,
+            ):
                 params["last_state_change"] = ts
-            else:
-                # Don't update status if incident isn't recovered
-                params.pop("status", None)
+        else:
+            # Don't update status if incident isn't recovered
+            params.pop("status", None)
 
     MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
         **params
     )
+
+
+def resolve_incident_group(
+    fingerprint: str,
+    project_id: int,
+):
+    from sentry.issues.producer import PayloadType, produce_occurrence_to_kafka
+    from sentry.issues.status_change_message import StatusChangeMessage
+    from sentry.models.group import GroupStatus
+
+    status_change = StatusChangeMessage(
+        fingerprint=[fingerprint],
+        project_id=project_id,
+        new_status=GroupStatus.RESOLVED,
+        new_substatus=None,
+    )
+
+    produce_occurrence_to_kafka(
+        payload_type=PayloadType.STATUS_CHANGE,
+        status_change=status_change,
+    )

+ 27 - 1
tests/sentry/monitors/logic/test_mark_ok.py

@@ -1,7 +1,10 @@
 from datetime import timedelta
+from unittest.mock import patch
 
 from django.utils import timezone
 
+from sentry.issues.producer import PayloadType
+from sentry.models.group import GroupStatus
 from sentry.monitors.logic.mark_ok import mark_ok
 from sentry.monitors.models import (
     CheckInStatus,
@@ -62,7 +65,8 @@ class MarkOkTestCase(TestCase):
         assert monitor_environment.next_checkin_latest == now + timedelta(minutes=2)
         assert monitor_environment.last_checkin == now
 
-    def test_mark_ok_recovery_threshold(self):
+    @patch("sentry.issues.producer.produce_occurrence_to_kafka")
+    def test_mark_ok_recovery_threshold(self, mock_produce_occurrence_to_kafka):
         now = timezone.now().replace(second=0, microsecond=0)
 
         recovery_threshold = 8
@@ -126,13 +130,17 @@ class MarkOkTestCase(TestCase):
         # Incident has not resolved
         assert incident.resolving_checkin is None
         assert incident.resolving_timestamp is None
+        # no status change is sent to kafka occurrence consumer
+        assert len(mock_produce_occurrence_to_kafka.mock_calls) == 0
 
         # create another failed check-in to break the chain
+        now = now + timedelta(minutes=1)
         last_checkin = MonitorCheckIn.objects.create(
             monitor=monitor,
             monitor_environment=monitor_environment,
             project_id=self.project.id,
             status=CheckInStatus.ERROR,
+            date_added=now,
         )
 
         # Still not resolved
@@ -166,3 +174,21 @@ class MarkOkTestCase(TestCase):
         # Incident resolved
         assert incident.resolving_checkin == last_checkin
         assert incident.resolving_timestamp == last_checkin.date_added
+
+        # assert status change is sent to kafka occurrence consumer
+        assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1
+
+        kwargs = mock_produce_occurrence_to_kafka.call_args.kwargs
+        payload_type, status_change = kwargs["payload_type"], kwargs["status_change"]
+        status_change = status_change.to_dict()
+
+        assert payload_type == PayloadType.STATUS_CHANGE
+        assert dict(
+            status_change,
+            **{
+                "fingerprint": [incident.grouphash],
+                "project_id": monitor.project_id,
+                "new_status": GroupStatus.RESOLVED,
+                "new_substatus": None,
+            },
+        ) == dict(status_change)