Browse Source

feat(crons): Add helpful subtitle to crons issue platform + update evidence (#51877)

Adds helpful human-readable subtitle to issue platform crons issues +
updates `last check-in` evidence to `last successful check-in`
Richard Ortenberg 1 year ago
parent
commit
b466fbcefb
3 changed files with 84 additions and 21 deletions
  1. 45 8
      src/sentry/monitors/models.py
  2. 8 2
      src/sentry/monitors/tasks.py
  3. 31 11
      tests/sentry/monitors/test_models.py

+ 45 - 8
src/sentry/monitors/models.py

@@ -480,7 +480,16 @@ class MonitorEnvironment(Model):
     def get_audit_log_data(self):
         return {"name": self.environment.name, "status": self.status, "monitor": self.monitor.name}
 
-    def mark_failed(self, last_checkin=None, reason=MonitorFailure.UNKNOWN):
+    def get_last_successful_checkin(self):
+        return (
+            MonitorCheckIn.objects.filter(monitor_environment=self, status=CheckInStatus.OK)
+            .order_by("-date_added")
+            .first()
+        )
+
+    def mark_failed(
+        self, last_checkin=None, reason=MonitorFailure.UNKNOWN, occurrence_context=None
+    ):
         from sentry.signals import monitor_environment_failed
 
         if last_checkin is None:
@@ -529,7 +538,16 @@ class MonitorEnvironment(Model):
             from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
             from sentry.issues.producer import produce_occurrence_to_kafka
 
-            occurrence_data = get_occurrence_data(reason)
+            if not occurrence_context:
+                occurrence_context = {}
+
+            occurrence_data = get_occurrence_data(reason, **occurrence_context)
+
+            # Get last successful check-in to show in evidence display
+            last_successful_checkin_timestamp = "None"
+            last_successful_checkin = self.get_last_successful_checkin()
+            if last_successful_checkin:
+                last_successful_checkin_timestamp = last_successful_checkin.date_added.isoformat()
 
             occurrence = IssueOccurrence(
                 id=uuid.uuid4().hex,
@@ -541,14 +559,16 @@ class MonitorEnvironment(Model):
                 ],
                 type=occurrence_data["group_type"],
                 issue_title=f"Monitor failure: {self.monitor.name}",
-                subtitle="",
+                subtitle=occurrence_data["subtitle"],
                 evidence_display=[
                     IssueEvidence(
                         name="Failure reason", value=occurrence_data["reason"], important=True
                     ),
                     IssueEvidence(name="Environment", value=self.environment.name, important=False),
                     IssueEvidence(
-                        name="Last check-in", value=last_checkin.isoformat(), important=False
+                        name="Last successful check-in",
+                        value=last_successful_checkin_timestamp,
+                        important=False,
                     ),
                 ],
                 evidence_data={},
@@ -613,13 +633,30 @@ class MonitorEnvironment(Model):
         MonitorEnvironment.objects.filter(id=self.id).exclude(last_checkin__gt=ts).update(**params)
 
 
-def get_occurrence_data(reason: str):
+def get_occurrence_data(reason: str, **kwargs):
     if reason == MonitorFailure.MISSED_CHECKIN:
-        return {"group_type": MonitorCheckInMissed, "level": "warning", "reason": "missed_checkin"}
+        expected_time = kwargs.get("expected_time", "the expected time")
+        return {
+            "group_type": MonitorCheckInMissed,
+            "level": "warning",
+            "reason": "missed_checkin",
+            "subtitle": f"No check-in reported at {expected_time}.",
+        }
     elif reason == MonitorFailure.DURATION:
-        return {"group_type": MonitorCheckInTimeout, "level": "error", "reason": "duration"}
+        timeout = kwargs.get("timeout", 30)
+        return {
+            "group_type": MonitorCheckInTimeout,
+            "level": "error",
+            "reason": "duration",
+            "subtitle": f"Check-in exceeded maximum duration of {timeout} minutes.",
+        }
 
-    return {"group_type": MonitorCheckInFailure, "level": "error", "reason": "error"}
+    return {
+        "group_type": MonitorCheckInFailure,
+        "level": "error",
+        "reason": "error",
+        "subtitle": "An error occurred during the last check-in.",
+    }
 
 
 @receiver(pre_save, sender=MonitorEnvironment)

+ 8 - 2
src/sentry/monitors/tasks.py

@@ -88,7 +88,10 @@ def check_monitors(current_datetime=None):
                 expected_time=expected_time,
                 monitor_config=monitor.get_validated_config(),
             )
-            monitor_environment.mark_failed(reason=MonitorFailure.MISSED_CHECKIN)
+            monitor_environment.mark_failed(
+                reason=MonitorFailure.MISSED_CHECKIN,
+                occurrence_context={"expected_time": expected_time},
+            )
         except Exception:
             logger.exception("Exception in check_monitors - mark missed")
 
@@ -124,6 +127,9 @@ def check_monitors(current_datetime=None):
                 status__in=[CheckInStatus.OK, CheckInStatus.ERROR],
             ).exists()
             if not has_newer_result:
-                monitor_environment.mark_failed(reason=MonitorFailure.DURATION)
+                monitor_environment.mark_failed(
+                    reason=MonitorFailure.DURATION,
+                    occurrence_context={"duration": (timeout.seconds // 60) % 60},
+                )
         except Exception:
             logger.exception("Exception in check_monitors - mark timeout")

+ 31 - 11
tests/sentry/monitors/test_models.py

@@ -13,7 +13,9 @@ from sentry.issues.grouptype import (
     MonitorCheckInTimeout,
 )
 from sentry.monitors.models import (
+    CheckInStatus,
     Monitor,
+    MonitorCheckIn,
     MonitorEnvironment,
     MonitorEnvironmentLimitsExceeded,
     MonitorFailure,
@@ -314,6 +316,13 @@ class MonitorEnvironmentTestCase(TestCase):
             status=monitor.status,
         )
 
+        successful_check_in = MonitorCheckIn.objects.create(
+            monitor=monitor,
+            monitor_environment=monitor_environment,
+            project_id=self.project.id,
+            status=CheckInStatus.OK,
+        )
+
         last_checkin = timezone.now()
         assert monitor_environment.mark_failed(last_checkin=last_checkin)
 
@@ -328,7 +337,7 @@ class MonitorEnvironmentTestCase(TestCase):
                 "project_id": self.project.id,
                 "fingerprint": [hash_from_values(["monitor", str(monitor.guid), "error"])],
                 "issue_title": f"Monitor failure: {monitor.name}",
-                "subtitle": "",
+                "subtitle": "An error occurred during the last check-in.",
                 "resource_id": None,
                 "evidence_data": {},
                 "evidence_display": [
@@ -339,8 +348,8 @@ class MonitorEnvironmentTestCase(TestCase):
                         "important": False,
                     },
                     {
-                        "name": "Last check-in",
-                        "value": last_checkin.isoformat(),
+                        "name": "Last successful check-in",
+                        "value": successful_check_in.date_added.isoformat(),
                         "important": False,
                     },
                 ],
@@ -391,9 +400,17 @@ class MonitorEnvironmentTestCase(TestCase):
             environment=self.environment,
             status=monitor.status,
         )
+        successful_check_in = MonitorCheckIn.objects.create(
+            monitor=monitor,
+            monitor_environment=monitor_environment,
+            project_id=self.project.id,
+            status=CheckInStatus.OK,
+        )
         last_checkin = timezone.now()
         assert monitor_environment.mark_failed(
-            last_checkin=last_checkin, reason=MonitorFailure.DURATION
+            last_checkin=last_checkin,
+            reason=MonitorFailure.DURATION,
+            occurrence_context={"duration": 30},
         )
 
         assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1
@@ -407,7 +424,7 @@ class MonitorEnvironmentTestCase(TestCase):
                 "project_id": self.project.id,
                 "fingerprint": [hash_from_values(["monitor", str(monitor.guid), "duration"])],
                 "issue_title": f"Monitor failure: {monitor.name}",
-                "subtitle": "",
+                "subtitle": "Check-in exceeded maximum duration of 30 minutes.",
                 "resource_id": None,
                 "evidence_data": {},
                 "evidence_display": [
@@ -418,8 +435,8 @@ class MonitorEnvironmentTestCase(TestCase):
                         "important": False,
                     },
                     {
-                        "name": "Last check-in",
-                        "value": last_checkin.isoformat(),
+                        "name": "Last successful check-in",
+                        "value": successful_check_in.date_added.isoformat(),
                         "important": False,
                     },
                 ],
@@ -471,8 +488,11 @@ class MonitorEnvironmentTestCase(TestCase):
             status=monitor.status,
         )
         last_checkin = timezone.now()
+        expected_time = monitor.get_next_scheduled_checkin_without_margin(last_checkin)
         assert monitor_environment.mark_failed(
-            last_checkin=last_checkin, reason=MonitorFailure.MISSED_CHECKIN
+            last_checkin=last_checkin,
+            reason=MonitorFailure.MISSED_CHECKIN,
+            occurrence_context={"expected_time": expected_time},
         )
 
         monitor.refresh_from_db()
@@ -490,7 +510,7 @@ class MonitorEnvironmentTestCase(TestCase):
                 "project_id": self.project.id,
                 "fingerprint": [hash_from_values(["monitor", str(monitor.guid), "missed_checkin"])],
                 "issue_title": f"Monitor failure: {monitor.name}",
-                "subtitle": "",
+                "subtitle": f"No check-in reported at {expected_time}.",
                 "resource_id": None,
                 "evidence_data": {},
                 "evidence_display": [
@@ -501,8 +521,8 @@ class MonitorEnvironmentTestCase(TestCase):
                         "important": False,
                     },
                     {
-                        "name": "Last check-in",
-                        "value": last_checkin.isoformat(),
+                        "name": "Last successful check-in",
+                        "value": "None",
                         "important": False,
                     },
                 ],