Browse Source

feat(crons): Fully configure timeout_at for check-ins (#51318)

Fully updates & clears `timeout_at` for `MonitorCheckIns`
Richard Ortenberg 1 year ago
parent
commit
c0ac9be760

+ 29 - 13
src/sentry/monitors/consumers/monitor_consumer.py

@@ -1,6 +1,6 @@
-import datetime
 import logging
 import uuid
+from datetime import datetime, timedelta
 from typing import Dict, Mapping, Optional
 
 import msgpack
@@ -27,8 +27,13 @@ from sentry.monitors.models import (
     MonitorLimitsExceeded,
     MonitorType,
 )
-from sentry.monitors.tasks import TIMEOUT
-from sentry.monitors.utils import signal_first_checkin, signal_first_monitor_created, valid_duration
+from sentry.monitors.utils import (
+    get_new_timeout_at,
+    get_timeout_at,
+    signal_first_checkin,
+    signal_first_monitor_created,
+    valid_duration,
+)
 from sentry.monitors.validators import ConfigValidator, MonitorCheckInValidator
 from sentry.utils import json, metrics
 from sentry.utils.dates import to_datetime
@@ -151,7 +156,10 @@ def _process_message(wrapper: Dict) -> None:
         return
 
     def update_existing_check_in(
-        existing_check_in: MonitorCheckIn, updated_status: CheckInStatus, updated_duration: float
+        existing_check_in: MonitorCheckIn,
+        updated_status: CheckInStatus,
+        updated_duration: float,
+        new_date_updated: datetime,
     ):
         if (
             existing_check_in.project_id != project_id
@@ -195,7 +203,19 @@ def _process_message(wrapper: Dict) -> None:
             logger.debug("check-in implicit duration is invalid: %s", project.organization_id)
             return
 
-        existing_check_in.update(status=updated_status, duration=updated_duration)
+        # update date_added for heartbeat
+        date_updated = existing_check_in.date_updated
+        if updated_status == CheckInStatus.IN_PROGRESS:
+            date_updated = new_date_updated
+
+        updated_timeout_at = get_new_timeout_at(existing_check_in, updated_status, new_date_updated)
+
+        existing_check_in.update(
+            status=updated_status,
+            duration=updated_duration,
+            date_updated=date_updated,
+            timeout_at=updated_timeout_at,
+        )
 
         return
 
@@ -286,7 +306,7 @@ def _process_message(wrapper: Dict) -> None:
                         guid=check_in_id,
                     )
 
-                update_existing_check_in(check_in, status, validated_params["duration"])
+                update_existing_check_in(check_in, status, validated_params["duration"], start_time)
 
             except MonitorCheckIn.DoesNotExist:
                 # Infer the original start time of the check-in from the duration.
@@ -294,7 +314,7 @@ def _process_message(wrapper: Dict) -> None:
                 date_added = start_time
                 duration = validated_params["duration"]
                 if duration is not None:
-                    date_added -= datetime.timedelta(milliseconds=duration)
+                    date_added -= timedelta(milliseconds=duration)
 
                 expected_time = None
                 if monitor_environment.last_checkin:
@@ -303,11 +323,7 @@ def _process_message(wrapper: Dict) -> None:
                     )
 
                 monitor_config = monitor.get_validated_config()
-                timeout_at = None
-                if status == CheckInStatus.IN_PROGRESS:
-                    timeout_at = date_added.replace(second=0, microsecond=0) + datetime.timedelta(
-                        minutes=(monitor_config or {}).get("max_runtime") or TIMEOUT
-                    )
+                timeout_at = get_timeout_at(monitor_config, status, date_added)
 
                 trace_id = validated_params.get("contexts", {}).get("trace", {}).get("trace_id")
 
@@ -337,7 +353,7 @@ def _process_message(wrapper: Dict) -> None:
                             guid=guid,
                         )
                         if not created:
-                            update_existing_check_in(check_in, status, duration)
+                            update_existing_check_in(check_in, status, duration, start_time)
                         else:
                             signal_first_checkin(project, monitor)
 

+ 5 - 1
src/sentry/monitors/endpoints/monitor_ingest_checkin_details.py

@@ -20,7 +20,7 @@ from sentry.apidocs.utils import inline_sentry_response_serializer
 from sentry.models import Project
 from sentry.monitors.models import CheckInStatus, Monitor, MonitorCheckIn, MonitorEnvironment
 from sentry.monitors.serializers import MonitorCheckInSerializerResponse
-from sentry.monitors.utils import valid_duration
+from sentry.monitors.utils import get_new_timeout_at, valid_duration
 from sentry.monitors.validators import MonitorCheckInValidator
 
 from .base import MonitorIngestEndpoint
@@ -102,6 +102,10 @@ class MonitorIngestCheckInDetailsEndpoint(MonitorIngestEndpoint):
 
             params["duration"] = duration
 
+        params["timeout_at"] = get_new_timeout_at(
+            checkin, params.get("status", checkin.status), params["date_updated"]
+        )
+
         # TODO(rjo100): will need to remove this when environment is ensured
         monitor_environment = checkin.monitor_environment
         if not monitor_environment:

+ 2 - 8
src/sentry/monitors/endpoints/monitor_ingest_checkin_index.py

@@ -31,8 +31,7 @@ from sentry.monitors.models import (
     MonitorLimitsExceeded,
 )
 from sentry.monitors.serializers import MonitorCheckInSerializerResponse
-from sentry.monitors.tasks import TIMEOUT
-from sentry.monitors.utils import signal_first_checkin, signal_first_monitor_created
+from sentry.monitors.utils import get_timeout_at, signal_first_checkin, signal_first_monitor_created
 from sentry.monitors.validators import MonitorCheckInValidator
 from sentry.ratelimits.config import RateLimitConfig
 from sentry.types.ratelimit import RateLimit, RateLimitCategory
@@ -201,12 +200,7 @@ class MonitorIngestCheckInIndexEndpoint(MonitorIngestEndpoint):
 
             status = getattr(CheckInStatus, result["status"].upper())
             monitor_config = monitor.get_validated_config()
-
-            timeout_at = None
-            if status == CheckInStatus.IN_PROGRESS:
-                timeout_at = date_added.replace(second=0, microsecond=0) + timedelta(
-                    minutes=(monitor_config or {}).get("max_runtime") or TIMEOUT
-                )
+            timeout_at = get_timeout_at(monitor_config, status, date_added)
 
             checkin = MonitorCheckIn.objects.create(
                 project_id=project.id,

+ 22 - 1
src/sentry/monitors/utils.py

@@ -1,3 +1,4 @@
+from datetime import datetime, timedelta
 from typing import Optional
 
 from django.db import transaction
@@ -11,7 +12,8 @@ from sentry.models import Rule, RuleActivity, RuleActivityType, RuleSource, User
 from sentry.models.project import Project
 from sentry.signals import first_cron_checkin_received, first_cron_monitor_created
 
-from .models import Monitor
+from .models import CheckInStatus, Monitor, MonitorCheckIn
+from .tasks import TIMEOUT
 
 
 def signal_first_checkin(project: Project, monitor: Monitor):
@@ -32,6 +34,25 @@ def signal_first_monitor_created(project: Project, user, from_upsert: bool):
         )
 
 
+# Generates a timeout_at value for new check-ins
+def get_timeout_at(
+    monitor_config: dict, status: CheckInStatus, date_added: Optional[datetime]
+) -> Optional[datetime]:
+    if status == CheckInStatus.IN_PROGRESS:
+        return date_added.replace(second=0, microsecond=0) + timedelta(
+            minutes=(monitor_config or {}).get("max_runtime") or TIMEOUT
+        )
+
+    return None
+
+
+# Generates a timeout_at value for existing check-ins that are being updated
+def get_new_timeout_at(
+    checkin: MonitorCheckIn, new_status: CheckInStatus, date_updated: datetime
+) -> Optional[datetime]:
+    return get_timeout_at(checkin.monitor.get_validated_config(), new_status, date_updated)
+
+
 # Used to check valid implicit durations for closing check-ins without a duration specified
 # as payload is already validated. Max value is > 24 days.
 def valid_duration(duration: Optional[int]) -> bool:

+ 6 - 0
tests/sentry/monitors/endpoints/test_monitor_ingest_checkin_details.py

@@ -13,6 +13,7 @@ from sentry.monitors.models import (
     MonitorStatus,
     MonitorType,
 )
+from sentry.monitors.tasks import TIMEOUT
 from sentry.testutils import MonitorIngestTestCase
 from sentry.testutils.silo import region_silo_test
 
@@ -80,6 +81,10 @@ class UpdateMonitorIngestCheckinTest(MonitorIngestTestCase):
             checkin = MonitorCheckIn.objects.get(id=checkin.id)
             assert checkin.status == CheckInStatus.IN_PROGRESS
             assert checkin.date_updated > checkin.date_added
+            timeout_at = checkin.date_updated.replace(second=0, microsecond=0) + timedelta(
+                minutes=TIMEOUT
+            )
+            assert checkin.timeout_at == timeout_at
 
     def test_passing(self):
         monitor = self._create_monitor()
@@ -107,6 +112,7 @@ class UpdateMonitorIngestCheckinTest(MonitorIngestTestCase):
             assert (
                 checkin.monitor_environment.environment.name == monitor_environment.environment.name
             )
+            assert checkin.timeout_at is None
 
             monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
             assert monitor_environment.next_checkin > checkin.date_added

+ 5 - 0
tests/sentry/monitors/test_monitor_consumer.py

@@ -185,6 +185,11 @@ class MonitorConsumerTest(TestCase):
         )
         assert checkin.timeout_at == timeout_at
 
+        self.send_message(monitor.slug, guid=self.guid)
+        checkin = MonitorCheckIn.objects.get(guid=self.guid)
+        assert checkin.status == CheckInStatus.OK
+        assert checkin.timeout_at is None
+
         new_guid = uuid.uuid4().hex
         self.send_message(
             "my-other-monitor",