Browse Source

ref(crons): Cleanup missed / timeout status (#66785)

This migration removes all monitor environment MISSED_CHECKIN and
TIMEOUT status's from the database.

These status's are no longer required since an incident may be composed
of check-ins which are failing in more than one way.
Evan Purkhiser 1 year ago
parent
commit
7d313b262f

+ 1 - 1
migrations_lockfile.txt

@@ -9,5 +9,5 @@ feedback: 0004_index_together
 hybridcloud: 0014_apitokenreplica_add_hashed_token
 nodestore: 0002_nodestore_no_dictfield
 replays: 0004_index_together
-sentry: 0673_add_env_muted_to_broken_detection
+sentry: 0674_monitor_clear_missed_timeout_as_error
 social_auth: 0002_default_auto_field

+ 47 - 0
src/sentry/migrations/0674_monitor_clear_missed_timeout_as_error.py

@@ -0,0 +1,47 @@
+# Generated by Django 5.0.2 on 2024-03-12 14:45
+
+from django.db import migrations
+
+from sentry.new_migrations.migrations import CheckedMigration
+from sentry.utils.query import RangeQuerySetWrapperWithProgressBar
+
+
+def clear_missed_timeout_as_error(apps, schema_editor):
+    MonitorEnvironment = apps.get_model("sentry", "MonitorEnvironment")
+
+    # Status from MonitorStatus:
+    #
+    # 6: MonitorStatus.MISSED_CHECKIN
+    # 7: MonitorStatus.TIMEOUT
+    monitors_missed_timeout = MonitorEnvironment.objects.filter(status__in=(6, 7))
+
+    for monitor_env in RangeQuerySetWrapperWithProgressBar(monitors_missed_timeout):
+        # Set to MonitorStatus.ERROR
+        monitor_env.status = 5
+        monitor_env.save(update_fields=["status"])
+
+
+class Migration(CheckedMigration):
+    # This flag is used to mark that a migration shouldn't be automatically run in production. For
+    # the most part, this should only be used for operations where it's safe to run the migration
+    # after your code has deployed. So this should not be used for most operations that alter the
+    # schema of a table.
+    # Here are some things that make sense to mark as dangerous:
+    # - Large data migrations. Typically we want these to be run manually by ops so that they can
+    #   be monitored and not block the deploy for a long period of time while they run.
+    # - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
+    #   have ops run this and not block the deploy. Note that while adding an index is a schema
+    #   change, it's completely safe to run the operation after the code has deployed.
+    is_dangerous = True
+
+    dependencies = [
+        ("sentry", "0673_add_env_muted_to_broken_detection"),
+    ]
+
+    operations = [
+        migrations.RunPython(
+            clear_missed_timeout_as_error,
+            migrations.RunPython.noop,
+            hints={"tables": ["sentry_monitorenvironment"]},
+        ),
+    ]

+ 76 - 0
tests/sentry/migrations/test_0674_monitor_clear_missed_timeout_as_error.py

@@ -0,0 +1,76 @@
+from sentry.monitors.models import MonitorType, ScheduleType
+from sentry.testutils.cases import TestMigrations
+
+
+class TestMonitorClearMissedTimeoutAsError(TestMigrations):
+    migrate_from = "0673_add_env_muted_to_broken_detection"
+    migrate_to = "0674_monitor_clear_missed_timeout_as_error"
+
+    def setup_before_migration(self, apps):
+        Monitor = apps.get_model("sentry", "Monitor")
+        MonitorEnvironment = apps.get_model("sentry", "MonitorEnvironment")
+
+        self.monitor = Monitor.objects.create(
+            guid="9aa14d45-1232-4a4b-9d90-c954c0377970",
+            organization_id=self.organization.id,
+            project_id=self.project.id,
+            type=MonitorType.CRON_JOB,
+            config={
+                "schedule": "* * * * *",
+                "schedule_type": ScheduleType.CRONTAB,
+                "checkin_margin": None,
+                "max_runtime": None,
+            },
+        )
+        self.monitor_env1 = MonitorEnvironment.objects.create(
+            monitor=self.monitor,
+            environment_id=self.create_environment(
+                organization=self.organization, project=self.project, name="prod1"
+            ).id,
+            status=4,  # OK
+        )
+        self.monitor_env2 = MonitorEnvironment.objects.create(
+            monitor=self.monitor,
+            environment_id=self.create_environment(
+                organization=self.organization, project=self.project, name="prod2"
+            ).id,
+            status=5,  # ERROR
+        )
+        self.monitor_env3 = MonitorEnvironment.objects.create(
+            monitor=self.monitor,
+            environment_id=self.create_environment(
+                organization=self.organization, project=self.project, name="prod3"
+            ).id,
+            status=6,  # MISSED_CHECKIN
+        )
+        self.monitor_env4 = MonitorEnvironment.objects.create(
+            monitor=self.monitor,
+            environment_id=self.create_environment(
+                organization=self.organization, project=self.project, name="prod4"
+            ).id,
+            status=7,  # TIMEOUT
+        )
+        self.monitor_env5 = MonitorEnvironment.objects.create(
+            monitor=self.monitor,
+            environment_id=self.create_environment(
+                organization=self.organization, project=self.project, name="prod5"
+            ).id,
+            status=7,  # TIMEOUT
+        )
+
+    def test(self):
+        """
+        Validate that environments with (MISSED_CHECKIN, TIMEOUT) status's are
+        set to error status.
+        """
+        self.monitor_env1.refresh_from_db()
+        self.monitor_env2.refresh_from_db()
+        self.monitor_env3.refresh_from_db()
+        self.monitor_env4.refresh_from_db()
+        self.monitor_env5.refresh_from_db()
+
+        assert self.monitor_env1.status == 4
+        assert self.monitor_env2.status == 5
+        assert self.monitor_env3.status == 5
+        assert self.monitor_env4.status == 5
+        assert self.monitor_env5.status == 5