Browse Source

fix(reprocessing): Add indexes on datetime cols for reprocessing models to fix `clear_expired_raw_events` (#67947)

The `clear-expired-raw-events` task has been failing for a while:
https://sentry.sentry.io/crons/sentry/clear-expired-raw-events/?environment=prod&statsPeriod=24h

This is because it performs batch deletes on these tables using
`datetime`, but there's no index available

Also limit the maximum number of times the query can run, and increase
the batch size to 1000.
Dan Fuller 11 months ago
parent
commit
bd4c091570

+ 1 - 1
migrations_lockfile.txt

@@ -9,5 +9,5 @@ feedback: 0004_index_together
 hybridcloud: 0015_apitokenreplica_hashed_token_index
 nodestore: 0002_nodestore_no_dictfield
 replays: 0004_index_together
-sentry: 0682_monitors_constrain_to_project_id_slug
+sentry: 0683_reprocessing_datetime_indexes
 social_auth: 0002_default_auto_field

+ 42 - 0
src/sentry/migrations/0683_reprocessing_datetime_indexes.py

@@ -0,0 +1,42 @@
+# Generated by Django 5.0.3 on 2024-03-29 17:00
+
+import django.utils.timezone
+from django.db import migrations, models
+
+from sentry.new_migrations.migrations import CheckedMigration
+
+
+class Migration(CheckedMigration):
+    # This flag is used to mark that a migration shouldn't be automatically run in production. For
+    # the most part, this should only be used for operations where it's safe to run the migration
+    # after your code has deployed. So this should not be used for most operations that alter the
+    # schema of a table.
+    # Here are some things that make sense to mark as dangerous:
+    # - Large data migrations. Typically we want these to be run manually by ops so that they can
+    #   be monitored and not block the deploy for a long period of time while they run.
+    # - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
+    #   have ops run this and not block the deploy. Note that while adding an index is a schema
+    #   change, it's completely safe to run the operation after the code has deployed.
+    is_dangerous = True
+
+    dependencies = [
+        ("sentry", "0682_monitors_constrain_to_project_id_slug"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="processingissue",
+            name="datetime",
+            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+        ),
+        migrations.AlterField(
+            model_name="rawevent",
+            name="datetime",
+            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+        ),
+        migrations.AlterField(
+            model_name="reprocessingreport",
+            name="datetime",
+            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+        ),
+    ]

+ 1 - 1
src/sentry/models/processingissue.py

@@ -133,7 +133,7 @@ class ProcessingIssue(Model):
     checksum = models.CharField(max_length=40, db_index=True)
     type = models.CharField(max_length=30)
     data = GzippedDictField()
-    datetime = models.DateTimeField(default=timezone.now)
+    datetime = models.DateTimeField(default=timezone.now, db_index=True)
 
     objects: ClassVar[ProcessingIssueManager] = ProcessingIssueManager()
 

+ 1 - 1
src/sentry/models/rawevent.py

@@ -16,7 +16,7 @@ class RawEvent(Model):
 
     project = FlexibleForeignKey("sentry.Project")
     event_id = models.CharField(max_length=32, null=True)
-    datetime = models.DateTimeField(default=timezone.now)
+    datetime = models.DateTimeField(default=timezone.now, db_index=True)
     data = NodeField(
         blank=True, null=True, ref_func=ref_func, ref_version=1, wrapper=CanonicalKeyView
     )

+ 1 - 1
src/sentry/models/reprocessingreport.py

@@ -11,7 +11,7 @@ class ReprocessingReport(Model):
 
     project = FlexibleForeignKey("sentry.Project")
     event_id = models.CharField(max_length=32, null=True)
-    datetime = models.DateTimeField(default=timezone.now)
+    datetime = models.DateTimeField(default=timezone.now, db_index=True)
 
     class Meta:
         app_label = "sentry"

+ 9 - 2
src/sentry/tasks/reprocessing.py

@@ -61,6 +61,11 @@ def clear_expired_raw_events():
     from sentry.models.rawevent import RawEvent
     from sentry.models.reprocessingreport import ReprocessingReport
 
+    # Max number of times to attempt to query each model
+    MAX_BATCHES_PER_MODEL = 1000
+    # Number of rows to fetch/delete for each query
+    LIMIT_PER_QUERY = 1000
+
     def batched_delete(model_cls, **filter):
         # Django 1.6's `Queryset.delete` runs in this order:
         #
@@ -72,10 +77,12 @@ def clear_expired_raw_events():
         # out at that point and never do the delete.
         #
         # Better to delete a few rows than none.
-        while True:
+        for _ in range(MAX_BATCHES_PER_MODEL):
             # Django already loads this into memory, might as well do it
             # explicitly. Makes check for result emptiness cheaper.
-            result = set(model_cls.objects.filter(**filter)[:200].values_list("pk", flat=True))
+            result = set(
+                model_cls.objects.filter(**filter)[:LIMIT_PER_QUERY].values_list("pk", flat=True)
+            )
             if not result:
                 break