Browse Source

Remove ProcessingIssue models, step 1 (#73939)

This removes the last use of `RawEvent`, `ProcessingIssue`,
`EventProcessingIssue` and `ReprocessingReport`. It also removes any DB
foreign key constraints, which is the first step of removing those
tables for good.
Arpad Borsos 8 months ago
parent
commit
3e5f726eaf

+ 1 - 1
migrations_lockfile.txt

@@ -10,6 +10,6 @@ hybridcloud: 0016_add_control_cacheversion
 nodestore: 0002_nodestore_no_dictfield
 remote_subscriptions: 0002_remove_separate_remote_subscription
 replays: 0004_index_together
-sentry: 0733_relocation_provenance
+sentry: 0734_rm_reprocessing_step1
 social_auth: 0002_default_auto_field
 uptime: 0002_remove_separate_remote_subscription

+ 0 - 3
src/sentry/conf/server.py

@@ -2474,9 +2474,6 @@ SENTRY_MAX_SERIALIZED_FILE_SIZE = 5000000
 # Max file size for avatar photo uploads
 SENTRY_MAX_AVATAR_SIZE = 5000000
 
-# The maximum age of raw events before they are deleted
-SENTRY_RAW_EVENT_MAX_AGE_DAYS = 10
-
 # statuspage.io support
 STATUS_PAGE_ID: str | None = None
 STATUS_PAGE_API_HOST = "statuspage.io"

+ 75 - 0
src/sentry/migrations/0734_rm_reprocessing_step1.py

@@ -0,0 +1,75 @@
+# Generated by Django 5.0.6 on 2024-07-08 13:42
+
+import django.db.models.deletion
+from django.db import migrations
+
+import sentry.db.models.fields.foreignkey
+from sentry.new_migrations.migrations import CheckedMigration
+
+
+class Migration(CheckedMigration):
+    # This flag is used to mark that a migration shouldn't be automatically run in production.
+    # This should only be used for operations where it's safe to run the migration after your
+    # code has deployed. So this should not be used for most operations that alter the schema
+    # of a table.
+    # Here are some things that make sense to mark as post deployment:
+    # - Large data migrations. Typically we want these to be run manually so that they can be
+    #   monitored and not block the deploy for a long period of time while they run.
+    # - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
+    #   run this outside deployments so that we don't block them. Note that while adding an index
+    #   is a schema change, it's completely safe to run the operation after the code has deployed.
+    # Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment
+
+    is_post_deployment = False
+
+    dependencies = [
+        ("sentry", "0733_relocation_provenance"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="eventprocessingissue",
+            name="processing_issue",
+            field=sentry.db.models.fields.foreignkey.FlexibleForeignKey(
+                db_constraint=False,
+                on_delete=django.db.models.deletion.CASCADE,
+                to="sentry.processingissue",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="eventprocessingissue",
+            name="raw_event",
+            field=sentry.db.models.fields.foreignkey.FlexibleForeignKey(
+                db_constraint=False,
+                on_delete=django.db.models.deletion.CASCADE,
+                to="sentry.rawevent",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="processingissue",
+            name="project",
+            field=sentry.db.models.fields.foreignkey.FlexibleForeignKey(
+                db_constraint=False,
+                on_delete=django.db.models.deletion.CASCADE,
+                to="sentry.project",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="rawevent",
+            name="project",
+            field=sentry.db.models.fields.foreignkey.FlexibleForeignKey(
+                db_constraint=False,
+                on_delete=django.db.models.deletion.CASCADE,
+                to="sentry.project",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="reprocessingreport",
+            name="project",
+            field=sentry.db.models.fields.foreignkey.FlexibleForeignKey(
+                db_constraint=False,
+                on_delete=django.db.models.deletion.CASCADE,
+                to="sentry.project",
+            ),
+        ),
+    ]

+ 3 - 3
src/sentry/models/processingissue.py

@@ -17,7 +17,7 @@ from sentry.db.models import (
 class ProcessingIssue(Model):
     __relocation_scope__ = RelocationScope.Excluded
 
-    project = FlexibleForeignKey("sentry.Project", db_index=True)
+    project = FlexibleForeignKey("sentry.Project", db_index=True, db_constraint=False)
     checksum = models.CharField(max_length=40, db_index=True)
     type = models.CharField(max_length=30)
     data = GzippedDictField()
@@ -43,8 +43,8 @@ class ProcessingIssue(Model):
 class EventProcessingIssue(Model):
     __relocation_scope__ = RelocationScope.Excluded
 
-    raw_event = FlexibleForeignKey("sentry.RawEvent")
-    processing_issue = FlexibleForeignKey("sentry.ProcessingIssue")
+    raw_event = FlexibleForeignKey("sentry.RawEvent", db_constraint=False)
+    processing_issue = FlexibleForeignKey("sentry.ProcessingIssue", db_constraint=False)
 
     class Meta:
         app_label = "sentry"

+ 1 - 1
src/sentry/models/rawevent.py

@@ -19,7 +19,7 @@ def ref_func(x):
 class RawEvent(Model):
     __relocation_scope__ = RelocationScope.Excluded
 
-    project = FlexibleForeignKey("sentry.Project")
+    project = FlexibleForeignKey("sentry.Project", db_constraint=False)
     event_id = models.CharField(max_length=32, null=True)
     datetime = models.DateTimeField(default=timezone.now, db_index=True)
     data: models.Field[Mapping[str, Any], NodeData] = NodeField(

+ 1 - 1
src/sentry/models/reprocessingreport.py

@@ -9,7 +9,7 @@ from sentry.db.models import FlexibleForeignKey, Model, region_silo_model, sane_
 class ReprocessingReport(Model):
     __relocation_scope__ = RelocationScope.Excluded
 
-    project = FlexibleForeignKey("sentry.Project")
+    project = FlexibleForeignKey("sentry.Project", db_constraint=False)
     event_id = models.CharField(max_length=32, null=True)
     datetime = models.DateTimeField(default=timezone.now, db_index=True)
 

+ 2 - 48
src/sentry/tasks/reprocessing.py

@@ -1,14 +1,8 @@
-import logging
-from datetime import timedelta
-
-from django.conf import settings
-from django.utils import timezone
+# TODO: delete this whole file once confirming no more of these tasks are floating around anywhere
 
 from sentry.silo.base import SiloMode
 from sentry.tasks.base import instrumented_task
 
-logger = logging.getLogger(__name__)
-
 
 @instrumented_task(
     name="sentry.tasks.reprocess_events",
@@ -26,44 +20,4 @@ def reprocess_events(project_id, **kwargs):
     silo_mode=SiloMode.REGION,
 )
 def clear_expired_raw_events():
-    from sentry.models.processingissue import ProcessingIssue
-    from sentry.models.rawevent import RawEvent
-    from sentry.models.reprocessingreport import ReprocessingReport
-
-    # Max number of times to attempt to query each model
-    MAX_BATCHES_PER_MODEL = 10000
-    # Number of rows to fetch/delete for each query
-    LIMIT_PER_QUERY = 100
-
-    def batched_delete(model_cls, **filter):
-        # Django 1.6's `Queryset.delete` runs in this order:
-        #
-        # 1. Fetch all models
-        # 2. Call all `on_delete`s
-        # 3. Delete from DB (batched `DELETE WHERE id in (...)`)
-        #
-        # Since we attempt to unpickle `NodeField`s in Step 2, we might time
-        # out at that point and never do the delete.
-        #
-        # Better to delete a few rows than none.
-        for _ in range(MAX_BATCHES_PER_MODEL):
-            # Django already loads this into memory, might as well do it
-            # explicitly. Makes check for result emptiness cheaper.
-            result = set(
-                model_cls.objects.filter(**filter)[:LIMIT_PER_QUERY].values_list("pk", flat=True)
-            )
-            if not result:
-                break
-
-            # Django ORM can't do delete with limit
-            model_cls.objects.filter(pk__in=result).delete()
-
-    cutoff = timezone.now() - timedelta(days=settings.SENTRY_RAW_EVENT_MAX_AGE_DAYS)
-
-    # Clear old raw events and reprocessing reports
-    batched_delete(RawEvent, datetime__lt=cutoff)
-    batched_delete(ReprocessingReport, datetime__lt=cutoff)
-
-    # Processing issues get a bit of extra time before we delete them
-    cutoff = timezone.now() - timedelta(days=int(settings.SENTRY_RAW_EVENT_MAX_AGE_DAYS * 1.3))
-    batched_delete(ProcessingIssue, datetime__lt=cutoff)
+    pass