Browse Source

Move metadata from `File` to `EventAttachment` (#59608)

Usage of the `File/Blob/Index` abstraction for `EventAttachment`s is
problematic for various reasons.

The `FileBlob` deduplication is a bottleneck in attachment ingestion and
has caused INC-513 among other things. The `File` abstraction also
covers *every* kind of file we store, making it hard to distinguish
different usecases like attachments and debug files. The `cleanup` of
`EventAttachment` can barely keep up with writes, primarily due to the
inefficiency of deleting the backing `File`.

This PR is doing the first preparatory changes to be able to move away
from `File` to an alternative backing store. It duplicates some of the
metadata that currently lives in `File` directly to `EventAttachment`.
(Note that things like `name` and `type` were already duplicated) It
also adds a new `blob_path` field and makes `file_id` nullable to be
forward compatible to a future switch here.

As a driveby change, it also cleans up some of the indexes on the
`EventAttachment` table that do not make sense.
Arpad Borsos 1 year ago
parent
commit
b152c238f5

+ 2 - 8
fixtures/backup/model_dependencies/detailed.json

@@ -1705,7 +1705,7 @@
       "file_id": {
         "kind": "ImplicitForeignKey",
         "model": "sentry.file",
-        "nullable": false
+        "nullable": true
       },
       "group_id": {
         "kind": "ImplicitForeignKey",
@@ -1725,13 +1725,7 @@
       "Region"
     ],
     "table_name": "sentry_eventattachment",
-    "uniques": [
-      [
-        "event_id",
-        "file_id",
-        "project_id"
-      ]
-    ]
+    "uniques": []
   },
   "sentry.eventprocessingissue": {
     "dangling": false,

+ 1 - 1
migrations_lockfile.txt

@@ -9,5 +9,5 @@ feedback: 0003_feedback_add_env
 hybridcloud: 0008_add_externalactorreplica
 nodestore: 0002_nodestore_no_dictfield
 replays: 0003_add_size_to_recording_segment
-sentry: 0592_delete_relocation_hybrid_cloud_foreign_keys
+sentry: 0593_eventattachment_metadata
 social_auth: 0002_default_auto_field

+ 11 - 5
src/sentry/api/endpoints/event_attachment_details.py

@@ -56,15 +56,21 @@ class EventAttachmentDetailsEndpoint(ProjectEndpoint):
 
     def download(self, attachment):
         file = File.objects.get(id=attachment.file_id)
+
+        content_type = attachment.content_type or file.headers.get(
+            "content-type", "application/octet-stream"
+        )
+        size = attachment.size or file.size
+
         fp = file.getfile()
         response = StreamingHttpResponse(
             iter(lambda: fp.read(4096), b""),
-            content_type=file.headers.get("content-type", "application/octet-stream"),
-        )
-        response["Content-Length"] = file.size
-        response["Content-Disposition"] = 'attachment; filename="%s"' % posixpath.basename(
-            " ".join(attachment.name.split())
+            content_type=content_type,
         )
+
+        response["Content-Length"] = size
+        name = posixpath.basename(" ".join(attachment.name.split()))
+        response["Content-Disposition"] = f'attachment; filename="{name}"'
         return response
 
     def get(self, request: Request, project, event_id, attachment_id) -> Response:

+ 14 - 7
src/sentry/api/serializers/models/eventattachment.py

@@ -8,24 +8,31 @@ from sentry.models.files.file import File
 @register(EventAttachment)
 class EventAttachmentSerializer(Serializer):
     def get_attrs(self, item_list, user, **kwargs):
-        files = {f.id: f for f in File.objects.filter(id__in=[ea.file_id for ea in item_list])}
-        return {ea: {"file": files[ea.file_id]} for ea in item_list}
+        files = {
+            f.id: f
+            for f in File.objects.filter(id__in=[ea.file_id for ea in item_list if ea.file_id])
+        }
+        return {ea: {"file": files[ea.file_id]} for ea in item_list if ea.file_id}
 
     def serialize(self, obj, attrs, user):
-        file = attrs["file"]
+        file = attrs.get("file")
+        content_type = obj.content_type or get_mimetype(file)
+        size = obj.size or file.size
+        sha1 = obj.sha1 or file.checksum
+        headers = {"Content-Type": content_type}
 
         return {
             "id": str(obj.id),
             "event_id": obj.event_id,
             "type": obj.type,
             "name": obj.name,
-            "mimetype": get_mimetype(file),
+            "mimetype": content_type,
             "dateCreated": obj.date_added,
-            "size": file.size,
+            "size": size,
             # TODO: It would be nice to deprecate these two fields.
             # If not, we can at least define `headers` as `Content-Type: $mimetype`.
-            "headers": file.headers,
-            "sha1": file.checksum,
+            "headers": headers,
+            "sha1": sha1,
         }
 
 

+ 23 - 4
src/sentry/event_manager.py

@@ -3,6 +3,7 @@ from __future__ import annotations
 import copy
 import ipaddress
 import logging
+import mimetypes
 import random
 import re
 import time
@@ -2419,20 +2420,32 @@ def save_attachment(
         logger.exception("Missing chunks for cache_key=%s", cache_key)
         return
 
+    content_type = normalize_content_type(attachment.content_type, attachment.name)
+
     file = File.objects.create(
         name=attachment.name,
         type=attachment.type,
-        headers={"Content-Type": attachment.content_type},
+        headers={"Content-Type": content_type},
     )
     file.putfile(BytesIO(data), blob_size=settings.SENTRY_ATTACHMENT_BLOB_SIZE)
 
+    size = file.size
+    sha1 = file.checksum
+    file_id = file.id
+
     EventAttachment.objects.create(
-        event_id=event_id,
+        # lookup:
         project_id=project.id,
         group_id=group_id,
-        name=attachment.name,
-        file_id=file.id,
+        event_id=event_id,
+        # metadata:
         type=attachment.type,
+        name=attachment.name,
+        content_type=content_type,
+        size=size,
+        sha1=sha1,
+        # storage:
+        file_id=file_id,
     )
 
     track_outcome(
@@ -2448,6 +2461,12 @@ def save_attachment(
     )
 
 
+def normalize_content_type(content_type: str | None, name: str) -> str:
+    if content_type:
+        return content_type.split(";")[0].strip()
+    return mimetypes.guess_type(name)[0] or "application/octet-stream"
+
+
 def save_attachments(cache_key: Optional[str], attachments: list[Attachment], job: Job) -> None:
     """
     Persists cached event attachments into the file store.

+ 60 - 0
src/sentry/migrations/0593_eventattachment_metadata.py

@@ -0,0 +1,60 @@
+# Generated by Django 3.2.23 on 2023-11-08 11:30
+
+from django.db import migrations, models
+
+import sentry.db.models.fields.bounded
+from sentry.new_migrations.migrations import CheckedMigration
+
+
+class Migration(CheckedMigration):
+    # This flag is used to mark that a migration shouldn't be automatically run in production. For
+    # the most part, this should only be used for operations where it's safe to run the migration
+    # after your code has deployed. So this should not be used for most operations that alter the
+    # schema of a table.
+    # Here are some things that make sense to mark as dangerous:
+    # - Large data migrations. Typically we want these to be run manually by ops so that they can
+    #   be monitored and not block the deploy for a long period of time while they run.
+    # - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
+    #   have ops run this and not block the deploy. Note that while adding an index is a schema
+    #   change, it's completely safe to run the operation after the code has deployed.
+    is_dangerous = False
+
+    dependencies = [
+        ("sentry", "0592_delete_relocation_hybrid_cloud_foreign_keys"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="eventattachment",
+            name="blob_path",
+            field=models.TextField(null=True),
+        ),
+        migrations.AddField(
+            model_name="eventattachment",
+            name="content_type",
+            field=models.TextField(null=True),
+        ),
+        migrations.AddField(
+            model_name="eventattachment",
+            name="sha1",
+            field=models.CharField(max_length=40, null=True),
+        ),
+        migrations.AddField(
+            model_name="eventattachment",
+            name="size",
+            field=sentry.db.models.fields.bounded.BoundedPositiveIntegerField(null=True),
+        ),
+        migrations.AlterField(
+            model_name="eventattachment",
+            name="file_id",
+            field=sentry.db.models.fields.bounded.BoundedBigIntegerField(db_index=True, null=True),
+        ),
+        migrations.AlterUniqueTogether(
+            name="eventattachment",
+            unique_together=set(),
+        ),
+        migrations.AlterIndexTogether(
+            name="eventattachment",
+            index_together={("project_id", "date_added")},
+        ),
+    ]

+ 15 - 4
src/sentry/models/eventattachment.py

@@ -5,6 +5,7 @@ from django.utils import timezone
 
 from sentry.backup.scopes import RelocationScope
 from sentry.db.models import BoundedBigIntegerField, Model, region_silo_only_model, sane_repr
+from sentry.db.models.fields.bounded import BoundedPositiveIntegerField
 
 # Attachment file types that are considered a crash report (PII relevant)
 CRASH_REPORT_TYPES = ("event.minidump", "event.applecrashreport")
@@ -32,21 +33,31 @@ def event_attachment_screenshot_filter(queryset):
 class EventAttachment(Model):
     __relocation_scope__ = RelocationScope.Excluded
 
+    # the things we want to look up attachments by:
     project_id = BoundedBigIntegerField()
     group_id = BoundedBigIntegerField(null=True, db_index=True)
     event_id = models.CharField(max_length=32, db_index=True)
-    file_id = BoundedBigIntegerField(db_index=True)
+
+    # attachment and file metadata
     type = models.CharField(max_length=64, db_index=True)
     name = models.TextField()
+    content_type = models.TextField(null=True)
+    size = BoundedPositiveIntegerField(null=True)
+    sha1 = models.CharField(max_length=40, null=True)
+
     date_added = models.DateTimeField(default=timezone.now, db_index=True)
 
+    # the backing blob, either going through the `File` model,
+    # or directly to a backing blob store
+    file_id = BoundedBigIntegerField(null=True, db_index=True)
+    blob_path = models.TextField(null=True)
+
     class Meta:
         app_label = "sentry"
         db_table = "sentry_eventattachment"
-        index_together = (("project_id", "date_added"), ("project_id", "date_added", "file_id"))
-        unique_together = (("project_id", "event_id", "file_id"),)
+        index_together = ("project_id", "date_added")
 
-    __repr__ = sane_repr("event_id", "name", "file_id")
+    __repr__ = sane_repr("event_id", "name")
 
     def delete(self, *args, **kwargs):
         from sentry.models.files.file import File

+ 8 - 7
src/sentry/reprocessing2.py

@@ -222,7 +222,6 @@ def pull_event_data(project_id, event_id) -> ReprocessableEvent:
 
 
 def reprocess_event(project_id, event_id, start_time):
-
     from sentry.ingest.consumer.processors import CACHE_TIMEOUT
     from sentry.tasks.store import preprocess_event_from_reprocessing
 
@@ -245,7 +244,10 @@ def reprocess_event(project_id, event_id, start_time):
     # (we simply update group_id on the EventAttachment models in post_process)
     attachment_objects = []
 
-    files = {f.id: f for f in models.File.objects.filter(id__in=[ea.file_id for ea in attachments])}
+    files = {
+        f.id: f
+        for f in models.File.objects.filter(id__in=[ea.file_id for ea in attachments if ea.file_id])
+    }
 
     for attachment_id, attachment in enumerate(attachments):
         with sentry_sdk.start_span(op="reprocess_event._copy_attachment_into_cache") as span:
@@ -254,7 +256,7 @@ def reprocess_event(project_id, event_id, start_time):
                 _copy_attachment_into_cache(
                     attachment_id=attachment_id,
                     attachment=attachment,
-                    file=files[attachment.file_id],
+                    file=files[attachment.file_id] if attachment.file_id else None,
                     cache_key=cache_key,
                     cache_timeout=CACHE_TIMEOUT,
                 )
@@ -312,7 +314,6 @@ def _send_delete_old_primary_hash_messages(
         # Racing might be happening between two different tasks. Give up on the
         # task that's lagging behind by prematurely terminating flushing.
         if len(event_ids) == 0:
-
             logger.error("reprocessing2.buffered_delete_old_primary_hash.empty_batch")
             return
 
@@ -420,7 +421,6 @@ def buffered_delete_old_primary_hash(
     with sentry_sdk.start_span(
         op="sentry.reprocessing2.buffered_delete_old_primary_hash.flush_events"
     ):
-
         _send_delete_old_primary_hash_messages(
             client, project_id, group_id, old_primary_hashes, force_flush_batch
         )
@@ -446,7 +446,8 @@ def _copy_attachment_into_cache(attachment_id, attachment, file, cache_key, cach
         )
         chunk_index += 1
 
-    assert size == file.size
+    expected_size = attachment.size or file.size
+    assert size == expected_size
 
     return CachedAttachment(
         key=cache_key,
@@ -455,7 +456,7 @@ def _copy_attachment_into_cache(attachment_id, attachment, file, cache_key, cach
         # XXX: Not part of eventattachment model, but not strictly
         # necessary for processing
         content_type=None,
-        type=file.type,
+        type=attachment.type,
         chunks=chunk_index,
         size=size,
     )

+ 38 - 9
tests/sentry/api/endpoints/test_event_attachments.py

@@ -1,3 +1,5 @@
+from io import BytesIO
+
 from sentry.models.eventattachment import EventAttachment
 from sentry.models.files.file import File
 from sentry.testutils.cases import APITestCase
@@ -21,19 +23,25 @@ class EventAttachmentsTest(APITestCase):
             data={"fingerprint": ["group1"], "timestamp": min_ago}, project_id=self.project.id
         )
 
+        file1 = File.objects.create(name="hello.png", type="event.attachment")
+        file1.putfile(BytesIO(b"File contents here"))
         attachment1 = EventAttachment.objects.create(
-            event_id=event1.event_id,
             project_id=event1.project_id,
-            file_id=File.objects.create(name="hello.png", type="image/png").id,
-            name="hello.png",
+            event_id=event1.event_id,
+            type="event.attachment",
+            name=file1.name,
+            file_id=file1.id,
         )
-        file = File.objects.create(name="hello.png", type="image/png")
-        EventAttachment.objects.create(
-            event_id=event2.event_id,
+
+        attachment2 = EventAttachment.objects.create(
             project_id=event2.project_id,
-            file_id=file.id,
-            type=file.type,
+            event_id=event2.event_id,
+            type="event.attachment",
             name="hello.png",
+            content_type="image/png",
+            size=1234,
+            sha1="1234",
+            # NOTE: we are not actually attaching the `file_id` here
         )
 
         path = f"/api/0/projects/{event1.project.organization.slug}/{event1.project.slug}/events/{event1.event_id}/attachments/"
@@ -44,8 +52,29 @@ class EventAttachmentsTest(APITestCase):
         assert response.status_code == 200, response.content
         assert len(response.data) == 1
         assert response.data[0]["id"] == str(attachment1.id)
-        assert response.data[0]["mimetype"] == "image/png"
         assert response.data[0]["event_id"] == attachment1.event_id
+        assert response.data[0]["type"] == "event.attachment"
+        assert response.data[0]["name"] == "hello.png"
+        assert response.data[0]["mimetype"] == "image/png"
+        assert response.data[0]["size"] == 18
+        assert response.data[0]["sha1"] == "d3f299af02d6abbe92dd8368bab781824a9702ed"
+        assert response.data[0]["headers"] == {"Content-Type": "image/png"}
+
+        path = f"/api/0/projects/{event2.project.organization.slug}/{event2.project.slug}/events/{event2.event_id}/attachments/"
+
+        with self.feature("organizations:event-attachments"):
+            response = self.client.get(path)
+
+        assert response.status_code == 200, response.content
+        assert len(response.data) == 1
+        assert response.data[0]["id"] == str(attachment2.id)
+        assert response.data[0]["event_id"] == attachment2.event_id
+        assert response.data[0]["type"] == "event.attachment"
+        assert response.data[0]["name"] == "hello.png"
+        assert response.data[0]["mimetype"] == "image/png"
+        assert response.data[0]["size"] == 1234
+        assert response.data[0]["sha1"] == "1234"
+        assert response.data[0]["headers"] == {"Content-Type": "image/png"}
 
     def test_is_screenshot(self):
         self.login_as(user=self.user)