Browse Source

Save `EventAttachment` blobs directly (#62318)

This introduces a new option to save the `EventAttachment` blobs
directly in the configured filestore storage backend, vs going through
the `File/Blob` abstraction.
Arpad Borsos 1 year ago
parent
commit
a3cc6d8355

+ 9 - 15
src/sentry/api/endpoints/event_attachment_details.py

@@ -1,6 +1,6 @@
 import posixpath
 
-from django.http import StreamingHttpResponse
+from django.http import FileResponse
 from rest_framework.request import Request
 from rest_framework.response import Response
 
@@ -14,7 +14,6 @@ from sentry.auth.superuser import is_active_superuser
 from sentry.auth.system import is_system_auth
 from sentry.constants import ATTACHMENTS_ROLE_DEFAULT
 from sentry.models.eventattachment import EventAttachment
-from sentry.models.files.file import File
 from sentry.models.organizationmember import OrganizationMember
 
 
@@ -57,22 +56,17 @@ class EventAttachmentDetailsEndpoint(ProjectEndpoint):
     permission_classes = (EventAttachmentDetailsPermission,)
 
     def download(self, attachment):
-        file = File.objects.get(id=attachment.file_id)
-
-        content_type = attachment.content_type or file.headers.get(
-            "content-type", "application/octet-stream"
-        )
-        size = attachment.size or file.size
+        fp = attachment.getfile()
+        name = posixpath.basename(" ".join(attachment.name.split()))
 
-        fp = file.getfile()
-        response = StreamingHttpResponse(
-            iter(lambda: fp.read(4096), b""),
-            content_type=content_type,
+        response = FileResponse(
+            fp,
+            as_attachment=True,
+            filename=name,
+            content_type=attachment.content_type,
         )
+        response["Content-Length"] = attachment.size
 
-        response["Content-Length"] = size
-        name = posixpath.basename(" ".join(attachment.name.split()))
-        response["Content-Disposition"] = f'attachment; filename="{name}"'
         return response
 
     def get(self, request: Request, project, event_id, attachment_id) -> Response:

+ 7 - 26
src/sentry/event_manager.py

@@ -3,14 +3,12 @@ from __future__ import annotations
 import copy
 import ipaddress
 import logging
-import mimetypes
 import random
 import re
 import time
 import uuid
 from dataclasses import dataclass
 from datetime import datetime, timedelta, timezone
-from io import BytesIO
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -85,7 +83,6 @@ from sentry.models.activity import Activity
 from sentry.models.environment import Environment
 from sentry.models.event import EventDict
 from sentry.models.eventattachment import CRASH_REPORT_TYPES, EventAttachment, get_crashreport_key
-from sentry.models.files.file import File
 from sentry.models.group import Group, GroupStatus
 from sentry.models.groupenvironment import GroupEnvironment
 from sentry.models.grouphash import GroupHash
@@ -2408,7 +2405,7 @@ def save_attachment(
         timestamp = datetime.utcnow().replace(tzinfo=timezone.utc)
 
     try:
-        data = attachment.data
+        attachment.data
     except MissingAttachmentChunks:
         track_outcome(
             org_id=project.organization_id,
@@ -2424,18 +2421,7 @@ def save_attachment(
         logger.exception("Missing chunks for cache_key=%s", cache_key)
         return
 
-    content_type = normalize_content_type(attachment.content_type, attachment.name)
-
-    file = File.objects.create(
-        name=attachment.name,
-        type=attachment.type,
-        headers={"Content-Type": content_type},
-    )
-    file.putfile(BytesIO(data), blob_size=settings.SENTRY_ATTACHMENT_BLOB_SIZE)
-
-    size = file.size
-    sha1 = file.checksum
-    file_id = file.id
+    file = EventAttachment.putfile(project.id, attachment)
 
     EventAttachment.objects.create(
         # lookup:
@@ -2445,11 +2431,12 @@ def save_attachment(
         # metadata:
         type=attachment.type,
         name=attachment.name,
-        content_type=content_type,
-        size=size,
-        sha1=sha1,
+        content_type=file.content_type,
+        size=file.size,
+        sha1=file.sha1,
         # storage:
-        file_id=file_id,
+        file_id=file.file_id,
+        blob_path=file.blob_path,
     )
 
     track_outcome(
@@ -2465,12 +2452,6 @@ def save_attachment(
     )
 
 
-def normalize_content_type(content_type: str | None, name: str) -> str:
-    if content_type:
-        return content_type.split(";")[0].strip()
-    return mimetypes.guess_type(name)[0] or "application/octet-stream"
-
-
 def save_attachments(cache_key: Optional[str], attachments: list[Attachment], job: Job) -> None:
     """
     Persists cached event attachments into the file store.

+ 98 - 2
src/sentry/models/eventattachment.py

@@ -1,11 +1,22 @@
+import mimetypes
+import random
+from dataclasses import dataclass
+from io import BytesIO
+from typing import IO, Optional
+
+import zstandard
+from django.conf import settings
 from django.core.cache import cache
 from django.core.exceptions import ObjectDoesNotExist
 from django.db import models
 from django.utils import timezone
 
+from sentry import options
+from sentry.attachments.base import CachedAttachment
 from sentry.backup.scopes import RelocationScope
 from sentry.db.models import BoundedBigIntegerField, Model, region_silo_only_model, sane_repr
 from sentry.db.models.fields.bounded import BoundedIntegerField
+from sentry.models.files.utils import get_size_and_checksum, get_storage
 
 # Attachment file types that are considered a crash report (PII relevant)
 CRASH_REPORT_TYPES = ("event.minidump", "event.applecrashreport")
@@ -29,8 +40,28 @@ def event_attachment_screenshot_filter(queryset):
     )
 
 
+@dataclass(frozen=True)
+class PutfileResult:
+    content_type: str
+    size: int
+    sha1: str
+    file_id: Optional[int] = None
+    blob_path: Optional[str] = None
+
+
 @region_silo_only_model
 class EventAttachment(Model):
+    """Attachment Metadata and Storage
+
+    The actual attachment data can be saved in different backing stores:
+    - Using the :class:`File` model using the `file_id` field.
+      This stores attachments chunked and deduplicated.
+    - When the `blob_path` field has a `eventattachments/v1/` prefix:
+      In this case, the default :func:`get_storage` is used as the backing store.
+      The attachment data is not chunked or deduplicated in this case.
+      However, it is `zstd` compressed.
+    """
+
     __relocation_scope__ = RelocationScope.Excluded
 
     # the things we want to look up attachments by:
@@ -60,8 +91,6 @@ class EventAttachment(Model):
     __repr__ = sane_repr("event_id", "name")
 
     def delete(self, *args, **kwargs):
-        from sentry.models.files.file import File
-
         rv = super().delete(*args, **kwargs)
 
         if self.group_id and self.type in CRASH_REPORT_TYPES:
@@ -70,7 +99,18 @@ class EventAttachment(Model):
             # repopulated with the next incoming crash report.
             cache.delete(get_crashreport_key(self.group_id))
 
+        if self.blob_path:
+            if self.blob_path.startswith("eventattachments/v1/"):
+                storage = get_storage()
+            else:
+                raise NotImplementedError()
+
+            storage.delete(self.blob_path)
+            return rv
+
         try:
+            from sentry.models.files.file import File
+
             file = File.objects.get(id=self.file_id)
         except ObjectDoesNotExist:
             # It's possible that the File itself was deleted
@@ -82,3 +122,59 @@ class EventAttachment(Model):
             file.delete()
 
         return rv
+
+    def getfile(self) -> IO:
+        if self.blob_path:
+            if self.blob_path.startswith("eventattachments/v1/"):
+                storage = get_storage()
+                compressed_blob = storage.open(self.blob_path)
+                dctx = zstandard.ZstdDecompressor()
+                return dctx.stream_reader(compressed_blob, read_across_frames=True)
+            else:
+                raise NotImplementedError()
+
+        from sentry.models.files.file import File
+
+        file = File.objects.get(id=self.file_id)
+        return file.getfile()
+
+    @classmethod
+    def putfile(cls, project_id: int, attachment: CachedAttachment) -> PutfileResult:
+        from sentry.models.files import File, FileBlob
+
+        blob = BytesIO(attachment.data)
+        content_type = normalize_content_type(attachment.content_type, attachment.name)
+
+        store_blobs = project_id in options.get("eventattachments.store-blobs.projects") or (
+            random.random() < options.get("eventattachments.store-blobs.sample-rate")
+        )
+
+        if store_blobs:
+            size, checksum = get_size_and_checksum(blob)
+            blob_path = "eventattachments/v1/" + FileBlob.generate_unique_path()
+
+            storage = get_storage()
+            cctx = zstandard.ZstdCompressor()
+            compressed_blob = cctx.stream_reader(blob)
+            storage.save(blob_path, compressed_blob)
+
+            return PutfileResult(
+                content_type=content_type, size=size, sha1=checksum, blob_path=blob_path
+            )
+
+        file = File.objects.create(
+            name=attachment.name,
+            type=attachment.type,
+            headers={"Content-Type": content_type},
+        )
+        file.putfile(blob, blob_size=settings.SENTRY_ATTACHMENT_BLOB_SIZE)
+
+        return PutfileResult(
+            content_type=content_type, size=file.size, sha1=file.checksum, file_id=file.id
+        )
+
+
+def normalize_content_type(content_type: str | None, name: str) -> str:
+    if content_type:
+        return content_type.split(";")[0].strip()
+    return mimetypes.guess_type(name)[0] or "application/octet-stream"

+ 1 - 0
src/sentry/models/files/utils.py

@@ -47,6 +47,7 @@ def get_size_and_checksum(fileobj, logger=nooplogger):
             break
         size += len(chunk)
         checksum.update(chunk)
+    fileobj.seek(0)
 
     logger.debug("get_size_and_checksum.end")
     return size, checksum.hexdigest()

+ 5 - 0
src/sentry/options/defaults.py

@@ -842,6 +842,11 @@ register(
 # Whether to use `zstd` instead of `zlib` for the attachment cache.
 register("attachment-cache.use-zstd", default=False, flags=FLAG_AUTOMATOR_MODIFIABLE)
 
+# Set of projects that will always store `EventAttachment` blobs directly.
+register("eventattachments.store-blobs.projects", default={}, flags=FLAG_AUTOMATOR_MODIFIABLE)
+# Percentage sample rate for `EventAttachment`s that should use direct blob storage.
+register("eventattachments.store-blobs.sample-rate", default=0.0, flags=FLAG_AUTOMATOR_MODIFIABLE)
+
 # All Relay options (statically authenticated Relays can be registered here)
 register("relay.static_auth", default={}, flags=FLAG_NOSTORE)
 

+ 28 - 0
tests/relay_integration/test_integration.py

@@ -125,6 +125,34 @@ class SentryRemoteTest(RelayStoreHelper, TransactionTestCase):
         attachment = EventAttachment.objects.get(project_id=self.project.id, event_id=event_id)
         assert attachment.group_id == event.group_id
 
+    def test_blob_only_attachment(self):
+        event_id1 = uuid4().hex
+        event_id2 = uuid4().hex
+
+        files = {"some_file": ("hello.txt", BytesIO(b"Hello World! default"))}
+        self.post_and_retrieve_attachment(event_id1, files)
+
+        # Again, but using direct blob storage
+        files = {"some_file": ("hello.txt", BytesIO(b"Hello World! direct"))}
+        with self.options(
+            {
+                "eventattachments.store-blobs.sample-rate": 1,
+            }
+        ):
+            self.post_and_retrieve_attachment(event_id2, files)
+
+        # Finally, fetch the updated attachment and compare the group id
+        attachments = EventAttachment.objects.filter(project_id=self.project.id)
+        assert len(attachments) == 2
+
+        with attachments[0].getfile() as blob:
+            assert blob.read() == b"Hello World! default"
+        assert attachments[0].file_id is not None
+
+        with attachments[1].getfile() as blob:
+            assert blob.read() == b"Hello World! direct"
+        assert attachments[1].blob_path is not None
+
     def test_transaction(self):
         event_data = {
             "event_id": "d2132d31b39445f1938d7e21b6bf0ec4",

+ 39 - 12
tests/sentry/api/endpoints/test_event_attachment_details.py

@@ -1,7 +1,5 @@
-from io import BytesIO
-
+from sentry.attachments.base import CachedAttachment
 from sentry.models.eventattachment import EventAttachment
-from sentry.models.files.file import File
 from sentry.testutils.cases import APITestCase, PermissionTestCase
 from sentry.testutils.helpers.datetime import before_now, iso_format
 from sentry.testutils.helpers.response import close_streaming_response
@@ -25,15 +23,25 @@ class CreateAttachmentMixin:
             project_id=self.project.id,
         )
 
-        self.file = File.objects.create(name="hello.png", type="image/png; foo=bar")
-        self.file.putfile(BytesIO(b"File contents here"))
+        attachment = CachedAttachment(
+            name="hello.png", content_type="image/png; foo=bar", data=b"File contents here"
+        )
+        file = EventAttachment.putfile(
+            self.project.id,
+            attachment,
+        )
 
         self.attachment = EventAttachment.objects.create(
             event_id=self.event.event_id,
             project_id=self.event.project_id,
-            file_id=self.file.id,
-            type=self.file.type,
-            name="hello.png",
+            type=attachment.type,
+            name=attachment.name,
+            content_type=file.content_type,
+            size=file.size,
+            sha1=file.sha1,
+            # storage:
+            file_id=file.file_id,
+            blob_path=file.blob_path,
         )
 
         return self.attachment
@@ -59,15 +67,34 @@ class EventAttachmentDetailsTest(APITestCase, CreateAttachmentMixin):
         self.login_as(user=self.user)
 
         self.create_attachment()
-        path = f"/api/0/projects/{self.organization.slug}/{self.project.slug}/events/{self.event.event_id}/attachments/{self.attachment.id}/?download"
+        path1 = f"/api/0/projects/{self.organization.slug}/{self.project.slug}/events/{self.event.event_id}/attachments/{self.attachment.id}/?download"
 
         with self.feature("organizations:event-attachments"):
-            response = self.client.get(path)
+            response = self.client.get(path1)
+
+        assert response.status_code == 200, response.content
+        assert response.get("Content-Disposition") == 'attachment; filename="hello.png"'
+        assert response.get("Content-Length") == str(self.attachment.size)
+        assert response.get("Content-Type") == "image/png"
+        assert b"File contents here" == close_streaming_response(response)
+
+        with self.options(
+            {
+                "eventattachments.store-blobs.sample-rate": 1,
+            }
+        ):
+            self.create_attachment()
+
+        path2 = f"/api/0/projects/{self.organization.slug}/{self.project.slug}/events/{self.event.event_id}/attachments/{self.attachment.id}/?download"
+        assert path1 is not path2
+
+        with self.feature("organizations:event-attachments"):
+            response = self.client.get(path2)
 
         assert response.status_code == 200, response.content
         assert response.get("Content-Disposition") == 'attachment; filename="hello.png"'
-        assert response.get("Content-Length") == str(self.file.size)
-        assert response.get("Content-Type") == "application/octet-stream"
+        assert response.get("Content-Length") == str(self.attachment.size)
+        assert response.get("Content-Type") == "image/png"
         assert b"File contents here" == close_streaming_response(response)
 
     def test_delete(self):