Browse Source

Remove foreign key to FileBlob from ExportedDataBlob (#27209)

* Remove foreign key to FileBlob from ExportedDataBlob

* Remove file fk from ExportedData

* Consolidate into single migration

* Fix ExportedData file use in serializer and tests

* Fixing more tests
Michal Kuffa 3 years ago
parent
commit
bf994341df

+ 1 - 1
migrations_lockfile.txt

@@ -7,5 +7,5 @@ will then be regenerated, and you should be able to merge without conflicts.
 
 jira_ac: 0001_initial
 nodestore: 0002_nodestore_no_dictfield
-sentry: 0218_releasefile_remove_fks
+sentry: 0219_exporteddatablob_remove_blob_fk
 social_auth: 0001_initial

+ 4 - 3
src/sentry/api/serializers/models/exporteddata.py

@@ -23,12 +23,13 @@ class ExportedDataSerializer(Serializer):
         return attrs
 
     def serialize(self, obj, attrs, user, **kwargs):
-        if obj.file is None:
+        file = obj._get_file()
+        if file is None:
             checksum = None
             file_name = None
         else:
-            checksum = obj.file.checksum
-            file_name = obj.file.name
+            checksum = file.checksum
+            file_name = file.name
 
         return {
             "id": obj.id,

+ 2 - 2
src/sentry/data_export/endpoints/data_export_details.py

@@ -35,7 +35,7 @@ class DataExportDetailsEndpoint(OrganizationEndpoint):
                         detail="You don't have access to some of the data this export contains."
                     )
             # Ignore the download parameter unless we have a file to stream
-            if request.GET.get("download") is not None and data_export.file is not None:
+            if request.GET.get("download") is not None and data_export._get_file() is not None:
                 return self.download(data_export)
             return Response(serialize(data_export, request.user))
         except ExportedData.DoesNotExist:
@@ -43,7 +43,7 @@ class DataExportDetailsEndpoint(OrganizationEndpoint):
 
     def download(self, data_export):
         metrics.incr("dataexport.download", sample_rate=1.0)
-        file = data_export.file
+        file = data_export._get_file()
         raw_file = file.getfile()
         response = StreamingHttpResponse(
             iter(lambda: raw_file.read(4096), b""), content_type="text/csv"

+ 18 - 9
src/sentry/data_export/models.py

@@ -31,9 +31,7 @@ class ExportedData(Model):
 
     organization = FlexibleForeignKey("sentry.Organization")
     user = FlexibleForeignKey(settings.AUTH_USER_MODEL, null=True, on_delete=models.SET_NULL)
-    file = FlexibleForeignKey(
-        "sentry.File", null=True, db_constraint=False, on_delete=models.SET_NULL
-    )
+    file_id = BoundedBigIntegerField(null=True)
     date_added = models.DateTimeField(default=timezone.now)
     date_finished = models.DateTimeField(null=True)
     date_expired = models.DateTimeField(null=True, db_index=True)
@@ -68,8 +66,9 @@ class ExportedData(Model):
         return None if date is None else force_text(date.strftime("%-I:%M %p on %B %d, %Y (%Z)"))
 
     def delete_file(self):
-        if self.file:
-            self.file.delete()
+        file = self._get_file()
+        if file:
+            file.delete()
 
     def delete(self, *args, **kwargs):
         self.delete_file()
@@ -79,14 +78,14 @@ class ExportedData(Model):
         self.delete_file()  # If a file is present, remove it
         current_time = timezone.now()
         expire_time = current_time + expiration
-        self.update(file=file, date_finished=current_time, date_expired=expire_time)
+        self.update(file_id=file.id, date_finished=current_time, date_expired=expire_time)
         self.email_success()
 
     def email_success(self):
         from sentry.utils.email import MessageBuilder
 
         # The following condition should never be true, but it's a safeguard in case someone manually calls this method
-        if self.date_finished is None or self.date_expired is None or self.file is None:
+        if self.date_finished is None or self.date_expired is None or self._get_file() is None:
             logger.warning(
                 "Notification email attempted on incomplete dataset",
                 extra={"data_export_id": self.id, "organization_id": self.organization_id},
@@ -121,6 +120,16 @@ class ExportedData(Model):
         msg.send_async([self.user.email])
         self.delete()
 
+    def _get_file(self):
+        from sentry.models import File
+
+        if self.file_id:
+            try:
+                return File.objects.get(pk=self.file_id)
+            except File.DoesNotExist:
+                self.update(file_id=None)
+        return None
+
     class Meta:
         app_label = "sentry"
         db_table = "sentry_exporteddata"
@@ -132,10 +141,10 @@ class ExportedDataBlob(Model):
     __include_in_export__ = False
 
     data_export = FlexibleForeignKey("sentry.ExportedData")
-    blob = FlexibleForeignKey("sentry.FileBlob", db_constraint=False)
+    blob_id = BoundedBigIntegerField()
     offset = BoundedBigIntegerField()
 
     class Meta:
         app_label = "sentry"
         db_table = "sentry_exporteddatablob"
-        unique_together = (("data_export", "blob", "offset"),)
+        unique_together = (("data_export", "blob_id", "offset"),)

+ 2 - 2
src/sentry/data_export/tasks.py

@@ -265,7 +265,7 @@ def store_export_chunk_as_blob(data_export, bytes_written, fileobj, blob_size=DE
         blob_fileobj = ContentFile(contents)
         blob = FileBlob.from_file(blob_fileobj, logger=logger)
         ExportedDataBlob.objects.get_or_create(
-            data_export=data_export, blob=blob, offset=bytes_written + bytes_offset
+            data_export=data_export, blob_id=blob.id, offset=bytes_written + bytes_offset
         )
 
         bytes_offset += blob.size
@@ -320,7 +320,7 @@ def merge_export_blobs(data_export_id, **kwargs):
                 for export_blob in ExportedDataBlob.objects.filter(
                     data_export=data_export
                 ).order_by("offset"):
-                    blob = export_blob.blob
+                    blob = FileBlob.objects.get(pk=export_blob.blob_id)
                     FileBlobIndex.objects.create(file=file, blob=blob, offset=size)
                     size += blob.size
                     blob_checksum = sha1(b"")

+ 62 - 0
src/sentry/migrations/0219_exporteddatablob_remove_blob_fk.py

@@ -0,0 +1,62 @@
+# Generated by Django 1.11.29 on 2021-07-07 19:03
+
+from django.db import migrations
+
+import sentry.db.models.fields.bounded
+
+
+class Migration(migrations.Migration):
+    # This flag is used to mark that a migration shouldn't be automatically run in
+    # production. We set this to True for operations that we think are risky and want
+    # someone from ops to run manually and monitor.
+    # General advice is that if in doubt, mark your migration as `is_dangerous`.
+    # Some things you should always mark as dangerous:
+    # - Large data migrations. Typically we want these to be run manually by ops so that
+    #   they can be monitored. Since data migrations will now hold a transaction open
+    #   this is even more important.
+    # - Adding columns to highly active tables, even ones that are NULL.
+    is_dangerous = False
+
+    # This flag is used to decide whether to run this migration in a transaction or not.
+    # By default we prefer to run in a transaction, but for migrations where you want
+    # to `CREATE INDEX CONCURRENTLY` this needs to be set to False. Typically you'll
+    # want to create an index concurrently when adding one to an existing table.
+    # You'll also usually want to set this to `False` if you're writing a data
+    # migration, since we don't want the entire migration to run in one long-running
+    # transaction.
+    atomic = True
+
+    dependencies = [
+        ("sentry", "0218_releasefile_remove_fks"),
+    ]
+
+    operations = [
+        migrations.SeparateDatabaseAndState(
+            database_operations=[],
+            state_operations=[
+                migrations.RemoveField(
+                    model_name="exporteddata",
+                    name="file",
+                ),
+                migrations.AddField(
+                    model_name="exporteddata",
+                    name="file_id",
+                    field=sentry.db.models.fields.bounded.BoundedBigIntegerField(null=True),
+                ),
+                migrations.AddField(
+                    model_name="exporteddatablob",
+                    name="blob_id",
+                    field=sentry.db.models.fields.bounded.BoundedBigIntegerField(default=0),
+                    preserve_default=False,
+                ),
+                migrations.RemoveField(
+                    model_name="exporteddatablob",
+                    name="blob",
+                ),
+                migrations.AlterUniqueTogether(
+                    name="exporteddatablob",
+                    unique_together={("data_export", "blob_id", "offset")},
+                ),
+            ],
+        )
+    ]

+ 1 - 1
tests/sentry/data_export/endpoints/test_data_export_details.py

@@ -81,7 +81,7 @@ class DataExportDetailsTest(APITestCase):
             name="test.csv", type="export.csv", headers={"Content-Type": "text/csv"}
         )
         file.putfile(BytesIO(contents))
-        self.data_export.update(file=file)
+        self.data_export.update(file_id=file.id)
         with self.feature("organizations:discover-query"):
             response = self.get_valid_response(self.organization.slug, self.data_export.id)
         assert response.data["checksum"] == sha1(contents).hexdigest()

+ 8 - 7
tests/sentry/data_export/test_models.py

@@ -58,18 +58,19 @@ class ExportedDataTest(TestCase):
 
     def test_delete_file(self):
         # Empty call should have no effect
-        assert self.data_export.file is None
+        assert self.data_export.file_id is None
         self.data_export.delete_file()
-        assert self.data_export.file is None
+        assert self.data_export.file_id is None
         # Real call should delete the file
         assert File.objects.filter(id=self.file1.id).exists()
-        self.data_export.update(file=self.file1)
-        assert isinstance(self.data_export.file, File)
+        self.data_export.update(file_id=self.file1.id)
+        assert isinstance(self.data_export._get_file(), File)
         self.data_export.delete_file()
+        assert self.data_export._get_file() is None
         assert not File.objects.filter(id=self.file1.id).exists()
         # The ExportedData should be unaffected
         assert ExportedData.objects.filter(id=self.data_export.id).exists()
-        assert ExportedData.objects.get(id=self.data_export.id).file is None
+        assert ExportedData.objects.get(id=self.data_export.id)._get_file() is None
 
     def test_delete(self):
         self.data_export.finalize_upload(file=self.file1)
@@ -86,7 +87,7 @@ class ExportedDataTest(TestCase):
             tf.seek(0)
             self.file1.putfile(tf)
         self.data_export.finalize_upload(file=self.file1)
-        assert self.data_export.file.getfile().read() == self.TEST_STRING
+        assert self.data_export._get_file().getfile().read() == self.TEST_STRING
         assert self.data_export.date_finished is not None
         assert self.data_export.date_expired is not None
         assert self.data_export.date_expired == self.data_export.date_finished + DEFAULT_EXPIRATION
@@ -96,7 +97,7 @@ class ExportedDataTest(TestCase):
             tf.seek(0)
             self.file2.putfile(tf)
         self.data_export.finalize_upload(file=self.file2, expiration=timedelta(weeks=2))
-        assert self.data_export.file.getfile().read() == self.TEST_STRING + self.TEST_STRING
+        assert self.data_export._get_file().getfile().read() == self.TEST_STRING + self.TEST_STRING
         # Ensure the first file is deleted
         assert not File.objects.filter(id=self.file1.id).exists()
         assert self.data_export.date_expired == self.data_export.date_finished + timedelta(weeks=2)

+ 66 - 57
tests/sentry/data_export/test_tasks.py

@@ -76,13 +76,14 @@ class AssembleDownloadTest(TestCase, SnubaTestCase):
         de = ExportedData.objects.get(id=de.id)
         assert de.date_finished is not None
         assert de.date_expired is not None
-        assert de.file is not None
-        assert isinstance(de.file, File)
-        assert de.file.headers == {"Content-Type": "text/csv"}
-        assert de.file.size is not None
-        assert de.file.checksum is not None
+        assert de.file_id is not None
+        assert isinstance(de._get_file(), File)
+        file = de._get_file()
+        assert file.headers == {"Content-Type": "text/csv"}
+        assert file.size is not None
+        assert file.checksum is not None
         # Convert raw csv to list of line-strings
-        header, raw1, raw2 = de.file.getfile().read().strip().split(b"\r\n")
+        header, raw1, raw2 = file.getfile().read().strip().split(b"\r\n")
         assert header == b"value,times_seen,last_seen,first_seen"
 
         raw1, raw2 = sorted([raw1, raw2])
@@ -107,13 +108,14 @@ class AssembleDownloadTest(TestCase, SnubaTestCase):
         de = ExportedData.objects.get(id=de.id)
         assert de.date_finished is not None
         assert de.date_expired is not None
-        assert de.file is not None
-        assert isinstance(de.file, File)
-        assert de.file.headers == {"Content-Type": "text/csv"}
-        assert de.file.size is not None
-        assert de.file.checksum is not None
+        assert de.file_id is not None
+        assert isinstance(de._get_file(), File)
+        file = de._get_file()
+        assert file.headers == {"Content-Type": "text/csv"}
+        assert file.size is not None
+        assert file.checksum is not None
         # Convert raw csv to list of line-strings
-        header, raw1, raw2 = de.file.getfile().read().strip().split(b"\r\n")
+        header, raw1, raw2 = file.getfile().read().strip().split(b"\r\n")
         assert header == b"value,times_seen,last_seen,first_seen"
 
         raw1, raw2 = sorted([raw1, raw2])
@@ -182,13 +184,14 @@ class AssembleDownloadTest(TestCase, SnubaTestCase):
         de = ExportedData.objects.get(id=de.id)
         assert de.date_finished is not None
         assert de.date_expired is not None
-        assert de.file is not None
-        assert isinstance(de.file, File)
-        assert de.file.headers == {"Content-Type": "text/csv"}
-        assert de.file.size is not None
-        assert de.file.checksum is not None
+        assert de.file_id is not None
+        assert isinstance(de._get_file(), File)
+        file = de._get_file()
+        assert file.headers == {"Content-Type": "text/csv"}
+        assert file.size is not None
+        assert file.checksum is not None
         # Convert raw csv to list of line-strings
-        header = de.file.getfile().read().strip()
+        header = file.getfile().read().strip()
         assert header == b"value,times_seen,last_seen,first_seen"
 
     @patch("sentry.data_export.models.ExportedData.email_success")
@@ -204,13 +207,14 @@ class AssembleDownloadTest(TestCase, SnubaTestCase):
         de = ExportedData.objects.get(id=de.id)
         assert de.date_finished is not None
         assert de.date_expired is not None
-        assert de.file is not None
-        assert isinstance(de.file, File)
-        assert de.file.headers == {"Content-Type": "text/csv"}
-        assert de.file.size is not None
-        assert de.file.checksum is not None
+        assert de.file_id is not None
+        assert isinstance(de._get_file(), File)
+        file = de._get_file()
+        assert file.headers == {"Content-Type": "text/csv"}
+        assert file.size is not None
+        assert file.checksum is not None
         # Convert raw csv to list of line-strings
-        header, raw1, raw2, raw3 = de.file.getfile().read().strip().split(b"\r\n")
+        header, raw1, raw2, raw3 = file.getfile().read().strip().split(b"\r\n")
         assert header == b"title"
 
         assert raw1.startswith(b"<unlabeled event>")
@@ -237,13 +241,14 @@ class AssembleDownloadTest(TestCase, SnubaTestCase):
         de = ExportedData.objects.get(id=de.id)
         assert de.date_finished is not None
         assert de.date_expired is not None
-        assert de.file is not None
-        assert isinstance(de.file, File)
-        assert de.file.headers == {"Content-Type": "text/csv"}
-        assert de.file.size is not None
-        assert de.file.checksum is not None
+        assert de.file_id is not None
+        assert isinstance(de._get_file(), File)
+        file = de._get_file()
+        assert file.headers == {"Content-Type": "text/csv"}
+        assert file.size is not None
+        assert file.checksum is not None
         # Convert raw csv to list of line-strings
-        header, raw1, raw2 = de.file.getfile().read().strip().split(b"\r\n")
+        header, raw1, raw2 = file.getfile().read().strip().split(b"\r\n")
         assert header == b"title"
 
         assert raw1.startswith(b"<unlabeled event>")
@@ -269,13 +274,14 @@ class AssembleDownloadTest(TestCase, SnubaTestCase):
         de = ExportedData.objects.get(id=de.id)
         assert de.date_finished is not None
         assert de.date_expired is not None
-        assert de.file is not None
-        assert isinstance(de.file, File)
-        assert de.file.headers == {"Content-Type": "text/csv"}
-        assert de.file.size is not None
-        assert de.file.checksum is not None
+        assert de.file_id is not None
+        assert isinstance(de._get_file(), File)
+        file = de._get_file()
+        assert file.headers == {"Content-Type": "text/csv"}
+        assert file.size is not None
+        assert file.checksum is not None
         # Convert raw csv to list of line-strings
-        header, raw1, raw2, raw3 = de.file.getfile().read().strip().split(b"\r\n")
+        header, raw1, raw2, raw3 = file.getfile().read().strip().split(b"\r\n")
         assert header == b"title"
 
         assert raw1.startswith(b"<unlabeled event>")
@@ -330,14 +336,15 @@ class AssembleDownloadTest(TestCase, SnubaTestCase):
         de = ExportedData.objects.get(id=de.id)
         assert de.date_finished is not None
         assert de.date_expired is not None
-        assert de.file is not None
-        assert isinstance(de.file, File)
-        assert de.file.headers == {"Content-Type": "text/csv"}
-        assert de.file.size is not None
-        assert de.file.checksum is not None
+        assert de.file_id is not None
+        assert isinstance(de._get_file(), File)
+        file = de._get_file()
+        assert file.headers == {"Content-Type": "text/csv"}
+        assert file.size is not None
+        assert file.checksum is not None
         # Convert raw csv to list of line-strings
         # capping MAX_FILE_SIZE forces the last batch to be dropped, leaving 2 rows
-        header, raw1, raw2 = de.file.getfile().read().strip().split(b"\r\n")
+        header, raw1, raw2 = file.getfile().read().strip().split(b"\r\n")
         assert header == b"title"
 
         assert raw1.startswith(b"<unlabeled event>")
@@ -358,14 +365,15 @@ class AssembleDownloadTest(TestCase, SnubaTestCase):
         de = ExportedData.objects.get(id=de.id)
         assert de.date_finished is not None
         assert de.date_expired is not None
-        assert de.file is not None
-        assert isinstance(de.file, File)
-        assert de.file.headers == {"Content-Type": "text/csv"}
-        assert de.file.size is not None
-        assert de.file.checksum is not None
+        assert de.file_id is not None
+        assert isinstance(de._get_file(), File)
+        file = de._get_file()
+        assert file.headers == {"Content-Type": "text/csv"}
+        assert file.size is not None
+        assert file.checksum is not None
         # Convert raw csv to list of line-strings
         # capping MAX_FILE_SIZE forces the last batch to be dropped, leaving 2 rows
-        header, raw1, raw2 = de.file.getfile().read().strip().split(b"\r\n")
+        header, raw1, raw2 = file.getfile().read().strip().split(b"\r\n")
         assert header == b"title"
 
         assert raw1.startswith(b"<unlabeled event>")
@@ -443,12 +451,13 @@ class AssembleDownloadTest(TestCase, SnubaTestCase):
         de = ExportedData.objects.get(id=de.id)
         assert de.date_finished is not None
         assert de.date_expired is not None
-        assert de.file is not None
-        assert isinstance(de.file, File)
-        assert de.file.headers == {"Content-Type": "text/csv"}
-        assert de.file.size is not None
-        assert de.file.checksum is not None
-        header, row = de.file.getfile().read().strip().split(b"\r\n")
+        assert de.file_id is not None
+        assert isinstance(de._get_file(), File)
+        file = de._get_file()
+        assert file.headers == {"Content-Type": "text/csv"}
+        assert file.size is not None
+        assert file.checksum is not None
+        header, row = file.getfile().read().strip().split(b"\r\n")
 
     @patch("sentry.snuba.discover.raw_query")
     @patch("sentry.data_export.models.ExportedData.email_failure")
@@ -590,7 +599,7 @@ class AssembleDownloadTest(TestCase, SnubaTestCase):
             assemble_download(de.id, batch_size=1)
         de = ExportedData.objects.get(id=de.id)
         # Convert raw csv to list of line-strings
-        header, raw1, raw2, raw3 = de.file.getfile().read().strip().split(b"\r\n")
+        header, raw1, raw2, raw3 = de._get_file().getfile().read().strip().split(b"\r\n")
         assert header == b"environment"
 
         assert raw1.startswith(b"prod")
@@ -639,8 +648,8 @@ class AssembleDownloadLargeTest(TestCase, SnubaTestCase):
         de = ExportedData.objects.get(id=de.id)
         assert de.date_finished is not None
         assert de.date_expired is not None
-        assert de.file is not None
-        assert isinstance(de.file, File)
+        assert de.file_id is not None
+        assert isinstance(de._get_file(), File)
 
         assert emailer.called