models.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. import mmap
  2. import tempfile
  3. from concurrent.futures import ThreadPoolExecutor
  4. from hashlib import sha1
  5. from asgiref.sync import sync_to_async
  6. from django.core.files.base import File as FileObj
  7. from django.db import models, transaction
  8. from glitchtip.base_models import CreatedModel
  9. from .exceptions import AssembleChecksumMismatch
  10. def _get_size_and_checksum(fileobj):
  11. size = 0
  12. checksum = sha1()
  13. while True:
  14. chunk = fileobj.read(65536)
  15. if not chunk:
  16. break
  17. size += len(chunk)
  18. checksum.update(chunk)
  19. return size, checksum.hexdigest()
  20. class FileBlob(CreatedModel):
  21. """
  22. Port of sentry.models.file.FileBlob with simplifications
  23. OSS Sentry stores files in file blob chunks. Where one file gets saved as many blobs.
  24. GlitchTip uses Django FileField and does split files into chunks.
  25. The FileBlob's still provide file deduplication.
  26. """
  27. blob = models.FileField(upload_to="uploads/file_blobs")
  28. size = models.PositiveIntegerField(null=True)
  29. checksum = models.CharField(max_length=40, unique=True)
  30. @classmethod
  31. async def from_files(cls, files, organization=None, logger=None):
  32. logger.debug("FileBlob.from_files.start")
  33. files_with_checksums = []
  34. for fileobj in files:
  35. if isinstance(fileobj, tuple):
  36. files_with_checksums.append(fileobj)
  37. else:
  38. files_with_checksums.append((fileobj, None))
  39. for file_with_checksum in files_with_checksums:
  40. blob = cls()
  41. blob_file = file_with_checksum[0]
  42. blob.size = file_with_checksum[0].size
  43. blob.checksum = file_with_checksum[1]
  44. await sync_to_async(blob.blob.save)(blob_file.name, blob_file)
  45. await blob.asave()
  46. @classmethod
  47. def from_file(cls, fileobj):
  48. """
  49. Retrieve a single FileBlob instances for the given file.
  50. """
  51. checksum = sha1()
  52. with fileobj.open("rb") as f:
  53. if f.multiple_chunks():
  54. for chunk in f.chunks():
  55. checksum.update(chunk)
  56. else:
  57. checksum.update(f.read())
  58. # Significant deviation from OSS Sentry
  59. file_blob, _ = cls.objects.get_or_create(
  60. checksum=checksum.hexdigest(),
  61. defaults={"blob": fileobj, "size": fileobj.size},
  62. )
  63. return file_blob
  64. class File(CreatedModel):
  65. """
  66. Port of sentry.models.file.File
  67. """
  68. name = models.TextField()
  69. type = models.CharField(max_length=64)
  70. headers = models.JSONField(blank=True, null=True)
  71. blob = models.ForeignKey(FileBlob, on_delete=models.CASCADE, null=True)
  72. size = models.PositiveIntegerField(default=0)
  73. checksum = models.CharField(max_length=40, null=True, db_index=True)
  74. def put_django_file(self, fileobj):
  75. """Save a Django File object as a File Blob"""
  76. self.size = fileobj.size
  77. file_blob = FileBlob.from_file(fileobj)
  78. self.checksum = file_blob.checksum
  79. self.save()
  80. def putfile(self, fileobj):
  81. """Save a file-like object as a File Blob"""
  82. size, checksum = _get_size_and_checksum(fileobj)
  83. fileobj.seek(0)
  84. file_blob, _ = FileBlob.objects.get_or_create(
  85. defaults={"blob": FileObj(fileobj, name=checksum)},
  86. size=size,
  87. checksum=checksum,
  88. )
  89. self.checksum = checksum
  90. self.blob = file_blob
  91. self.save()
  92. def _get_chunked_blob(
  93. self, mode=None, prefetch=False, prefetch_to=None, delete=True
  94. ):
  95. return ChunkedFileBlobIndexWrapper(
  96. FileBlobIndex.objects.filter(file=self)
  97. .select_related("blob")
  98. .order_by("offset"),
  99. mode=mode,
  100. prefetch=prefetch,
  101. prefetch_to=prefetch_to,
  102. delete=delete,
  103. )
  104. def getfile(self):
  105. impl = self._get_chunked_blob()
  106. return FileObj(impl, self.name)
  107. def assemble_from_file_blob_ids(self, file_blob_ids, checksum, commit=True):
  108. """
  109. This creates a file, from file blobs and returns a temp file with the
  110. contents.
  111. """
  112. tf = tempfile.NamedTemporaryFile()
  113. with transaction.atomic():
  114. file_blobs = FileBlob.objects.filter(id__in=file_blob_ids).all()
  115. # Ensure blobs are in the order and duplication as provided
  116. blobs_by_id = {blob.id: blob for blob in file_blobs}
  117. file_blobs = [blobs_by_id[blob_id] for blob_id in file_blob_ids]
  118. new_checksum = sha1(b"")
  119. offset = 0
  120. for blob in file_blobs:
  121. FileBlobIndex.objects.create(file=self, blob=blob, offset=offset)
  122. for chunk in blob.blob.chunks():
  123. new_checksum.update(chunk)
  124. tf.write(chunk)
  125. offset += blob.size
  126. self.size = offset
  127. self.checksum = new_checksum.hexdigest()
  128. if checksum != self.checksum:
  129. raise AssembleChecksumMismatch("Checksum mismatch")
  130. if commit:
  131. self.save()
  132. tf.flush()
  133. tf.seek(0)
  134. return tf
  135. class FileBlobIndex(models.Model):
  136. """
  137. Ported from OSS Sentry. Should be removed as GlitchTip does not
  138. split file blobs into chunks.
  139. """
  140. file = models.ForeignKey(File, on_delete=models.CASCADE)
  141. blob = models.ForeignKey(FileBlob, on_delete=models.CASCADE)
  142. offset = models.PositiveIntegerField()
  143. class Meta:
  144. unique_together = (("file", "blob", "offset"),)
  145. class ChunkedFileBlobIndexWrapper(object):
  146. def __init__(
  147. self, indexes, mode=None, prefetch=False, prefetch_to=None, delete=True
  148. ):
  149. # eager load from database incase its a queryset
  150. self._indexes = list(indexes)
  151. self._curfile = None
  152. self._curidx = None
  153. if prefetch:
  154. self.prefetched = True
  155. self._prefetch(prefetch_to, delete)
  156. else:
  157. self.prefetched = False
  158. self.mode = mode
  159. self.open()
  160. def __enter__(self):
  161. return self
  162. def __exit__(self, exc_type, exc_value, tb):
  163. self.close()
  164. def detach_tempfile(self):
  165. if not self.prefetched:
  166. raise TypeError("Can only detech tempfiles in prefetch mode")
  167. rv = self._curfile
  168. self._curfile = None
  169. self.close()
  170. rv.seek(0)
  171. return rv
  172. def _nextidx(self):
  173. assert not self.prefetched, "this makes no sense"
  174. old_file = self._curfile
  175. try:
  176. try:
  177. self._curidx = next(self._idxiter)
  178. self._curfile = self._curidx.blob.getfile()
  179. except StopIteration:
  180. self._curidx = None
  181. self._curfile = None
  182. finally:
  183. if old_file is not None:
  184. old_file.close()
  185. @property
  186. def size(self):
  187. return sum(i.blob.size for i in self._indexes)
  188. def open(self):
  189. self.closed = False
  190. self.seek(0)
  191. def _prefetch(self, prefetch_to=None, delete=True):
  192. size = self.size
  193. f = tempfile.NamedTemporaryFile(
  194. prefix="._prefetch-", dir=prefetch_to, delete=delete
  195. )
  196. if size == 0:
  197. self._curfile = f
  198. return
  199. # Zero out the file
  200. f.seek(size - 1)
  201. f.write("\x00")
  202. f.flush()
  203. mem = mmap.mmap(f.fileno(), size)
  204. def fetch_file(offset, getfile):
  205. with getfile() as sf:
  206. while True:
  207. chunk = sf.read(65535)
  208. if not chunk:
  209. break
  210. mem[offset : offset + len(chunk)] = chunk
  211. offset += len(chunk)
  212. with ThreadPoolExecutor(max_workers=4) as exe:
  213. for idx in self._indexes:
  214. exe.submit(fetch_file, idx.offset, idx.blob.getfile)
  215. mem.flush()
  216. self._curfile = f
  217. def close(self):
  218. if self._curfile:
  219. self._curfile.close()
  220. self._curfile = None
  221. self._curidx = None
  222. self.closed = True
  223. def seek(self, pos):
  224. if self.closed:
  225. raise ValueError("I/O operation on closed file")
  226. if self.prefetched:
  227. return self._curfile.seek(pos)
  228. if pos < 0:
  229. raise IOError("Invalid argument")
  230. for n, idx in enumerate(self._indexes[::-1]):
  231. if idx.offset <= pos:
  232. if idx != self._curidx:
  233. self._idxiter = iter(self._indexes[-(n + 1) :])
  234. self._nextidx()
  235. break
  236. else:
  237. raise ValueError("Cannot seek to pos")
  238. self._curfile.seek(pos - self._curidx.offset)
  239. def tell(self):
  240. if self.closed:
  241. raise ValueError("I/O operation on closed file")
  242. if self.prefetched:
  243. return self._curfile.tell()
  244. if self._curfile is None:
  245. return self.size
  246. return self._curidx.offset + self._curfile.tell()
  247. def read(self, n=-1):
  248. if self.closed:
  249. raise ValueError("I/O operation on closed file")
  250. if self.prefetched:
  251. return self._curfile.read(n)
  252. result = bytearray()
  253. # Read to the end of the file
  254. if n < 0:
  255. while self._curfile is not None:
  256. blob_result = self._curfile.read(32768)
  257. if not blob_result:
  258. self._nextidx()
  259. else:
  260. result.extend(blob_result)
  261. # Read until a certain number of bytes are read
  262. else:
  263. while n > 0 and self._curfile is not None:
  264. blob_result = self._curfile.read(min(n, 32768))
  265. if not blob_result:
  266. self._nextidx()
  267. else:
  268. n -= len(blob_result)
  269. result.extend(blob_result)
  270. return bytes(result)