models.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. import mmap
  2. import tempfile
  3. from concurrent.futures import ThreadPoolExecutor
  4. from hashlib import sha1
  5. from django.core.files.base import File as FileObj
  6. from django.db import models, transaction
  7. from glitchtip.base_models import CreatedModel
  8. from .exceptions import AssembleChecksumMismatch
  9. def _get_size_and_checksum(fileobj):
  10. size = 0
  11. checksum = sha1()
  12. while True:
  13. chunk = fileobj.read(65536)
  14. if not chunk:
  15. break
  16. size += len(chunk)
  17. checksum.update(chunk)
  18. return size, checksum.hexdigest()
  19. class FileBlob(CreatedModel):
  20. """
  21. Port of sentry.models.file.FileBlob with simplifications
  22. OSS Sentry stores files in file blob chunks. Where one file gets saved as many blobs.
  23. GlitchTip uses Django FileField and does split files into chunks.
  24. The FileBlob's still provide file deduplication.
  25. """
  26. blob = models.FileField(upload_to="uploads/file_blobs")
  27. size = models.PositiveIntegerField(null=True)
  28. checksum = models.CharField(max_length=40, unique=True)
  29. @classmethod
  30. def from_files(cls, files, organization=None, logger=None):
  31. logger.debug("FileBlob.from_files.start")
  32. files_with_checksums = []
  33. for fileobj in files:
  34. if isinstance(fileobj, tuple):
  35. files_with_checksums.append(fileobj)
  36. else:
  37. files_with_checksums.append((fileobj, None))
  38. for file_with_checksum in files_with_checksums:
  39. blob = cls()
  40. blob_file = file_with_checksum[0]
  41. blob.size = file_with_checksum[0].size
  42. blob.checksum = file_with_checksum[1]
  43. blob.blob.save(blob_file.name, blob_file)
  44. blob.save()
  45. @classmethod
  46. def from_file(cls, fileobj):
  47. """
  48. Retrieve a single FileBlob instances for the given file.
  49. """
  50. checksum = sha1()
  51. with fileobj.open("rb") as f:
  52. if f.multiple_chunks():
  53. for chunk in f.chunks():
  54. checksum.update(chunk)
  55. else:
  56. checksum.update(f.read())
  57. # Significant deviation from OSS Sentry
  58. file_blob, _ = cls.objects.get_or_create(
  59. checksum=checksum.hexdigest(),
  60. defaults={"blob": fileobj, "size": fileobj.size},
  61. )
  62. return file_blob
  63. class File(CreatedModel):
  64. """
  65. Port of sentry.models.file.File
  66. """
  67. name = models.TextField()
  68. type = models.CharField(max_length=64)
  69. headers = models.JSONField(blank=True, null=True)
  70. blob = models.ForeignKey(FileBlob, on_delete=models.CASCADE, null=True)
  71. size = models.PositiveIntegerField(null=True)
  72. checksum = models.CharField(max_length=40, null=True, db_index=True)
  73. def put_django_file(self, fileobj):
  74. """Save a Django File object as a File Blob"""
  75. self.size = fileobj.size
  76. file_blob = FileBlob.from_file(fileobj)
  77. self.checksum = file_blob.checksum
  78. self.save()
  79. def putfile(self, fileobj):
  80. """Save a file-like object as a File Blob"""
  81. size, checksum = _get_size_and_checksum(fileobj)
  82. fileobj.seek(0)
  83. file_blob, _ = FileBlob.objects.get_or_create(
  84. defaults={"blob": FileObj(fileobj, name=checksum)},
  85. size=size,
  86. checksum=checksum,
  87. )
  88. self.checksum = checksum
  89. self.blob = file_blob
  90. self.save()
  91. def _get_chunked_blob(
  92. self, mode=None, prefetch=False, prefetch_to=None, delete=True
  93. ):
  94. return ChunkedFileBlobIndexWrapper(
  95. FileBlobIndex.objects.filter(file=self)
  96. .select_related("blob")
  97. .order_by("offset"),
  98. mode=mode,
  99. prefetch=prefetch,
  100. prefetch_to=prefetch_to,
  101. delete=delete,
  102. )
  103. def getfile(self):
  104. impl = self._get_chunked_blob()
  105. return FileObj(impl, self.name)
  106. def assemble_from_file_blob_ids(self, file_blob_ids, checksum, commit=True):
  107. """
  108. This creates a file, from file blobs and returns a temp file with the
  109. contents.
  110. """
  111. tf = tempfile.NamedTemporaryFile()
  112. with transaction.atomic():
  113. file_blobs = FileBlob.objects.filter(id__in=file_blob_ids).all()
  114. # Ensure blobs are in the order and duplication as provided
  115. blobs_by_id = {blob.id: blob for blob in file_blobs}
  116. file_blobs = [blobs_by_id[blob_id] for blob_id in file_blob_ids]
  117. new_checksum = sha1(b"")
  118. offset = 0
  119. for blob in file_blobs:
  120. FileBlobIndex.objects.create(file=self, blob=blob, offset=offset)
  121. for chunk in blob.blob.chunks():
  122. new_checksum.update(chunk)
  123. tf.write(chunk)
  124. offset += blob.size
  125. self.size = offset
  126. self.checksum = new_checksum.hexdigest()
  127. if checksum != self.checksum:
  128. raise AssembleChecksumMismatch("Checksum mismatch")
  129. if commit:
  130. self.save()
  131. tf.flush()
  132. tf.seek(0)
  133. return tf
  134. class FileBlobIndex(models.Model):
  135. """
  136. Ported from OSS Sentry. Should be removed as GlitchTip does not
  137. split file blobs into chunks.
  138. """
  139. file = models.ForeignKey(File, on_delete=models.CASCADE)
  140. blob = models.ForeignKey(FileBlob, on_delete=models.CASCADE)
  141. offset = models.PositiveIntegerField()
  142. class Meta:
  143. unique_together = (("file", "blob", "offset"),)
  144. class ChunkedFileBlobIndexWrapper(object):
  145. def __init__(
  146. self, indexes, mode=None, prefetch=False, prefetch_to=None, delete=True
  147. ):
  148. # eager load from database incase its a queryset
  149. self._indexes = list(indexes)
  150. self._curfile = None
  151. self._curidx = None
  152. if prefetch:
  153. self.prefetched = True
  154. self._prefetch(prefetch_to, delete)
  155. else:
  156. self.prefetched = False
  157. self.mode = mode
  158. self.open()
  159. def __enter__(self):
  160. return self
  161. def __exit__(self, exc_type, exc_value, tb):
  162. self.close()
  163. def detach_tempfile(self):
  164. if not self.prefetched:
  165. raise TypeError("Can only detech tempfiles in prefetch mode")
  166. rv = self._curfile
  167. self._curfile = None
  168. self.close()
  169. rv.seek(0)
  170. return rv
  171. def _nextidx(self):
  172. assert not self.prefetched, "this makes no sense"
  173. old_file = self._curfile
  174. try:
  175. try:
  176. self._curidx = next(self._idxiter)
  177. self._curfile = self._curidx.blob.getfile()
  178. except StopIteration:
  179. self._curidx = None
  180. self._curfile = None
  181. finally:
  182. if old_file is not None:
  183. old_file.close()
  184. @property
  185. def size(self):
  186. return sum(i.blob.size for i in self._indexes)
  187. def open(self):
  188. self.closed = False
  189. self.seek(0)
  190. def _prefetch(self, prefetch_to=None, delete=True):
  191. size = self.size
  192. f = tempfile.NamedTemporaryFile(
  193. prefix="._prefetch-", dir=prefetch_to, delete=delete
  194. )
  195. if size == 0:
  196. self._curfile = f
  197. return
  198. # Zero out the file
  199. f.seek(size - 1)
  200. f.write("\x00")
  201. f.flush()
  202. mem = mmap.mmap(f.fileno(), size)
  203. def fetch_file(offset, getfile):
  204. with getfile() as sf:
  205. while True:
  206. chunk = sf.read(65535)
  207. if not chunk:
  208. break
  209. mem[offset : offset + len(chunk)] = chunk
  210. offset += len(chunk)
  211. with ThreadPoolExecutor(max_workers=4) as exe:
  212. for idx in self._indexes:
  213. exe.submit(fetch_file, idx.offset, idx.blob.getfile)
  214. mem.flush()
  215. self._curfile = f
  216. def close(self):
  217. if self._curfile:
  218. self._curfile.close()
  219. self._curfile = None
  220. self._curidx = None
  221. self.closed = True
  222. def seek(self, pos):
  223. if self.closed:
  224. raise ValueError("I/O operation on closed file")
  225. if self.prefetched:
  226. return self._curfile.seek(pos)
  227. if pos < 0:
  228. raise IOError("Invalid argument")
  229. for n, idx in enumerate(self._indexes[::-1]):
  230. if idx.offset <= pos:
  231. if idx != self._curidx:
  232. self._idxiter = iter(self._indexes[-(n + 1) :])
  233. self._nextidx()
  234. break
  235. else:
  236. raise ValueError("Cannot seek to pos")
  237. self._curfile.seek(pos - self._curidx.offset)
  238. def tell(self):
  239. if self.closed:
  240. raise ValueError("I/O operation on closed file")
  241. if self.prefetched:
  242. return self._curfile.tell()
  243. if self._curfile is None:
  244. return self.size
  245. return self._curidx.offset + self._curfile.tell()
  246. def read(self, n=-1):
  247. if self.closed:
  248. raise ValueError("I/O operation on closed file")
  249. if self.prefetched:
  250. return self._curfile.read(n)
  251. result = bytearray()
  252. # Read to the end of the file
  253. if n < 0:
  254. while self._curfile is not None:
  255. blob_result = self._curfile.read(32768)
  256. if not blob_result:
  257. self._nextidx()
  258. else:
  259. result.extend(blob_result)
  260. # Read until a certain number of bytes are read
  261. else:
  262. while n > 0 and self._curfile is not None:
  263. blob_result = self._curfile.read(min(n, 32768))
  264. if not blob_result:
  265. self._nextidx()
  266. else:
  267. n -= len(blob_result)
  268. result.extend(blob_result)
  269. return bytes(result)