SMusatov
/
GlitchTip
mirror of https://gitlab.com/glitchtip/glitchtip-backend.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
							"""Partial port of sentry/tasks/assemble.py"""

import hashlib
import json
import shutil
import tempfile
from enum import Enum
from os import path

from django.core.cache import cache

from apps.organizations_ext.models import Organization
from apps.releases.models import Release
from apps.sourcecode.models import DebugSymbolBundle
from sentry.utils.zip import safe_extract_zip

from .exceptions import AssembleArtifactsError, AssembleChecksumMismatch
from .models import File, FileBlob

MAX_FILE_SIZE = 2**31  # 2GB is the maximum offset supported by fileblob


class ChunkFileState(Enum):
    OK = "ok"  # File in database
    NOT_FOUND = "not_found"  # File not found in database
    CREATED = "created"  # File was created in the request and send to the worker for assembling
    ASSEMBLING = "assembling"  # File still being processed by worker
    ERROR = "error"  # Error happened during assembling


class AssembleTask(Enum):
    DIF = "project.dsym"  # Debug file upload
    ARTIFACTS = "organization.artifacts"  # Release file upload


def _get_cache_key(task, scope, checksum):
    """Computes the cache key for assemble status.

    ``task`` must be one of the ``AssembleTask`` values. The scope can be the
    identifier of any model, such as the organization or project that this task
    is performed under.

    ``checksum`` should be the SHA1 hash of the main file that is being
    assembled.
    """
    return (
        "assemble-status:%s"
        % hashlib.sha1(
            ("%s|%s|%s" % (scope, checksum.encode("ascii"), task)).encode()
        ).hexdigest()
    )


def set_assemble_status(
    task: AssembleTask, scope, checksum, state: ChunkFileState, detail=None
):
    """
    Updates the status of an assembling task. It is cached for 10 minutes.
    """
    cache_key = _get_cache_key(task, scope, checksum)
    cache.set(cache_key, (state, detail), 600)


def assemble_artifacts(
    organization: Organization, version: str | None, checksum: str, chunks: list[str]
):
    set_assemble_status(
        AssembleTask.ARTIFACTS, organization.pk, checksum, ChunkFileState.ASSEMBLING
    )
    # Assemble the chunks into a temporary file
    rv = assemble_file(
        AssembleTask.ARTIFACTS,
        organization,
        "release-artifacts.zip",
        checksum,
        chunks,
        file_type="release.bundle",
    )

    if rv is None:
        return

    bundle, temp_file = rv
    scratchpad = tempfile.mkdtemp()

    try:
        safe_extract_zip(temp_file, scratchpad, strip_toplevel=False)
    except BaseException as ex:
        raise AssembleArtifactsError("failed to extract bundle") from ex

    try:
        manifest_path = path.join(scratchpad, "manifest.json")
        with open(manifest_path, "rb") as manifest:
            manifest = json.loads(manifest.read())
    except BaseException as ex:
        raise AssembleArtifactsError("failed to open release manifest") from ex

    if organization.slug != manifest.get("org"):
        raise AssembleArtifactsError("organization does not match uploaded bundle")

    release_name = manifest.get("release")
    if release_name != version:
        raise AssembleArtifactsError("release does not match uploaded bundle")

    release: Release | None = None
    if release_name:
        release, _ = Release.objects.get_or_create(
            organization=organization, version=release_name
        )

    # Sentry OSS would add dist to release here

    artifacts = manifest.get("files", {})
    files = []
    for rel_path, artifact in artifacts.items():
        artifact_url = artifact.get("url", rel_path)
        artifact_basename = artifact_url.rsplit("/", 1)[-1]
        headers = artifact.get("headers", {})

        file = File.objects.create(
            name=artifact_basename,
            type=artifact["type"],
            headers=headers,
        )
        files.append(file)

        full_path = path.join(scratchpad, rel_path)
        with open(full_path, "rb") as fp:
            file.putfile(fp)

    bundles: list[DebugSymbolBundle] = []
    for file in files:
        if file.type == "minified_source":
            try:
                sourcemap_file = next(
                    value
                    for value in files
                    if value.type == "source_map"
                    and (
                        file.headers.get("sourcemap", file.headers.get("Sourcemap"))
                        == value.name
                        or (
                            value.headers.get("debug-id")
                            and value.headers.get("debug-id")
                            == file.headers.get("debug-id")
                        )
                    )
                )
            except StopIteration:
                sourcemap_file = None

            if sourcemap_file:
                bundles.append(
                    DebugSymbolBundle(
                        organization=organization,
                        debug_id=file.headers.get("debug-id"),
                        release=release,
                        sourcemap_file=sourcemap_file,
                        file=file,
                    )
                )

    DebugSymbolBundle.objects.bulk_create(
        bundles,
        ignore_conflicts=True,
        # unique_fields=["organization", "debug_id", "release"],
        # update_fields=["file", "sourcemap_file"],
    )
    # May need to readd this logic but in bulk
    # if not created:
    #     old_file = release_file.file
    #     release_file.file = file
    #     release_file.save(update_fields=["file"])
    #     old_file.delete()

    set_assemble_status(
        AssembleTask.ARTIFACTS, organization.pk, checksum, ChunkFileState.OK
    )
    shutil.rmtree(scratchpad)
    bundle.delete()


def assemble_file(
    task: AssembleTask,
    organization: Organization,
    name: str,
    checksum,
    chunks,
    file_type,
):
    """
    Verifies and assembles a file model from chunks.

    This downloads all chunks from blob store to verify their integrity and
    associates them with a created file model. Additionally, it assembles the
    full file in a temporary location and verifies the complete content hash.

    Returns a tuple ``(File, TempFile)`` on success, or ``None`` on error.
    """
    # Load all FileBlobs from db since we can be sure here we already own all
    # chunks need to build the file
    file_blobs = FileBlob.objects.filter(checksum__in=chunks).values_list(
        "id", "checksum", "size"
    )

    # Reject all files that exceed the maximum allowed size for this
    # organization. This value cannot be
    file_size = sum(x[2] for x in file_blobs)
    if file_size > MAX_FILE_SIZE:
        set_assemble_status(
            task,
            organization.id,
            checksum,
            ChunkFileState.ERROR,
            detail="File exceeds maximum size",
        )
        return

    # Sanity check.  In case not all blobs exist at this point we have a
    # race condition.
    if set(x[1] for x in file_blobs) != set(chunks):
        set_assemble_status(
            task,
            organization.id,
            checksum,
            ChunkFileState.ERROR,
            detail="Not all chunks available for assembling",
        )
        return

    # Ensure blobs are in the order and duplication in which they were
    # transmitted. Otherwise, we would assemble the file in the wrong order.
    ids_by_checksum = {chks: id for id, chks, _ in file_blobs}
    file_blob_ids = [ids_by_checksum[c] for c in chunks]

    file = File.objects.create(name=name, checksum=checksum, type=file_type)
    try:
        temp_file = file.assemble_from_file_blob_ids(file_blob_ids, checksum)
    except AssembleChecksumMismatch:
        file.delete()
        set_assemble_status(
            task,
            organization.id,
            checksum,
            ChunkFileState.ERROR,
            detail="Reported checksum mismatch",
        )
    else:
        file.save()
        return file, temp_file