Browse Source

Merge pull request #5436 from getsentry/feature/pending-delete-metadata

[deletions] new task abstraction and scheduler
David Cramer 7 years ago
parent
commit
3bb2cba756

+ 1 - 0
CHANGES

@@ -12,6 +12,7 @@ Version 8.17 (Unreleased)
 - The threads interface now contributes to grouping if it contains a single thread.
 - Added per-key (DSN) rate limits (``project:rate-limits`` feature).
 - Added tsdb statistics for events per-key.
+- Added ``sentry.deletions`` abstraction to improve bulk deletions.
 - Added basic workspace for Visual Studio Code.
 - Added hovercards for Issue IDs in activity entries.
 

+ 7 - 0
src/sentry/conf/server.py

@@ -594,6 +594,13 @@ CELERYBEAT_SCHEDULE = {
             'expires': 60 * 25,
         },
     },
+    'schedule-deletions': {
+        'task': 'sentry.tasks.deletion.run_scheduled_deletions',
+        'schedule': timedelta(minutes=15),
+        'options': {
+            'expires': 60 * 25,
+        },
+    },
     'schedule-weekly-organization-reports': {
         'task': 'sentry.tasks.reports.prepare_reports',
         'schedule': crontab(

+ 81 - 0
src/sentry/deletions/__init__.py

@@ -0,0 +1,81 @@
+"""
+The deletions subsystem managers bulk deletes as well as cascades. It attempts
+to optimize around various patterns while using a standard approach to do so.
+
+For example, let's say you want to delete an organization:
+
+>>> from sentry import deletions
+>>> task = deletions.get(model=Organization)
+>>> work = True
+>>> while work:
+>>>    work = task.chunk()
+
+The system has a default task implementation to handle Organization which will
+efficiently cascade deletes. This behavior varies based on the input object,
+as the task can override the behavior for it's children.
+
+For example, when you delete a Group, it will cascade in a more traditional
+manner. It will batch each child (such as Event). However, when you delete a
+project, it won't actually cascade to the registered Group task. It will instead
+take a more efficient approach of batch deleting its indirect descedancts, such
+as Event, so it can more efficiently bulk delete rows.
+"""
+
+from __future__ import absolute_import
+
+from .base import BulkModelDeletionTask, ModelDeletionTask, ModelRelation  # NOQA
+from .manager import DeletionTaskManager
+
+default_manager = DeletionTaskManager(default_task=ModelDeletionTask)
+
+
+def load_defaults():
+    from sentry import models
+    from . import defaults
+
+    default_manager.register(models.Activity, BulkModelDeletionTask)
+    default_manager.register(models.ApiApplication, defaults.ApiApplicationDeletionTask)
+    default_manager.register(models.ApiKey, BulkModelDeletionTask)
+    default_manager.register(models.ApiGrant, BulkModelDeletionTask)
+    default_manager.register(models.ApiToken, BulkModelDeletionTask)
+    default_manager.register(models.CommitAuthor, BulkModelDeletionTask)
+    default_manager.register(models.CommitFileChange, BulkModelDeletionTask)
+    default_manager.register(models.EnvironmentProject, BulkModelDeletionTask)
+    default_manager.register(models.Event, defaults.EventDeletionTask)
+    default_manager.register(models.EventMapping, BulkModelDeletionTask)
+    default_manager.register(models.EventTag, BulkModelDeletionTask)
+    default_manager.register(models.EventUser, BulkModelDeletionTask)
+    default_manager.register(models.Group, defaults.GroupDeletionTask)
+    default_manager.register(models.GroupAssignee, BulkModelDeletionTask)
+    default_manager.register(models.GroupBookmark, BulkModelDeletionTask)
+    default_manager.register(models.GroupCommitResolution, BulkModelDeletionTask)
+    default_manager.register(models.GroupEmailThread, BulkModelDeletionTask)
+    default_manager.register(models.GroupHash, BulkModelDeletionTask)
+    default_manager.register(models.GroupMeta, BulkModelDeletionTask)
+    default_manager.register(models.GroupRedirect, BulkModelDeletionTask)
+    default_manager.register(models.GroupRelease, BulkModelDeletionTask)
+    default_manager.register(models.GroupResolution, BulkModelDeletionTask)
+    default_manager.register(models.GroupRuleStatus, BulkModelDeletionTask)
+    default_manager.register(models.GroupSeen, BulkModelDeletionTask)
+    default_manager.register(models.GroupSnooze, BulkModelDeletionTask)
+    default_manager.register(models.GroupSubscription, BulkModelDeletionTask)
+    default_manager.register(models.GroupTagKey, BulkModelDeletionTask)
+    default_manager.register(models.GroupTagValue, BulkModelDeletionTask)
+    default_manager.register(models.Organization, defaults.OrganizationDeletionTask)
+    default_manager.register(models.OrganizationMemberTeam, BulkModelDeletionTask)
+    default_manager.register(models.Project, defaults.ProjectDeletionTask)
+    default_manager.register(models.ProjectBookmark, BulkModelDeletionTask)
+    default_manager.register(models.ProjectKey, BulkModelDeletionTask)
+    default_manager.register(models.Repository, defaults.RepositoryDeletionTask)
+    default_manager.register(models.SavedSearch, BulkModelDeletionTask)
+    default_manager.register(models.SavedSearchUserDefault, BulkModelDeletionTask)
+    default_manager.register(models.TagKey, defaults.TagKeyDeletionTask)
+    default_manager.register(models.TagValue, BulkModelDeletionTask)
+    default_manager.register(models.Team, defaults.TeamDeletionTask)
+    default_manager.register(models.UserReport, BulkModelDeletionTask)
+
+
+load_defaults()
+
+get = default_manager.get
+register = default_manager.register

+ 218 - 0
src/sentry/deletions/base.py

@@ -0,0 +1,218 @@
+from __future__ import absolute_import, print_function
+
+import logging
+
+from sentry.constants import ObjectStatus
+from sentry.utils.query import bulk_delete_objects
+
+
+class BaseRelation(object):
+    def __init__(self, params, task):
+        self.task = task
+        self.params = params
+
+    def __repr__(self):
+        return '<%s: task=%s params=%s>' % (
+            type(self),
+            self.task,
+            self.params,
+        )
+
+
+class ModelRelation(BaseRelation):
+    def __init__(self, model, query, task=None):
+        params = {
+            'model': model,
+            'query': query,
+        }
+        super(ModelRelation, self).__init__(params=params, task=task)
+
+
+class BaseDeletionTask(object):
+    logger = logging.getLogger('sentry.deletions.async')
+
+    DEFAULT_CHUNK_SIZE = 100
+
+    def __init__(self, manager, transaction_id=None,
+                 actor_id=None, chunk_size=DEFAULT_CHUNK_SIZE):
+        self.manager = manager
+        self.transaction_id = transaction_id
+        self.actor_id = actor_id
+        self.chunk_size = chunk_size
+
+    def __repr__(self):
+        return '<%s: transaction_id=%s actor_id=%s>' % (
+            type(self),
+            self.transaction_id,
+            self.actor_id,
+        )
+
+    def chunk(self):
+        """
+        Deletes a chunk of this instance's data. Return ``True`` if there is
+        more work, or ``False`` if the entity has been removed.
+        """
+        raise NotImplementedError
+
+    def get_child_relations(self, instance):
+        # TODO(dcramer): it'd be nice if we collected the default relationships
+        return [
+            # ModelRelation(Model, {'parent_id': instance.id})
+        ]
+
+    def get_child_relations_bulk(self, instance_list):
+        return [
+            # ModelRelation(Model, {'parent_id__in': [i.id for id in instance_list]})
+        ]
+
+    def delete_bulk(self, instance_list):
+        """
+        Delete a batch of objects bound to this task.
+
+        This **should** not be called with arbitrary types, but rather should
+        be used for only the base type this task was instantiated against.
+        """
+        self.mark_deletion_in_progress(instance_list)
+
+        child_relations = self.get_child_relations_bulk(instance_list)
+        if child_relations:
+            has_more = self.delete_children(child_relations)
+            if has_more:
+                return has_more
+
+        for instance in instance_list:
+            child_relations = self.get_child_relations(instance)
+            if child_relations:
+                has_more = self.delete_children(child_relations)
+                if has_more:
+                    return has_more
+
+        return self.delete_instance_bulk(instance_list)
+
+    def delete_instance(self, instance):
+        raise NotImplementedError
+
+    def delete_instance_bulk(self, instance_list):
+        for instance in instance_list:
+            self.delete_instance(instance)
+
+    def delete_children(self, relations):
+        # Ideally this runs through the deletion manager
+        has_more = False
+        for relation in relations:
+            task = self.manager.get(
+                transaction_id=self.transaction_id,
+                actor_id=self.actor_id,
+                chunk_size=self.chunk_size,
+                task=relation.task,
+                **relation.params
+            )
+            has_more = task.chunk()
+            if has_more:
+                return has_more
+        return has_more
+
+    def mark_deletion_in_progress(self, instance_list):
+        pass
+
+
+class ModelDeletionTask(BaseDeletionTask):
+    DEFAULT_QUERY_LIMIT = None
+
+    def __init__(self, manager, model, query, query_limit=None, **kwargs):
+        super(ModelDeletionTask, self).__init__(manager, **kwargs)
+        self.model = model
+        self.query = query
+        self.query_limit = (
+            query_limit or
+            self.DEFAULT_QUERY_LIMIT or
+            self.chunk_size
+        )
+
+    def __repr__(self):
+        return '<%s: model=%s query=%s transaction_id=%s actor_id=%s>' % (
+            type(self),
+            self.model,
+            self.query,
+            self.transaction_id,
+            self.actor_id,
+        )
+
+    def chunk(self):
+        """
+        Deletes a chunk of this instance's data. Return ``True`` if there is
+        more work, or ``False`` if the entity has been removed.
+        """
+        query_limit = self.query_limit
+        remaining = self.chunk_size
+        while remaining > 0:
+            queryset = list(self.model.objects.filter(
+                **self.query
+            )[:query_limit])
+            if not queryset:
+                return False
+
+            self.delete_bulk(queryset)
+            remaining -= query_limit
+        return True
+
+    def delete_instance_bulk(self, instance_list):
+        # slow, but ensures Django cascades are handled
+        for instance in instance_list:
+            self.delete_instance(instance)
+
+    def delete_instance(self, instance):
+        instance_id = instance.id
+        try:
+            instance.delete()
+        finally:
+            self.logger.info('object.delete.executed', extra={
+                'object_id': instance_id,
+                'transaction_id': self.transaction_id,
+                'app_label': instance._meta.app_label,
+                'model': type(instance).__name__,
+            })
+
+    def get_actor(self):
+        from sentry.models import User
+
+        if self.actor_id:
+            try:
+                return User.objects.get_from_cache(id=self.actor_id)
+            except User.DoesNotExist:
+                pass
+        return None
+
+    def mark_deletion_in_progress(self, instance_list):
+        for instance in instance_list:
+            status = getattr(instance, 'status', None)
+            if status not in (ObjectStatus.DELETION_IN_PROGRESS, None):
+                instance.update(status=ObjectStatus.DELETION_IN_PROGRESS)
+
+
+class BulkModelDeletionTask(ModelDeletionTask):
+    """
+    An efficient mechanism for deleting larger volumes of rows in one pass,
+    but will hard fail if the relations have resident foreign relations.
+
+    Note: Does NOT support child relations.
+    """
+    DEFAULT_CHUNK_SIZE = 10000
+
+    def chunk(self):
+        return self.delete_instance_bulk()
+
+    def delete_instance_bulk(self):
+        try:
+            return bulk_delete_objects(
+                model=self.model,
+                limit=self.chunk_size,
+                transaction_id=self.transaction_id,
+                **self.query
+            )
+        finally:
+            self.logger.info('object.delete.bulk_executed', extra=dict({
+                'transaction_id': self.transaction_id,
+                'app_label': self.model._meta.app_label,
+                'model': self.model.__name__,
+            }, **self.query))

+ 5 - 0
src/sentry/deletions/defaults/__init__.py

@@ -0,0 +1,5 @@
+from __future__ import absolute_import
+
+from sentry.utils.imports import import_submodules
+
+import_submodules(globals(), __name__, __path__)

+ 23 - 0
src/sentry/deletions/defaults/apiapplication.py

@@ -0,0 +1,23 @@
+from __future__ import absolute_import, print_function
+
+from ..base import ModelDeletionTask, ModelRelation
+
+
+class ApiApplicationDeletionTask(ModelDeletionTask):
+    def get_child_relations(self, instance):
+        from sentry.models import ApiGrant, ApiToken
+
+        # in bulk
+        model_list = (
+            ApiToken, ApiGrant
+        )
+        return [
+            ModelRelation(m, {'application_id': instance.id}) for m in model_list
+        ]
+
+    def mark_deletion_in_progress(self, instance_list):
+        from sentry.models import ApiApplicationStatus
+
+        for instance in instance_list:
+            if instance.status != ApiApplicationStatus.deletion_in_progress:
+                instance.update(status=ApiApplicationStatus.deletion_in_progress)

+ 32 - 0
src/sentry/deletions/defaults/event.py

@@ -0,0 +1,32 @@
+from __future__ import absolute_import, print_function
+
+from sentry import nodestore
+
+from ..base import (
+    BaseDeletionTask, BaseRelation, ModelDeletionTask, ModelRelation
+)
+
+
+class NodeDeletionTask(BaseDeletionTask):
+    def __init__(self, manager, nodes, **kwargs):
+        self.nodes = nodes
+        super(NodeDeletionTask, self).__init__(manager, **kwargs)
+
+    def chunk(self):
+        nodestore.delete_multi(self.nodes)
+        return False
+
+
+class EventDeletionTask(ModelDeletionTask):
+    def get_child_relations_bulk(self, instance_list):
+        from sentry.models import EventTag
+
+        node_ids = [i.data.id for i in instance_list]
+        event_ids = [i.id for i in instance_list]
+
+        return [
+            BaseRelation({'nodes': node_ids}, NodeDeletionTask),
+            ModelRelation(EventTag, {
+                'event_id__in': event_ids,
+            }, ModelDeletionTask),
+        ]

+ 52 - 0
src/sentry/deletions/defaults/group.py

@@ -0,0 +1,52 @@
+from __future__ import absolute_import, print_function
+
+from ..base import ModelDeletionTask, ModelRelation
+
+
+class GroupDeletionTask(ModelDeletionTask):
+    def get_child_relations(self, instance):
+        from sentry import models
+
+        relations = []
+
+        model_list = (
+            # prioritize GroupHash
+            models.GroupHash,
+            models.EventTag,
+            models.EventMapping,
+            models.GroupAssignee,
+            models.GroupCommitResolution,
+            models.GroupBookmark,
+            models.GroupMeta,
+            models.GroupRelease,
+            models.GroupRedirect,
+            models.GroupResolution,
+            models.GroupRuleStatus,
+            models.GroupSnooze,
+            models.GroupTagValue,
+            models.GroupTagKey,
+            models.GroupEmailThread,
+            models.GroupSubscription,
+            models.UserReport,
+            # Event is last as its the most time consuming
+            models.Event,
+        )
+        relations.extend([
+            ModelRelation(m, {'group_id': instance.id}) for m in model_list
+        ])
+
+        return relations
+
+    def delete_instance(self, instance):
+        from sentry.similarity import features
+
+        features.delete(instance)
+
+        return super(GroupDeletionTask, self).delete_instance(instance)
+
+    def mark_deletion_in_progress(self, instance_list):
+        from sentry.models import GroupStatus
+
+        for instance in instance_list:
+            if instance.status != GroupStatus.DELETION_IN_PROGRESS:
+                instance.update(status=GroupStatus.DELETION_IN_PROGRESS)

+ 36 - 0
src/sentry/deletions/defaults/organization.py

@@ -0,0 +1,36 @@
+from __future__ import absolute_import, print_function
+
+from ..base import ModelDeletionTask, ModelRelation
+
+
+class OrganizationDeletionTask(ModelDeletionTask):
+    def get_child_relations(self, instance):
+        from sentry.models import (
+            OrganizationMember,
+            Commit, CommitAuthor, CommitFileChange, Environment, Release,
+            ReleaseCommit, ReleaseEnvironment, ReleaseFile, Distribution,
+            ReleaseHeadCommit, Repository, Team
+        )
+
+        # Team must come first
+        relations = [
+            ModelRelation(Team, {'organization_id': instance.id}),
+        ]
+
+        model_list = (
+            OrganizationMember, CommitFileChange, Commit, CommitAuthor,
+            Environment, Repository, Release, ReleaseCommit,
+            ReleaseEnvironment, ReleaseFile, Distribution, ReleaseHeadCommit
+        )
+        relations.extend([
+            ModelRelation(m, {'organization_id': instance.id}) for m in model_list
+        ])
+
+        return relations
+
+    def mark_deletion_in_progress(self, instance_list):
+        from sentry.models import OrganizationStatus
+
+        for instance in instance_list:
+            if instance.status != OrganizationStatus.DELETION_IN_PROGRESS:
+                instance.update(status=OrganizationStatus.DELETION_IN_PROGRESS)

+ 71 - 0
src/sentry/deletions/defaults/project.py

@@ -0,0 +1,71 @@
+from __future__ import absolute_import, print_function
+
+from ..base import (
+    BulkModelDeletionTask, ModelDeletionTask, ModelRelation
+)
+
+
+class ProjectDeletionTask(ModelDeletionTask):
+    def get_child_relations(self, instance):
+        from sentry import models
+
+        relations = [
+            # ProjectKey gets revoked immediately, in bulk
+            ModelRelation(models.ProjectKey, {'project_id': instance.id})
+        ]
+
+        # in bulk
+        model_list = (
+            models.Activity,
+            models.EnvironmentProject,
+            models.EventMapping,
+            models.EventUser,
+            models.EventTag,
+            models.GroupAssignee,
+            models.GroupBookmark,
+            models.GroupEmailThread,
+            models.GroupHash,
+            models.GroupRelease,
+            models.GroupRuleStatus,
+            models.GroupSeen,
+            models.GroupSubscription,
+            models.GroupTagKey,
+            models.GroupTagValue,
+            models.ProjectBookmark,
+            models.ProjectKey,
+            models.SavedSearchUserDefault,
+            models.SavedSearch,
+            models.TagKey,
+            models.TagValue,
+            models.UserReport,
+        )
+        relations.extend([
+            ModelRelation(m, {'project_id': instance.id}, BulkModelDeletionTask)
+            for m in model_list
+        ])
+
+        model_list = (
+            models.GroupMeta,
+            models.GroupResolution,
+            models.GroupSnooze,
+        )
+        relations.extend([
+            ModelRelation(m, {'group__project': instance.id}, ModelDeletionTask)
+            for m in model_list
+        ])
+
+        # special case event due to nodestore
+        relations.extend([
+            ModelRelation(models.Event, {'project_id': instance.id})
+        ])
+
+        # in bulk
+        # Release needs to handle deletes after Group is cleaned up as the foreign
+        # key is protected
+        model_list = (models.Group, models.ReleaseProject)
+        relations.extend([
+            ModelRelation(m, {'project_id': instance.id}, ModelDeletionTask)
+            for m in model_list
+        ])
+
+        return relations

Some files were not shown because too many files changed in this diff