Browse Source

chore(hybrid-cloud): Break the SavedSearch.owner foreign key (#44086)

First step for the HybridCloud deletions work described here:

https://www.notion.so/sentry/Deletions-in-Hybrid-Cloud-eb0c167804a0415ca0a418ab2700cdff

Breaks the SavedSearch.owner foreign key, and provides the
infrastructure to process it, even in MONOLITH mode.

Requires some getsentry changes to handle the `owner` => `owner_id`
change, PR coming forth.
@evanpurkhiser As requested, I reorganized the history of this PR into
logic chunks that hopefully explain the story of the work better.
Zach Collins 2 years ago
parent
commit
1abd29e8d0

+ 1 - 1
migrations_lockfile.txt

@@ -6,5 +6,5 @@ To resolve this, rebase against latest master and regenerate your migration. Thi
 will then be regenerated, and you should be able to merge without conflicts.
 
 nodestore: 0002_nodestore_no_dictfield
-sentry: 0353_add_missing_uuid_unique_indexes
+sentry: 0354_break_saved_search_foreign_key
 social_auth: 0001_initial

+ 3 - 3
src/sentry/api/endpoints/organization_searches.py

@@ -40,7 +40,7 @@ class OrganizationSearchesEndpoint(OrganizationEndpoint):
             # the same organization. DOES include the requesting users pinned
             # search
             .exclude(
-                ~Q(owner=request.user),
+                ~Q(owner_id=request.user.id),
                 visibility__in=(Visibility.OWNER, Visibility.OWNER_PINNED),
             )
             .filter(
@@ -83,7 +83,7 @@ class OrganizationSearchesEndpoint(OrganizationEndpoint):
                 organization=organization,
                 type=SearchType.ISSUE.value,
                 visibility=Visibility.OWNER,
-                owner=request.user,
+                owner_id=request.user.id,
                 query=result["query"],
             ).exists():
                 return Response(
@@ -93,7 +93,7 @@ class OrganizationSearchesEndpoint(OrganizationEndpoint):
 
         saved_search = SavedSearch.objects.create(
             organization=organization,
-            owner=request.user,
+            owner_id=request.user.id,
             type=result["type"],
             name=result["name"],
             query=result["query"],

+ 2 - 2
src/sentry/auth/helper.py

@@ -236,7 +236,7 @@ class AuthIdentityHandler:
             log_service.record_audit_log(
                 event=AuditLogEvent(
                     organization_id=self.organization.id,
-                    time_of_creation=timezone.now(),
+                    date_added=timezone.now(),
                     event_id=audit_log.get_event_id("MEMBER_ADD"),
                     actor_user_id=user.id,
                     actor_label=user.username,
@@ -319,7 +319,7 @@ class AuthIdentityHandler:
             log_service.record_audit_log(
                 event=AuditLogEvent(
                     organization_id=self.organization.id,
-                    time_of_creation=timezone.now(),
+                    date_added=timezone.now(),
                     event_id=audit_log.get_event_id("SSO_IDENTITY_LINK"),
                     actor_user_id=self.user.id,
                     actor_label=self.user.username,

+ 7 - 0
src/sentry/conf/server.py

@@ -580,6 +580,9 @@ CELERY_IMPORTS = (
     "sentry.tasks.commits",
     "sentry.tasks.commit_context",
     "sentry.tasks.deletion",
+    "sentry.tasks.deletion.scheduled",
+    "sentry.tasks.deletion.groups",
+    "sentry.tasks.deletion.hybrid_cloud",
     "sentry.tasks.deliver_from_outbox",
     "sentry.tasks.digests",
     "sentry.tasks.email",
@@ -805,6 +808,10 @@ CELERYBEAT_SCHEDULE = {
         "schedule": crontab_with_minute_jitter(hour="*/6"),
         "options": {"expires": 60 * 25},
     },
+    "schedule-hybrid-cloud-foreign-key-jobs": {
+        "task": "sentry.tasks.deletion.hybrid_cloud.schedule_hybrid_cloud_foreign_key_jobs",
+        "schedule": timedelta(minutes=15),
+    },
     "monitor-release-adoption": {
         "task": "sentry.release_health.tasks.monitor_release_adoption",
         "schedule": crontab(minute=0),

+ 94 - 0
src/sentry/db/models/fields/hybrid_cloud_foreign_key.py

@@ -0,0 +1,94 @@
+"""
+A 'foreign key' which is not enforced in the local database, but triggers eventually consistent delete work in the
+presence of RegionTombstone ControlTombstone model objects through the tasks/deletion/hybrid_cloud.py logic.
+
+It's main purpose to support columns in, say, region silos that refer to User or Integration objects (conversely
+also columns in the control silo that point to, say, organization) that do not actually exist in the local database,
+or even the local network.
+
+Functionally, this field is just a dumb BigIntegerField. While it does support indexes, it does not support constraints.
+This means, for instance, you should absolutely expect identifiers in this column that do not necessarily exist.
+Eventually, if the related object is deleted and processed correctly, this column may be set null or cleaned up,
+but at any given time an integer in this column makes no guarantees about the existence of a related object.
+
+Cascade behavior is provided to the application via tasks/deletion/hybrid_cloud.py jobs.  Again, to emphasize, this
+process is eventually consistent, and the timing of the completion of those process is a black box of systems
+behavior, networking, and tuning.
+
+To add this field to a model, you need to do a few preparatory steps:
+1.  Ensure that the 'model' pointed to by this field is in an opposing silo mode.  Tests *should* fail for any usage
+of a HybridCloudForeignKey that points from and to models in the same silo mode.
+2.  Ensure that the foreign model being referenced produces outboxes to sync tombstones in an atomic transaction.
+For most common cross silo models, there should be a custom delete method already that implements this.
+If not, it's ideal to first consult with the hybrid cloud team beforehand to strategize on the outbox and
+deletion strategies.
+3.  Validate that either the default, or the registered bulk deletions in sentry/deletions/__init__.py make sense
+for your model.  This is especially true if your model previously had no cascade logic (a new model, for instance)
+4.  For an existing field to a HCFK, django will produce a non working migration by default.  There is no way to
+configure the auto generated django migrations unfortunately.  You'll need to carefully build a migration by following
+this pattern:
+    a. register a database operation that alters the field to a ForeignKey with db_constraint=False, in order to produce
+    the custom sql of actually dropping the existing constraint in the database.
+    b. register state operations that further adjust the internal django field state as follows:
+        i. alters the original field to the new HybridCloudForeignKey (use the generated migration for this)
+        ii. renames that field to the `_id` form (eg user => user_id)
+        iii. removes, then re-adds, any other, say, unique constraints that depended on the original field.  They still
+            exist, but due to ii, they need to be reconstructed in terms of the renamed field name, even if the column
+            name is the same.
+4a. Basically, don't change an existing field to HCFK.  The hybrid cloud team probably needs to carefully manage that.
+
+Ideally, when applying this field, you write model test that validates that deletion of your parent model produces
+the expected cascade behavior in your field.
+"""
+from __future__ import annotations
+
+from enum import IntEnum
+
+from django.db import models
+
+__all__ = "HybridCloudForeignKey"
+
+from typing import Any, Tuple
+
+from django.apps import apps
+
+
+class HybridCloudForeignKeyCascadeBehavior(IntEnum):
+    CASCADE = 1
+    SET_NULL = 2
+
+
+class HybridCloudForeignKey(models.BigIntegerField):  # type: ignore
+    on_delete: str
+    foreign_model_name: str
+
+    @property
+    def foreign_model(self) -> Any:
+        parts = self.foreign_model_name.split(".")
+        return apps.get_model(app_label=parts[0], model_name=parts[1])
+
+    @property
+    def foreign_table_name(self) -> str:
+        return self.foreign_model._meta.db_table
+
+    def __init__(
+        self, foreign_model: str, *, on_delete: HybridCloudForeignKeyCascadeBehavior | str, **kwds
+    ):
+        self.on_delete = (
+            on_delete
+            if isinstance(on_delete, HybridCloudForeignKeyCascadeBehavior)
+            else HybridCloudForeignKeyCascadeBehavior[on_delete.upper()]
+        ).name.upper()
+
+        parts = foreign_model.split(".")
+        assert (
+            len(parts) == 2
+        ), f"{self.__class__.__name__} model reference must be <app>.<ModelName>, got {foreign_model}"
+        self.foreign_model_name = foreign_model
+
+        kwds.setdefault("db_index", True)
+        super().__init__(**kwds)
+
+    def deconstruct(self) -> Tuple[Any, Any, Any, Any]:
+        (name, path, args, kwds) = super().deconstruct()
+        return name, path, [self.foreign_model_name], dict(on_delete=self.on_delete, **kwds)

+ 28 - 0
src/sentry/db/postgres/roles.py

@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+import contextlib
+import os
+import sys
+
+from django.db.transaction import get_connection
+
+
+@contextlib.contextmanager
+def test_psql_role_override(role_name: str, using: str | None = None):
+    """
+    During test runs, the role of the current connection will be swapped with role_name, and then swapped
+    back to its original.  Has no effect in production.
+    """
+
+    if "pytest" not in sys.modules or os.environ.get("DB", "postgres") != "postgres":
+        yield
+        return
+
+    with get_connection(using).cursor() as conn:
+        conn.execute("SELECT user")
+        (cur,) = conn.fetchone()
+        conn.execute("SET ROLE %s", [role_name])
+        try:
+            yield
+        finally:
+            conn.execute("SET ROLE %s", [cur])

+ 67 - 0
src/sentry/migrations/0354_break_saved_search_foreign_key.py

@@ -0,0 +1,67 @@
+# Generated by Django 2.2.28 on 2023-01-31 22:25
+
+from django.db import migrations, models
+
+import sentry.db.models.fields.hybrid_cloud_foreign_key
+from sentry.new_migrations.migrations import CheckedMigration
+
+
+class Migration(CheckedMigration):
+    # This flag is used to mark that a migration shouldn't be automatically run in production. For
+    # the most part, this should only be used for operations where it's safe to run the migration
+    # after your code has deployed. So this should not be used for most operations that alter the
+    # schema of a table.
+    # Here are some things that make sense to mark as dangerous:
+    # - Large data migrations. Typically we want these to be run manually by ops so that they can
+    #   be monitored and not block the deploy for a long period of time while they run.
+    # - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
+    #   have ops run this and not block the deploy. Note that while adding an index is a schema
+    #   change, it's completely safe to run the operation after the code has deployed.
+    is_dangerous = False
+
+    dependencies = [
+        ("sentry", "0353_add_missing_uuid_unique_indexes"),
+    ]
+
+    database_operations = [
+        migrations.AlterField(
+            model_name="savedsearch",
+            name="owner",
+            field=sentry.db.models.fields.foreignkey.FlexibleForeignKey(
+                to="sentry.User", db_constraint=False, db_index=True, null=True
+            ),
+        ),
+    ]
+
+    state_operations = [
+        migrations.AlterField(
+            model_name="savedsearch",
+            name="owner",
+            # field=models.IntegerField(db_index=True, null=False),
+            field=sentry.db.models.fields.hybrid_cloud_foreign_key.HybridCloudForeignKey(
+                "sentry.User",
+                db_index=True,
+                null=True,
+                on_delete="CASCADE",
+            ),
+        ),
+        migrations.RenameField(model_name="savedsearch", old_name="owner", new_name="owner_id"),
+        migrations.RemoveConstraint(
+            model_name="savedsearch",
+            name="sentry_savedsearch_pinning_constraint",
+        ),
+        migrations.AddConstraint(
+            model_name="savedsearch",
+            constraint=models.UniqueConstraint(
+                condition=models.Q(visibility="owner_pinned"),
+                fields=("organization", "owner_id", "type"),
+                name="sentry_savedsearch_pinning_constraint",
+            ),
+        ),
+    ]
+
+    operations = [
+        migrations.SeparateDatabaseAndState(
+            database_operations=database_operations, state_operations=state_operations
+        )
+    ]

+ 2 - 2
src/sentry/models/auditlogentry.py

@@ -93,7 +93,7 @@ class AuditLogEntry(Model):
             organization_id=int(
                 self.organization.id
             ),  # prefer raising NoneType here over actually passing through
-            time_of_creation=self.datetime or timezone.now(),
+            date_added=self.datetime or timezone.now(),
             actor_user_id=self.actor and self.actor.id,
             target_object_id=self.target_object,
             ip_address=self.ip_address and str(self.ip_address),
@@ -111,7 +111,7 @@ class AuditLogEntry(Model):
         """
         return AuditLogEntry(
             organization_id=event.organization_id,
-            datetime=event.time_of_creation,
+            datetime=event.date_added,
             actor_id=event.actor_user_id,
             target_object=event.target_object_id,
             ip_address=event.ip_address,

+ 2 - 4
src/sentry/models/counter.py

@@ -13,6 +13,7 @@ from sentry.db.models import (
     region_silo_only_model,
     sane_repr,
 )
+from sentry.db.postgres.roles import test_psql_role_override
 
 
 @region_silo_only_model
@@ -98,8 +99,7 @@ def create_counter_function(app_config, using, **kwargs):
     if not get_model_if_available(app_config, "Counter"):
         return
 
-    cursor = connections[using].cursor()
-    try:
+    with test_psql_role_override("postgres", using), connections[using].cursor() as cursor:
         cursor.execute(
             """
             create or replace function sentry_increment_project_counter(
@@ -126,8 +126,6 @@ def create_counter_function(app_config, using, **kwargs):
             $$ language plpgsql;
         """
         )
-    finally:
-        cursor.close()
 
 
 post_migrate.connect(create_counter_function, dispatch_uid="create_counter_function", weak=False)

+ 41 - 11
src/sentry/models/outbox.py

@@ -22,6 +22,7 @@ from sentry.db.models import (
     sane_repr,
 )
 from sentry.silo import SiloMode
+from sentry.utils import metrics
 
 THE_PAST = datetime.datetime(2016, 8, 1, 0, 0, 0, 0, tzinfo=timezone.utc)
 
@@ -154,6 +155,11 @@ class OutboxBase(Model):
     def next_schedule(self, now: datetime.datetime) -> datetime.datetime:
         return now + (self.last_delay() * 2)
 
+    def save(self, **kwds: Any):
+        tags = {"category": OutboxCategory(self.category).name}
+        metrics.incr("outbox.saved", 1, tags=tags)
+        super().save(**kwds)
+
     @contextlib.contextmanager
     def process_coalesced(self) -> Generator[OutboxBase | None, None, None]:
         # Do not, use a select for update here -- it is tempting, but a major performance issue.
@@ -163,12 +169,24 @@ class OutboxBase(Model):
         coalesced: OutboxBase | None = self.select_coalesced_messages().last()
         yield coalesced
         if coalesced is not None:
-            self.select_coalesced_messages().filter(id__lte=coalesced.id).delete()
+            first_coalesced: OutboxBase = self.select_coalesced_messages().first() or coalesced
+            _, deleted = self.select_coalesced_messages().filter(id__lte=coalesced.id).delete()
+            tags = {"category": OutboxCategory(self.category).name}
+            metrics.incr("outbox.processed", deleted, tags=tags)
+            metrics.timing(
+                "outbox.processing_lag",
+                datetime.datetime.now().timestamp() - first_coalesced.scheduled_from.timestamp(),
+                tags=tags,
+            )
 
     def process(self) -> bool:
         with self.process_coalesced() as coalesced:
             if coalesced is not None:
-                coalesced.send_signal()
+                with metrics.timer(
+                    "outbox.send_signal.duration",
+                    tags={"category": OutboxCategory(coalesced.category).name},
+                ):
+                    coalesced.send_signal()
                 return True
         return False
 
@@ -191,14 +209,6 @@ class OutboxBase(Model):
                 f"Could not flush items from shard {self.key_from(self.sharding_columns)!r}"
             )
 
-    @classmethod
-    def for_shard(cls: Type[_T], shard_scope: OutboxScope, shard_identifier: int) -> _T:
-        """
-        Logically, this is just an alias for the constructor of cls, but explicitly named to call out the intended
-        semantic of creating and instance to invoke `drain_shard` on.
-        """
-        return cls(shard_scope=shard_scope, shard_identifier=shard_identifier)
-
 
 MONOLITH_REGION_NAME = "--monolith--"
 
@@ -234,6 +244,14 @@ class RegionOutbox(OutboxBase):
             ("shard_scope", "shard_identifier", "id"),
         )
 
+    @classmethod
+    def for_shard(cls: Type[_T], shard_scope: OutboxScope, shard_identifier: int) -> _T:
+        """
+        Logically, this is just an alias for the constructor of cls, but explicitly named to call out the intended
+        semantic of creating and instance to invoke `drain_shard` on.
+        """
+        return cls(shard_scope=shard_scope, shard_identifier=shard_identifier)
+
     __repr__ = sane_repr("shard_scope", "shard_identifier", "category", "object_identifier")
 
 
@@ -253,7 +271,7 @@ class ControlOutbox(OutboxBase):
 
     def send_signal(self):
         process_control_outbox.send(
-            sender=self.category,
+            sender=OutboxCategory(self.category),
             payload=self.payload,
             region_name=self.region_name,
             object_identifier=self.object_identifier,
@@ -301,6 +319,18 @@ class ControlOutbox(OutboxBase):
             result.payload = payload
             yield result
 
+    @classmethod
+    def for_shard(
+        cls: Type[_T], shard_scope: OutboxScope, shard_identifier: int, region_name: str
+    ) -> _T:
+        """
+        Logically, this is just an alias for the constructor of cls, but explicitly named to call out the intended
+        semantic of creating and instance to invoke `drain_shard` on.
+        """
+        return cls(
+            shard_scope=shard_scope, shard_identifier=shard_identifier, region_name=region_name
+        )
+
 
 def _find_orgs_for_user(user_id: int) -> Set[int]:
     # TODO: This must be changed to the org member mapping in the control silo eventually.

Some files were not shown because too many files changed in this diff