Browse Source

feat(backup): Allow custom ordinals during validation (#60156)

When comparing two export JSONs, we simply order the models in each by
pk, and assume that the relative position of models in the two JSON
blobs is sufficient to identify them ("compare the 3rd instance of
`sentry.foo` in the left blob with the 3rd instance in the right blob").
As long as all models are newly inserted, this makes sense: the newly
inserted models for the right-side blob will be inserted in `pk` order,
so this method of ordering holds.

But things break down when our export includes some existing models and
some newly inserted ones. Consider an import where we have the following
3 email models:

```json
{
  "model": "sentry.email",
  "pk": 1,
  "fields": {
    "email": "a@example.com",
    "date_added": "2023-06-22T22:59:55.531Z"
  }
},
{
  "model": "sentry.email",
  "pk": 2,
  "fields": {
    "email": "b@example.com",
    "date_added": "2023-06-22T22:59:55.531Z"
  }
},
{
  "model": "sentry.email",
  "pk": 3,
  "fields": {
    "email": "c@example.com",
    "date_added": "2023-06-22T22:59:55.531Z"
  }
},
```

If we import it with `--merge-users` set to true, and a `sentry.email`
like `b@example.com` already exists, the assumption that pk ordering is
valid falls apart: `b@example.com` would have had its `pk` assigned long
ago, meaning that it precedes `a@example.com` in `pk`-based ordering.
The resulting output might look like this:

```json
{
  "model": "sentry.email",
  "pk": 5,
  "fields": {
    "email": "b@example.com",
    "date_added": "2023-06-05:12:34.531Z"
  }
},
{
  "model": "sentry.email",
  "pk": 201,
  "fields": {
    "email": "a@example.com",
    "date_added": "2023-06-22T22:59:55.531Z"
  }
},
{
  "model": "sentry.email",
  "pk": 202,
  "fields": {
    "email": "c@example.com",
    "date_added": "2023-06-22T22:59:55.531Z"
  }
},
```

The ordering is no longer correct - `b@example.com` precedes the newly
inserted emails when ordered by `pk` alone!

For "merge-able" or "ovewrite-able" models like this, it's better to use
some globally unique identifier (or group thereof) to do the ordering,
if possible. Luckily, for most models that we allow to be
merged/overwritten, this is readily available.

This PR sets those custom ordinal definitions for these models, and
define the logic for using a custom, non-`pk`-based ordering method for
models that demand it.

Issue: getsentry/team-ospo#195
Alex Zaslavsky 1 year ago
parent
commit
c7d1cc6928

+ 14 - 0
fixtures/backup/single-integration.json

@@ -0,0 +1,14 @@
+ [
+    {
+        "model": "sentry.integration",
+        "pk": 1,
+        "fields": {
+            "date_added": "2023-06-22T00:00:00.000Z",
+            "date_updated": "2023-06-23T00:00:00.000Z",
+            "external_id": "slack:test-org",
+            "name": "Slack for test-org",
+            "provider": "slack",
+            "status": 0
+        }
+    }
+]

+ 3 - 0
src/sentry/backup/findings.py

@@ -37,6 +37,9 @@ class ComparatorFindingKind(FindingKind):
     # The instances of a particular model did not maintain total ordering of pks (that is, pks did not appear in ascending order, or appear multiple times).
     UnorderedInput = auto()
 
+    # Multiple instances of the same custom ordinal signature exist in the input.
+    DuplicateCustomOrdinal = auto()
+
     # The number of instances of a particular model on the left and right side of the input were not
     # equal.
     UnequalCounts = auto()

+ 12 - 10
src/sentry/backup/mixins.py

@@ -9,23 +9,25 @@ from sentry.backup.scopes import ImportScope, RelocationScope
 
 class OverwritableConfigMixin:
     """
-    Handles the `ImportFlags.overwrite_configs` setting when it's piped through to a `RelocationScope.Config` model with at least one `unique=True` field, thereby handling the collision in the manner the importer requested.
+    Handles the `ImportFlags.overwrite_configs` setting when it's piped through to a
+    `RelocationScope.Config` model with at least one `unique=True` field, thereby handling the
+    collision in the manner the importer requested.
     """
 
+    # TODO(getsentry/team-ospo#190): Clean up the type checking in this method.
     def write_relocation_import(
         self, scope: ImportScope, flags: ImportFlags
     ) -> Optional[Tuple[int, ImportKind]]:
-        # TODO(getsentry/team-ospo#190): Clean up the type checking here.
-        if self.get_relocation_scope() == RelocationScope.Config:  # type: ignore
-            # Get all unique sets that will potentially cause collisions.
-            uniq_sets = dependencies()[get_model_name(self)].get_uniques_without_foreign_keys()  # type: ignore
+        # Get all unique sets that will potentially cause collisions.
+        uniq_sets = dependencies()[get_model_name(self)].get_uniques_without_foreign_keys()  # type: ignore
+
+        # Don't use this mixin for models with multiple unique sets; write custom logic instead.
+        assert len(uniq_sets) <= 1
 
-            # Don't use this mixin for models with multiple unique sets; write custom logic instead.
-            if len(uniq_sets) > 1:
-                raise ValueError(
-                    "Cannot use `OverwritableConfigMixin` on model with multiple unique sets"
-                )
+        # Must set `__relocation_custom_ordinal__` on models that use this mixin.
+        assert getattr(self.__class__, "__relocation_custom_ordinal__", None) is not None
 
+        if self.get_relocation_scope() == RelocationScope.Config:  # type: ignore
             if len(uniq_sets) == 1:
                 uniq_set = uniq_sets[0]
                 query = dict()

+ 129 - 66
src/sentry/backup/validate.py

@@ -1,12 +1,13 @@
 from __future__ import annotations
 
+from collections import OrderedDict as ordereddict
 from collections import defaultdict
 from copy import deepcopy
 from difflib import unified_diff
-from typing import Dict, Tuple
+from typing import Dict, OrderedDict, Tuple
 
 from sentry.backup.comparators import ComparatorMap, ForeignKeyComparator, get_default_comparators
-from sentry.backup.dependencies import ImportKind, NormalizedModelName, PrimaryKeyMap
+from sentry.backup.dependencies import ImportKind, NormalizedModelName, PrimaryKeyMap, get_model
 from sentry.backup.findings import (
     ComparatorFinding,
     ComparatorFindingKind,
@@ -33,53 +34,111 @@ def validate(
     class OrdinalCounter:
         """Keeps track of the next ordinal to be assigned for a given model kind."""
 
-        max_seen_pk: int
+        # The `value` being tracked is either the custom ordering tuple for this model (see: `BaseModel::get_relocation_ordinal_fields()` method), or otherwise just the pk.
+        max_seen_ordinal_value: int | tuple | None
         next_ordinal: int
 
         def __init__(self):
-            self.max_seen_pk = -1
+            self.max_seen_ordinal_value = None
             self.next_ordinal = 1
 
-        def assign(self, obj: JSONData, side: Side) -> Tuple[int, list[ComparatorFinding]]:
+        def assign(
+            self, obj: JSONData, ordinal_value: int | tuple, side: Side
+        ) -> Tuple[InstanceID, list[ComparatorFinding]]:
             """Assigns the next available ordinal to the supplied `obj` model."""
 
             pk = obj["pk"]
             model_name = NormalizedModelName(obj["model"])
             findings = []
-            if pk > self.max_seen_pk:
-                self.max_seen_pk = pk
+            if (
+                self.max_seen_ordinal_value is None
+                or ordinal_value > self.max_seen_ordinal_value  # type: ignore
+            ):
+                self.max_seen_ordinal_value = ordinal_value
             else:
+                # Only `pk`-based collisions are reported here; collisions for custom ordinals are
+                # caught earlier.
+                assert not isinstance(self.max_seen_ordinal_value, tuple)
+
                 findings.append(
                     ComparatorFinding(
                         kind=ComparatorFindingKind.UnorderedInput,
                         on=InstanceID(str(model_name), self.next_ordinal),
                         left_pk=pk if side == Side.left else None,
                         right_pk=pk if side == Side.right else None,
-                        reason=f"""instances not listed in ascending `pk` order; `pk` {pk} is less than or equal to {self.max_seen_pk} which precedes it""",
+                        reason=f"""instances not listed in ascending `pk` order; `pk` {pk} is less than or equal to {self.max_seen_ordinal_value} which precedes it""",
                     )
                 )
 
             obj["ordinal"] = self.next_ordinal
             self.next_ordinal += 1
-            return (obj["ordinal"], findings if findings else [])
+
+            return (InstanceID(str(model_name), obj["ordinal"]), findings if findings else [])
 
     OrdinalCounters = Dict[NormalizedModelName, OrdinalCounter]
-    ModelMap = Dict[InstanceID, JSONData]
+    ModelMap = Dict[NormalizedModelName, OrderedDict[InstanceID, JSONData]]
 
     def build_model_map(
         models: JSONData, side: Side, findings: ComparatorFindings
     ) -> Tuple[ModelMap, OrdinalCounters]:
         """Does two things in tandem: builds a map of InstanceID -> JSON model, and simultaneously builds a map of model name -> number of ordinals assigned."""
 
-        model_map: ModelMap = {}
+        from sentry.db.models import BaseModel
+        from sentry.models.user import User
+
+        model_map: ModelMap = defaultdict(ordereddict)
         ordinal_counters: OrdinalCounters = defaultdict(OrdinalCounter)
+        need_ordering: dict[NormalizedModelName, Dict[tuple, JSONData]] = defaultdict(dict)
+        pks_to_usernames: dict[int, str] = dict()
+
         for model in models:
+            pk = model["pk"]
             model_name = NormalizedModelName(model["model"])
-            counter = ordinal_counters[model_name]
-            ordinal, found = counter.assign(model, side)
-            findings.extend(found)
-            id = InstanceID(str(model_name), ordinal)
-            model_map[id] = model
+            model_type = get_model(model_name)
+            if model_type is None or not issubclass(model_type, BaseModel):
+                raise RuntimeError("Unknown model class")
+
+            if model_type == User:
+                pks_to_usernames[pk] = model["fields"]["username"]
+
+            custom_ordinal_fields = model_type.get_relocation_ordinal_fields()
+            if custom_ordinal_fields is None:
+                id, found = ordinal_counters[model_name].assign(model, pk, side)
+                findings.extend(found)
+                model_map[model_name][id] = model
+                continue
+
+            custom_ordinal_parts = []
+            for field in custom_ordinal_fields:
+                # Special case: for `user` pks, look through the user to the `username` instead.
+                if field == "user" or field == "user_id":
+                    custom_ordinal_parts.append(pks_to_usernames[model["fields"][field]])
+                else:
+                    custom_ordinal_parts.append(model["fields"][field])
+
+            ordinal = tuple(custom_ordinal_parts)
+            if need_ordering[model_name].get(ordinal) is not None:
+                findings.append(
+                    ComparatorFinding(
+                        kind=ComparatorFindingKind.DuplicateCustomOrdinal,
+                        on=InstanceID(str(model_name), None),
+                        left_pk=pk if side == Side.left else None,
+                        right_pk=pk if side == Side.right else None,
+                        reason=f"""custom ordinal value `{ordinal}` appears multiple times""",
+                    )
+                )
+
+            need_ordering[model_name][ordinal] = model
+
+        for model_name, models in need_ordering.items():
+            # Sort the models by key, which is a tuple of ordered custom ordinal field values,
+            # specific to the model in question.
+            ordered_models = dict(sorted(models.items()))
+            for ordinal_value, model in ordered_models.items():
+                id, found = ordinal_counters[model_name].assign(model, ordinal_value, side)
+                findings.extend(found)
+                model_map[model_name][id] = model
+
         return (model_map, ordinal_counters)
 
     def json_lines(obj: JSONData) -> list[str]:
@@ -134,59 +193,63 @@ def validate(
 
     # Save the pk -> ordinal mapping on both sides, so that we can decode foreign keys into this
     # model that we encounter later.
-    for id, right in right_models.items():
-        if id.ordinal is None:
-            raise RuntimeError("all InstanceIDs used for comparisons must have their ordinal set")
-
-        left = left_models[id]
-        left_pk_map.insert(
-            NormalizedModelName(id.model), left_models[id]["pk"], id.ordinal, ImportKind.Inserted
-        )
-        right_pk_map.insert(
-            NormalizedModelName(id.model), right["pk"], id.ordinal, ImportKind.Inserted
-        )
+    for model_name, models in right_models.items():
+        for id, right in models.items():
+            assert id.ordinal is not None
+
+            left = left_models[model_name][id]
+            left_pk_map.insert(
+                NormalizedModelName(id.model),
+                left_models[model_name][id]["pk"],
+                id.ordinal,
+                ImportKind.Inserted,
+            )
+            right_pk_map.insert(
+                NormalizedModelName(id.model), right["pk"], id.ordinal, ImportKind.Inserted
+            )
 
     # We only perform custom comparisons and JSON diffs on non-duplicate entries that exist in both
     # outputs.
-    for id, right in right_models.items():
-        if id.ordinal is None:
-            raise RuntimeError("all InstanceIDs used for comparisons must have their ordinal set")
-
-        # Try comparators applicable for this specific model.
-        left = left_models[id]
-        if id.model in comparators:
-            # We take care to run ALL of the `compare()` methods on each comparator before calling
-            # any `scrub()` methods. This ensures that, in cases where a single model uses multiple
-            # comparators that touch the same fields, one comparator does not accidentally scrub the
-            # inputs for its follower. If `compare()` functions are well-behaved (that is, they
-            # don't mutate their inputs), this should be sufficient to ensure that the order in
-            # which comparators are applied does not change the final output.
-            for cmp in comparators[id.model]:
-                ex = cmp.existence(id, left, right)
-                if ex:
-                    findings.extend(ex)
-                    continue
-
-                if isinstance(cmp, ForeignKeyComparator):
-                    cmp.set_primary_key_maps(left_pk_map, right_pk_map)
-
-                res = cmp.compare(id, left, right)
-                if res:
-                    findings.extend(res)
-            for cmp in comparators[id.model]:
-                cmp.scrub(left, right)
-
-        # Finally, perform a diff on the remaining JSON.
-        diff = list(unified_diff(json_lines(left["fields"]), json_lines(right["fields"]), n=15))
-        if diff:
-            findings.append(
-                ComparatorFinding(
-                    kind=ComparatorFindingKind.UnequalJSON,
-                    on=id,
-                    left_pk=left["pk"],
-                    right_pk=right["pk"],
-                    reason="\n    " + "\n    ".join(diff),
+    for model_name, models in right_models.items():
+        for id, right in models.items():
+            assert id.ordinal is not None
+
+            # Try comparators applicable for this specific model.
+            left = left_models[model_name][id]
+            if id.model in comparators:
+                # We take care to run ALL of the `compare()` methods on each comparator before
+                # calling any `scrub()` methods. This ensures that, in cases where a single model
+                # uses multiple comparators that touch the same fields, one comparator does not
+                # accidentally scrub the inputs for its follower. If `compare()` functions are
+                # well-behaved (that is, they don't mutate their inputs), this should be sufficient
+                # to ensure that the order in which comparators are applied does not change the
+                # final output.
+                for cmp in comparators[id.model]:
+                    ex = cmp.existence(id, left, right)
+                    if ex:
+                        findings.extend(ex)
+                        continue
+
+                    if isinstance(cmp, ForeignKeyComparator):
+                        cmp.set_primary_key_maps(left_pk_map, right_pk_map)
+
+                    res = cmp.compare(id, left, right)
+                    if res:
+                        findings.extend(res)
+                for cmp in comparators[id.model]:
+                    cmp.scrub(left, right)
+
+            # Finally, perform a diff on the remaining JSON.
+            diff = list(unified_diff(json_lines(left["fields"]), json_lines(right["fields"]), n=15))
+            if diff:
+                findings.append(
+                    ComparatorFinding(
+                        kind=ComparatorFindingKind.UnequalJSON,
+                        on=id,
+                        left_pk=left["pk"],
+                        right_pk=right["pk"],
+                        reason="\n    " + "\n    ".join(diff),
+                    )
                 )
-            )
 
     return findings

+ 20 - 0
src/sentry/db/models/base.py

@@ -50,6 +50,11 @@ class BaseModel(models.Model):
     __relocation_scope__: RelocationScope | set[RelocationScope]
     __relocation_dependencies__: set[str]
 
+    # Some models have a globally unique identifier, like a UUID. This should be a set of one or
+    # more fields, none of which are foreign keys, that are `unique=True` or `unique_together` for
+    # an entire Sentry instance.
+    __relocation_custom_ordinal__: list[str] | None = None
+
     objects: ClassVar[BaseManager[Self]] = BaseManager()
 
     update = update
@@ -123,6 +128,21 @@ class BaseModel(models.Model):
 
         return self.__relocation_scope__
 
+    @classmethod
+    def get_relocation_ordinal_fields(self) -> None | list[str]:
+        """
+        Retrieves the custom ordinal fields for models that may be re-used at import time (that is,
+        the `write_relocation_import()` method may return an `ImportKind` besides
+        `ImportKind.Inserted`). In such cases, we want an ordering of models by a globally unique
+        value that is not the `pk`, to ensure that merged and inserted models are still ordered
+        correctly with respect to one another.
+        """
+
+        if self.__relocation_custom_ordinal__ is None:
+            return None
+
+        return self.__relocation_custom_ordinal__
+
     @classmethod
     def get_possible_relocation_scopes(cls) -> set[RelocationScope]:
         """

+ 1 - 0
src/sentry/models/email.py

@@ -22,6 +22,7 @@ class Email(Model):
 
     __relocation_scope__ = RelocationScope.User
     __relocation_dependencies__ = {"sentry.User"}
+    __relocation_custom_ordinal__ = ["email"]
 
     email = CIEmailField(_("email address"), unique=True, max_length=75)
     date_added = models.DateTimeField(default=timezone.now)

+ 1 - 0
src/sentry/models/options/option.py

@@ -32,6 +32,7 @@ class BaseOption(OverwritableConfigMixin, Model):
 
     # Subclasses should overwrite the relocation scope as appropriate.
     __relocation_scope__ = RelocationScope.Excluded
+    __relocation_custom_ordinal__ = ["key"]
 
     key = models.CharField(max_length=128, unique=True)
     last_updated = models.DateTimeField(default=timezone.now)

+ 2 - 0
src/sentry/models/relay.py

@@ -11,6 +11,7 @@ from sentry.db.models import Model, region_silo_only_model
 @region_silo_only_model
 class RelayUsage(OverwritableConfigMixin, Model):
     __relocation_scope__ = RelocationScope.Config
+    __relocation_custom_ordinal__ = ["relay_id", "version"]
 
     relay_id = models.CharField(max_length=64)
     version = models.CharField(max_length=32, default="0.0.1")
@@ -27,6 +28,7 @@ class RelayUsage(OverwritableConfigMixin, Model):
 @region_silo_only_model
 class Relay(OverwritableConfigMixin, Model):
     __relocation_scope__ = RelocationScope.Config
+    __relocation_custom_ordinal__ = ["relay_id"]
 
     relay_id = models.CharField(max_length=64, unique=True)
     public_key = models.CharField(max_length=200)

+ 2 - 0
src/sentry/models/user.py

@@ -86,6 +86,8 @@ class UserManager(BaseManager["User"], DjangoUserManager):
 @control_silo_only_model
 class User(BaseModel, AbstractBaseUser):
     __relocation_scope__ = RelocationScope.User
+    __relocation_custom_ordinal__ = ["username"]
+
     replication_version: int = 2
 
     id = BoundedBigAutoField(primary_key=True)

+ 2 - 1
src/sentry/models/useremail.py

@@ -46,6 +46,7 @@ class UserEmailManager(BaseManager["UserEmail"]):
 class UserEmail(ControlOutboxProducingModel):
     __relocation_scope__ = RelocationScope.User
     __relocation_dependencies__ = {"sentry.Email"}
+    __relocation_custom_ordinal__ = ["user", "email"]
 
     user = FlexibleForeignKey(settings.AUTH_USER_MODEL, related_name="emails")
     email = models.EmailField(_("email address"), max_length=75)
@@ -102,7 +103,7 @@ class UserEmail(ControlOutboxProducingModel):
         if old_pk is None:
             return None
 
-        # If we are merging users, ignore the imported email and use the merged user's email
+        # If we are merging users, ignore the imported email and use the existing user's email
         # instead.
         if pk_map.get_kind(get_model_name(User), old_user_id) == ImportKind.Existing:
             return None

Some files were not shown because too many files changed in this diff