Browse Source

feat(backup): Add ForeignKeyComparator (#54558)

Given two instances of a model that are being validated for equality, we
want the foreign keys to be correct relatively (ie, they point to the
same model in the respective models' JSON blobs), not absolutely (ie,
they are literally the same integer). By creating maps that store the
relations between pks and ordinals, we can easily check that the models
point to match their respective ordinals regardless of the actual pk
numbers.

Issue: getsentry/team-ospo#171
Alex Zaslavsky 1 year ago
parent
commit
60f2390092

+ 79 - 0
src/sentry/backup/comparators.py

@@ -7,6 +7,7 @@ from typing import Callable, Dict, List
 from dateutil import parser
 from django.db import models
 
+from sentry.backup.dependencies import PrimaryKeyMap, dependencies
 from sentry.backup.findings import ComparatorFinding, ComparatorFindingKind, InstanceID
 from sentry.backup.helpers import Side, get_exportable_final_derivations_of
 from sentry.db.models import BaseModel
@@ -200,6 +201,73 @@ class DatetimeEqualityComparator(JSONScrubbingComparator):
         return findings
 
 
+class ForeignKeyComparator(JSONScrubbingComparator):
+    """Ensures that foreign keys match in a relative (they refer to the same other model in their
+    respective JSON blobs) rather than absolute (they have literally the same integer value)
+    sense."""
+
+    left_pk_map: PrimaryKeyMap | None = None
+    right_pk_map: PrimaryKeyMap | None = None
+
+    def __init__(self, foreign_fields: dict[str, models.base.ModelBase]):
+        super().__init__(*(foreign_fields.keys()))
+        self.foreign_fields = foreign_fields
+
+    def set_primary_key_maps(self, left_pk_map: PrimaryKeyMap, right_pk_map: PrimaryKeyMap):
+        """Call this function before running the comparator, to ensure that it has access to the latest mapping information for both sides of the comparison."""
+
+        self.left_pk_map = left_pk_map
+        self.right_pk_map = right_pk_map
+
+    def compare(self, on: InstanceID, left: JSONData, right: JSONData) -> list[ComparatorFinding]:
+        findings = []
+        fields = sorted(self.fields)
+        for f in fields:
+            field_model_name = "sentry." + self.foreign_fields[f].__name__.lower()
+            if left["fields"].get(f) is None and right["fields"].get(f) is None:
+                continue
+
+            if self.left_pk_map is None or self.right_pk_map is None:
+                raise RuntimeError("must call `set_primary_key_maps` before comparing")
+
+            left_fk_as_ordinal = self.left_pk_map.get(field_model_name, left["fields"][f])
+            right_fk_as_ordinal = self.right_pk_map.get(field_model_name, right["fields"][f])
+            if left_fk_as_ordinal is None or right_fk_as_ordinal is None:
+                if left_fk_as_ordinal is None:
+                    findings.append(
+                        ComparatorFinding(
+                            kind=self.get_kind(),
+                            on=on,
+                            left_pk=left["pk"],
+                            right_pk=right["pk"],
+                            reason=f"""the left foreign key ordinal for `{f}` model with pk `{left["fields"][f]}` could not be found""",
+                        )
+                    )
+                if right_fk_as_ordinal is None:
+                    findings.append(
+                        ComparatorFinding(
+                            kind=self.get_kind(),
+                            on=on,
+                            left_pk=left["pk"],
+                            right_pk=right["pk"],
+                            reason=f"""the right foreign key ordinal for `{f}` model with pk `{right["fields"][f]}` could not be found""",
+                        )
+                    )
+                continue
+
+            if left_fk_as_ordinal != right_fk_as_ordinal:
+                findings.append(
+                    ComparatorFinding(
+                        kind=self.get_kind(),
+                        on=on,
+                        left_pk=left["pk"],
+                        right_pk=right["pk"],
+                        reason=f"""the left foreign key ordinal ({left_fk_as_ordinal}) for `{f}` was not equal to the right foreign key ordinal ({right_fk_as_ordinal})""",
+                    )
+                )
+        return findings
+
+
 class ObfuscatingComparator(JSONScrubbingComparator, ABC):
     """Comparator that compares private values, but then safely truncates them to ensure that they
     do not leak out in logs, stack traces, etc."""
@@ -338,6 +406,16 @@ def auto_assign_email_obfuscating_comparators(comps: ComparatorMap) -> None:
                 comps[name].append(EmailObfuscatingComparator(*assign))
 
 
+def auto_assign_foreign_key_comparators(comps: ComparatorMap) -> None:
+    """Automatically assigns the ForeignKeyComparator or to all appropriate model fields (see
+    dependencies.py for more on what "appropriate" means in this context)."""
+
+    for model_name, rels in dependencies().items():
+        comps[model_name.lower()].append(
+            ForeignKeyComparator({k: v.model for k, v in rels.foreign_keys.items()})
+        )
+
+
 ComparatorList = List[JSONScrubbingComparator]
 ComparatorMap = Dict[str, ComparatorList]
 
@@ -380,6 +458,7 @@ def build_default_comparators():
     # to the `DEFAULT_COMPARATORS` map.
     auto_assign_datetime_equality_comparators(comparators)
     auto_assign_email_obfuscating_comparators(comparators)
+    auto_assign_foreign_key_comparators(comparators)
 
     return comparators
 

+ 32 - 0
src/sentry/backup/dependencies.py

@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from collections import defaultdict
 from enum import Enum, auto, unique
 from typing import NamedTuple
 
@@ -71,6 +72,37 @@ class DependenciesJSONEncoder(json.JSONEncoder):
         return super().default(obj)
 
 
+class PrimaryKeyMap:
+    """
+    A map between a primary key in one primary key sequence (like a database) and another (like the
+    ordinals in a backup JSON file). As models are moved between databases, their absolute contents
+    may stay the same, but their relative identifiers may change. This class allows us to track how
+    those relations have been transformed, thereby preserving the model references between one
+    another.
+
+    Note that the map assumes that the primary keys in question are integers. In particular, natural
+    keys are not supported!
+    """
+
+    mapping: dict[str, dict[int, int]]
+
+    def __init__(self):
+        self.mapping = defaultdict(dict)
+
+    def get(self, model: str, old: int) -> int | None:
+        """Get the new, post-mapping primary key from an old primary key."""
+
+        pk_map = self.mapping.get(model)
+        if pk_map is None:
+            return None
+        return pk_map.get(old)
+
+    def insert(self, model: str, old: int, new: int):
+        """Create a new OLD_PK -> NEW_PK mapping for the given model."""
+
+        self.mapping[model][old] = new
+
+
 def dependencies() -> dict[str, ModelRelations]:
     """Produce a dictionary mapping model type definitions to a `ModelDeps` describing their dependencies."""
 

+ 7 - 0
src/sentry/backup/findings.py

@@ -62,6 +62,13 @@ class ComparatorFindingKind(IntEnum):
     # `None`.
     HashObfuscatingComparatorExistenceCheck = auto()
 
+    # Foreign key field comparison failed.
+    ForeignKeyComparator = auto()
+
+    # Failed to compare foreign key fields because one of the fields being compared was not present
+    # or `None`.
+    ForeignKeyComparatorExistenceCheck = auto()
+
     # Failed to compare an ignored field.
     IgnoredComparator = auto()
 

+ 25 - 1
src/sentry/backup/validate.py

@@ -5,7 +5,8 @@ from copy import deepcopy
 from difflib import unified_diff
 from typing import Dict, Tuple
 
-from sentry.backup.comparators import DEFAULT_COMPARATORS, ComparatorMap
+from sentry.backup.comparators import DEFAULT_COMPARATORS, ComparatorMap, ForeignKeyComparator
+from sentry.backup.dependencies import PrimaryKeyMap
 from sentry.backup.findings import (
     ComparatorFinding,
     ComparatorFindingKind,
@@ -115,10 +116,30 @@ def validate(
     if not findings.empty():
         return findings
 
+    # As models are compared, we will add their pk mapping to separate `PrimaryKeyMaps`. Then, when
+    # a foreign keyed field into the specific model is encountered, we will be able to ensure that
+    # both sides reference the correct model.
+    #
+    # For instance, we encounter the first `sentry.User` model on both the left and right side, with
+    # the left side having a `pk` of 123, and the right having `456`. This means that we want to map
+    # `[sentry.User][123] = 1` on the left and `[sentry.User][456] = 1`. Later, when we encounter
+    # foreign keys to a user model with `pk` 123 on the left and 456 on the right, we'll be able to
+    # dereference the map to ensure that those both point to the same model on their respective
+    # sides.
+    left_pk_map = PrimaryKeyMap()
+    right_pk_map = PrimaryKeyMap()
+
     # We only perform custom comparisons and JSON diffs on non-duplicate entries that exist in both
     # outputs.
     for id, right in right_models.items():
+        if id.ordinal is None:
+            raise RuntimeError("all InstanceIDs used for comparisons must have their ordinal set")
+
+        # Save the pk -> ordinal mapping on both sides, so that we can decode foreign keys into this
+        # model that we encounter later.
         left = left_models[id]
+        left_pk_map.insert(id.model, left["pk"], id.ordinal)
+        right_pk_map.insert(id.model, right["pk"], id.ordinal)
 
         # Try comparators applicable for this specific model.
         if id.model in comparators:
@@ -134,6 +155,9 @@ def validate(
                     findings.extend(ex)
                     continue
 
+                if isinstance(cmp, ForeignKeyComparator):
+                    cmp.set_primary_key_maps(left_pk_map, right_pk_map)
+
                 res = cmp.compare(id, left, right)
                 if res:
                     findings.extend(res)

+ 209 - 0
tests/sentry/backup/test_comparators.py

@@ -1,13 +1,17 @@
 from copy import deepcopy
 
+import pytest
+
 from sentry.backup.comparators import (
     DatetimeEqualityComparator,
     DateUpdatedComparator,
     EmailObfuscatingComparator,
+    ForeignKeyComparator,
     HashObfuscatingComparator,
     IgnoredComparator,
     ScrubbedData,
 )
+from sentry.backup.dependencies import PrimaryKeyMap, dependencies
 from sentry.backup.findings import ComparatorFindingKind, InstanceID
 from sentry.utils.json import JSONData
 
@@ -396,6 +400,211 @@ def test_good_hash_obfuscating_comparator_scrubbed():
     ]
 
 
+DEPENDENCIES = dependencies()
+
+
+def test_good_foreign_key_comparator():
+    cmp = ForeignKeyComparator(
+        {k: v.model for k, v in DEPENDENCIES["sentry.UserEmail"].foreign_keys.items()}
+    )
+    id = InstanceID("sentry.useremail", 0)
+    left_pk_map = PrimaryKeyMap()
+    left_pk_map.insert("sentry.user", 12, 1)
+    right_pk_map = PrimaryKeyMap()
+    right_pk_map.insert("sentry.user", 34, 1)
+    left: JSONData = {
+        "model": "test",
+        "ordinal": 1,
+        "pk": 1,
+        "fields": {
+            "user": 12,
+            "email": "testing@example.com",
+            "validation_hash": "ABC123",
+            "date_hash_added": "2023-06-23T00:00:00.000Z",
+            "is_verified": True,
+        },
+    }
+    right: JSONData = {
+        "model": "test",
+        "ordinal": 1,
+        "pk": 1,
+        "fields": {
+            "user": 34,
+            "email": "testing@example.com",
+            "validation_hash": "ABC123",
+            "date_hash_added": "2023-06-23T00:00:00.000Z",
+            "is_verified": True,
+        },
+    }
+    cmp.set_primary_key_maps(left_pk_map, right_pk_map)
+
+    assert not cmp.compare(id, left, right)
+
+
+def test_good_foreign_key_comparator_scrubbed():
+    cmp = ForeignKeyComparator(
+        {k: v.model for k, v in DEPENDENCIES["sentry.UserEmail"].foreign_keys.items()}
+    )
+    left: JSONData = {
+        "model": "test",
+        "ordinal": 1,
+        "pk": 1,
+        "fields": {
+            "user": 12,
+            "email": "testing@example.com",
+            "validation_hash": "ABC123",
+            "date_hash_added": "2023-06-23T00:00:00.000Z",
+            "is_verified": True,
+        },
+    }
+    right = deepcopy(left)
+    cmp.scrub(left, right)
+    assert left["scrubbed"]
+    assert left["scrubbed"]["ForeignKeyComparator::user"] is ScrubbedData()
+
+    assert right["scrubbed"]
+    assert right["scrubbed"]["ForeignKeyComparator::user"] is ScrubbedData()
+
+
+def test_bad_foreign_key_comparator_set_primary_key_maps_not_called():
+    cmp = ForeignKeyComparator(
+        {k: v.model for k, v in DEPENDENCIES["sentry.UserEmail"].foreign_keys.items()}
+    )
+    id = InstanceID("sentry.useremail", 0)
+    left_pk_map = PrimaryKeyMap()
+    left_pk_map.insert("sentry.user", 12, 1)
+    right_pk_map = PrimaryKeyMap()
+    right_pk_map.insert("sentry.user", 34, 1)
+    left: JSONData = {
+        "model": "test",
+        "ordinal": 1,
+        "pk": 1,
+        "fields": {
+            "user": 12,
+            "email": "testing@example.com",
+            "validation_hash": "ABC123",
+            "date_hash_added": "2023-06-23T00:00:00.000Z",
+            "is_verified": True,
+        },
+    }
+    right: JSONData = {
+        "model": "test",
+        "ordinal": 1,
+        "pk": 1,
+        "fields": {
+            "user": 34,
+            "email": "testing@example.com",
+            "validation_hash": "ABC123",
+            "date_hash_added": "2023-06-23T00:00:00.000Z",
+            "is_verified": True,
+        },
+    }
+
+    with pytest.raises(RuntimeError):
+        cmp.compare(id, left, right)
+
+
+def test_bad_foreign_key_comparator_unequal_mapping():
+    cmp = ForeignKeyComparator(
+        {k: v.model for k, v in DEPENDENCIES["sentry.UserEmail"].foreign_keys.items()}
+    )
+    id = InstanceID("sentry.useremail", 0)
+    left_pk_map = PrimaryKeyMap()
+    left_pk_map.insert("sentry.user", 12, 1)
+    right_pk_map = PrimaryKeyMap()
+    right_pk_map.insert("sentry.user", 34, 2)
+    left: JSONData = {
+        "model": "test",
+        "ordinal": 1,
+        "pk": 1,
+        "fields": {
+            "user": 12,
+            "email": "testing@example.com",
+            "validation_hash": "ABC123",
+            "date_hash_added": "2023-06-23T00:00:00.000Z",
+            "is_verified": True,
+        },
+    }
+    right: JSONData = {
+        "model": "test",
+        "ordinal": 1,
+        "pk": 1,
+        "fields": {
+            "user": 34,
+            "email": "testing@example.com",
+            "validation_hash": "ABC123",
+            "date_hash_added": "2023-06-23T00:00:00.000Z",
+            "is_verified": True,
+        },
+    }
+    cmp.set_primary_key_maps(left_pk_map, right_pk_map)
+
+    res = cmp.compare(id, left, right)
+    assert res
+    assert res[0]
+    assert res[0].kind == ComparatorFindingKind.ForeignKeyComparator
+    assert res[0].on == id
+    assert res[0].left_pk == 1
+    assert res[0].right_pk == 1
+    assert "`user`" in res[0].reason
+    assert "left foreign key ordinal (1)" in res[0].reason
+    assert "right foreign key ordinal (2)" in res[0].reason
+
+
+def test_bad_foreign_key_comparator_missing_mapping():
+    cmp = ForeignKeyComparator(
+        {k: v.model for k, v in DEPENDENCIES["sentry.UserEmail"].foreign_keys.items()}
+    )
+    id = InstanceID("sentry.useremail", 0)
+    left_pk_map = PrimaryKeyMap()
+    right_pk_map = PrimaryKeyMap()
+    left: JSONData = {
+        "model": "test",
+        "ordinal": 1,
+        "pk": 1,
+        "fields": {
+            "user": 12,
+            "email": "testing@example.com",
+            "validation_hash": "ABC123",
+            "date_hash_added": "2023-06-23T00:00:00.000Z",
+            "is_verified": True,
+        },
+    }
+    right: JSONData = {
+        "model": "test",
+        "ordinal": 1,
+        "pk": 1,
+        "fields": {
+            "user": 34,
+            "email": "testing@example.com",
+            "validation_hash": "ABC123",
+            "date_hash_added": "2023-06-23T00:00:00.000Z",
+            "is_verified": True,
+        },
+    }
+    cmp.set_primary_key_maps(left_pk_map, right_pk_map)
+
+    res = cmp.compare(id, left, right)
+    assert len(res) == 2
+    assert res[0]
+    assert res[0].kind == ComparatorFindingKind.ForeignKeyComparator
+    assert res[0].on == id
+    assert res[0].left_pk == 1
+    assert res[0].right_pk == 1
+    assert "`user`" in res[0].reason
+    assert "left foreign key ordinal" in res[0].reason
+    assert "pk `12`" in res[0].reason
+
+    assert res[1]
+    assert res[1].kind == ComparatorFindingKind.ForeignKeyComparator
+    assert res[1].on == id
+    assert res[1].left_pk == 1
+    assert res[1].right_pk == 1
+    assert "`user`" in res[1].reason
+    assert "right foreign key ordinal" in res[1].reason
+    assert "pk `34`" in res[1].reason
+
+
 def test_good_ignored_comparator():
     cmp = IgnoredComparator("ignored_field")
     id = InstanceID("test", 0)

+ 73 - 0
tests/sentry/backup/test_correctness.py

@@ -430,6 +430,79 @@ def test_auto_assign_date_updated_comparator(tmp_path):
     assert not findings
 
 
+def test_auto_assign_foreign_key_comparator(tmp_path):
+    left = [
+        json.loads(
+            """
+            {
+                "model": "sentry.user",
+                "pk": 12,
+                "fields": {
+                    "password": "abc123",
+                    "last_login": null,
+                    "username": "testing@example.com",
+                    "name": "",
+                    "email": "testing@example.com"
+                }
+            }
+        """
+        )
+    ]
+    right = [
+        json.loads(
+            """
+            {
+                "model": "sentry.user",
+                "pk": 34,
+                "fields": {
+                    "password": "abc123",
+                    "last_login": null,
+                    "username": "testing@example.com",
+                    "name": "",
+                    "email": "testing@example.com"
+                }
+            }
+        """
+        )
+    ]
+
+    userrole_left = json.loads(
+        """
+            {
+                "model": "sentry.useremail",
+                "pk": 56,
+                "fields": {
+                    "user": 12,
+                    "email": "testing@example.com",
+                    "validation_hash": "ABC123",
+                    "date_hash_added": "2023-06-23T00:00:00.000Z",
+                    "is_verified": true
+                }
+            }
+        """
+    )
+    userrole_right = json.loads(
+        """
+            {
+                "model": "sentry.useremail",
+                "pk": 78,
+                "fields": {
+                    "user": 34,
+                    "email": "testing@example.com",
+                    "validation_hash": "ABC123",
+                    "date_hash_added": "2023-06-23T00:00:00.000Z",
+                    "is_verified": true
+                }
+            }
+        """
+    )
+    left.append(userrole_left)
+    right.append(userrole_right)
+    out = validate(left, right)
+    findings = out.findings
+    assert not findings
+
+
 def test_auto_assign_ignored_comparator(tmp_path):
     left = [
         json.loads(