Browse Source

feat(backup): Add dangling reference info to `dependencies()` (#57109)

This change introduces the idea of a "dangling" model: a model that has
no transitive, non-nullable references to one of the "root" models that
define their respective relocation scopes. This is important, because
dangling models cannot be filtered using the usual method of "does the
foreign key exist", since they do not reliably have foreign key
references into filtered models.

Changes have also been made to the graphviz visualization binary at
`bin/model-dependency-graphviz`. One may view the new output by invoking
something like `bin/model-dependency-graphviz | pbcopy` from a terminal
`cd`'ed to the root of this repository, and pasting the output to a
graphviz viewer like https://dreampuf.github.io/GraphvizOnline.

Issue: getsentry/team-ospo#196
Issue: getsentry/team-ospo#201
Alex Zaslavsky 1 year ago
parent
commit
37e595baf5

+ 40 - 25
bin/model-dependency-graphviz

@@ -30,23 +30,36 @@ digraph Models {
         node [shape="plaintext",style="none"]
         key1 [label=<<table border="0" cellpadding="2" cellspacing="0" cellborder="0">
             <tr><td align="right" port="i1">HybridCloudForeignKey</td></tr>
-            <tr><td align="right" port="i2">Explicit ForeignKey</td></tr>
-            <tr><td align="right" port="i3">Implicit ForeignKey</td></tr>
-            <tr><td align="right" port="i4">Control Silo Model</td></tr>
-            <tr><td align="right" port="i5">Region Silo Model</td></tr>
-            <tr><td align="right" port="i6">Unexported Model</td></tr>
+            <tr><td align="right" port="i2">HybridCloudForeignKey (nullable)</td></tr>
+            <tr><td align="right" port="i3">Explicit ForeignKey</td></tr>
+            <tr><td align="right" port="i4">Explicit ForeignKey (nullable)</td></tr>
+            <tr><td align="right" port="i5">Implicit ForeignKey</td></tr>
+            <tr><td align="right" port="i6">Implicit ForeignKey (nullable)</td></tr>
+            <tr><td align="right" port="i7">Control Silo Model</td></tr>
+            <tr><td align="right" port="i8">Control Silo Model (dangling)</td></tr>
+            <tr><td align="right" port="i9">Region Silo Model</td></tr>
+            <tr><td align="right" port="i10">Region Silo Model (dangling)</td></tr>
+            <tr><td align="right" port="i11">Unexported Model</td></tr>
         </table>>]
         key2 [label=<<table border="0" cellpadding="2" cellspacing="0" cellborder="0">
             <tr><td port="i1">&nbsp;</td></tr>
             <tr><td port="i2">&nbsp;</td></tr>
             <tr><td port="i3">&nbsp;</td></tr>
-            <tr><td port="i4" bgcolor="lightcoral">&nbsp;</td></tr>
-            <tr><td port="i5" bgcolor="lightblue">&nbsp;</td></tr>
-            <tr><td port="i6" bgcolor="grey">&nbsp;</td></tr>
+            <tr><td port="i4">&nbsp;</td></tr>
+            <tr><td port="i5">&nbsp;</td></tr>
+            <tr><td port="i6">&nbsp;</td></tr>
+            <tr><td port="i7" bgcolor="#ffb6c1ff">&nbsp;</td></tr>
+            <tr><td port="i8" bgcolor="#ffb6c166">&nbsp;</td></tr>
+            <tr><td port="i9" bgcolor="#add8e6ff">&nbsp;</td></tr>
+            <tr><td port="i10" bgcolor="#add8e666">&nbsp;</td></tr>
+            <tr><td port="i11" bgcolor="#c0c0c0ff">&nbsp;</td></tr>
         </table>>]
-        key1:i1:e -> key2:i1:w [color=green]
-        key1:i2:e -> key2:i2:w [color=blue]
-        key1:i3:e -> key2:i3:w [color=red]
+        key1:i1:e -> key2:i1:w [color="#008b00ff",style=solid]
+        key1:i2:e -> key2:i2:w [color="#008b0066",style=dashed]
+        key1:i3:e -> key2:i3:w [color="#0000eeff",style=solid]
+        key1:i4:e -> key2:i4:w [color="#0000ee66",style=dashed]
+        key1:i5:e -> key2:i5:w [color="#cd0000ff",style=solid]
+        key1:i6:e -> key2:i6:w [color="#cd000066",style=dashed]
     }
 
     $clusters
@@ -63,7 +76,7 @@ cluster = Template(
         shape="rectangle"
         fillcolor="$fill"
         fontsize="40"
-        color="grey"
+        color="#c0c0c0"
 
         $nodes
     }
@@ -73,28 +86,30 @@ cluster = Template(
 
 @unique
 class ClusterColor(Enum):
-    Purple = "lavenderblush"
-    Yellow = "khaki"
-    Green = "honeydew"
-    Blue = "lightsteelblue1"
+    Purple = "#fff0f5"  # lavenderblush
+    Yellow = "#f0e68c"  # khaki
+    Green = "#f0fff0"  # honeydew
+    Blue = "#cae1ff"  # lightsteelblue1
 
 
 @unique
 class NodeColor(Enum):
-    Red = "lightpink"
-    Blue = "lightblue"
+    Red = "#ffb6c1"  # lightpink
+    Blue = "#add8e6"  # lightblue
 
 
 @unique
 class EdgeColor(Enum):
-    Hybrid = "color=green"
-    Explicit = "color=blue"
-    Implicit = "color=red"
+    Hybrid = "#008b00"  # green4
+    Explicit = "#0000ee"  # blue2
+    Implicit = "#cd0000"  # red3
 
 
-def print_model_node(model: models.base.ModelBase, silo: SiloMode) -> str:
+def print_model_node(mr: ModelRelations, silo: SiloMode) -> str:
+    id = mr.model.__name__
     color = NodeColor.Red if silo == SiloMode.CONTROL else NodeColor.Blue
-    return f""""{model.__name__}" [fillcolor="{color.value}"];"""
+    opacity = "66" if mr.dangling else "ff"
+    return f""""{id}" [fillcolor="{color.value}{opacity}",color="#000000{opacity}"];"""
 
 
 def print_rel_scope_subgraph(
@@ -104,7 +119,7 @@ def print_rel_scope_subgraph(
         num=num,
         name=name,
         fill=color.value,
-        nodes="\n        ".join([print_model_node(mr.model, mr.silos[0]) for mr in rels]),
+        nodes="\n        ".join([print_model_node(mr, mr.silos[0]) for mr in rels]),
     )
 
 
@@ -123,7 +138,7 @@ def print_edge(src: models.base.ModelBase, dest: models.base.ModelBase, field: F
     elif field.kind == ForeignFieldKind.ImplicitForeignKey:
         color = EdgeColor.Implicit
     style = "dashed" if field.nullable else "solid"
-    return f""""{src.__name__}":e -> "{dest.__name__}":w [{color.value},style={style}];"""
+    return f""""{src.__name__}":e -> "{dest.__name__}":w [color="{color.value}",style={style}];"""
 
 
 def get_most_permissive_relocation_scope(mr: ModelRelations) -> RelocationScope:

File diff suppressed because it is too large
+ 127 - 10
fixtures/backup/model_dependencies/detailed.json


+ 149 - 49
src/sentry/backup/dependencies.py

@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from collections import defaultdict
+from dataclasses import dataclass
 from enum import Enum, auto, unique
 from functools import lru_cache
 from typing import Dict, FrozenSet, NamedTuple, Optional, Set, Tuple, Type
@@ -14,6 +15,71 @@ from sentry.silo import SiloMode
 from sentry.utils import json
 
 
+class NormalizedModelName:
+    """
+    A wrapper type that ensures that the contained model name has been properly normalized. A "normalized" model name is one that is identical to the name as it appears in an exported JSON backup, so a string of the form `{app_label.lower()}.{model_name.lower()}`.
+    """
+
+    __model_name: str
+
+    def __init__(self, model_name: str):
+        if "." not in model_name:
+            raise TypeError("cannot create NormalizedModelName from invalid input string")
+        self.__model_name = model_name.lower()
+
+    def __hash__(self):
+        return hash(self.__model_name)
+
+    def __eq__(self, other) -> bool:
+        if other is None:
+            return False
+        if not isinstance(other, self.__class__):
+            raise TypeError(
+                "NormalizedModelName can only be compared with other NormalizedModelName"
+            )
+        return self.__model_name == other.__model_name
+
+    def __lt__(self, other) -> bool:
+        if not isinstance(other, self.__class__):
+            raise TypeError(
+                "NormalizedModelName can only be compared with other NormalizedModelName"
+            )
+        return self.__model_name < other.__model_name
+
+    def __str__(self) -> str:
+        return self.__model_name
+
+
+# A "root" model is one that is the source of a particular relocation scope - ex, `User` is the root
+# of the `User` relocation scope, the model from which all other (non-dangling; see below) models in
+# that scope are referenced.
+#
+# TODO(getsentry/team-ospo#190): We should find a better way to store this information than a magic
+# list in this file. We should probably make a field (or method?) on `BaseModel` instead.
+@unique
+class RelocationRootModels(Enum):
+    """
+    Record the "root" models for a given `RelocationScope`.
+    """
+
+    Excluded: list[NormalizedModelName] = []
+    User = [NormalizedModelName("sentry.user")]
+    Organization = [NormalizedModelName("sentry.organization")]
+    Config = [
+        NormalizedModelName("sentry.controloption"),
+        NormalizedModelName("sentry.option"),
+        NormalizedModelName("sentry.relay"),
+        NormalizedModelName("sentry.relayusage"),
+        NormalizedModelName("sentry.userrole"),
+    ]
+    # TODO(getsentry/team-ospo#188): Split out extension scope root models from this list.
+    Global = [
+        NormalizedModelName("sentry.apiapplication"),
+        NormalizedModelName("sentry.integration"),
+        NormalizedModelName("sentry.sentryapp"),
+    ]
+
+
 @unique
 class ForeignFieldKind(Enum):
     """Kinds of foreign fields that we care about."""
@@ -47,9 +113,20 @@ class ForeignField(NamedTuple):
     nullable: bool
 
 
-class ModelRelations(NamedTuple):
+@dataclass
+class ModelRelations:
     """What other models does this model depend on, and how?"""
 
+    # A "dangling" model is one that does not transitively contain a non-nullable `ForeignField`
+    # reference to at least one of the `RelocationRootModels` listed above.
+    #
+    # TODO(getsentry/team-ospo#190): A model may or may not be "dangling" in different
+    # `ExportScope`s - for example, a model in `RelocationScope.Organization` may have a single,
+    # non-nullable `ForeignField` reference to a root model in `RelocationScope.Config`. This would
+    # cause it to be dangling when we do an `ExportScope.Organization` export, but non-dangling if
+    # we do an `ExportScope.Global` export. HOWEVER, as best as I can tell, this situation does not
+    # actually exist today, so we can ignore this subtlety for now and just us a boolean here.
+    dangling: Optional[bool]
     foreign_keys: dict[str, ForeignField]
     model: Type[models.base.Model]
     relocation_scope: RelocationScope | set[RelocationScope]
@@ -63,41 +140,6 @@ class ModelRelations(NamedTuple):
         return {ff.model for ff in self.foreign_keys.values()}
 
 
-class NormalizedModelName:
-    """
-    A wrapper type that ensures that the contained model name has been properly normalized. A "normalized" model name is one that is identical to the name as it appears in an exported JSON backup, so a string of the form `{app_label.lower()}.{model_name.lower()}`.
-    """
-
-    __model_name: str
-
-    def __init__(self, model_name: str):
-        if "." not in model_name:
-            raise TypeError("cannot create NormalizedModelName from invalid input string")
-        self.__model_name = model_name.lower()
-
-    def __hash__(self):
-        return hash(self.__model_name)
-
-    def __eq__(self, other) -> bool:
-        if other is None:
-            return False
-        if not isinstance(other, self.__class__):
-            raise TypeError(
-                "NormalizedModelName can only be compared with other NormalizedModelName"
-            )
-        return self.__model_name == other.__model_name
-
-    def __lt__(self, other) -> bool:
-        if not isinstance(other, self.__class__):
-            raise TypeError(
-                "NormalizedModelName can only be compared with other NormalizedModelName"
-            )
-        return self.__model_name < other.__model_name
-
-    def __str__(self) -> str:
-        return self.__model_name
-
-
 def get_model_name(model: type[models.Model] | models.Model) -> NormalizedModelName:
     return NormalizedModelName(f"{model._meta.app_label}.{model._meta.object_name}")
 
@@ -119,6 +161,8 @@ class DependenciesJSONEncoder(json.JSONEncoder):
     def default(self, obj):
         if meta := getattr(obj, "_meta", None):
             return f"{meta.app_label}.{meta.object_name}".lower()
+        if isinstance(obj, ModelRelations):
+            return obj.__dict__
         if isinstance(obj, ForeignFieldKind):
             return obj.name
         if isinstance(obj, RelocationScope):
@@ -226,8 +270,8 @@ def dependencies() -> dict[NormalizedModelName, ModelRelations]:
     from sentry.models.actor import Actor
     from sentry.models.team import Team
 
-    # Process the list of models, and get the list of dependencies
-    model_dependencies_list: Dict[NormalizedModelName, ModelRelations] = {}
+    # Process the list of models, and get the list of dependencies.
+    model_dependencies_dict: Dict[NormalizedModelName, ModelRelations] = {}
     app_configs = apps.get_app_configs()
     models_from_names = {
         get_model_name(model): model
@@ -285,6 +329,10 @@ def dependencies() -> dict[NormalizedModelName, ModelRelations]:
                 field for field in model._meta.get_fields() if isinstance(field, OneToOneField)
             ]
             for field in one_to_one_fields:
+                is_nullable = getattr(field, "null", False)
+                if getattr(field, "unique", False):
+                    uniques.add(frozenset({field.name}))
+
                 rel_model = getattr(field.remote_field, "model", None)
                 if rel_model is not None and rel_model != model:
                     if isinstance(field, OneToOneCascadeDeletes):
@@ -311,6 +359,10 @@ def dependencies() -> dict[NormalizedModelName, ModelRelations]:
                 or isinstance(field, BoundedPositiveIntegerField)
             ]
             for field in simple_integer_fields:
+                is_nullable = getattr(field, "null", False)
+                if getattr(field, "unique", False):
+                    uniques.add(frozenset({field.name}))
+
                 # "actor_id", when used as a simple integer field rather than a `ForeignKey` into an
                 # `Actor`, refers to a unified but loosely specified means by which to index into a
                 # either a `Team` or `User`, before this pattern was formalized by the official
@@ -322,12 +374,13 @@ def dependencies() -> dict[NormalizedModelName, ModelRelations]:
                         foreign_keys[field.name] = ForeignField(
                             model=models_from_names[candidate],
                             kind=ForeignFieldKind.ImplicitForeignKey,
-                            nullable=False,
+                            nullable=is_nullable,
                         )
 
-            model_dependencies_list[get_model_name(model)] = ModelRelations(
-                model=model,
+            model_dependencies_dict[get_model_name(model)] = ModelRelations(
+                dangling=None,
                 foreign_keys=foreign_keys,
+                model=model,
                 relocation_scope=getattr(model, "__relocation_scope__", RelocationScope.Excluded),
                 silos=list(
                     getattr(model._meta, "silo_limit", ModelSiloLimit(SiloMode.MONOLITH)).modes
@@ -336,7 +389,54 @@ def dependencies() -> dict[NormalizedModelName, ModelRelations]:
                 # Sort the constituent sets alphabetically, so that we get consistent JSON output.
                 uniques=sorted(list(uniques), key=lambda u: ":".join(sorted(list(u)))),
             )
-    return model_dependencies_list
+
+    # Get a flat list of "root" models, then mark all of them as non-dangling.
+    relocation_root_models: list[NormalizedModelName] = []
+    for root_models in RelocationRootModels:
+        relocation_root_models.extend(root_models.value)
+    for model_name in relocation_root_models:
+        model_dependencies_dict[model_name].dangling = False
+
+    # Now that all `ModelRelations` have been added to the `model_dependencies_dict`, we can circle
+    # back and figure out which ones are actually dangling. We do this by marking all of the root
+    # models non-dangling, then traversing from every other model to a (possible) root model
+    # recursively. At this point there should be no circular reference chains, so if we encounter
+    # them, fail immediately.
+    def resolve_dangling(seen: Set[NormalizedModelName], model_name: NormalizedModelName) -> bool:
+        model_relations = model_dependencies_dict[model_name]
+        model_name = get_model_name(model_relations.model)
+        if model_name in seen:
+            raise RuntimeError(
+                f"Circular dependency: {model_name} cannot transitively reference itself"
+            )
+        if model_relations.dangling is not None:
+            return model_relations.dangling
+
+        # TODO(getsentry/team-ospo#190): Maybe make it so that `Global` models are never "dangling",
+        # since we want to export 100% of models in `ExportScope.Global` anyway?
+
+        seen.add(model_name)
+
+        # If we are able to successfully over all of the foreign keys without encountering a
+        # dangling reference, we know that this model is dangling as well.
+        model_relations.dangling = True
+        for ff in model_relations.foreign_keys.values():
+            if not ff.nullable:
+                foreign_model_name = get_model_name(ff.model)
+                if not resolve_dangling(seen, foreign_model_name):
+                    # We only need one non-dangling reference to mark this model as non-dangling as
+                    # well.
+                    model_relations.dangling = False
+                    break
+
+        seen.remove(model_name)
+        return model_relations.dangling
+
+    for model_name in model_dependencies_dict.keys():
+        if model_name not in relocation_root_models:
+            resolve_dangling(set(), model_name)
+
+    return model_dependencies_dict
 
 
 # No arguments, so we lazily cache the result after the first calculation.
@@ -347,9 +447,9 @@ def sorted_dependencies() -> list[Type[models.base.Model]]:
     Similar to Django's algorithm except that we discard the importance of natural keys
     when sorting dependencies (ie, it works without them)."""
 
-    model_dependencies_list = list(dependencies().values())
-    model_dependencies_list.reverse()
-    model_set = {md.model for md in model_dependencies_list}
+    model_dependencies_dict = list(dependencies().values())
+    model_dependencies_dict.reverse()
+    model_set = {md.model for md in model_dependencies_dict}
 
     # Now sort the models to ensure that dependencies are met. This
     # is done by repeatedly iterating over the input list of models.
@@ -360,11 +460,11 @@ def sorted_dependencies() -> list[Type[models.base.Model]]:
     # If we do a full iteration without a promotion, that means there are
     # circular dependencies in the list.
     model_list = []
-    while model_dependencies_list:
+    while model_dependencies_dict:
         skipped = []
         changed = False
-        while model_dependencies_list:
-            model_deps = model_dependencies_list.pop()
+        while model_dependencies_dict:
+            model_deps = model_dependencies_dict.pop()
             deps = model_deps.flatten()
             model = model_deps.model
 
@@ -388,6 +488,6 @@ def sorted_dependencies() -> list[Type[models.base.Model]]:
                     for m in sorted(skipped, key=lambda mr: get_model_name(mr.model))
                 )
             )
-        model_dependencies_list = skipped
+        model_dependencies_dict = skipped
 
     return model_list

Some files were not shown because too many files changed in this diff