Browse Source

feat(backup): Enable sanitization for User and Config scope models (#69162)

All models with the `Config` or `User` `RelocationScope` should now be
properly sanitized, with the exception of `*Option` models, which will
need more complex logic, to be added later.
Alex Zaslavsky 10 months ago
parent
commit
8d933cbcf9

+ 187 - 4
src/sentry/backup/sanitize.py

@@ -1,8 +1,12 @@
+import ipaddress
 from collections.abc import Callable
 from copy import deepcopy
 from dataclasses import dataclass
 from datetime import UTC, datetime, timedelta, timezone
+from ipaddress import IPv4Address, IPv6Address, ip_address
 from random import choice, randint
+from urllib.parse import urlparse, urlunparse
+from uuid import UUID, uuid4
 
 import petname
 from dateutil.parser import parse as parse_datetime
@@ -35,6 +39,17 @@ UPPER_CASE_NON_HEX = {
 LOWER_CASE_HEX = {c.lower() for c in UPPER_CASE_HEX}
 LOWER_CASE_NON_HEX = {c.lower() for c in UPPER_CASE_NON_HEX}
 
+MAX_IPV4 = (2**ipaddress.IPV4LENGTH) - 1
+MAX_IPV6 = (2**ipaddress.IPV6LENGTH) - 1
+
+
+def random_ipv4():
+    return str(ipaddress.IPv4Address(randint(0, MAX_IPV4)))
+
+
+def random_ipv6():
+    return str(ipaddress.IPv6Address(randint(0, MAX_IPV6)))
+
 
 class SanitizationError(Exception):
     """
@@ -64,7 +79,8 @@ class UnrecognizedDatetimeError(SanitizationError):
 @dataclass
 class SanitizableField:
     """
-    A pairing a of a `NormalizedModelName` with a field in that model, specifying the target for a sanitization operation.
+    A pairing a of a `NormalizedModelName` with a field in that model, specifying the target for a
+    sanitization operation.
     """
 
     from sentry.backup.dependencies import NormalizedModelName
@@ -100,7 +116,8 @@ def _set_field_value(json: JSONData, field: SanitizableField, value: JSONData) -
 
 def default_string_sanitizer(old: str) -> str:
     """
-    Default string randomizer. Looks at the characters present in the source string to create a new, random string from a roughly similar set of characters.
+    Default string randomizer. Looks at the characters present in the source string to create a new,
+    random string from a roughly similar set of characters.
     """
 
     has_upper_case_hex = False
@@ -136,7 +153,7 @@ def default_string_sanitizer(old: str) -> str:
     if has_digit:
         chars += "0123456789"
 
-    return "".join([choice(list(chars)) for _ in range(3)])
+    return "".join([choice(list(chars)) for _ in range(len(old))])
 
 
 class Sanitizer:
@@ -174,7 +191,7 @@ class Sanitizer:
 
     def __init__(self, export: JSONData, datetime_offset: timedelta | None = None):
         self.json = export
-        self.interned_strings = dict()
+        self.interned_strings = {"": ""}  # Always map empty string to itself.
         self.interned_datetimes = dict()
 
         # Walk the data once, extracting any dates into a set.
@@ -234,6 +251,34 @@ class Sanitizer:
 
         return "@".join([self.map_string(p) for p in old.split("@")])
 
+    def map_ip(self, old: str) -> str:
+        """
+        Maps an IP with some randomly generated alternative. If the `old` IP has already been seen,
+        the already-generated value for that existing key will be used instead. If it has not, we'll
+        generate a new one. This ensures that all identical existing IPs are swapped with identical
+        replacements everywhere they occur.
+
+        If the `old` string is not a valid IP, it will be treated as a regular string.
+
+        If you wish to update an actual JSON model in-place with this newly generated IP, `set_ip()`
+        is the preferred method for doing so.
+        """
+
+        interned = self.interned_strings.get(old)
+        if interned is not None:
+            return interned
+
+        try:
+            old_ip = ip_address(old)
+        except ValueError:
+            return self.map_string(old)
+        else:
+            if isinstance(old_ip, IPv4Address):
+                self.interned_strings[old] = random_ipv4()
+            elif isinstance(old_ip, IPv6Address):
+                self.interned_strings[old] = random_ipv6()
+            return self.interned_strings[old]
+
     def map_name(self, old: str) -> str:
         """
         Maps a proper noun name with some randomly generated "petname" value (ex: "Hairy Tortoise").
@@ -285,6 +330,54 @@ class Sanitizer:
         self.interned_strings[old] = new
         return new
 
+    def map_url(self, old: str) -> str:
+        """
+        Map an URL in a manner that retains domain relationships - ie, all sanitized URLs from
+        domain `.foo` will now be from `.bar`. Scheme, subdomains, paths, etc will be retained in
+        kind (ie: if we have a path, we will sanitize it with a similar, interned one). This ensures
+        that all identical existing URLs are swapped with identical replacements everywhere they
+        occur.
+
+        If the `old` string is not a valid URL , it will be treated as a regular string.
+
+        If you wish to update an actual JSON model in-place with this newly generated URL,
+        `set_URL()` is the preferred method for doing so.
+        """
+
+        url = urlparse(old)
+        hostname = url.hostname
+        if not hostname:
+            return self.map_string(old)
+
+        new_path = (
+            "" if not url.path else ".".join([self.map_string(p) for p in url.path.split(".")])
+        )
+        new_query = (
+            "" if not url.query else ".".join([self.map_string(q) for q in url.query.split(".")])
+        )
+        return urlunparse(
+            url._replace(
+                scheme=self.map_string(url.scheme) if url.scheme else "",
+                netloc=".".join([self.map_string(d) for d in hostname.split(".")]),
+                path=new_path,
+                query=new_query,
+                fragment=self.map_string(url.fragment) if url.fragment else "",
+            )
+        )
+
+    def map_uuid(self, old: str) -> str:
+        """
+        Maps a UUID. If the `old` UUID has already been seen, the already-generated value for that
+        existing key will be used instead. If it has not, we'll generate a new one. This ensures
+        that all identical existing UUIDs are swapped with identical replacements everywhere they
+        occur.
+
+        If you wish to update an actual JSON model in-place with this newly generated name,
+        `set_uuid()` is the preferred method for doing so.
+        """
+
+        return self.map_string(old, lambda _: str(uuid4()))
+
     def set_datetime(self, json: JSONData, field: SanitizableField) -> datetime | None:
         """
         Replaces a datetime by replacing it with a different, but still correctly ordered,
@@ -334,6 +427,34 @@ class Sanitizer:
 
         return _set_field_value(json, field, self.map_email(old))
 
+    def set_ip(
+        self,
+        json: JSONData,
+        field: SanitizableField,
+    ) -> str | None:
+        """
+        Replaces a IP with a randomly generated value. If the existing value of the IP has
+        already been seen, the already-generated value for that existing key will be used instead.
+        If it has not, we'll generate a new one. This ensures that all identical existing IPs are
+        swapped with identical replacements everywhere they occur.
+
+        This method updates the JSON in-place if the specified field is a non-null value, then
+        returns the newly generated replacement. If the specified field could not be found in the
+        supplied JSON model, `None` is returned instead.
+
+        If you wish to merely generate a string without updating the JSON in-place, consider using
+        `map_ip()` instead.
+        """
+
+        field.validate_json_model(json)
+        old = _get_field_value(json, field)
+        if old is None:
+            return None
+        if not isinstance(old, str):
+            raise TypeError("Existing value must be a string")
+
+        return _set_field_value(json, field, self.map_ip(old))
+
     def set_name(
         self,
         json: JSONData,
@@ -432,6 +553,68 @@ class Sanitizer:
 
         return _set_field_value(json, field, self.map_string(old, generate))
 
+    def set_url(
+        self,
+        json: JSONData,
+        field: SanitizableField,
+    ) -> str | None:
+        """
+        Replace a URL in a manner that retains domain relationships - ie, all sanitized URLs from
+        domain `.foo` will now be from `.bar`. Scheme, subdomains, paths, etc will be retained in
+        kind (ie: if we have a path, we will sanitize it with a similar, interned one). This ensures
+        that all identical existing URLs are swapped with identical replacements everywhere they
+        occur.
+
+        If the `old` string is not a valid URL , it will be treated as a regular string.
+
+        This method updates the JSON in-place if the specified field is a non-null value, then
+        returns the newly generated replacement. If the specified field could not be found in the
+        supplied JSON model, `None` is returned instead.
+
+        If you wish to merely generate a string without updating the JSON in-place, consider using
+        `map_url()` instead.
+        """
+
+        field.validate_json_model(json)
+        old = _get_field_value(json, field)
+        if old is None:
+            return None
+        if not isinstance(old, str):
+            raise TypeError("Existing value must be a string")
+
+        return _set_field_value(json, field, self.map_url(old))
+
+    def set_uuid(
+        self,
+        json: JSONData,
+        field: SanitizableField,
+    ) -> str | None:
+        """
+        Replaces a UUID with a randomly generated value. If the existing value of the UUID has
+        already been seen, the already-generated value for that existing key will be used instead.
+        If it has not, we'll generate a new one. This ensures that all identical existing UUIDs are
+        swapped with identical replacements everywhere they occur.
+
+        This method updates the JSON in-place if the specified field is a non-null value, then
+        returns the newly generated replacement. If the specified field could not be found in the
+        supplied JSON model, `None` is returned instead.
+
+        If you wish to merely generate a string without updating the JSON in-place, consider using
+        `map_uuid()` instead.
+        """
+
+        field.validate_json_model(json)
+        old = _get_field_value(json, field)
+        if old is None:
+            return None
+        if not isinstance(old, str):
+            raise TypeError("Existing value must be a string")
+
+        # Will throw an error if this is not a valid UUID.
+        UUID(old)
+
+        return _set_field_value(json, field, self.map_uuid(old))
+
 
 def sanitize(export: JSONData, datetime_offset: timedelta | None = None) -> JSONData:
     """

+ 27 - 0
src/sentry/db/models/base.py

@@ -18,6 +18,7 @@ from sentry.backup.dependencies import (
 from sentry.backup.helpers import ImportFlags
 from sentry.backup.sanitize import SanitizableField, Sanitizer
 from sentry.backup.scopes import ImportScope, RelocationScope
+from sentry.db.models.fields.uuid import UUIDField
 from sentry.silo.base import SiloLimit, SiloMode
 from sentry.utils.json import JSONData
 
@@ -211,6 +212,9 @@ class BaseModel(models.Model):
         fields = cls._meta.get_fields()
         field_names = [f.name for f in fields]
 
+        str_field_types = [models.CharField, models.TextField]
+        sensitive_words = ["password", "token", "secret"]
+
         # All `models.CharField` fields called "slug" and "name" can be auto-sanitized as strings.
         if "name" in field_names and "slug" in field_names:
             sanitizer.set_name_and_slug_pair(
@@ -228,6 +232,29 @@ class BaseModel(models.Model):
             if isinstance(f, models.EmailField):
                 sanitizer.set_email(json, SanitizableField(model_name, f.name))
 
+            # Auto-sanitize all IP Address fields.
+            if isinstance(f, models.IPAddressField) or isinstance(f, models.GenericIPAddressField):
+                sanitizer.set_ip(json, SanitizableField(model_name, f.name))
+
+            # Auto-sanitize all URL fields.
+            if isinstance(f, models.URLField) or f.name.endswith("url") or f.name.endswith("uri"):
+                sanitizer.set_url(json, SanitizableField(model_name, f.name))
+
+            # Auto-sanitize all UUID fields.
+            if (
+                isinstance(f, models.UUIDField)
+                or isinstance(f, UUIDField)
+                or f.name.endswith("guid")
+                or f.name.endswith("uuid")
+            ):
+                sanitizer.set_uuid(json, SanitizableField(model_name, f.name))
+
+            # Auto-sanitize all string fields that contain any sensitive words in their name.
+            is_str_field_type = next(filter(lambda t: isinstance(f, t), str_field_types), None)
+            contains_sensitive_word = next(filter(lambda w: w in f.name, sensitive_words), None)
+            if is_str_field_type and contains_sensitive_word:
+                sanitizer.set_string(json, SanitizableField(model_name, f.name))
+
         return None
 
     def normalize_before_relocation_import(

+ 34 - 1
src/sentry/models/apitoken.py

@@ -11,8 +11,9 @@ from django.utils import timezone
 from django.utils.encoding import force_str
 
 from sentry import options
-from sentry.backup.dependencies import ImportKind
+from sentry.backup.dependencies import ImportKind, NormalizedModelName, get_model_name
 from sentry.backup.helpers import ImportFlags
+from sentry.backup.sanitize import SanitizableField, Sanitizer
 from sentry.backup.scopes import ImportScope, RelocationScope
 from sentry.constants import SentryAppStatus
 from sentry.db.models import FlexibleForeignKey, control_silo_model, sane_repr
@@ -22,6 +23,7 @@ from sentry.models.apiscopes import HasApiScopes
 from sentry.models.outbox import OutboxCategory
 from sentry.types.region import find_all_region_names
 from sentry.types.token import AuthTokenType
+from sentry.utils.json import JSONData
 
 DEFAULT_EXPIRATION = timedelta(days=30)
 TOKEN_REDACTED = "***REDACTED***"
@@ -318,6 +320,37 @@ class ApiToken(ReplicatedControlModel, HasApiScopes):
 
         return super().write_relocation_import(scope, flags)
 
+    @classmethod
+    def sanitize_relocation_json(
+        cls, json: JSONData, sanitizer: Sanitizer, model_name: NormalizedModelName | None = None
+    ) -> None:
+        model_name = get_model_name(cls) if model_name is None else model_name
+        super().sanitize_relocation_json(json, sanitizer, model_name)
+
+        token = generate_token()
+        token_last_characters = token[-4:]
+        hashed_token = hashlib.sha256(token.encode()).hexdigest()
+        refresh_token = generate_token()
+        hashed_refresh_token = hashlib.sha256(refresh_token.encode()).hexdigest()
+
+        sanitizer.set_string(json, SanitizableField(model_name, "token"), lambda _: token)
+        sanitizer.set_string(
+            json,
+            SanitizableField(model_name, "token_last_characters"),
+            lambda _: token_last_characters,
+        )
+        sanitizer.set_string(
+            json, SanitizableField(model_name, "hashed_token"), lambda _: hashed_token
+        )
+        sanitizer.set_string(
+            json, SanitizableField(model_name, "refresh_token"), lambda _: refresh_token
+        )
+        sanitizer.set_string(
+            json,
+            SanitizableField(model_name, "hashed_refresh_token"),
+            lambda _: hashed_refresh_token,
+        )
+
     @property
     def organization_id(self) -> int | None:
         from sentry.models.integrations.sentry_app_installation import SentryAppInstallation

+ 23 - 0
src/sentry/models/relay.py

@@ -3,9 +3,12 @@ from django.utils import timezone
 from django.utils.functional import cached_property
 from sentry_relay.auth import PublicKey
 
+from sentry.backup.dependencies import NormalizedModelName, get_model_name
 from sentry.backup.mixins import OverwritableConfigMixin
+from sentry.backup.sanitize import SanitizableField, Sanitizer
 from sentry.backup.scopes import RelocationScope
 from sentry.db.models import Model, region_silo_model
+from sentry.utils.json import JSONData
 
 
 @region_silo_model
@@ -24,6 +27,16 @@ class RelayUsage(OverwritableConfigMixin, Model):
         app_label = "sentry"
         db_table = "sentry_relayusage"
 
+    @classmethod
+    def sanitize_relocation_json(
+        cls, json: JSONData, sanitizer: Sanitizer, model_name: NormalizedModelName | None = None
+    ) -> None:
+        model_name = get_model_name(cls) if model_name is None else model_name
+        super().sanitize_relocation_json(json, sanitizer, model_name)
+
+        sanitizer.set_uuid(json, SanitizableField(model_name, "relay_id"))
+        sanitizer.set_string(json, SanitizableField(model_name, "public_key"))
+
 
 @region_silo_model
 class Relay(OverwritableConfigMixin, Model):
@@ -66,3 +79,13 @@ class Relay(OverwritableConfigMixin, Model):
         Returns all the relays that are configured with one of the specified keys
         """
         return Relay.objects.filter(public_key__in=keys)
+
+    @classmethod
+    def sanitize_relocation_json(
+        cls, json: JSONData, sanitizer: Sanitizer, model_name: NormalizedModelName | None = None
+    ) -> None:
+        model_name = get_model_name(cls) if model_name is None else model_name
+        super().sanitize_relocation_json(json, sanitizer, model_name)
+
+        sanitizer.set_uuid(json, SanitizableField(model_name, "relay_id"))
+        sanitizer.set_string(json, SanitizableField(model_name, "public_key"))

+ 18 - 1
src/sentry/models/user.py

@@ -21,8 +21,14 @@ from django.utils.translation import gettext_lazy as _
 
 from bitfield import TypedClassBitField
 from sentry.auth.authenticators import available_authenticators
-from sentry.backup.dependencies import ImportKind, PrimaryKeyMap
+from sentry.backup.dependencies import (
+    ImportKind,
+    NormalizedModelName,
+    PrimaryKeyMap,
+    get_model_name,
+)
 from sentry.backup.helpers import ImportFlags
+from sentry.backup.sanitize import SanitizableField, Sanitizer
 from sentry.backup.scopes import ImportScope, RelocationScope
 from sentry.db.models import (
     BaseManager,
@@ -44,6 +50,7 @@ from sentry.services.hybrid_cloud.user import RpcUser
 from sentry.types.integrations import EXTERNAL_PROVIDERS, ExternalProviders
 from sentry.types.region import find_all_region_names, find_regions_for_user
 from sentry.utils.http import absolute_uri
+from sentry.utils.json import JSONData
 from sentry.utils.retries import TimedRetryPolicy
 
 audit_logger = logging.getLogger("sentry.audit.user")
@@ -504,6 +511,16 @@ class User(BaseModel, AbstractBaseUser):
             # Perform the remainder of the write while we're still holding the lock.
             return do_write()
 
+    @classmethod
+    def sanitize_relocation_json(
+        cls, json: JSONData, sanitizer: Sanitizer, model_name: NormalizedModelName | None = None
+    ) -> None:
+        model_name = get_model_name(cls) if model_name is None else model_name
+        super().sanitize_relocation_json(json, sanitizer, model_name)
+
+        sanitizer.set_string(json, SanitizableField(model_name, "username"))
+        sanitizer.set_string(json, SanitizableField(model_name, "session_nonce"))
+
     @classmethod
     def handle_async_deletion(
         cls,

+ 20 - 1
src/sentry/models/useremail.py

@@ -10,8 +10,14 @@ from django.db import models
 from django.utils import timezone
 from django.utils.translation import gettext_lazy as _
 
-from sentry.backup.dependencies import ImportKind, PrimaryKeyMap, get_model_name
+from sentry.backup.dependencies import (
+    ImportKind,
+    NormalizedModelName,
+    PrimaryKeyMap,
+    get_model_name,
+)
 from sentry.backup.helpers import ImportFlags
+from sentry.backup.sanitize import SanitizableField, Sanitizer
 from sentry.backup.scopes import ImportScope, RelocationScope
 from sentry.db.models import BaseManager, FlexibleForeignKey, control_silo_model, sane_repr
 from sentry.db.models.outboxes import ControlOutboxProducingModel
@@ -19,6 +25,7 @@ from sentry.models.outbox import ControlOutboxBase, OutboxCategory
 from sentry.services.hybrid_cloud.organization.model import RpcOrganization
 from sentry.services.hybrid_cloud.user.model import RpcUser
 from sentry.types.region import find_regions_for_user
+from sentry.utils.json import JSONData
 from sentry.utils.security import get_secure_token
 
 if TYPE_CHECKING:
@@ -142,3 +149,15 @@ class UserEmail(ControlOutboxProducingModel):
         # `--merge_users=true` case is handled in the `normalize_before_relocation_import()` method
         # above).
         return (useremail.pk, ImportKind.Inserted)
+
+    @classmethod
+    def sanitize_relocation_json(
+        cls, json: JSONData, sanitizer: Sanitizer, model_name: NormalizedModelName | None = None
+    ) -> None:
+        model_name = get_model_name(cls) if model_name is None else model_name
+        super().sanitize_relocation_json(json, sanitizer, model_name)
+
+        validation_hash = get_secure_token()
+        sanitizer.set_string(
+            json, SanitizableField(model_name, "validation_hash"), lambda _: validation_hash
+        )

+ 60 - 13
tests/sentry/backup/snapshots/SanitizationExhaustiveTests/test_clean_pks.pysnap

@@ -1,5 +1,5 @@
 ---
-created: '2024-04-25T18:22:28.906727+00:00'
+created: '2024-05-02T21:07:58.578052+00:00'
 creator: sentry
 source: tests/sentry/backup/test_sanitize.py
 ---
@@ -68,18 +68,22 @@ source: tests/sentry/backup/test_sanitize.py
   ordinal: 1
   sanitized_fields:
   - date_added
+  - secret_key
 - model_name: sentry.projectkey
   ordinal: 2
   sanitized_fields:
   - date_added
+  - secret_key
 - model_name: sentry.projectkey
   ordinal: 3
   sanitized_fields:
   - date_added
+  - secret_key
 - model_name: sentry.projectkey
   ordinal: 4
   sanitized_fields:
   - date_added
+  - secret_key
 - model_name: sentry.projectoption
   ordinal: 1
   sanitized_fields: []
@@ -127,17 +131,22 @@ source: tests/sentry/backup/test_sanitize.py
   - date_added
 - model_name: sentry.relay
   ordinal: 1
-  sanitized_fields: []
+  sanitized_fields:
+  - public_key
+  - relay_id
 - model_name: sentry.relayusage
   ordinal: 1
   sanitized_fields:
   - first_seen
   - last_seen
+  - public_key
+  - relay_id
 - model_name: sentry.repository
   ordinal: 1
   sanitized_fields:
   - date_added
   - name
+  - url
 - model_name: sentry.team
   ordinal: 1
   sanitized_fields:
@@ -151,7 +160,8 @@ source: tests/sentry/backup/test_sanitize.py
   - email
   - last_active
   - last_password_change
-  - name
+  - password
+  - username
 - model_name: sentry.user
   ordinal: 2
   sanitized_fields:
@@ -159,7 +169,8 @@ source: tests/sentry/backup/test_sanitize.py
   - email
   - last_active
   - last_password_change
-  - name
+  - password
+  - username
 - model_name: sentry.user
   ordinal: 3
   sanitized_fields:
@@ -167,7 +178,8 @@ source: tests/sentry/backup/test_sanitize.py
   - email
   - last_active
   - last_password_change
-  - name
+  - password
+  - username
 - model_name: sentry.user
   ordinal: 4
   sanitized_fields:
@@ -175,14 +187,14 @@ source: tests/sentry/backup/test_sanitize.py
   - email
   - last_active
   - last_password_change
-  - name
+  - password
+  - username
 - model_name: sentry.user
   ordinal: 5
   sanitized_fields:
   - date_joined
-  - email
   - last_active
-  - name
+  - username
 - model_name: sentry.user
   ordinal: 6
   sanitized_fields:
@@ -190,16 +202,19 @@ source: tests/sentry/backup/test_sanitize.py
   - email
   - last_active
   - last_password_change
-  - name
+  - password
+  - username
 - model_name: sentry.userip
   ordinal: 1
   sanitized_fields:
   - first_seen
+  - ip_address
   - last_seen
 - model_name: sentry.userip
   ordinal: 2
   sanitized_fields:
   - first_seen
+  - ip_address
   - last_seen
 - model_name: sentry.useroption
   ordinal: 1
@@ -246,6 +261,8 @@ source: tests/sentry/backup/test_sanitize.py
   sanitized_fields:
   - date_added
   - name
+  - token_hashed
+  - token_last_characters
 - model_name: sentry.organizationmember
   ordinal: 1
   sanitized_fields:
@@ -261,7 +278,7 @@ source: tests/sentry/backup/test_sanitize.py
   ordinal: 1
   sanitized_fields:
   - date_added
-  - name
+  - guid
   - slug
 - model_name: sentry.environment
   ordinal: 1
@@ -292,7 +309,6 @@ source: tests/sentry/backup/test_sanitize.py
   ordinal: 5
   sanitized_fields:
   - date_added
-  - email
 - model_name: sentry.email
   ordinal: 6
   sanitized_fields:
@@ -341,6 +357,7 @@ source: tests/sentry/backup/test_sanitize.py
 - model_name: sentry.apiapplication
   ordinal: 1
   sanitized_fields:
+  - client_secret
   - date_added
   - name
 - model_name: sentry.useremail
@@ -348,31 +365,36 @@ source: tests/sentry/backup/test_sanitize.py
   sanitized_fields:
   - date_hash_added
   - email
+  - validation_hash
 - model_name: sentry.useremail
   ordinal: 2
   sanitized_fields:
   - date_hash_added
   - email
+  - validation_hash
 - model_name: sentry.useremail
   ordinal: 3
   sanitized_fields:
   - date_hash_added
   - email
+  - validation_hash
 - model_name: sentry.useremail
   ordinal: 4
   sanitized_fields:
   - date_hash_added
   - email
+  - validation_hash
 - model_name: sentry.useremail
   ordinal: 5
   sanitized_fields:
   - date_hash_added
-  - email
+  - validation_hash
 - model_name: sentry.useremail
   ordinal: 6
   sanitized_fields:
   - date_hash_added
   - email
+  - validation_hash
 - model_name: sentry.snubaquery
   ordinal: 1
   sanitized_fields:
@@ -392,6 +414,8 @@ source: tests/sentry/backup/test_sanitize.py
   - date_updated
   - name
   - slug
+  - uuid
+  - webhook_url
 - model_name: sentry.rule
   ordinal: 1
   sanitized_fields:
@@ -451,20 +475,35 @@ source: tests/sentry/backup/test_sanitize.py
   sanitized_fields:
   - date_added
   - expires_at
+  - hashed_refresh_token
+  - hashed_token
+  - refresh_token
+  - token
+  - token_last_characters
 - model_name: sentry.apitoken
   ordinal: 2
   sanitized_fields:
   - date_added
+  - hashed_refresh_token
+  - hashed_token
   - name
+  - refresh_token
+  - token
+  - token_last_characters
 - model_name: sentry.apitoken
   ordinal: 3
   sanitized_fields:
   - date_added
+  - hashed_token
   - name
+  - token
+  - token_last_characters
+  - token_type
 - model_name: sentry.apigrant
   ordinal: 1
   sanitized_fields:
   - expires_at
+  - redirect_uri
 - model_name: sentry.apiauthorization
   ordinal: 1
   sanitized_fields:
@@ -505,9 +544,11 @@ source: tests/sentry/backup/test_sanitize.py
   sanitized_fields:
   - date_added
   - date_updated
+  - uuid
 - model_name: sentry.sentryappcomponent
   ordinal: 1
-  sanitized_fields: []
+  sanitized_fields:
+  - uuid
 - model_name: sentry.rulesnooze
   ordinal: 1
   sanitized_fields:
@@ -572,10 +613,16 @@ source: tests/sentry/backup/test_sanitize.py
   ordinal: 1
   sanitized_fields:
   - date_added
+  - guid
+  - secret
+  - url
 - model_name: sentry.servicehook
   ordinal: 2
   sanitized_fields:
   - date_added
+  - guid
+  - secret
+  - url
 - model_name: sentry.incident
   ordinal: 1
   sanitized_fields:

+ 14 - 4
tests/sentry/backup/snapshots/SanitizationIntegrationTests/test_fresh_install.pysnap

@@ -1,5 +1,5 @@
 ---
-created: '2024-04-17T17:41:21.647153+00:00'
+created: '2024-05-02T21:07:54.051381+00:00'
 creator: sentry
 source: tests/sentry/backup/test_sanitize.py
 ---
@@ -45,7 +45,8 @@ source: tests/sentry/backup/test_sanitize.py
   - email
   - last_active
   - last_password_change
-  - name
+  - password
+  - username
 - model_name: sentry.user
   ordinal: 2
   sanitized_fields:
@@ -53,15 +54,20 @@ source: tests/sentry/backup/test_sanitize.py
   - email
   - last_active
   - last_password_change
-  - name
+  - password
+  - username
 - model_name: sentry.relayusage
   ordinal: 1
   sanitized_fields:
   - first_seen
   - last_seen
+  - public_key
+  - relay_id
 - model_name: sentry.relay
   ordinal: 1
-  sanitized_fields: []
+  sanitized_fields:
+  - public_key
+  - relay_id
 - model_name: sentry.authenticator
   ordinal: 1
   sanitized_fields:
@@ -75,11 +81,13 @@ source: tests/sentry/backup/test_sanitize.py
   sanitized_fields:
   - date_hash_added
   - email
+  - validation_hash
 - model_name: sentry.useremail
   ordinal: 2
   sanitized_fields:
   - date_hash_added
   - email
+  - validation_hash
 - model_name: sentry.userrole
   ordinal: 1
   sanitized_fields:
@@ -115,6 +123,7 @@ source: tests/sentry/backup/test_sanitize.py
   ordinal: 1
   sanitized_fields:
   - date_added
+  - secret_key
 - model_name: sentry.rule
   ordinal: 1
   sanitized_fields:
@@ -143,5 +152,6 @@ source: tests/sentry/backup/test_sanitize.py
 - model_name: sentry.apiapplication
   ordinal: 1
   sanitized_fields:
+  - client_secret
   - date_added
   - name

+ 53 - 1
tests/sentry/backup/test_sanitize.py

@@ -24,6 +24,7 @@ from sentry.backup.sanitize import (
 from sentry.backup.scopes import RelocationScope
 from sentry.db.models.base import DefaultFieldsModel
 from sentry.db.models.fields.slug import SentrySlugField
+from sentry.db.models.fields.uuid import UUIDField
 from sentry.testutils.cases import TestCase
 from sentry.testutils.factories import get_fixture_path
 from sentry.testutils.helpers.backups import BackupTestCase
@@ -37,6 +38,10 @@ FAKE_NAME = "Fake Name"
 FAKE_NICKNAME = "Fake Nickname"
 FAKE_SLUG = "fake-slug"
 FAKE_TEXT = "This is some text."
+FAKE_IP_V4 = "8.8.8.8"
+FAKE_IP_V6 = "9c72:8448:90c4:4e5e:a946:c1f5:71a6:4cc2"
+FAKE_URL = "https://sub.domain.example.com/some/path?a=b&c=d#foo"
+FAKE_UUID = "6b79316f-cd5c-42fa-ad45-20ce0b1f0725"
 
 CURR_DATE = datetime.now()
 CURR_YEAR = CURR_DATE.year
@@ -55,6 +60,10 @@ class FakeSanitizableModel(DefaultFieldsModel):
     slug = SentrySlugField(null=True)
     nickname = models.CharField(null=True, max_length=32)
     text = SentrySlugField(null=True, max_length=128)
+    ip_v4 = models.GenericIPAddressField(null=True)
+    ip_v6 = models.GenericIPAddressField(null=True)
+    url = models.URLField(null=True)
+    uuid = UUIDField(null=True)
 
     class Meta:
         app_label = "test"
@@ -65,9 +74,10 @@ class FakeSanitizableModel(DefaultFieldsModel):
         cls, json: JSONData, sanitizer: Sanitizer, model_name: NormalizedModelName | None = None
     ) -> None:
         model_name = get_model_name(cls) if model_name is None else model_name
+        super().sanitize_relocation_json(json, sanitizer, model_name)
+
         sanitizer.set_name(json, SanitizableField(model_name, "nickname"))
         sanitizer.set_string(json, SanitizableField(model_name, "text"))
-        return super().sanitize_relocation_json(json, sanitizer, model_name)
 
 
 @patch("sentry.backup.dependencies.get_model", Mock(return_value=FakeSanitizableModel))
@@ -92,6 +102,10 @@ class SanitizationUnitTests(TestCase):
             slug=FAKE_SLUG,
             nickname=FAKE_NICKNAME,
             text=FAKE_TEXT,
+            ip_v4=FAKE_IP_V4,
+            ip_v6=FAKE_IP_V6,
+            url=FAKE_URL,
+            uuid=FAKE_UUID,
         )
         faked = self.serialize_to_json_data([model, model])
         sanitized = sanitize(faked, DELTA_YEAR)
@@ -108,6 +122,15 @@ class SanitizationUnitTests(TestCase):
         assert isinstance(s0["nickname"], str)
         assert isinstance(s0["text"], str)
 
+        assert isinstance(s0["ip_v4"], str)
+        assert s0["ip_v4"].count(".") == 3
+
+        assert isinstance(s0["ip_v6"], str)
+        assert s0["ip_v6"].count(":") == 7
+
+        assert isinstance(s0["url"], str)
+        assert isinstance(s0["uuid"], str)
+
         # Confirm sanitization.
         assert parse_datetime(f0["date_added"]) < s0["date_added"]
         assert parse_datetime(f0["date_updated"]) < s0["date_updated"]
@@ -116,6 +139,10 @@ class SanitizationUnitTests(TestCase):
         assert f0["slug"] != s0["slug"]
         assert f0["nickname"] != s0["nickname"]
         assert f0["text"] != s0["text"]
+        assert f0["ip_v4"] != s0["ip_v4"]
+        assert f0["ip_v6"] != s0["ip_v6"]
+        assert f0["url"] != s0["url"]
+        assert f0["uuid"] != s0["uuid"]
 
         # Identical source values remain equal after sanitization.
         assert s0["date_added"] == s1["date_added"]
@@ -125,6 +152,10 @@ class SanitizationUnitTests(TestCase):
         assert s0["slug"] == s1["slug"]
         assert s0["nickname"] == s1["nickname"]
         assert s0["text"] == s1["text"]
+        assert s0["ip_v4"] == s1["ip_v4"]
+        assert s0["ip_v6"] == s1["ip_v6"]
+        assert s0["url"] == s1["url"]
+        assert s0["uuid"] == s1["uuid"]
 
     def test_good_all_sanitizers_unset_fields(self):
         model = FakeSanitizableModel(
@@ -134,6 +165,10 @@ class SanitizationUnitTests(TestCase):
             nickname=None,
             slug=None,
             text=None,
+            ip_v4=None,
+            ip_v6=None,
+            url=None,
+            uuid=None,
         )
         faked = self.serialize_to_json_data([model])
         sanitized = sanitize(faked, DELTA_YEAR)
@@ -146,12 +181,21 @@ class SanitizationUnitTests(TestCase):
         assert s["slug"] is None
         assert s["nickname"] is None
         assert s["text"] is None
+        assert s["ip_v4"] is None
+        assert s["ip_v6"] is None
+        assert s["url"] is None
+        assert s["uuid"] is None
+
         assert s["date_updated"] == f["date_updated"]
         assert s["email"] == f["email"]
         assert s["name"] == f["name"]
         assert s["slug"] == f["slug"]
         assert s["nickname"] == f["nickname"]
         assert s["text"] == f["text"]
+        assert s["ip_v4"] == f["ip_v4"]
+        assert s["ip_v6"] == f["ip_v6"]
+        assert s["url"] == f["url"]
+        assert s["uuid"] == f["uuid"]
 
     def test_good_date_all_sanitizers_no_delta(self):
         faked = self.serialize_to_json_data(
@@ -164,6 +208,10 @@ class SanitizationUnitTests(TestCase):
                     slug=FAKE_SLUG,
                     nickname=FAKE_NICKNAME,
                     text=FAKE_TEXT,
+                    ip_v4=FAKE_IP_V4,
+                    ip_v6=FAKE_IP_V6,
+                    url=FAKE_URL,
+                    uuid=FAKE_UUID,
                 )
             ]
         )
@@ -177,6 +225,10 @@ class SanitizationUnitTests(TestCase):
         assert f["slug"] != s["slug"]
         assert f["nickname"] != s["nickname"]
         assert f["text"] != s["text"]
+        assert f["ip_v4"] != s["ip_v4"]
+        assert f["ip_v6"] != s["ip_v6"]
+        assert f["url"] != s["url"]
+        assert f["uuid"] != s["uuid"]
         assert s["date_added"] < s["date_updated"]
 
     def test_good_dates_preserve_ordering(self):