Browse Source

fix(hybridcloud) Begin replacing role switching with query audits (#52587)

Switching roles has been proving tempermental as we apply it to split
databases. Instead of relying on postgres permission errors we can audit
the query logs generated by each test case to check for unsafe query
patterns. I've implemented some simple substring based heuristics
presently as I was trying to keep complexity and overhead low.

I've not removed the role manipulation yet, as I wanted to get the query
auditing in place first. I'll follow up these changes with another set
of changes to remove the role manipulation.
Mark Story 1 year ago
parent
commit
5678fa1966

+ 10 - 0
src/sentry/db/postgres/roles.py

@@ -3,11 +3,15 @@ from __future__ import annotations
 import contextlib
 import os
 import sys
+from collections import defaultdict
+from typing import MutableMapping
 
 from django.db.transaction import get_connection
 
 from sentry.silo.patches.silo_aware_transaction_patch import determine_using_by_silo_mode
 
+_fencing_counters: MutableMapping[str, int] = defaultdict(int)
+
 
 @contextlib.contextmanager
 def in_test_psql_role_override(role_name: str, using: str | None = None):
@@ -22,7 +26,12 @@ def in_test_psql_role_override(role_name: str, using: str | None = None):
 
     using = determine_using_by_silo_mode(using)
 
+    # TODO(mark) Move this closer to other silo code.
+    _fencing_counters[using] += 1
+
     with get_connection(using).cursor() as conn:
+        fence_value = _fencing_counters[using]
+        conn.execute("SELECT %s", [f"start_role_override_{fence_value}"])
         conn.execute("SELECT user")
         (cur,) = conn.fetchone()
         conn.execute("SET ROLE %s", [role_name])
@@ -30,3 +39,4 @@ def in_test_psql_role_override(role_name: str, using: str | None = None):
             yield
         finally:
             conn.execute("SET ROLE %s", [cur])
+            conn.execute("SELECT %s", [f"end_role_override_{fence_value}"])

+ 2 - 1
src/sentry/silo/patches/silo_aware_transaction_patch.py

@@ -41,7 +41,7 @@ def siloed_on_commit(func: Callable[..., Any], using: Optional[str] = None) -> N
     return _default_on_commit(func, using)
 
 
-def determine_using_by_silo_mode(using):
+def determine_using_by_silo_mode(using: Optional[str]) -> str:
     from sentry.models import ControlOutbox, RegionOutbox
     from sentry.silo import SiloMode
 
@@ -51,6 +51,7 @@ def determine_using_by_silo_mode(using):
 
     if not using:
         using = region_db if current_silo_mode == SiloMode.REGION else control_db
+        assert using
 
     both_silos_route_to_same_db = control_db == region_db
 

+ 20 - 16
src/sentry/tasks/organization_mapping.py

@@ -2,9 +2,11 @@ import logging
 from datetime import datetime, timedelta
 
 from django.contrib.postgres.aggregates import ArrayAgg
+from django.db import router
 from django.db.models import Count
 from django.utils import timezone
 
+from sentry.db.postgres.roles import in_test_psql_role_override
 from sentry.models.organizationmapping import OrganizationMapping
 from sentry.services.hybrid_cloud.organization import organization_service
 from sentry.silo.base import SiloMode
@@ -40,12 +42,13 @@ def _verify_mappings(expiration_threshold_time: datetime) -> None:
         org = organization_service.get_organization_by_id(
             id=mapping.organization_id, slug=mapping.slug
         )
-        if org is None and mapping.date_created <= expiration_threshold_time:
-            mapping.delete()
-        elif org is not None:
-            mapping.verified = True
-            mapping.idempotency_key = ""
-            mapping.save()
+        with in_test_psql_role_override("postgres", using=router.db_for_write(OrganizationMapping)):
+            if org is None and mapping.date_created <= expiration_threshold_time:
+                mapping.delete()
+            elif org is not None:
+                mapping.verified = True
+                mapping.idempotency_key = ""
+                mapping.save()
 
 
 def _remove_duplicate_mappings(expiration_threshold_time: datetime) -> None:
@@ -62,14 +65,15 @@ def _remove_duplicate_mappings(expiration_threshold_time: datetime) -> None:
         )
         organization_id = dupe["organization_id"]
 
-        # Organization exists in the region silo
-        found_org = organization_service.get_organization_by_id(id=organization_id)
-        if found_org is None:
-            # Delete all mappings.
-            OrganizationMapping.objects.filter(organization_id=organization_id).delete()
-            return
+        with in_test_psql_role_override("postgres", using=router.db_for_write(OrganizationMapping)):
+            # Organization exists in the region silo
+            found_org = organization_service.get_organization_by_id(id=organization_id)
+            if found_org is None:
+                # Delete all mappings.
+                OrganizationMapping.objects.filter(organization_id=organization_id).delete()
+                return
 
-        # Delete all expired mappings that don't match this org slug
-        OrganizationMapping.objects.filter(
-            organization_id=organization_id, date_created__lte=expiration_threshold_time
-        ).exclude(slug=found_org.organization.slug).delete()
+            # Delete all expired mappings that don't match this org slug
+            OrganizationMapping.objects.filter(
+                organization_id=organization_id, date_created__lte=expiration_threshold_time
+            ).exclude(slug=found_org.organization.slug).delete()

+ 75 - 1
src/sentry/testutils/silo.py

@@ -2,9 +2,21 @@ from __future__ import annotations
 
 import functools
 import inspect
+import re
 import sys
 from contextlib import contextmanager
-from typing import Any, Callable, Generator, Iterable, MutableMapping, MutableSet, Set, Tuple, Type
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    MutableMapping,
+    MutableSet,
+    Set,
+    Tuple,
+    Type,
+)
 from unittest import TestCase
 
 import pytest
@@ -326,6 +338,68 @@ def restrict_role(role: str, model: Any, revocation_type: str, using: str = "def
         connection.execute(f"REVOKE {revocation_type} ON public.{model._meta.db_table} FROM {role}")
 
 
+def protected_table(table: str, operation: str) -> re.Pattern:
+    return re.compile(f'{operation}[^"]+"{table}"', re.IGNORECASE)
+
+
+protected_operations = (
+    protected_table("sentry_organizationmember", "insert"),
+    protected_table("sentry_organizationmember", "update"),
+    protected_table("sentry_organizationmember", "delete"),
+    protected_table("sentry_organization", "insert"),
+    protected_table("sentry_organization", "update"),
+    protected_table("sentry_organizationmapping", "insert"),
+    protected_table("sentry_organizationmapping", "update"),
+    protected_table("sentry_organizationmembermapping", "insert"),
+)
+
+fence_re = re.compile(r"select\s*\'(?P<operation>start|end)_role_override", re.IGNORECASE)
+
+
+def validate_protected_queries(queries: Iterable[Dict[str, str]]) -> None:
+    """
+    Validate a list of queries to ensure that protected queries
+    are wrapped in role_override fence values.
+
+    See sentry.db.postgres.roles for where fencing queries come from.
+    """
+    fence_depth = 0
+    for query in queries:
+        sql = query["sql"]
+        match = fence_re.match(sql)
+        if match:
+            operation = match.group("operation")
+            if operation == "start":
+                fence_depth += 1
+            elif operation == "end":
+                fence_depth = max(fence_depth - 1, 0)
+            else:
+                raise AssertionError("Invalid fencing operation encounted")
+
+        for protected in protected_operations:
+            if protected.match(sql):
+                if fence_depth == 0:
+                    msg = [
+                        "Found protected operation without explicit outbox escape!",
+                        "",
+                        sql,
+                        "",
+                        "Was not surrounded by role elevation queries, and could corrupt data if outboxes are not generated.",
+                        "If you are confident that outboxes are being generated, wrap the "
+                        "operation that generates this query with the `in_test_psql_role_override` ",
+                        "context manager to resolve this failure. For example:",
+                        "",
+                        "with in_test_psql_role_override():",
+                        "    membership.delete()",
+                        "",
+                        "Full query log:",
+                        "",
+                    ]
+                    msg.extend([q["sql"] for q in queries])
+
+                    raise AssertionError("\n".join(msg))
+
+
 def iter_models(app_name: str | None = None) -> Iterable[Type[Model]]:
     from django.apps import apps
 

+ 39 - 0
tests/conftest.py

@@ -1,4 +1,5 @@
 import os
+from typing import MutableMapping
 
 import pytest
 from django.db import connections
@@ -195,3 +196,41 @@ def protect_hybrid_cloud_writes_and_deletes(request):
         for connection in connections.all():
             with connection.cursor() as cursor:
                 cursor.execute("SET ROLE 'postgres'")
+
+
+@pytest.fixture(autouse=True)
+def audit_hybrid_cloud_writes_and_deletes(request):
+    """
+    Ensure that write operations on hybrid cloud foreign keys are recorded
+    alongside outboxes or use a context manager to indicate that the
+    caller has considered outbox and didn't accidentally forget.
+
+    Generally you can avoid assertion errors from these checks by:
+
+    1. Running deletion/write logic within an `outbox_context`.
+    2. Using Model.delete()/save methods that create outbox messages in the
+       same transaction as a delete operation.
+
+    Scenarios that are generally always unsafe are  using
+    `QuerySet.delete()`, `QuerySet.update()` or raw SQL to perform
+    writes.
+
+    The User.delete() method is a good example of how to safely
+    delete records and generate outbox messages.
+    """
+    from sentry.testutils.silo import validate_protected_queries
+
+    debug_cursor_state: MutableMapping[str, bool] = {}
+    for conn in connections.all():
+        debug_cursor_state[conn.alias] = conn.force_debug_cursor
+
+        conn.queries_log.clear()
+        conn.force_debug_cursor = True
+
+    try:
+        yield
+    finally:
+        for conn in connections.all():
+            conn.force_debug_cursor = debug_cursor_state[conn.alias]
+
+            validate_protected_queries(conn.queries)

+ 16 - 12
tests/sentry/tasks/test_organization_mapping.py

@@ -1,6 +1,7 @@
 from datetime import datetime
 
 import pytest
+from django.db import router
 
 from sentry.db.postgres.roles import in_test_psql_role_override
 from sentry.models.organization import Organization
@@ -20,14 +21,16 @@ class OrganizationMappingRepairTest(TestCase):
     def test_removes_expired_unverified(self):
         self.organization = Factories.create_organization()
         expired_time = datetime.now() - ORGANIZATION_MAPPING_EXPIRY
-        mapping = OrganizationMapping.objects.get(organization_id=self.organization.id)
-        mapping.verified = False
-        mapping.date_created = expired_time
-        mapping.save()
 
-        phantom_mapping = self.create_organization_mapping(
-            Organization(id=123, slug="fake-slug"), date_created=expired_time, verified=False
-        )
+        with in_test_psql_role_override("postgres", using=router.db_for_write(OrganizationMapping)):
+            mapping = OrganizationMapping.objects.get(organization_id=self.organization.id)
+            mapping.verified = False
+            mapping.date_created = expired_time
+            mapping.save()
+
+            phantom_mapping = self.create_organization_mapping(
+                Organization(id=123, slug="fake-slug"), date_created=expired_time, verified=False
+            )
 
         repair_mappings()
 
@@ -40,11 +43,12 @@ class OrganizationMappingRepairTest(TestCase):
         self.organization = Factories.create_organization()
         expired_time = datetime.now() - ORGANIZATION_MAPPING_EXPIRY
 
-        mapping = OrganizationMapping.objects.get(organization_id=self.organization.id)
-        mapping.verified = False
-        mapping.date_created = expired_time
-        mapping.idempotency_key = "1234"
-        mapping.save()
+        with in_test_psql_role_override("postgres", using=router.db_for_write(OrganizationMapping)):
+            mapping = OrganizationMapping.objects.get(organization_id=self.organization.id)
+            mapping.verified = False
+            mapping.date_created = expired_time
+            mapping.idempotency_key = "1234"
+            mapping.save()
 
         repair_mappings()
 

+ 101 - 0
tests/sentry/testutils/test_silo.py

@@ -0,0 +1,101 @@
+import pytest
+
+from sentry.testutils.silo import validate_protected_queries
+
+
+def test_validate_protected_queries__no_queries():
+    validate_protected_queries([])
+
+
+def test_validate_protected_queries__ok():
+    queries = [
+        {"sql": "SELECT * FROM sentry_organization"},
+        {"sql": "UPDATE sentry_team SET slug = 'best-team' WHERE id = 1"},
+    ]
+    validate_protected_queries(queries)
+
+
+def test_validate_protected_queries__missing_fences():
+    queries = [
+        {"sql": 'SAVEPOINT "s123abc"'},
+        {"sql": 'UPDATE "sentry_useremail" SET "is_verified" = true WHERE "id" = 1'},
+        {"sql": 'UPDATE "sentry_organization" SET "slug" = \'oops\' WHERE "id" = 1'},
+        {"sql": 'UPDATE "sentry_team" SET "slug" = \'frontend\' WHERE "id" = 3'},
+    ]
+    with pytest.raises(AssertionError):
+        validate_protected_queries(queries)
+
+
+def test_validate_protected_queries__with_single_fence():
+    queries = [
+        {"sql": 'SAVEPOINT "s123abc"'},
+        {"sql": 'UPDATE "sentry_useremail" SET "is_verified" = true WHERE "id" = 1'},
+        {"sql": "SELECT 'start_role_override_1'"},
+        {"sql": 'UPDATE "sentry_organization" SET "slug" = \'oops\' WHERE "id" = 1'},
+        {"sql": "SELECT 'end_role_override_1'"},
+        {"sql": 'UPDATE "sentry_team" SET "slug" = \'frontend\' WHERE "id" = 3'},
+    ]
+    validate_protected_queries(queries)
+
+
+def test_validate_protected_queries__multiple_fences():
+    queries = [
+        {"sql": 'SAVEPOINT "s123abc"'},
+        {"sql": 'UPDATE "sentry_useremail" SET "is_verified" = true WHERE "id" = 1'},
+        {"sql": "SELECT 'start_role_override_1'"},
+        {"sql": 'UPDATE "sentry_organization" SET "slug" = \'oops\' WHERE "id" = 1'},
+        {"sql": "SELECT 'end_role_override_1'"},
+        {"sql": 'UPDATE "sentry_team" SET "slug" = \'frontend\' WHERE "id" = 3'},
+        {"sql": "SELECT 'start_role_override_2'"},
+        {"sql": 'UPDATE "sentry_organization" SET "slug" = \'another-oops\' WHERE "id" = 1'},
+        {"sql": "SELECT 'end_role_override_2'"},
+    ]
+    validate_protected_queries(queries)
+
+
+def test_validate_protected_queries__nested_fences():
+    queries = [
+        {"sql": 'SAVEPOINT "s123abc"'},
+        {"sql": 'UPDATE "sentry_useremail" SET "is_verified" = true WHERE "id" = 1'},
+        {"sql": "SELECT 'start_role_override_1'"},
+        {"sql": 'UPDATE "sentry_organization" SET "slug" = \'safe\' WHERE "id" = 1'},
+        # Nested role overrides shouldn't happen but we need to handle them just in case.
+        {"sql": "SELECT 'start_role_override_2'"},
+        {"sql": 'UPDATE "sentry_organization" SET "slug" = \'also-safe\' WHERE "id" = 1'},
+        {"sql": "SELECT 'end_role_override_2'"},
+        {"sql": "SELECT 'end_role_override_1'"},
+        {"sql": 'UPDATE "sentry_team" SET "slug" = \'frontend\' WHERE "id" = 3'},
+        {"sql": 'UPDATE "sentry_organizationmemberteam" SET "role" = \'member\' WHERE "id" = 3'},
+    ]
+    validate_protected_queries(queries)
+
+    queries = [
+        {"sql": 'SAVEPOINT "s123abc"'},
+        {"sql": 'UPDATE "sentry_useremail" SET "is_verified" = true WHERE "id" = 1'},
+        {"sql": "SELECT 'start_role_override_1'"},
+        {"sql": 'UPDATE "sentry_organization" SET "slug" = \'safe\' WHERE "id" = 1'},
+        # Nested role overrides shouldn't happen but we need to handle them just in case.
+        {"sql": "SELECT 'start_role_override_2'"},
+        {"sql": 'UPDATE "sentry_organization" SET "slug" = \'also-safe\' WHERE "id" = 1'},
+        {"sql": "SELECT 'end_role_override_2'"},
+        {"sql": 'UPDATE "sentry_organization" SET "slug" = \'still-safe\' WHERE "id" = 1'},
+        {"sql": "SELECT 'end_role_override_1'"},
+        {"sql": 'UPDATE "sentry_organization" SET "slug" = \'not-safe\' WHERE "id" = 1'},
+    ]
+    with pytest.raises(AssertionError):
+        validate_protected_queries(queries)
+
+
+def test_validate_protected_queries__fenced_and_not():
+    queries = [
+        {"sql": 'SAVEPOINT "s123abc"'},
+        {"sql": 'UPDATE "sentry_useremail" SET "is_verified" = true WHERE "id" = 1'},
+        {"sql": "SELECT 'start_role_override_1'"},
+        {"sql": 'UPDATE "sentry_organization" SET "slug" = \'oops\' WHERE "id" = 1'},
+        {"sql": "SELECT 'end_role_override_1'"},
+        {"sql": 'UPDATE "sentry_team" SET "slug" = \'frontend\' WHERE "id" = 3'},
+        # This query is lacking fences
+        {"sql": 'UPDATE "sentry_organization" SET "slug" = \'another-oops\' WHERE "id" = 1'},
+    ]
+    with pytest.raises(AssertionError):
+        validate_protected_queries(queries)