Browse Source

fix(migrations): Add in a feature to force a lock timeout to be set on all migration operations (#79052)

We had an incident where a migration couldn't acquire a lock to modify
`sentry_groupedmessage` and took down production. This shouldn't have
happened, since we rely on django-zero-downtime-migrations to set a lock
acquisition timeout on all operations.

Unfortunately, it doesn't set it on `RunSQL` or `RunPython` ops. There
are other operations that it considers safe that it also doesn't set it
on, but those aren't an issue.

This pr adds `ZERO_DOWNTIME_MIGRATIONS_LOCK_TIMEOUT_FORCE`, which when
set to True will guarantee that we always set the lock timeout for all
operations. In cases where we would have previously set both a lock and
statement timeout, the result is unchanged.

We specifically don't want to set a statement timeout on all operations.
Some operations can take a long time (adding an index) and so we don't
want to force this in all cases.

This pr vendors a bunch of code from `dzdm`. Ideally we could attempt to
submit it upstream, but I think we'd need to tidy things up a little to
do so.

<!-- Describe your PR here. -->
Dan Fuller 4 months ago
parent
commit
7d964ec858

+ 0 - 0
fixtures/safe_migrations_apps/run_sql_app/__init__.py


+ 30 - 0
fixtures/safe_migrations_apps/run_sql_app/migrations/0001_initial.py

@@ -0,0 +1,30 @@
+# Generated by Django 3.1 on 2019-09-22 21:47
+
+from django.db import migrations, models
+
+from sentry.new_migrations.migrations import CheckedMigration
+
+
+class Migration(CheckedMigration):
+
+    initial = True
+
+    dependencies = []
+
+    operations = [
+        migrations.CreateModel(
+            name="TestTable",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                ("field", models.IntegerField(null=True)),
+            ],
+        ),
+    ]

+ 23 - 0
fixtures/safe_migrations_apps/run_sql_app/migrations/0002_run_sql.py

@@ -0,0 +1,23 @@
+from django.db import migrations
+
+from sentry.new_migrations.migrations import CheckedMigration
+
+
+class Migration(CheckedMigration):
+
+    dependencies = [
+        ("run_sql_app", "0001_initial"),
+    ]
+
+    operations = [
+        migrations.SeparateDatabaseAndState(
+            database_operations=[
+                migrations.RunSQL(
+                    """ALTER TABLE "run_sql_app_testtable" DROP COLUMN "field";""",
+                    reverse_sql="""ALTER TABLE "run_sql_app_testtable" ADD COLUMN "field" int NULL;""",
+                    hints={"tables": ["run_sql_app_testtable"]},
+                )
+            ],
+            state_operations=[migrations.RemoveField("testtable", "field")],
+        )
+    ]

+ 14 - 0
fixtures/safe_migrations_apps/run_sql_app/migrations/0003_add_col.py

@@ -0,0 +1,14 @@
+from django.db import migrations, models
+
+from sentry.new_migrations.migrations import CheckedMigration
+
+
+class Migration(CheckedMigration):
+
+    dependencies = [
+        ("run_sql_app", "0002_run_sql"),
+    ]
+
+    operations = [
+        migrations.AddField("testtable", "field", models.IntegerField(null=True)),
+    ]

+ 0 - 0
fixtures/safe_migrations_apps/run_sql_app/migrations/__init__.py


+ 5 - 0
fixtures/safe_migrations_apps/run_sql_app/models.py

@@ -0,0 +1,5 @@
+from django.db import models
+
+
+class TestTable(models.Model):
+    field = models.IntegerField(default=0)

+ 1 - 0
src/sentry/conf/server.py

@@ -3150,6 +3150,7 @@ PG_VERSION: str = os.getenv("PG_VERSION") or "14"
 ZERO_DOWNTIME_MIGRATIONS_RAISE_FOR_UNSAFE = True
 ZERO_DOWNTIME_MIGRATIONS_LOCK_TIMEOUT = None
 ZERO_DOWNTIME_MIGRATIONS_STATEMENT_TIMEOUT = None
+ZERO_DOWNTIME_MIGRATIONS_LOCK_TIMEOUT_FORCE = False
 
 if int(PG_VERSION.split(".", maxsplit=1)[0]) < 12:
     # In v0.6 of django-pg-zero-downtime-migrations this settings is deprecated for PostreSQLv12+

+ 78 - 0
src/sentry/db/postgres/schema.py

@@ -1,10 +1,17 @@
+from contextlib import contextmanager
+
+from django.conf import settings
+from django.db.backends.ddl_references import Statement
 from django.db.backends.postgresql.schema import (
     DatabaseSchemaEditor as PostgresDatabaseSchemaEditor,
 )
 from django.db.models import Field
 from django.db.models.base import ModelBase
 from django_zero_downtime_migrations.backends.postgres.schema import (
+    DUMMY_SQL,
     DatabaseSchemaEditorMixin,
+    MultiStatementSQL,
+    PGLock,
     Unsafe,
     UnsafeOperationException,
 )
@@ -69,6 +76,12 @@ class SafePostgresDatabaseSchemaEditor(DatabaseSchemaEditorMixin, PostgresDataba
         PostgresDatabaseSchemaEditor.alter_db_tablespace
     )
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.LOCK_TIMEOUT_FORCE = getattr(
+            settings, "ZERO_DOWNTIME_MIGRATIONS_LOCK_TIMEOUT_FORCE", False
+        )
+
     def alter_db_table(self, model, old_db_table, new_db_table):
         """
         This didn't work correctly in  django_zero_downtime_migrations, so implementing here. This
@@ -97,6 +110,71 @@ class SafePostgresDatabaseSchemaEditor(DatabaseSchemaEditorMixin, PostgresDataba
             "More info here: https://develop.sentry.dev/database-migrations/#deleting-columns"
         )
 
+    def execute(self, sql, params=()):
+        if sql is DUMMY_SQL:
+            return
+        statements = []
+        if isinstance(sql, MultiStatementSQL):
+            statements.extend(sql)
+        elif isinstance(sql, Statement) and isinstance(sql.template, MultiStatementSQL):
+            statements.extend(Statement(s, **sql.parts) for s in sql.template)
+        else:
+            statements.append(sql)
+        for statement in statements:
+            if isinstance(statement, PGLock):
+                use_timeouts = statement.use_timeouts
+                disable_statement_timeout = statement.disable_statement_timeout
+                statement = statement.sql
+            elif isinstance(statement, Statement) and isinstance(statement.template, PGLock):
+                use_timeouts = statement.template.use_timeouts
+                disable_statement_timeout = statement.template.disable_statement_timeout
+                statement = Statement(statement.template.sql, **statement.parts)
+            else:
+                use_timeouts = False
+                disable_statement_timeout = False
+
+            if use_timeouts:
+                with self._set_operation_timeout(self.STATEMENT_TIMEOUT, self.LOCK_TIMEOUT):
+                    PostgresDatabaseSchemaEditor.execute(self, statement, params)
+            elif self.LOCK_TIMEOUT_FORCE:
+                with self._set_operation_timeout(lock_timeout=self.LOCK_TIMEOUT):
+                    PostgresDatabaseSchemaEditor.execute(self, statement, params)
+            elif disable_statement_timeout and self.FLEXIBLE_STATEMENT_TIMEOUT:
+                with self._set_operation_timeout(self.ZERO_TIMEOUT):
+                    PostgresDatabaseSchemaEditor.execute(self, statement, params)
+            else:
+                PostgresDatabaseSchemaEditor.execute(self, statement, params)
+
+    @contextmanager
+    def _set_operation_timeout(self, statement_timeout=None, lock_timeout=None):
+        if self.collect_sql:
+            previous_statement_timeout = self.ZERO_TIMEOUT
+            previous_lock_timeout = self.ZERO_TIMEOUT
+        else:
+            with self.connection.cursor() as cursor:
+                cursor.execute(self.sql_get_statement_timeout)
+                (previous_statement_timeout,) = cursor.fetchone()
+                cursor.execute(self.sql_get_lock_timeout)
+                (previous_lock_timeout,) = cursor.fetchone()
+        if statement_timeout is not None:
+            PostgresDatabaseSchemaEditor.execute(
+                self, self.sql_set_statement_timeout % {"statement_timeout": statement_timeout}
+            )
+        if lock_timeout is not None:
+            PostgresDatabaseSchemaEditor.execute(
+                self, self.sql_set_lock_timeout % {"lock_timeout": lock_timeout}
+            )
+        yield
+        if statement_timeout is not None:
+            PostgresDatabaseSchemaEditor.execute(
+                self,
+                self.sql_set_statement_timeout % {"statement_timeout": previous_statement_timeout},
+            )
+        if lock_timeout is not None:
+            PostgresDatabaseSchemaEditor.execute(
+                self, self.sql_set_lock_timeout % {"lock_timeout": previous_lock_timeout}
+            )
+
 
 class DatabaseSchemaEditorProxy:
     """

+ 74 - 10
tests/sentry/db/postgres/schema/safe_migrations/integration/test_migrations.py

@@ -7,6 +7,25 @@ from django_zero_downtime_migrations.backends.postgres.schema import UnsafeOpera
 from sentry.testutils.cases import TestCase
 
 
+def one_line_sql(sql: str) -> str:
+    return (
+        sql.replace("    ", "")
+        .replace("\n", " ")
+        .replace("( ", "(")
+        .replace(" )", ")")
+        .replace("  ", " ")
+        .strip()
+    )
+
+
+def split_sql_queries(sql: str) -> list[str]:
+    return [
+        line
+        for line in [one_line_sql(line) for line in sql.splitlines()]
+        if line and not line.startswith("--")
+    ]
+
+
 class BaseSafeMigrationTest(TestCase):
     BASE_PATH = "fixtures.safe_migrations_apps"
     # abstract
@@ -15,22 +34,28 @@ class BaseSafeMigrationTest(TestCase):
     migrate_to: str
 
     def run_migration(self):
+        self._run_migration(self.app, self.migrate_from)
+        self._run_migration(self.app, self.migrate_to)
+
+    def _run_migration(self, app, migration):
         with override_settings(
             INSTALLED_APPS=(f"{self.BASE_PATH}.{self.app}",), MIGRATION_MODULES={}
         ):
-            migrate_from = [(self.app, self.migrate_from)]
-            migrate_to = [(self.app, self.migrate_to)]
             executor = MigrationExecutor(connection)
-            executor.loader.project_state(migrate_from).apps
-
-            # Reverse to the original migration
-            executor.migrate(migrate_from)
+            executor.loader.build_graph()
+            target = [(app, migration)]
+            executor.loader.project_state(target).apps
+            executor.migrate(target)
 
-            # Run the migration to test
+    def sql_migrate(self, app_label, migration_name):
+        with override_settings(
+            INSTALLED_APPS=(f"{self.BASE_PATH}.{self.app}",), MIGRATION_MODULES={}
+        ):
+            target = (app_label, migration_name)
             executor = MigrationExecutor(connection)
-            executor.loader.build_graph()  # reload.
-            executor.migrate(migrate_to)
-            executor.loader.project_state(migrate_to).apps
+            plan = [(executor.loader.graph.nodes[target], None)]
+            sql_statements = executor.loader.collect_sql(plan)  # type: ignore[attr-defined]
+            return "\n".join(sql_statements)
 
 
 class AddColWithDefaultTest(BaseSafeMigrationTest):
@@ -155,3 +180,42 @@ class DeleteModelCorrectTest(BaseSafeMigrationTest):
 
     def test(self):
         self.run_migration()
+
+
+class LockTimeoutTest(BaseSafeMigrationTest):
+    app = "run_sql_app"
+
+    def test(self):
+        with override_settings(
+            ZERO_DOWNTIME_MIGRATIONS_LOCK_TIMEOUT="5s",
+            ZERO_DOWNTIME_MIGRATIONS_STATEMENT_TIMEOUT="5s",
+            ZERO_DOWNTIME_MIGRATIONS_LOCK_TIMEOUT_FORCE=True,
+        ):
+            migration_sql = self.sql_migrate(self.app, "0001_initial")
+            # We'd never block while attempting to acquire a lock when creating a table, but since we set
+            # `ZERO_DOWNTIME_MIGRATIONS_LOCK_TIMEOUT_FORCE` we should add a lock_timeout here anyway
+            assert split_sql_queries(migration_sql) == [
+                "SET lock_timeout TO '5s';",
+                'CREATE TABLE "run_sql_app_testtable" ("id" integer NOT NULL PRIMARY KEY '
+                'GENERATED BY DEFAULT AS IDENTITY, "field" integer NULL);',
+                "SET lock_timeout TO '0ms';",
+            ]
+            self._run_migration(self.app, "0001_initial")
+            migration_sql = self.sql_migrate(self.app, "0002_run_sql")
+            # The runsql operation should just have the lock timeout set, since it's relying on
+            # `ZERO_DOWNTIME_MIGRATIONS_LOCK_TIMEOUT_FORCE`
+            assert split_sql_queries(migration_sql) == [
+                "SET lock_timeout TO '5s';",
+                'ALTER TABLE "run_sql_app_testtable" DROP COLUMN "field";',
+                "SET lock_timeout TO '0ms';",
+            ]
+            self._run_migration(self.app, "0002_run_sql")
+            migration_sql = self.sql_migrate(self.app, "0003_add_col")
+            # This should set the statement timeout since it's an operation that dzdm handles
+            assert split_sql_queries(migration_sql) == [
+                "SET statement_timeout TO '5s';",
+                "SET lock_timeout TO '5s';",
+                'ALTER TABLE "run_sql_app_testtable" ADD COLUMN "field" integer NULL;',
+                "SET statement_timeout TO '0ms';",
+                "SET lock_timeout TO '0ms';",
+            ]