Browse Source

feat(cdc) Backfill script for cdc tables (#25661)

This script backfills the clickhouse tables from a Postgres snapshot.
This will be mostly used on self hosted and production environments but it may be useful in dev as well so adding it to the sentry codebase.

It expects postgres and clickhouse to be up and running
Then it runs the snapshot taking process with on a container created with the cdc image
It saves it in a directory in /tmp
Then, for each storage it loads the snapshot into snuba.
Filippo Pacifici 3 years ago
parent
commit
512e84928f
2 changed files with 97 additions and 0 deletions
  1. 40 0
      config/cdc/cdc-snapshot-config.yaml
  2. 57 0
      scripts/backfill_cdc.sh

+ 40 - 0
config/cdc/cdc-snapshot-config.yaml

@@ -0,0 +1,40 @@
+version: 1
+
+product: snuba
+
+destination:
+    type: directory
+    options:
+        location: '/tmp/cdc-snapshots/'
+
+tables:
+    -   table: sentry_groupedmessage
+        zip: True
+        columns:
+            -   name: 'project_id'
+            -   name: 'id'
+            -   name: 'status'
+            -   name: 'last_seen'
+                formatter:
+                    type: 'datetime'
+                    precision: 'second'
+            -   name: 'first_seen'
+                formatter:
+                    type: 'datetime'
+                    precision: 'second'
+            -   name: 'active_at'
+                formatter:
+                    type: 'datetime'
+                    precision: 'second'
+            -   name: 'first_release_id'
+    -   table: sentry_groupasignee
+        zip: True
+        columns:
+            -   name: 'project_id'
+            -   name: 'group_id'
+            -   name: 'date_added'
+                formatter:
+                    type: 'datetime'
+                    precision: 'second'
+            -   name: 'user_id'
+            -   name: 'team_id'

+ 57 - 0
scripts/backfill_cdc.sh

@@ -0,0 +1,57 @@
+#!/bin/bash
+# Backfills all the CDC tables
+set -e
+
+declare -a STORAGES=("groupedmessages"  "groupassignees")
+
+log_message() {
+    GREEN='\033[0;32m'
+    NC='\033[0m'
+
+    echo -e "${GREEN}${1}${NC}"
+}
+
+mkdir -p /tmp/cdc-snapshots/
+
+log_message "********* Taking the snapshot from Postgres *********"
+
+cd "$(dirname "$0")"
+
+docker run \
+-v "$(pwd)"/../config/cdc/configuration.yaml:/etc/cdc/configuration.yaml \
+-v "$(pwd)"/../config/cdc/cdc-snapshot-config.yaml:/etc/cdc/cdc-snapshot-config.yaml \
+-v /tmp/cdc-snapshots:/tmp/cdc-snapshots \
+--rm \
+--network sentry \
+getsentry/cdc:nightly \
+cdc -c /etc/cdc/configuration.yaml \
+snapshot --snapshot-config /etc/cdc/cdc-snapshot-config.yaml \
+2>&1 | tee /tmp/cdc-snapshots/snapshot.log
+
+SNAPSHOT_ID=$(awk '{ if($4=="Starting" && $5=="snapshot" && $6=="ID") print $7}' /tmp/cdc-snapshots/snapshot.log )
+SNAPSHOT_PATH="/tmp/cdc-snapshots/cdc_snapshot_snuba_$SNAPSHOT_ID"
+rm /tmp/cdc-snapshots/snapshot.log
+
+
+log_message "********* Loading the snapshot into Snuba *********"
+
+for i in "${!STORAGES[@]}";
+do
+    log_message "********* Loading ${STORAGES[$i]}"
+
+    docker run \
+    -v "$SNAPSHOT_PATH"/:/tmp/cdc-snapshot \
+    --rm \
+    --network sentry \
+    -e SNUBA_SETTINGS=docker \
+    -e CLICKHOUSE_HOST=sentry_clickhouse \
+    getsentry/snuba:nightly \
+    snuba bulk-load --storage="${STORAGES[$i]}" \
+    --source=/tmp/cdc-snapshot \
+    --ignore-existing-data \
+    --pre-processed \
+    --show-progress
+done
+
+log_message "********* Done *********"
+echo "You can now remove the snapshot from $SNAPSHOT_PATH"