HENNGE · FurqanHabibi · Mar 25, 2025 · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025
diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml
@@ -0,0 +1,113 @@
+name: Create and publish a Docker image
+
+on:
+  release:
+    types: [published]
+  push:
+    branches: dev
+
+env:
+  REGISTRY: ghcr.io
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        include: [
+          {platform: 'linux/amd64', runner: 'ubuntu-24.04'},
+          {platform: 'linux/arm64', runner: 'ubuntu-24.04-arm'},
+        ]
+    runs-on: ${{ matrix.runner }}
+    steps:
+      - name: Prepare
+        run: |
+          platform=${{ matrix.platform }}
+          echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
+          echo "IMAGE_NAME=${GITHUB_REPOSITORY@L}" >> $GITHUB_ENV
+
+      - name: Log in to the Container registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      - name: Build and push by digest
+        id: build
+        uses: docker/build-push-action@v6
+        with:
+          platforms: ${{ matrix.platform }}
+          labels: ${{ steps.meta.outputs.labels }}
+          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          outputs: type=image,push-by-digest=true,name-canonical=true,push=true
+
+      - name: Export digest
+        run: |
+          mkdir -p ${{ runner.temp }}/digests
+          digest="${{ steps.build.outputs.digest }}"
+          touch "${{ runner.temp }}/digests/${digest#sha256:}"
+
+      - name: Upload digest
+        uses: actions/upload-artifact@v4
+        with:
+          name: digests-${{ env.PLATFORM_PAIR }}
+          path: ${{ runner.temp }}/digests/*
+          if-no-files-found: error
+          retention-days: 1
+
+  merge:
+    runs-on: ubuntu-24.04
+    needs:
+      - build
+    steps:
+      - name: Prepare
+        run: |
+          echo "IMAGE_NAME=${GITHUB_REPOSITORY@L}" >> $GITHUB_ENV
+
+      - name: Download digests
+        uses: actions/download-artifact@v4
+        with:
+          path: ${{ runner.temp }}/digests
+          pattern: digests-*
+          merge-multiple: true
+
+      - name: Log in to the Container registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=ref,event=branch
+            type=ref,event=pr
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+
+      - name: Create manifest list and push
+        working-directory: ${{ runner.temp }}/digests
+        run: |
+          docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
+            $(printf '${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}@sha256:%s ' *)
+
+      - name: Inspect image
+        run: |
+          docker buildx imagetools inspect ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.meta.outputs.version }}
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+.venv
+.envrc
+
+__pycache__/
+hardcode.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,7 @@
+repos:
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.11.0
+    hooks:
+      - id: ruff
+        args: [--fix]
+      - id: ruff-format
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,22 @@
+FROM python:3.13.2-alpine
+
+WORKDIR /app
+
+COPY poetry.lock pyproject.toml /app/
+COPY src/ /app/src
+
+RUN apk update \
+    && apk upgrade --no-cache \
+    && apk add --no-cache --virtual build-dependencies build-base curl \
+    && pip install --no-cache-dir --upgrade pip setuptools wheel \
+    && curl -sSL https://install.python-poetry.org | POETRY_HOME=/etc/poetry python - \
+    && ln -s /etc/poetry/bin/poetry /usr/local/bin/poetry \
+    && poetry run pip install --upgrade pip setuptools wheel \
+    && MAKEFLAGS="-j" poetry install \
+    && poetry run python -m compileall -j 0 src \
+    && rm -rf /root/.cache/pip \
+    && rm -rf /root/.cache/pypoetry/artifacts /root/.cache/pypoetry/cache \
+    && rm -rf /etc/poetry/lib/poetry/_vendor/py3.13 \
+    && apk del --no-cache build-dependencies
+
+ENTRYPOINT ["poetry", "run", "sanitizer"]
diff --git a/README.md b/README.md
@@ -1 +1,83 @@
 # rds-snapshot-sanitizer
+
+Create sanitized copy of RDS snapshots and share them with selected accounts.
+
+It works by restoring an unsanitized snapshot to a temporary cluster and executing sanitizing SQL queries against it, after which sanitized snapshot will be created and optionally shared with other accounts.
+
+# Environment variable
+- `SANITIZER_RDS_CLUSTER_ID`: RDS cluster identifier whose snapshots will be sanitized.
+- `SANITIZER_CONFIG`: rds-snapshot-sanitizer configuration in JSON. See [Configuration](#configuration).
+- `SANTITIZER_RDS_INSTANCE_ACU`: (Optional) ACU to be allocatted for the temporary RDS instance. Defaults to 2 ACU.
+- `SANITIZER_SQL_MAX_CONNECTIONS`: (Optional) Number of maximum connections to be created for executing the SQL queries. Defaults to 20.
+- `SANITIZER_SHARE_KMS_KEY_ID`: (Optional) KMS key identifier to be used for the sanitized snapshot.
+- `SANITIZER_SHARE_ACCOUNT_IDS`: (Optional) List of AWS account ids to share the sanitized snapshot with.
+- `SANITIZER_AWS_REGION`: (Optional) AWS region where the RDS cluster is hosted. Defaults to `AWS_REGION` or `AWS_DEFAULT_REGION` environment variable.
+- `SANITIZER_DELETE_OLD_SNAPSHOTS`: (Optional) Whether to delete old snapshots. Defaults to False.
+- `SANITIZER_OLD_SNAPSHOT_DAYS`: (Optional) Number of days for a snapshot to be considered old. Defaults to 30.
+
+# Configuration
+The configuration is a JSON file with the following schema:
+- `"tables"`: list of table configuration
+  - `"name"`: name of the table
+  - `"columns"`: list of column configuration
+    - `"name"`: name of the column
+    - `"sanitizer"`: type of sanitizer to be used. There are two types provided, static and random.
+      - `"type"`: `"static"`
+      - `"value"`: a static string value to be used for replacement.
+
+      OR
+
+      - `"type"`: `"random"`
+      - `"kind"`: `"name"`, `"first_name"`, `"last_name"`, `"user_name"`, `"email"`, `"phone_number"`, etc. See the full list of [random providers](https://faker.readthedocs.io/en/master/providers.html).
+  - `"drop_constraints"`: list of table constraints to be dropped
+- `"drop_indexes"`: list of index to be dropped
+
+Example:
+```json
+{
+    "drop_indexes": ["users_tenant_id_email_key"],
+    "tables": [
+        {
+            "name": "users",
+            "columns": [
+                {
+                    "name": "family_name",
+                    "sanitizer": {"type": "random", "kind": "last_name"},
+                },
+                {
+                    "name": "given_name",
+                    "sanitizer": {"type": "random", "kind": "first_name"},
+                },
+                {"name": "email", "sanitizer": {"type": "random", "kind": "email"}}
+            ]
+        },
+        {
+            "name": "imported_users",
+            "drop_constraints": ["imported_users_tenant_id_email_key"],
+            "columns": [
+                {
+                    "name": "family_name",
+                    "sanitizer": {"type": "static", "kind": "doe"},
+                },
+                {
+                    "name": "given_name",
+                    "sanitizer": {"type": "static", "value": "john"},
+                },
+                {"name": "email", "sanitizer": {"type": "random", "kind": "email"}}
+            ]
+        }
+    ]
+}
+```
+
+# Running locally
+The tool is meant to be run inside the same network as the RDS subnet that contains the target cluster.
+
+To run the tool locally (for debugging, etc), you need to:
+- Ensure that the temporary RDS cluster is accessible from localhost. See [port-forwarding with session manager](https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-sessions-start.html#sessions-remote-port-forwarding).
+- Specify `--local` flag to set the RDS host target to localhost.
+
+```bash
+poetry install
+poetry run sanitizer --flag
+```