Skip to content

Commit 2a1b78f

Browse files
authored
Merge pull request #1 from HENNGE/dev
Implement RDS snapshot sanitizer
2 parents c3074a0 + 97ecb38 commit 2a1b78f

File tree

12 files changed

+1797
-0
lines changed

12 files changed

+1797
-0
lines changed

.github/workflows/workflow.yml

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
name: Create and publish a Docker image
2+
3+
on:
4+
release:
5+
types: [published]
6+
push:
7+
branches: dev
8+
9+
env:
10+
REGISTRY: ghcr.io
11+
12+
jobs:
13+
build:
14+
strategy:
15+
fail-fast: false
16+
matrix:
17+
include: [
18+
{platform: 'linux/amd64', runner: 'ubuntu-24.04'},
19+
{platform: 'linux/arm64', runner: 'ubuntu-24.04-arm'},
20+
]
21+
runs-on: ${{ matrix.runner }}
22+
steps:
23+
- name: Prepare
24+
run: |
25+
platform=${{ matrix.platform }}
26+
echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
27+
echo "IMAGE_NAME=${GITHUB_REPOSITORY@L}" >> $GITHUB_ENV
28+
29+
- name: Log in to the Container registry
30+
uses: docker/login-action@v3
31+
with:
32+
registry: ${{ env.REGISTRY }}
33+
username: ${{ github.actor }}
34+
password: ${{ secrets.GITHUB_TOKEN }}
35+
36+
- name: Set up Docker Buildx
37+
uses: docker/setup-buildx-action@v3
38+
39+
- name: Docker meta
40+
id: meta
41+
uses: docker/metadata-action@v5
42+
with:
43+
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
44+
45+
- name: Build and push by digest
46+
id: build
47+
uses: docker/build-push-action@v6
48+
with:
49+
platforms: ${{ matrix.platform }}
50+
labels: ${{ steps.meta.outputs.labels }}
51+
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
52+
outputs: type=image,push-by-digest=true,name-canonical=true,push=true
53+
54+
- name: Export digest
55+
run: |
56+
mkdir -p ${{ runner.temp }}/digests
57+
digest="${{ steps.build.outputs.digest }}"
58+
touch "${{ runner.temp }}/digests/${digest#sha256:}"
59+
60+
- name: Upload digest
61+
uses: actions/upload-artifact@v4
62+
with:
63+
name: digests-${{ env.PLATFORM_PAIR }}
64+
path: ${{ runner.temp }}/digests/*
65+
if-no-files-found: error
66+
retention-days: 1
67+
68+
merge:
69+
runs-on: ubuntu-24.04
70+
needs:
71+
- build
72+
steps:
73+
- name: Prepare
74+
run: |
75+
echo "IMAGE_NAME=${GITHUB_REPOSITORY@L}" >> $GITHUB_ENV
76+
77+
- name: Download digests
78+
uses: actions/download-artifact@v4
79+
with:
80+
path: ${{ runner.temp }}/digests
81+
pattern: digests-*
82+
merge-multiple: true
83+
84+
- name: Log in to the Container registry
85+
uses: docker/login-action@v3
86+
with:
87+
registry: ${{ env.REGISTRY }}
88+
username: ${{ github.actor }}
89+
password: ${{ secrets.GITHUB_TOKEN }}
90+
91+
- name: Set up Docker Buildx
92+
uses: docker/setup-buildx-action@v3
93+
94+
- name: Docker meta
95+
id: meta
96+
uses: docker/metadata-action@v5
97+
with:
98+
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
99+
tags: |
100+
type=ref,event=branch
101+
type=ref,event=pr
102+
type=semver,pattern={{version}}
103+
type=semver,pattern={{major}}.{{minor}}
104+
105+
- name: Create manifest list and push
106+
working-directory: ${{ runner.temp }}/digests
107+
run: |
108+
docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
109+
$(printf '${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}@sha256:%s ' *)
110+
111+
- name: Inspect image
112+
run: |
113+
docker buildx imagetools inspect ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.meta.outputs.version }}

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
.venv
2+
.envrc
3+
4+
__pycache__/
5+
hardcode.py

.pre-commit-config.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
repos:
2+
- repo: https://github.com/charliermarsh/ruff-pre-commit
3+
rev: v0.11.0
4+
hooks:
5+
- id: ruff
6+
args: [--fix]
7+
- id: ruff-format

Dockerfile

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
FROM python:3.13.2-alpine
2+
3+
WORKDIR /app
4+
5+
COPY poetry.lock pyproject.toml /app/
6+
COPY src/ /app/src
7+
8+
RUN apk update \
9+
&& apk upgrade --no-cache \
10+
&& apk add --no-cache --virtual build-dependencies build-base curl \
11+
&& pip install --no-cache-dir --upgrade pip setuptools wheel \
12+
&& curl -sSL https://install.python-poetry.org | POETRY_HOME=/etc/poetry python - \
13+
&& ln -s /etc/poetry/bin/poetry /usr/local/bin/poetry \
14+
&& poetry run pip install --upgrade pip setuptools wheel \
15+
&& MAKEFLAGS="-j" poetry install \
16+
&& poetry run python -m compileall -j 0 src \
17+
&& rm -rf /root/.cache/pip \
18+
&& rm -rf /root/.cache/pypoetry/artifacts /root/.cache/pypoetry/cache \
19+
&& rm -rf /etc/poetry/lib/poetry/_vendor/py3.13 \
20+
&& apk del --no-cache build-dependencies
21+
22+
ENTRYPOINT ["poetry", "run", "sanitizer"]

README.md

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,83 @@
11
# rds-snapshot-sanitizer
2+
3+
Create sanitized copy of RDS snapshots and share them with selected accounts.
4+
5+
It works by restoring an unsanitized snapshot to a temporary cluster and executing sanitizing SQL queries against it, after which sanitized snapshot will be created and optionally shared with other accounts.
6+
7+
# Environment variable
8+
- `SANITIZER_RDS_CLUSTER_ID`: RDS cluster identifier whose snapshots will be sanitized.
9+
- `SANITIZER_CONFIG`: rds-snapshot-sanitizer configuration in JSON. See [Configuration](#configuration).
10+
- `SANTITIZER_RDS_INSTANCE_ACU`: (Optional) ACU to be allocatted for the temporary RDS instance. Defaults to 2 ACU.
11+
- `SANITIZER_SQL_MAX_CONNECTIONS`: (Optional) Number of maximum connections to be created for executing the SQL queries. Defaults to 20.
12+
- `SANITIZER_SHARE_KMS_KEY_ID`: (Optional) KMS key identifier to be used for the sanitized snapshot.
13+
- `SANITIZER_SHARE_ACCOUNT_IDS`: (Optional) List of AWS account ids to share the sanitized snapshot with.
14+
- `SANITIZER_AWS_REGION`: (Optional) AWS region where the RDS cluster is hosted. Defaults to `AWS_REGION` or `AWS_DEFAULT_REGION` environment variable.
15+
- `SANITIZER_DELETE_OLD_SNAPSHOTS`: (Optional) Whether to delete old snapshots. Defaults to False.
16+
- `SANITIZER_OLD_SNAPSHOT_DAYS`: (Optional) Number of days for a snapshot to be considered old. Defaults to 30.
17+
18+
# Configuration
19+
The configuration is a JSON file with the following schema:
20+
- `"tables"`: list of table configuration
21+
- `"name"`: name of the table
22+
- `"columns"`: list of column configuration
23+
- `"name"`: name of the column
24+
- `"sanitizer"`: type of sanitizer to be used. There are two types provided, static and random.
25+
- `"type"`: `"static"`
26+
- `"value"`: a static string value to be used for replacement.
27+
28+
OR
29+
30+
- `"type"`: `"random"`
31+
- `"kind"`: `"name"`, `"first_name"`, `"last_name"`, `"user_name"`, `"email"`, `"phone_number"`, etc. See the full list of [random providers](https://faker.readthedocs.io/en/master/providers.html).
32+
- `"drop_constraints"`: list of table constraints to be dropped
33+
- `"drop_indexes"`: list of index to be dropped
34+
35+
Example:
36+
```json
37+
{
38+
"drop_indexes": ["users_tenant_id_email_key"],
39+
"tables": [
40+
{
41+
"name": "users",
42+
"columns": [
43+
{
44+
"name": "family_name",
45+
"sanitizer": {"type": "random", "kind": "last_name"},
46+
},
47+
{
48+
"name": "given_name",
49+
"sanitizer": {"type": "random", "kind": "first_name"},
50+
},
51+
{"name": "email", "sanitizer": {"type": "random", "kind": "email"}}
52+
]
53+
},
54+
{
55+
"name": "imported_users",
56+
"drop_constraints": ["imported_users_tenant_id_email_key"],
57+
"columns": [
58+
{
59+
"name": "family_name",
60+
"sanitizer": {"type": "static", "kind": "doe"},
61+
},
62+
{
63+
"name": "given_name",
64+
"sanitizer": {"type": "static", "value": "john"},
65+
},
66+
{"name": "email", "sanitizer": {"type": "random", "kind": "email"}}
67+
]
68+
}
69+
]
70+
}
71+
```
72+
73+
# Running locally
74+
The tool is meant to be run inside the same network as the RDS subnet that contains the target cluster.
75+
76+
To run the tool locally (for debugging, etc), you need to:
77+
- Ensure that the temporary RDS cluster is accessible from localhost. See [port-forwarding with session manager](https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-sessions-start.html#sessions-remote-port-forwarding).
78+
- Specify `--local` flag to set the RDS host target to localhost.
79+
80+
```bash
81+
poetry install
82+
poetry run sanitizer --flag
83+
```

0 commit comments

Comments
 (0)