Skip to content

Commit 4dcad26

Browse files
authored
Add Delta Lake example (#17)
1 parent 14955dd commit 4dcad26

File tree

10 files changed

+643
-0
lines changed

10 files changed

+643
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ This repository contains examples of use cases that utilize Decodable streaming
3636
|[Array Aggregation](array-agg)| Using the `array_agg()` UDF for denormalizing data in a pipeline from MySQL to OpenSearch |
3737
|[Kafka with ngrok](kafka-ngrok)| Docker Compose for running Apache Kafka locally, accessible from the internet using ngrok|
3838
|[PyFlink on Decodable](pyflink-decodable)| Running a PyFlink job as a Custom Pipeline on Decodable|
39+
|[Delta Lake / Flink](flink-delta-lake)| Writing to Delta Lake with Apache Flink |
3940

4041

4142
## License

flink-delta-lake/delta-flink.sql

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
CREATE CATALOG c_delta WITH ( 'type' = 'delta-catalog', 'catalog-type' = 'in-memory');
2+
3+
CREATE DATABASE c_delta.db_new;
4+
5+
CREATE TABLE c_delta.db_new.t_foo (c1 varchar, c2 int) WITH ( 'connector' = 'delta', 'table-path' = 's3a://warehouse/t_foo');
6+
7+
INSERT INTO c_delta.db_new.t_foo
8+
SELECT name, 42
9+
FROM (VALUES ('Never'), ('Gonna'), ('Give'), ('You'), ('Up')) AS NameTable(name);

flink-delta-lake/docker-compose.yml

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
services:
2+
jobmanager:
3+
build: ./flink
4+
hostname: jobmanager
5+
container_name: jobmanager
6+
ports:
7+
- "8081:8081"
8+
command: jobmanager
9+
volumes:
10+
- .:/data/
11+
environment:
12+
- |
13+
FLINK_PROPERTIES=
14+
flink.hadoop.fs.s3a.access.key: admin
15+
flink.hadoop.fs.s3a.secret.key: password
16+
flink.hadoop.fs.s3a.endpoint: http://minio:9000
17+
flink.hadoop.fs.s3a.path.style.access: true
18+
fs.s3a.access.key: admin
19+
fs.s3a.secret.key: password
20+
fs.s3a.endpoint: http://minio:9000
21+
fs.s3a.path.style.access: true
22+
jobmanager.rpc.address: jobmanager
23+
rest.flamegraph.enabled: true
24+
taskmanager:
25+
build: ./flink
26+
hostname: taskmanager
27+
depends_on:
28+
- jobmanager
29+
command: taskmanager
30+
deploy:
31+
replicas: 2
32+
environment:
33+
- |
34+
FLINK_PROPERTIES=
35+
flink.hadoop.fs.s3a.access.key: admin
36+
flink.hadoop.fs.s3a.secret.key: password
37+
flink.hadoop.fs.s3a.endpoint: http://minio:9000
38+
flink.hadoop.fs.s3a.path.style.access: true
39+
fs.s3a.access.key: admin
40+
fs.s3a.secret.key: password
41+
fs.s3a.endpoint: http://minio:9000
42+
fs.s3a.path.style.access: true
43+
jobmanager.rpc.address: jobmanager
44+
taskmanager.numberOfTaskSlots: 4
45+
46+
zookeeper:
47+
image: confluentinc/cp-zookeeper:7.5.1
48+
container_name: zookeeper
49+
environment:
50+
ZOOKEEPER_CLIENT_PORT: 2181
51+
ZOOKEEPER_TICK_TIME: 2000
52+
53+
kafka:
54+
image: confluentinc/cp-kafka:7.5.1
55+
container_name: broker
56+
depends_on:
57+
- zookeeper
58+
environment:
59+
KAFKA_BROKER_ID: 1
60+
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
61+
KAFKA_LISTENERS: DOCKER://broker:29092, LOCALHOST://localhost:9092
62+
KAFKA_ADVERTISED_LISTENERS: DOCKER://broker:29092, LOCALHOST://localhost:9092
63+
KAFKA_INTER_BROKER_LISTENER_NAME: DOCKER
64+
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: DOCKER:PLAINTEXT,LOCALHOST:PLAINTEXT
65+
KAFKA_AUTO_CREATE_TOPICS_ENABLE: "true"
66+
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
67+
ports:
68+
- 9092:9092
69+
70+
minio:
71+
image: minio/minio
72+
container_name: minio
73+
environment:
74+
- MINIO_ROOT_USER=admin
75+
- MINIO_ROOT_PASSWORD=password
76+
ports:
77+
- 9001:9001
78+
- 9000:9000
79+
command: ["server", "/data", "--console-address", ":9001"]
80+
81+
mc:
82+
depends_on:
83+
- minio
84+
image: minio/mc
85+
container_name: mc
86+
entrypoint: >
87+
/bin/sh -c "
88+
until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done;
89+
/usr/bin/mc rm -r --force minio/warehouse;
90+
/usr/bin/mc mb minio/warehouse;
91+
tail -f /dev/null
92+
"
93+
94+
hive-metastore:
95+
container_name: hms
96+
build: ./hms-standalone-s3
97+
ports:
98+
- "9083:9083"
99+
environment:
100+
- HMS_LOGLEVEL=INFO
101+
102+
duckdb:
103+
image: davidgasquez/duckdb
104+
container_name: duckdb
105+
restart: no
106+
entrypoint: tail -f /dev/null
107+
108+
kcat:
109+
image: edenhill/kcat:1.7.1
110+
container_name: kcat
111+
restart: no
112+
entrypoint: tail -f /dev/null
113+
114+
shadowtraffic:
115+
# watch 'docker exec shadowtraffic curl -s localhost:9400/metrics |grep events_sent'
116+
image: shadowtraffic/shadowtraffic:0.6.0
117+
container_name: shadowtraffic
118+
# profiles: ["shadowtraffic"]
119+
env_file:
120+
- shadowtraffic/license.env
121+
volumes:
122+
- ./shadowtraffic:/data
123+
command: --config /data/kafka-retail.json
124+
125+
# Without a network explicitly defined, you hit this Hive/Thrift error
126+
# java.net.URISyntaxException Illegal character in hostname
127+
# https://github.com/TrivadisPF/platys-modern-data-platform/issues/231
128+
networks:
129+
default:
130+
name: zaphod

flink-delta-lake/flink/Dockerfile

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
FROM apache/flink:1.18.1-scala_2.12-java11
2+
SHELL ["/bin/bash", "-c"]
3+
4+
# Install some useful tools
5+
RUN apt-get update && \
6+
apt-get install -y neovim tree lnav unzip && \
7+
apt-get purge -y --auto-remove && \
8+
apt-get clean && \
9+
rm -rf /var/lib/apt/lists/*
10+
11+
RUN wget https://github.com/duckdb/duckdb/releases/download/v1.0.0/duckdb_cli-linux-amd64.zip \
12+
&& unzip duckdb_cli-linux-amd64.zip -d /usr/local/bin \
13+
&& rm duckdb_cli-linux-amd64.zip
14+
15+
USER flink
16+
WORKDIR /opt/flink
17+
18+
COPY --chown=flink conf/hive-site.xml ./conf/hive-site.xml
19+
# COPY --chown=flink conf/log4j.properties ./conf/log4j-console.properties
20+
21+
# Enable SQL Client to find the job manager when running it from this image
22+
RUN sed -i "s/jobmanager.rpc.address: localhost/jobmanager.rpc.address: flink-jobmanager/g" ./conf/flink-conf.yaml
23+
24+
# # Enable this for debug logging
25+
# RUN cat >> ./conf/log4j.properties <<EOF
26+
# logger.fs.name = org.apache.hadoop.fs
27+
# logger.fs.level = TRACE
28+
# logger.fs2.name = org.apache.flink.fs
29+
# logger.fs2.level = TRACE
30+
# logger.aws.name = com.amazonaws
31+
# logger.aws.level = TRACE
32+
# logger.delta.name = io.delta
33+
# logger.delta.level =TRACE
34+
# EOF
35+
36+
# RUN cat >> ./conf/log4j-cli.properties <<EOF
37+
# logger.fs.name = org.apache.hadoop.fs
38+
# logger.fs.level = TRACE
39+
# logger.fs2.name = org.apache.flink.fs
40+
# logger.fs2.level = TRACE
41+
# logger.aws.name = com.amazonaws
42+
# logger.aws.level = TRACE
43+
# logger.delta.name = io.delta
44+
# logger.delta.level =TRACE
45+
# EOF
46+
47+
# Install JARs
48+
# Create necessary directories
49+
RUN mkdir -p ./lib/delta ./lib/kafka ./lib/hive ./lib/hadoop
50+
51+
RUN echo "Add Flink S3 Plugin" && \
52+
mkdir ./plugins/s3-fs-hadoop && \
53+
cp ./opt/flink-s3-fs-hadoop-1.18.1.jar ./plugins/s3-fs-hadoop/
54+
55+
# Download and Install JARs
56+
57+
RUN echo "-> Install JARs: Flink's Kafka connector" && \
58+
mkdir -p ./lib/kafka && pushd $_ && \
59+
curl https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-kafka/3.1.0-1.18/flink-sql-connector-kafka-3.1.0-1.18.jar -O && \
60+
popd
61+
62+
RUN echo "-> Install JARs: Flink's Hive connector" && \
63+
mkdir -p ./lib/hive && pushd $_ && \
64+
curl https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-hive-3.1.3_2.12/1.18.1/flink-sql-connector-hive-3.1.3_2.12-1.18.1.jar -O && \
65+
popd
66+
67+
RUN echo "-> Install JARs: Dependencies for Delta Lake" && \
68+
mkdir -p ./lib/delta && pushd $_ && \
69+
curl https://repo1.maven.org/maven2/io/delta/delta-flink/3.2.0/delta-flink-3.2.0.jar -O && \
70+
curl https://repo1.maven.org/maven2/io/delta/delta-standalone_2.12/3.2.0/delta-standalone_2.12-3.2.0.jar -O && \
71+
curl https://repo1.maven.org/maven2/io/delta/delta-storage/3.2.0/delta-storage-3.2.0.jar -O && \
72+
curl https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-parquet/1.18.1/flink-sql-parquet-1.18.1.jar -O && \
73+
curl https://repo1.maven.org/maven2/com/chuusai/shapeless_2.12/2.3.4/shapeless_2.12-2.3.4.jar -O && \
74+
popd
75+
76+
RUN echo "-> Install JARs: AWS / Hadoop S3" && \
77+
mkdir -p ./lib/aws && pushd $_ && \
78+
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar -O && \
79+
curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.648/aws-java-sdk-bundle-1.12.648.jar -O && \
80+
popd
81+
82+
RUN echo "-> Install JARs: Hadoop" && \
83+
mkdir -p ./lib/hadoop && pushd $_ && \
84+
curl https://repo1.maven.org/maven2/com/google/guava/guava/27.0-jre/guava-27.0-jre.jar -O && \
85+
curl https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-databind/2.12.7/jackson-databind-2.12.7.jar -O && \
86+
curl https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-core/2.17.1/jackson-core-2.17.1.jar -O && \
87+
curl https://repo1.maven.org/maven2/com/fasterxml/jackson/core/jackson-annotations/2.12.7/jackson-annotations-2.12.7.jar -O && \
88+
curl https://repo1.maven.org/maven2/org/apache/commons/commons-configuration2/2.1.1/commons-configuration2-2.1.1.jar -O && \
89+
curl https://repo1.maven.org/maven2/commons-logging/commons-logging/1.1.3/commons-logging-1.1.3.jar -O && \
90+
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-auth/3.3.4/hadoop-auth-3.3.4.jar -O && \
91+
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/3.3.4/hadoop-common-3.3.4.jar -O && \
92+
curl https://repo1.maven.org/maven2/org/apache/hadoop/thirdparty/hadoop-shaded-guava/1.1.1/hadoop-shaded-guava-1.1.1.jar -O && \
93+
curl https://repo1.maven.org/maven2/org/codehaus/woodstox/stax2-api/4.2.1/stax2-api-4.2.1.jar -O && \
94+
curl https://repo1.maven.org/maven2/com/fasterxml/woodstox/woodstox-core/5.3.0/woodstox-core-5.3.0.jar -O && \
95+
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-hdfs-client/3.3.4/hadoop-hdfs-client-3.3.4.jar -O && \
96+
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-mapreduce-client-core/3.3.4/hadoop-mapreduce-client-core-3.3.4.jar -O && \
97+
popd
98+
99+
# Set the launch command
100+
CMD ./bin/start-cluster.sh && sleep infinity
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2+
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3+
<configuration>
4+
<property>
5+
<name>hive.metastore.local</name>
6+
<value>false</value>
7+
</property>
8+
9+
<property>
10+
<name>hive.metastore.uris</name>
11+
<value>thrift://hms:9083</value>
12+
</property>
13+
14+
<!-- <property>
15+
<name>fs.s3a.access.key</name>
16+
<value>admin</value>
17+
</property>
18+
19+
<property>
20+
<name>fs.s3a.secret.key</name>
21+
<value>password</value>
22+
</property>
23+
24+
<property>
25+
<name>fs.s3a.endpoint</name>
26+
<value>http://minio:9000</value>
27+
</property>
28+
29+
<property>
30+
<name>fs.s3a.path.style.access</name>
31+
<value>true</value>
32+
</property> -->
33+
34+
</configuration>
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
FROM hms-standalone
2+
3+
RUN apt-get update && apt-get install -y curl rlwrap vim
4+
5+
RUN cd /opt/hive-metastore/lib && \
6+
curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar -O && \
7+
curl https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/2.20.18/bundle-2.20.18.jar -O && \
8+
curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.648/aws-java-sdk-bundle-1.12.648.jar -O
9+
10+
COPY conf/hive-site.xml /opt/hive-metastore/conf/hive-site.xml
11+
12+
RUN cd ~ && \
13+
curl https://archive.apache.org/dist/db/derby/db-derby-10.14.2.0/db-derby-10.14.2.0-bin.tar.gz -o db-derby-10.14.2.0-bin.tar.gz && \
14+
tar xf db-derby-10.14.2.0-bin.tar.gz
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2+
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
3+
Licensed to the Apache Software Foundation (ASF) under one or more
4+
contributor license agreements. See the NOTICE file distributed with
5+
this work for additional information regarding copyright ownership.
6+
The ASF licenses this file to You under the Apache License, Version 2.0
7+
(the "License"); you may not use this file except in compliance with
8+
the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing, software
13+
distributed under the License is distributed on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
See the License for the specific language governing permissions and
16+
limitations under the License.
17+
-->
18+
<!-- These are default values meant to allow easy smoke testing of the metastore. You will
19+
likely need to add a number of new values. -->
20+
<configuration>
21+
<property>
22+
<name>metastore.thrift.uris</name>
23+
<value>thrift://hms:9083</value>
24+
</property>
25+
<!-- Add Materialization stuff for standalone metastore -->
26+
<property>
27+
<name>metastore.task.threads.always</name>
28+
<value>org.apache.hadoop.hive.metastore.events.EventCleanerTask</value>
29+
</property>
30+
<property>
31+
<name>metastore.expression.proxy</name>
32+
<value>org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy</value>
33+
</property>
34+
35+
<!-- Derby embedded DB-->
36+
<property>
37+
<name>javax.jdo.option.ConnectionURL</name>
38+
<value>jdbc:derby:;databaseName=/tmp/metastore_db;create=true</value>
39+
</property>
40+
<property>
41+
<name>javax.jdo.option.ConnectionDriverName</name>
42+
<value>org.apache.derby.jdbc.EmbeddedDriver</value>
43+
</property>
44+
45+
<property>
46+
<name>fs.s3a.access.key</name>
47+
<value>admin</value>
48+
</property>
49+
50+
<property>
51+
<name>fs.s3a.secret.key</name>
52+
<value>password</value>
53+
</property>
54+
55+
<property>
56+
<name>fs.s3a.endpoint</name>
57+
<value>http://minio:9000</value>
58+
</property>
59+
60+
<property>
61+
<name>fs.s3a.path.style.access</name>
62+
<value>true</value>
63+
</property>
64+
65+
</configuration>

0 commit comments

Comments
 (0)