Skip to content

Commit 48a9711

Browse files
committed
add launch script
1 parent 4d979c1 commit 48a9711

File tree

1 file changed

+228
-0
lines changed

1 file changed

+228
-0
lines changed

rpc/launch_tpch_queries.py

Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
import argparse
2+
import os
3+
import random
4+
import subprocess
5+
import sys
6+
import time
7+
import numpy as np
8+
9+
from pathlib import Path
10+
11+
from workload import JobGraph
12+
from utils import EventTime
13+
from data.tpch_loader import make_release_policy
14+
15+
16+
def map_dataset_to_deadline(dataset_size):
17+
# 50gb => 2mins, 100gb => 6mins, 250gb => 12mins, 500gb => 24mins
18+
mapping = {"50": 120, "100": 360, "250": 720, "500": 1440}
19+
return mapping.get(dataset_size, 120) # Default to 120s if dataset size is NA
20+
21+
22+
def launch_query(query_number, args):
23+
deadline = map_dataset_to_deadline(args.dataset_size)
24+
25+
cmd = [
26+
f"{args.spark_mirror_path.resolve()}/bin/spark-submit",
27+
*("--deploy-mode", "cluster"),
28+
*("--master", "spark://130.207.125.81:7077"),
29+
*("--conf", "'spark.port.maxRetries=132'"),
30+
*("--conf", "'spark.eventLog.enabled=true'"),
31+
*("--conf", f"'spark.eventLog.dir={args.spark_eventlog_dir.resolve()}'"),
32+
*("--conf", "'spark.sql.adaptive.enabled=false'"),
33+
*("--conf", "'spark.sql.adaptive.coalescePartitions.enabled=false'"),
34+
*("--conf", "'spark.sql.autoBroadcastJoinThreshold=-1'"),
35+
*("--conf", "'spark.sql.shuffle.partitions=1'"),
36+
*("--conf", "'spark.sql.files.minPartitionNum=1'"),
37+
*("--conf", "'spark.sql.files.maxPartitionNum=1'"),
38+
*("--conf", f"'spark.app.deadline={deadline}'"),
39+
*("--class", "'main.scala.TpchQuery'"),
40+
f"{args.tpch_spark_path.resolve()}/target/scala-2.13/spark-tpc-h-queries_2.13-1.0.jar",
41+
f"{query_number}",
42+
f"{args.dataset_size}",
43+
f"{args.max_cores}",
44+
]
45+
46+
# print(
47+
# f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Launching Query: {query_number}, "
48+
# f"dataset: {args.dataset_size}GB, deadline: {deadline}s, maxCores: {args.max_cores}"
49+
# )
50+
51+
try:
52+
cmd = ' '.join(cmd)
53+
print("Launching:", cmd)
54+
subprocess.Popen(
55+
cmd,
56+
shell=True,
57+
)
58+
print("Query launched successfully.")
59+
except Exception as e:
60+
print(f"Error launching query: {e}")
61+
62+
63+
def generate_release_times(rng, args):
64+
if args.distribution == "periodic":
65+
release_policy_args = {
66+
"period": EventTime(args.period, EventTime.Unit.US),
67+
}
68+
elif args.distribution == "fixed":
69+
release_policy_args = {
70+
"period": EventTime(args.period, EventTime.Unit.US),
71+
"num_invocations": args.num_queries,
72+
}
73+
elif args.distribution == "poisson":
74+
release_policy_args = {
75+
"rate": args.variable_arrival_rate,
76+
"num_invocations": args.num_queries,
77+
}
78+
elif args.distribution == "gamma":
79+
release_policy_args = {
80+
"rate": args.variable_arrival_rate,
81+
"num_invocations": args.num_queries,
82+
"coefficient": args.coefficient,
83+
}
84+
elif args.distribution == "fixed_gamma":
85+
release_policy_args = {
86+
"variable_arrival_rate": args.variable_arrival_rate,
87+
"base_arrival_rate": args.base_arrival_rate,
88+
"num_invocations": args.num_queries,
89+
"coefficient": args.coefficient,
90+
}
91+
else:
92+
raise NotImplementedError(
93+
f"Release policy {args.distribution} not implemented."
94+
)
95+
96+
release_policy = make_release_policy(
97+
args.distribution,
98+
release_policy_args,
99+
rng,
100+
args.rng_seed,
101+
(args.randomize_start_time_min, args.randomize_start_time_max),
102+
)
103+
104+
release_times = release_policy.get_release_times(
105+
completion_time=EventTime(sys.maxsize, EventTime.Unit.US)
106+
)
107+
108+
return release_times
109+
110+
111+
def main():
112+
parser = argparse.ArgumentParser(
113+
description="Generate a workload of queries based on distribution type."
114+
)
115+
parser.add_argument(
116+
"--spark-mirror-path",
117+
type=Path,
118+
required=True,
119+
help="Path to spark-mirror repository",
120+
)
121+
parser.add_argument(
122+
"--tpch-spark-path",
123+
type=Path,
124+
required=True,
125+
help="Path to TPC-H Spark repository",
126+
)
127+
parser.add_argument(
128+
"--spark-eventlog-dir",
129+
default=Path(os.getcwd()) / "spark-eventlog",
130+
type=Path,
131+
help="Path to directory in which to Spark event logs will be dumped",
132+
)
133+
parser.add_argument(
134+
"--distribution",
135+
choices=["periodic", "fixed", "poisson", "gamma", "closed_loop", "fixed_gamma"],
136+
default="gamma",
137+
help="Type of distribution for query inter-arrival times (default: gamma)",
138+
)
139+
parser.add_argument(
140+
"--num_queries",
141+
type=int,
142+
default=50,
143+
help="Number of queries to generate (default: 50)",
144+
)
145+
parser.add_argument(
146+
"--dataset_size",
147+
choices=["50", "100", "250", "500"],
148+
default="50",
149+
help="Dataset size per query in GB (default: 50)",
150+
)
151+
parser.add_argument(
152+
"--max_cores",
153+
type=int,
154+
choices=[50, 75, 100, 200],
155+
default=50,
156+
help="Maximum executor cores (default: 50)",
157+
)
158+
parser.add_argument(
159+
"--period",
160+
type=int,
161+
default=25,
162+
help="Releases a DAG after period time has elapsed",
163+
)
164+
parser.add_argument(
165+
"--variable_arrival_rate",
166+
type=float,
167+
default=1.0,
168+
help="Variable arrival rate for poisson and gamma distributions",
169+
)
170+
parser.add_argument(
171+
"--coefficient",
172+
type=float,
173+
default=1.0,
174+
help="Coefficient for poisson and gamma distributions",
175+
)
176+
parser.add_argument(
177+
"--base_arrival_rate",
178+
type=float,
179+
default=1.0,
180+
help="Base arrival rate for fixed_gamma distribution",
181+
)
182+
parser.add_argument("--randomize_start_time_min", type=int, default=0)
183+
parser.add_argument("--randomize_start_time_max", type=int, default=0)
184+
parser.add_argument(
185+
"--rng_seed",
186+
type=int,
187+
default=1234,
188+
help="RNG seed for generating inter-arrival periods and picking DAGs (default: 1234)",
189+
)
190+
parser.add_argument("--queries", type=int, nargs='+', help="Launch specific queries")
191+
192+
args = parser.parse_args()
193+
194+
if not args.spark_eventlog_dir.exists():
195+
args.spark_eventlog_dir.mkdir(parents=True)
196+
197+
os.environ["TPCH_INPUT_DATA_DIR"] = str(args.tpch_spark_path.resolve() / "dbgen")
198+
199+
if args.queries:
200+
assert(len(args.queries) == args.num_queries)
201+
202+
rng = random.Random(args.rng_seed)
203+
204+
# Generate release times
205+
release_times = generate_release_times(rng, args)
206+
print("Release times:", release_times)
207+
208+
# Launch queries
209+
inter_arrival_times = [release_times[0].time]
210+
for i in range(len(release_times) - 1):
211+
inter_arrival_times.append(release_times[i + 1].time - release_times[i].time)
212+
for i, inter_arrival_time in enumerate(inter_arrival_times):
213+
time.sleep(inter_arrival_time)
214+
if args.queries:
215+
query_number = args.queries[i]
216+
else:
217+
query_number = rng.randint(1, 22)
218+
launch_query(query_number, args)
219+
print(
220+
"Current time: ",
221+
time.strftime("%Y-%m-%d %H:%M:%S"),
222+
" launching query: ",
223+
query_number,
224+
)
225+
226+
227+
if __name__ == "__main__":
228+
main()

0 commit comments

Comments
 (0)