Skip to content

Commit bb2676f

Browse files
committed
updated get_running_jobs and build_job_submit for slurm
1 parent 32373ab commit bb2676f

File tree

2 files changed

+34
-19
lines changed

2 files changed

+34
-19
lines changed

hpc_helper/_hpc_helper.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,6 @@ def get_running_jobs_torque(job_pattern: str, target_system: Optional[TARGET_SYS
116116
list of job names that are currently running
117117
118118
"""
119-
# job_pattern = (VP_\w+)
120119
qstat = _check_command_for_target_system("qstat", target_system)
121120
out = subprocess.check_output(qstat).decode("utf-8")
122121
return re.findall(rf"\S* {job_pattern}\s*\w+\s*\S*\s*R", out)
@@ -136,10 +135,8 @@ def get_running_jobs_slurm(job_pattern: str):
136135
list of job names that are currently running
137136
138137
"""
139-
# job_pattern = (VP_\w+)
140-
# out = subprocess.check_output("squeue").decode("utf-8")
141-
# return re.findall(rf"\S* {job_pattern}\s*\w+\s*\S*\s*R", out)
142-
raise NotImplementedError("Not implemented yet!")
138+
out = subprocess.check_output("squeue").decode("utf-8")
139+
return re.findall(rf"\d+\s*\w+\s*{job_pattern}\s*\w+\s*R\S*", out)
143140

144141

145142
def build_job_submit_torque(
@@ -200,13 +197,19 @@ def build_job_submit_torque(
200197
return qsub_command
201198

202199

200+
def _check_partition_slurm(partition: str, gres: str):
201+
if partition in ("v100", "a100"):
202+
assert partition in gres
203+
204+
203205
def build_job_submit_slurm(
204206
job_name: str,
205207
script_name: str,
206208
target_system: Optional[TARGET_SYSTEM] = "woody",
207209
nodes: Optional[int] = 1,
208210
tasks_per_node: Optional[int] = 4,
209211
gres: Optional[str] = "gpu:1",
212+
partition: Optional[str] = None,
210213
walltime: Optional[str] = "24:00:00",
211214
mail_type: Optional[Literal["BEGIN", "END", "FAIL", "ALL"]] = "ALL",
212215
args: Optional[Sequence[str]] = None,
@@ -231,6 +234,8 @@ def build_job_submit_slurm(
231234
gres : str, optional
232235
configuration of requested GPUs (for tinygpu)
233236
Default: "gpu:1" (for tinygpu)
237+
partition : str, optional
238+
partition for tinygpu when specific nodes (e.g., A100 or V100) are requested.
234239
walltime : str, optional
235240
required wall clock time (runtime) in the format ``HH:MM:SS``.
236241
Default: "24:00:00" (24 hours)
@@ -251,8 +256,13 @@ def build_job_submit_slurm(
251256
252257
"""
253258
sbatch = _check_command_for_target_system("sbatch", target_system=target_system)
254-
sbatch_command = f"{sbatch} --job-name {job_name} --nodes={nodes} --ntasks-per-node={tasks_per_node} "
259+
sbatch_command = f"{sbatch} --job-name {job_name} "
260+
if target_system != "tinygpu":
261+
sbatch_command += f"--nodes={nodes} --ntasks-per-node={tasks_per_node} "
255262
if target_system == "tinygpu":
263+
if partition is not None:
264+
_check_partition_slurm(partition, gres)
265+
sbatch_command += f"--partition={partition} "
256266
sbatch_command += f"--gres={gres} "
257267
sbatch_command += f"--time={walltime} --mail-type={mail_type} {script_name} "
258268

@@ -300,16 +310,22 @@ def _add_arguments_torque(command_str: str, args: Optional[Sequence[str]] = None
300310

301311
def _add_arguments_slurm(command_str: str, args: Optional[Sequence[str]] = None, **kwargs) -> str:
302312
if len(kwargs) != 0 or args is not None:
313+
command_str += "--export="
303314
if args is not None:
304315
command_str += 'PARAMS="'
305316
for arg in args:
306317
command_str += f"{arg} "
307-
command_str = command_str.strip()
308-
command_str += '" '
318+
command_str = command_str.strip() + '"'
319+
if len(kwargs) != 0:
320+
command_str += ","
309321
if len(kwargs) != 0:
310322
for key, value in kwargs.items():
311-
command_str += f"{key}={value},"
312-
# remove the last comma
313-
command_str = command_str[:-1]
323+
command_str += f'{key}="{value}",'
324+
# remove the last comma and add the quote again
325+
command_str = command_str[:-2] + '"'
314326

315327
return command_str.strip()
328+
329+
330+
if __name__ == "__main__":
331+
print(build_job_submit_slurm("VP_01", "jobscript.sh", "tinygpu", BASE_PATH="hello", SUBJECT_ID="VP_01"))

tests/test_hpc_helper.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -164,29 +164,28 @@ def test_build_job_submit_slurm_raises(self, target_system, expected):
164164
"tinygpu",
165165
None,
166166
{},
167-
"sbatch.tinygpu --job-name Test_Job --nodes=1 --ntasks-per-node=4 --gres=gpu:1 "
168-
f"--time=24:00:00 --mail-type=ALL jobscript.sh",
167+
"sbatch.tinygpu --job-name Test_Job --gres=gpu:1 --time=24:00:00 --mail-type=ALL jobscript.sh",
169168
),
170169
(
171170
"tinygpu",
172171
["path1", "path2"],
173172
{},
174-
"sbatch.tinygpu --job-name Test_Job --nodes=1 --ntasks-per-node=4 --gres=gpu:1 "
175-
f'--time=24:00:00 --mail-type=ALL jobscript.sh PARAMS="path1 path2"',
173+
"sbatch.tinygpu --job-name Test_Job --gres=gpu:1 "
174+
f'--time=24:00:00 --mail-type=ALL jobscript.sh --export=PARAMS="path1 path2"',
176175
),
177176
(
178177
"tinygpu",
179178
["path1", ""],
180179
{"SUBJECT_DIR": "path3"},
181-
"sbatch.tinygpu --job-name Test_Job --nodes=1 --ntasks-per-node=4 --gres=gpu:1 "
182-
f'--time=24:00:00 --mail-type=ALL jobscript.sh PARAMS="path1" SUBJECT_DIR=path3',
180+
"sbatch.tinygpu --job-name Test_Job --gres=gpu:1 "
181+
f'--time=24:00:00 --mail-type=ALL jobscript.sh --export=PARAMS="path1",SUBJECT_DIR="path3"',
183182
),
184183
(
185184
"tinygpu",
186185
None,
187186
{"SUBJECT_DIR": "path3"},
188-
"sbatch.tinygpu --job-name Test_Job --nodes=1 --ntasks-per-node=4 --gres=gpu:1 "
189-
f"--time=24:00:00 --mail-type=ALL jobscript.sh SUBJECT_DIR=path3",
187+
"sbatch.tinygpu --job-name Test_Job --gres=gpu:1 "
188+
f'--time=24:00:00 --mail-type=ALL jobscript.sh --export=SUBJECT_DIR="path3"',
190189
),
191190
],
192191
)

0 commit comments

Comments
 (0)