Skip to content

Commit ada1a31

Browse files
authored
Support ParallelCluster 3.11.1 (#270)
Require at least 4 GB or else instance doesn't have enough memory. Update Lambdas to Python 3.12 from 3.9. Fix bug in Xio configuration. Resolves #268 Fix bug in ansible task that updates slurm.conf that didn't correctly detect changes and restart slurmctld. Resolves #267
1 parent 1ebf9a9 commit ada1a31

File tree

7 files changed

+49
-24
lines changed

7 files changed

+49
-24
lines changed

source/cdk/cdk_slurm_stack.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -903,9 +903,10 @@ def update_config_for_exostellar(self):
903903
if self.slurm_compute_node_sg_id:
904904
if self.slurm_compute_node_sg_id not in self.config['slurm']['Xio']['WorkerSecurityGroupIds']:
905905
self.config['slurm']['Xio']['WorkerSecurityGroupIds'].append(self.slurm_compute_node_sg_id)
906-
if self.res_dcv_security_group_id:
907-
if self.res_dcv_security_group_id not in self.config['slurm']['Xio']['WorkerSecurityGroupIds']:
908-
self.config['slurm']['Xio']['WorkerSecurityGroupIds'].append(self.res_dcv_security_group_id)
906+
if 'RESStackName' in self.config:
907+
if self.res_dcv_security_group_id:
908+
if self.res_dcv_security_group_id not in self.config['slurm']['Xio']['WorkerSecurityGroupIds']:
909+
self.config['slurm']['Xio']['WorkerSecurityGroupIds'].append(self.res_dcv_security_group_id)
909910

910911
# Get values from stack outputs
911912
ems_ip_address = None
@@ -1376,8 +1377,9 @@ def check_regions_config(self):
13761377
self.instance_types = sorted(self.instance_types)
13771378

13781379
# Filter the instance types by architecture due to PC limitation to 1 architecture
1379-
# Also require at least 2 GB of memory.
1380+
# Also require at least 4 GB of memory.
13801381
# Also filter by the CPU vendor from the config
1382+
MIN_COMPUTE_NODE_GB = 4
13811383
cluster_architecture = self.config['slurm']['ParallelClusterConfig']['Architecture']
13821384
logger.info(f"ParallelCluster Architecture: {cluster_architecture}")
13831385
filtered_instance_types = []
@@ -1387,7 +1389,7 @@ def check_regions_config(self):
13871389
logger.warning(f"Excluding {instance_type} because architecture ({instance_architecture}) != {cluster_architecture}")
13881390
continue
13891391
mem_gb = int(self.plugin.get_MemoryInMiB(self.cluster_region, instance_type) / 1024)
1390-
if mem_gb < 2:
1392+
if mem_gb < MIN_COMPUTE_NODE_GB:
13911393
logger.warning(f"Excluding {instance_type} because has less than 2 GiB of memory.")
13921394
continue
13931395
cpu_vendor = self.plugin.get_cpu_vendor(self.cluster_region, instance_type)
@@ -1425,9 +1427,7 @@ def create_parallel_cluster_lambdas(self):
14251427
aws_lambda.Architecture.X86_64,
14261428
],
14271429
compatible_runtimes = [
1428-
aws_lambda.Runtime.PYTHON_3_9,
1429-
# aws_lambda.Runtime.PYTHON_3_10, # Doesn't work: No module named 'rpds.rpds'
1430-
# aws_lambda.Runtime.PYTHON_3_11, # Doesn't work: No module named 'rpds.rpds'
1430+
aws_lambda.Runtime.PYTHON_3_12,
14311431
],
14321432
)
14331433

@@ -1437,7 +1437,7 @@ def create_parallel_cluster_lambdas(self):
14371437
function_name=f"{self.stack_name}-CreateBuildFiles",
14381438
description="Create ParallelCluster build configuration files",
14391439
memory_size=2048,
1440-
runtime=aws_lambda.Runtime.PYTHON_3_9,
1440+
runtime=aws_lambda.Runtime.PYTHON_3_12,
14411441
architecture=aws_lambda.Architecture.X86_64,
14421442
timeout=Duration.minutes(2),
14431443
log_retention=logs.RetentionDays.INFINITE,
@@ -1499,7 +1499,7 @@ def create_parallel_cluster_lambdas(self):
14991499
function_name=f"{self.stack_name}-CreateParallelClusterConfig",
15001500
description="Create ParallelCluster config",
15011501
memory_size=2048,
1502-
runtime=aws_lambda.Runtime.PYTHON_3_9,
1502+
runtime=aws_lambda.Runtime.PYTHON_3_12,
15031503
architecture=aws_lambda.Architecture.X86_64,
15041504
timeout=Duration.minutes(15),
15051505
log_retention=logs.RetentionDays.INFINITE,
@@ -1547,7 +1547,7 @@ def create_parallel_cluster_lambdas(self):
15471547
function_name=f"{self.stack_name}-CreateParallelCluster",
15481548
description="Create ParallelCluster",
15491549
memory_size=2048,
1550-
runtime=aws_lambda.Runtime.PYTHON_3_9,
1550+
runtime=aws_lambda.Runtime.PYTHON_3_12,
15511551
architecture=aws_lambda.Architecture.X86_64,
15521552
timeout=Duration.minutes(15),
15531553
log_retention=logs.RetentionDays.INFINITE,
@@ -1846,7 +1846,7 @@ def create_parallel_cluster_lambdas(self):
18461846
function_name=f"{self.stack_name}-CreateHeadNodeARecord",
18471847
description="Create head node A record",
18481848
memory_size=2048,
1849-
runtime=aws_lambda.Runtime.PYTHON_3_9,
1849+
runtime=aws_lambda.Runtime.PYTHON_3_12,
18501850
architecture=aws_lambda.Architecture.X86_64,
18511851
timeout=Duration.minutes(15),
18521852
log_retention=logs.RetentionDays.INFINITE,
@@ -1893,7 +1893,7 @@ def create_parallel_cluster_lambdas(self):
18931893
function_name=f"{self.stack_name}-UpdateHeadNode",
18941894
description="Update head node",
18951895
memory_size=2048,
1896-
runtime=aws_lambda.Runtime.PYTHON_3_9,
1896+
runtime=aws_lambda.Runtime.PYTHON_3_12,
18971897
architecture=aws_lambda.Architecture.X86_64,
18981898
timeout=Duration.minutes(15),
18991899
log_retention=logs.RetentionDays.INFINITE,
@@ -1935,7 +1935,7 @@ def create_parallel_cluster_lambdas(self):
19351935
function_name=f"{self.stack_name}-ConfigUsersGroupsJson",
19361936
description="Configure users and groups json file",
19371937
memory_size=2048,
1938-
runtime=aws_lambda.Runtime.PYTHON_3_9,
1938+
runtime=aws_lambda.Runtime.PYTHON_3_12,
19391939
architecture=aws_lambda.Architecture.X86_64,
19401940
timeout=Duration.minutes(15),
19411941
log_retention=logs.RetentionDays.INFINITE,
@@ -1983,7 +1983,7 @@ def create_parallel_cluster_lambdas(self):
19831983
function_name=f"{self.stack_name}-ConfigExternalLoginNodes",
19841984
description="Configure external login nodes",
19851985
memory_size=2048,
1986-
runtime=aws_lambda.Runtime.PYTHON_3_9,
1986+
runtime=aws_lambda.Runtime.PYTHON_3_12,
19871987
architecture=aws_lambda.Architecture.X86_64,
19881988
timeout=Duration.minutes(15),
19891989
log_retention=logs.RetentionDays.INFINITE,
@@ -2030,7 +2030,7 @@ def create_parallel_cluster_lambdas(self):
20302030
function_name=f"{self.stack_name}-DeconfigUsersGroupsJson",
20312031
description="Deconfigure RES users and groups json file",
20322032
memory_size=2048,
2033-
runtime=aws_lambda.Runtime.PYTHON_3_9,
2033+
runtime=aws_lambda.Runtime.PYTHON_3_12,
20342034
architecture=aws_lambda.Architecture.X86_64,
20352035
timeout=Duration.minutes(15),
20362036
log_retention=logs.RetentionDays.INFINITE,
@@ -2072,7 +2072,7 @@ def create_parallel_cluster_lambdas(self):
20722072
function_name=f"{self.stack_name}-DeconfigExternalLoginNodes",
20732073
description="Deconfigure external login nodes",
20742074
memory_size=2048,
2075-
runtime=aws_lambda.Runtime.PYTHON_3_9,
2075+
runtime=aws_lambda.Runtime.PYTHON_3_12,
20762076
architecture=aws_lambda.Architecture.X86_64,
20772077
timeout=Duration.minutes(15),
20782078
log_retention=logs.RetentionDays.INFINITE,
@@ -2114,7 +2114,7 @@ def create_callSlurmRestApiLambda(self):
21142114
function_name=f"{self.stack_name}-CallSlurmRestApiLambda",
21152115
description="Example showing how to call Slurm REST API",
21162116
memory_size=128,
2117-
runtime=aws_lambda.Runtime.PYTHON_3_9,
2117+
runtime=aws_lambda.Runtime.PYTHON_3_12,
21182118
architecture=aws_lambda.Architecture.ARM_64,
21192119
timeout=Duration.minutes(1),
21202120
log_retention=logs.RetentionDays.INFINITE,

source/cdk/config_schema.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@
9090
# 3.11.0:
9191
# * Add support for ap-southeast-3
9292
# * login node enhancements
93+
# 3.11.1:
94+
# * Disable Pyxis Spack plugin by default
95+
# * Upgrade Python runtime to 3.12
96+
# * Upgrade libjwt to version 1.17.0.
9397
MIN_PARALLEL_CLUSTER_VERSION = parse_version('3.6.0')
9498
# Update source/resources/default_config.yml with latest version when this is updated.
9599
PARALLEL_CLUSTER_VERSIONS = [
@@ -106,14 +110,17 @@
106110
'3.10.0',
107111
'3.10.1',
108112
'3.11.0',
113+
'3.11.1',
109114
]
110115
PARALLEL_CLUSTER_ENROOT_VERSIONS = {
111116
# This can be found on the head node by running 'yum info enroot'
112117
'3.11.0': '3.4.1', # confirmed
118+
'3.11.1': '3.4.1', # confirmed
113119
}
114120
PARALLEL_CLUSTER_PYXIS_VERSIONS = {
115121
# This can be found on the head node at /opt/parallelcluster/sources
116122
'3.11.0': '0.20.0', # confirmed
123+
'3.11.1': '0.20.0', # confirmed
117124
}
118125
PARALLEL_CLUSTER_MUNGE_VERSIONS = {
119126
# This can be found on the head node at /opt/parallelcluster/sources
@@ -131,6 +138,7 @@
131138
'3.10.0': '0.5.16', # confirmed
132139
'3.10.1': '0.5.16', # confirmed
133140
'3.11.0': '0.5.16', # confirmed
141+
'3.11.1': '0.5.16', # confirmed
134142
}
135143
PARALLEL_CLUSTER_PYTHON_VERSIONS = {
136144
# This can be found on the head node at /opt/parallelcluster/pyenv/versions
@@ -147,6 +155,7 @@
147155
'3.10.0': '3.9.19', # confirmed
148156
'3.10.1': '3.9.19', # confirmed
149157
'3.11.0': '3.9.20', # confirmed
158+
'3.11.1': '3.9.20', # confirmed
150159
}
151160
PARALLEL_CLUSTER_SLURM_VERSIONS = {
152161
# This can be found on the head node at /etc/chef/local-mode-cache/cache/
@@ -163,6 +172,7 @@
163172
'3.10.0': '23.11.7', # confirmed
164173
'3.10.1': '23.11.7', # confirmed
165174
'3.11.0': '23.11.10', # confirmed
175+
'3.11.1': '23.11.10', # confirmed
166176
}
167177
PARALLEL_CLUSTER_PC_SLURM_VERSIONS = {
168178
# This can be found on the head node at /etc/chef/local-mode-cache/cache/
@@ -179,6 +189,7 @@
179189
'3.10.0': '23-11-7-1', # confirmed
180190
'3.10.1': '23-11-7-1', # confirmed
181191
'3.11.0': '23-11-10-1', # confirmed
192+
'3.11.1': '23-11-10-1', # confirmed
182193
}
183194
SLURM_REST_API_VERSIONS = {
184195
'23-02-2-1': '0.0.39',

source/resources/lambdas/CreateBuildFiles/CreateBuildFiles.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,10 @@ def lambda_handler(event, context):
139139
else:
140140
raise KeyError(error_message)
141141

142+
if requestType == 'Delete':
143+
cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, physicalResourceId=cluster_name)
144+
return
145+
142146
ami_builds = json.loads(environ['AmiBuildsJson'])
143147
assets_bucket = environ['AssetsBucket']
144148
assets_base_key = environ['AssetsBaseKey']

source/resources/parallel-cluster/config/bin/on_compute_node_configured.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,9 @@ fi
6363
export PATH=/usr/sbin:$PATH
6464

6565
echo "Creating users and groups"
66-
$config_bin_dir/create_users_groups.py -i $config_dir/users_groups.json
66+
if [[ -e $config_dir/users_groups.json ]]; then
67+
$config_bin_dir/create_users_groups.py -i $config_dir/users_groups.json
68+
fi
6769

6870
# ansible_compute_node_vars_yml_s3_url="s3://$assets_bucket/$assets_base_key/config/ansible/ansible_compute_node_vars.yml"
6971

source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/config-submitter-access.yml

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,14 @@
3535
cmd: |
3636
set -ex
3737
38-
conf_files=$(find /opt/slurm -name '*.conf')
38+
conf_files=$(find /opt/slurm/etc -name '*.conf')
3939
backup_suffix=".$(date '+%Y-%m-%dT%H:%M:%S')~"
4040
num_changed=0
4141
for conf_file in ${conf_files[*]}; do
42-
sed --in-place=$backup_suffix 's%/opt/slurm/etc%/opt/slurm/{{ cluster_name }}/etc%' $conf_file
43-
sed --in-place=$backup_suffix 's%/opt/slurm/lib%/opt/slurm/{{ cluster_name }}/lib%' $conf_file
42+
sed --in-place=$backup_suffix \
43+
-e 's%/opt/slurm/etc%/opt/slurm/{{ cluster_name }}/etc%' \
44+
-e 's%/opt/slurm/lib%/opt/slurm/{{ cluster_name }}/lib%' \
45+
$conf_file
4446
4547
backup_conf_file="${conf_file}${backup_suffix}"
4648
if diff -q $backup_conf_file $conf_file; then
@@ -56,6 +58,12 @@
5658
else
5759
echo "No conf files changed."
5860
fi
61+
register: change_slurm_conf_result
62+
63+
- name: Show change_slurm_conf_result
64+
debug:
65+
msg: |
66+
{{ change_slurm_conf_result }}
5967
6068
- name: Fix permissions on config dir so users can access it to get the modulefiles
6169
file:

source/resources/playbooks/roles/exostellar_infrastructure_optimizer/files/opt/slurm/etc/exostellar/configure_xio.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@
2323
import json
2424
import logging
2525
import logging.handlers
26-
import os
27-
import pycurl
2826
import requests
2927
import yaml
3028

source/resources/playbooks/roles/exostellar_infrastructure_optimizer/tasks/main.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@
143143
cmd: |
144144
set -ex
145145
146+
yum -y install python3.11-pip
147+
python3.11 -m pip install requests PyYaml
146148
{{ exostellar_dir }}/configure_xio.py
147149
148150
- name: Create {{ exostellar_dir }}/xspot.slurm.conf

0 commit comments

Comments
 (0)