Skip to content

Commit 694e464

Browse files
authored
Name compute resources with the amount of memory and number of cores (#265)
Instead of naming the queue and the CR the same, after the instance type, name the CR with the instance type memory and core amounts to give cluster users more information about the compute nodes. Many users will not know how much memory and how many cores are available based on the instance type name. Resolves #264
1 parent 5da1512 commit 694e464

File tree

1 file changed

+20
-3
lines changed

1 file changed

+20
-3
lines changed

source/cdk/cdk_slurm_stack.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2497,12 +2497,20 @@ def create_parallel_cluster_config(self):
24972497
number_of_queues = 0
24982498
number_of_compute_resources = 0
24992499

2500+
disable_smt = self.config['slurm']['ParallelClusterConfig']['DisableSimultaneousMultithreading']
2501+
25002502
# Create 1 queue and compute resource for each instance type and purchase option.
2503+
# The queue is named after the instance type.
2504+
# The CR is named after the amount of memory and number of cores.
25012505
for purchase_option in purchase_options:
25022506
for instance_type in self.instance_types:
25032507
logger.debug(f"Creating queue for {purchase_option} {instance_type}")
25042508
efa_supported = self.plugin.get_EfaSupported(self.cluster_region, instance_type) and self.config['slurm']['ParallelClusterConfig']['EnableEfa']
25052509
mem_gb = int(self.plugin.get_MemoryInMiB(self.cluster_region, instance_type) / 1024)
2510+
core_count = int(self.plugin.get_CoreCount(self.cluster_region, instance_type))
2511+
threads_per_core = int(self.plugin.get_DefaultThreadsPerCore(self.cluster_region, instance_type))
2512+
if not disable_smt:
2513+
core_count *= threads_per_core
25062514
if purchase_option == 'ONDEMAND':
25072515
queue_name_prefix = "od"
25082516
allocation_strategy = 'lowest-price'
@@ -2524,6 +2532,7 @@ def create_parallel_cluster_config(self):
25242532
if number_of_queues >= MAX_NUMBER_OF_QUEUES:
25252533
logger.error(f"Can't create {queue_name} queue because MAX_NUMBER_OF_QUEUES=={MAX_NUMBER_OF_QUEUES} and have {number_of_queues} queues.")
25262534
exit(1)
2535+
# ParallelCluster creates a NodeSet for each queue that contains all NodeNames in the queue.
25272536
nodeset = f"{queue_name}_nodes"
25282537
if purchase_option_partition not in partition_nodesets:
25292538
partition_nodesets[purchase_option_partition] = []
@@ -2532,12 +2541,20 @@ def create_parallel_cluster_config(self):
25322541
if mem_partition not in partition_nodesets:
25332542
partition_nodesets[mem_partition] = []
25342543
partition_nodesets[mem_partition].append(nodeset)
2544+
mem_core_partition = f"{queue_name_prefix}-{mem_gb}-gb-{core_count}-cores"
2545+
if mem_core_partition not in partition_nodesets:
2546+
partition_nodesets[mem_core_partition] = []
2547+
partition_nodesets[mem_core_partition].append(nodeset)
25352548
parallel_cluster_queue = self.create_queue_config(queue_name, allocation_strategy, purchase_option)
25362549
number_of_queues += 1
25372550

2538-
compute_resource_name = f"{queue_name_prefix}-{instance_type}".replace('.', '-')
2539-
compute_resource_name = compute_resource_name.replace('large', 'l')
2540-
compute_resource_name = compute_resource_name.replace('medium', 'm')
2551+
if True:
2552+
# CR must begin with an alpha character, otherwise don't need the queue_name_prefix
2553+
compute_resource_name = f"{queue_name_prefix}-{mem_gb}-gb-{core_count}-cores"
2554+
else:
2555+
compute_resource_name = f"{queue_name_prefix}-{instance_type}".replace('.', '-')
2556+
compute_resource_name = compute_resource_name.replace('large', 'l')
2557+
compute_resource_name = compute_resource_name.replace('medium', 'm')
25412558
if number_of_compute_resources >= MAX_NUMBER_OF_COMPUTE_RESOURCES:
25422559
logger.error(f"Can't create {compute_resource_name} compute resource because MAX_NUMBER_OF_COMPUTE_RESOURCES=={MAX_NUMBER_OF_COMPUTE_RESOURCES} and have {number_of_compute_resources} compute resources")
25432560
exit(1)

0 commit comments

Comments
 (0)