@@ -2497,12 +2497,20 @@ def create_parallel_cluster_config(self):
2497
2497
number_of_queues = 0
2498
2498
number_of_compute_resources = 0
2499
2499
2500
+ disable_smt = self .config ['slurm' ]['ParallelClusterConfig' ]['DisableSimultaneousMultithreading' ]
2501
+
2500
2502
# Create 1 queue and compute resource for each instance type and purchase option.
2503
+ # The queue is named after the instance type.
2504
+ # The CR is named after the amount of memory and number of cores.
2501
2505
for purchase_option in purchase_options :
2502
2506
for instance_type in self .instance_types :
2503
2507
logger .debug (f"Creating queue for { purchase_option } { instance_type } " )
2504
2508
efa_supported = self .plugin .get_EfaSupported (self .cluster_region , instance_type ) and self .config ['slurm' ]['ParallelClusterConfig' ]['EnableEfa' ]
2505
2509
mem_gb = int (self .plugin .get_MemoryInMiB (self .cluster_region , instance_type ) / 1024 )
2510
+ core_count = int (self .plugin .get_CoreCount (self .cluster_region , instance_type ))
2511
+ threads_per_core = int (self .plugin .get_DefaultThreadsPerCore (self .cluster_region , instance_type ))
2512
+ if not disable_smt :
2513
+ core_count *= threads_per_core
2506
2514
if purchase_option == 'ONDEMAND' :
2507
2515
queue_name_prefix = "od"
2508
2516
allocation_strategy = 'lowest-price'
@@ -2524,6 +2532,7 @@ def create_parallel_cluster_config(self):
2524
2532
if number_of_queues >= MAX_NUMBER_OF_QUEUES :
2525
2533
logger .error (f"Can't create { queue_name } queue because MAX_NUMBER_OF_QUEUES=={ MAX_NUMBER_OF_QUEUES } and have { number_of_queues } queues." )
2526
2534
exit (1 )
2535
+ # ParallelCluster creates a NodeSet for each queue that contains all NodeNames in the queue.
2527
2536
nodeset = f"{ queue_name } _nodes"
2528
2537
if purchase_option_partition not in partition_nodesets :
2529
2538
partition_nodesets [purchase_option_partition ] = []
@@ -2532,12 +2541,20 @@ def create_parallel_cluster_config(self):
2532
2541
if mem_partition not in partition_nodesets :
2533
2542
partition_nodesets [mem_partition ] = []
2534
2543
partition_nodesets [mem_partition ].append (nodeset )
2544
+ mem_core_partition = f"{ queue_name_prefix } -{ mem_gb } -gb-{ core_count } -cores"
2545
+ if mem_core_partition not in partition_nodesets :
2546
+ partition_nodesets [mem_core_partition ] = []
2547
+ partition_nodesets [mem_core_partition ].append (nodeset )
2535
2548
parallel_cluster_queue = self .create_queue_config (queue_name , allocation_strategy , purchase_option )
2536
2549
number_of_queues += 1
2537
2550
2538
- compute_resource_name = f"{ queue_name_prefix } -{ instance_type } " .replace ('.' , '-' )
2539
- compute_resource_name = compute_resource_name .replace ('large' , 'l' )
2540
- compute_resource_name = compute_resource_name .replace ('medium' , 'm' )
2551
+ if True :
2552
+ # CR must begin with an alpha character, otherwise don't need the queue_name_prefix
2553
+ compute_resource_name = f"{ queue_name_prefix } -{ mem_gb } -gb-{ core_count } -cores"
2554
+ else :
2555
+ compute_resource_name = f"{ queue_name_prefix } -{ instance_type } " .replace ('.' , '-' )
2556
+ compute_resource_name = compute_resource_name .replace ('large' , 'l' )
2557
+ compute_resource_name = compute_resource_name .replace ('medium' , 'm' )
2541
2558
if number_of_compute_resources >= MAX_NUMBER_OF_COMPUTE_RESOURCES :
2542
2559
logger .error (f"Can't create { compute_resource_name } compute resource because MAX_NUMBER_OF_COMPUTE_RESOURCES=={ MAX_NUMBER_OF_COMPUTE_RESOURCES } and have { number_of_compute_resources } compute resources" )
2543
2560
exit (1 )
0 commit comments