Skip to content

Commit 36434ce

Browse files
committed
fix: retry for updating capacity provider service
1 parent a7ba8ef commit 36434ce

File tree

1 file changed

+61
-31
lines changed

1 file changed

+61
-31
lines changed

src/emd/cfn/shared/ecs_cluster.yaml

Lines changed: 61 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -164,40 +164,70 @@ Resources:
164164
ecs_client = boto3.client('ecs')
165165
cluster_name = os.environ['ECS_CLUSTER_NAME']
166166
capacity_provider_name = event['ResourceProperties']['CapacityProvider']
167+
168+
def try_update_with_retry():
169+
# Simple retry mechanism - try twice with a delay
170+
try:
171+
return _do_update()
172+
except Exception as e:
173+
if 'UpdateInProgressException' in str(e):
174+
print("Cluster busy, waiting 30 seconds before retry...")
175+
import time
176+
time.sleep(30)
177+
return _do_update() # Try once more
178+
else:
179+
raise # Re-raise if it's not the specific error we're handling
180+
181+
def _do_update():
182+
cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0]
183+
current_capacity_providers = cluster_info.get('capacityProviders', [])
184+
185+
if capacity_provider_name not in current_capacity_providers:
186+
current_capacity_providers.append(capacity_provider_name)
187+
188+
return ecs_client.put_cluster_capacity_providers(
189+
cluster=cluster_name,
190+
capacityProviders=current_capacity_providers,
191+
defaultCapacityProviderStrategy=[
192+
{
193+
'capacityProvider': capacity_provider_name,
194+
'weight': 1,
195+
'base': 0
196+
}
197+
]
198+
)
199+
167200
try:
168201
if event['RequestType'] in ['Create', 'Update']:
169-
cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0]
170-
current_capacity_providers = cluster_info.get('capacityProviders', [])
171-
172-
if capacity_provider_name not in current_capacity_providers:
173-
current_capacity_providers.append(capacity_provider_name)
174-
175-
ecs_client.put_cluster_capacity_providers(
176-
cluster=cluster_name,
177-
capacityProviders=current_capacity_providers,
178-
defaultCapacityProviderStrategy=[
179-
{
180-
'capacityProvider': capacity_provider_name,
181-
'weight': 1,
182-
'base': 0
183-
}
184-
]
185-
)
202+
try_update_with_retry()
186203
elif event['RequestType'] == 'Delete':
187-
# Retrieve current capacity providers
188-
cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0]
189-
current_capacity_providers = cluster_info.get('capacityProviders', [])
190-
191-
# Remove only the specific capacity provider
192-
updated_capacity_providers = [
193-
cp for cp in current_capacity_providers if cp != capacity_provider_name
194-
]
195-
196-
ecs_client.put_cluster_capacity_providers(
197-
cluster=cluster_name,
198-
capacityProviders=updated_capacity_providers,
199-
defaultCapacityProviderStrategy=[]
200-
)
204+
def _do_delete():
205+
# Retrieve current capacity providers
206+
cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0]
207+
current_capacity_providers = cluster_info.get('capacityProviders', [])
208+
209+
# Remove only the specific capacity provider
210+
updated_capacity_providers = [
211+
cp for cp in current_capacity_providers if cp != capacity_provider_name
212+
]
213+
214+
return ecs_client.put_cluster_capacity_providers(
215+
cluster=cluster_name,
216+
capacityProviders=updated_capacity_providers,
217+
defaultCapacityProviderStrategy=[]
218+
)
219+
220+
# Simple retry for delete operation too
221+
try:
222+
_do_delete()
223+
except Exception as e:
224+
if 'UpdateInProgressException' in str(e):
225+
print("Cluster busy during delete, waiting 30 seconds before retry...")
226+
import time
227+
time.sleep(30)
228+
_do_delete() # Try once more
229+
else:
230+
raise
201231
cfnresponse.send(event, context, cfnresponse.SUCCESS, {})
202232
except Exception as e:
203233
cfnresponse.send(event, context, cfnresponse.FAILED, {'Error': str(e)})

0 commit comments

Comments
 (0)