@@ -60,6 +60,7 @@ Metadata:
60
60
- BuildkiteAgentScalerServerlessARN
61
61
- BuildkiteAgentScalerVersion
62
62
- LogRetentionDays
63
+ - BuildkiteAgentEnableGracefulShutdown
63
64
64
65
- Label :
65
66
default : Network Configuration
@@ -237,6 +238,14 @@ Parameters:
237
238
Description : The number of days to retain the Cloudwatch Logs of the lambda.
238
239
Default : " 1"
239
240
241
+ BuildkiteAgentEnableGracefulShutdown :
242
+ Description : Set to true to enable graceful shutdown of agents when the ASG is updated with replacement. This allows ASGs to be removed in a timely manner during an in-place update of the elastic stack, and allows remaining agents to finish jobs without interruptions.
243
+ Type : String
244
+ AllowedValues :
245
+ - " true"
246
+ - " false"
247
+ Default : " false"
248
+
240
249
BuildkiteAgentTracingBackend :
241
250
Description : The tracing backend to use for CI tracing. See https://buildkite.com/docs/agent/v3/tracing
242
251
Type : String
@@ -937,6 +946,9 @@ Conditions:
937
946
UseCostAllocationTags :
938
947
!Equals [ !Ref EnableCostAllocationTags, "true" ]
939
948
949
+ EnableBuildkiteAgentGracefulShutdown :
950
+ !Equals [ !Ref BuildkiteAgentEnableGracefulShutdown, "true" ]
951
+
940
952
UsePipelineSigningKMSKey :
941
953
!Not [ !Equals [ !Ref PipelineSigningKMSKeyId, "" ] ]
942
954
@@ -1858,6 +1870,147 @@ Resources:
1858
1870
ServiceToken : !GetAtt AzRebalancingSuspenderFunction.Arn
1859
1871
AutoScalingGroupName : !Ref AgentAutoScaleGroup
1860
1872
1873
+ StopBuildkiteAgentsRole :
1874
+ Type : AWS::IAM::Role
1875
+ Condition : EnableBuildkiteAgentGracefulShutdown
1876
+ Properties :
1877
+ PermissionsBoundary :
1878
+ !If [
1879
+ SetInstanceRolePermissionsBoundaryARN,
1880
+ !Ref InstanceRolePermissionsBoundaryARN,
1881
+ !Ref "AWS::NoValue",
1882
+ ]
1883
+ AssumeRolePolicyDocument :
1884
+ Version : 2012-10-17
1885
+ Statement :
1886
+ - Effect : Allow
1887
+ Principal :
1888
+ Service :
1889
+ - lambda.amazonaws.com
1890
+ Action :
1891
+ - sts:AssumeRole
1892
+ ManagedPolicyArns :
1893
+ - arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
1894
+ Policies :
1895
+ - PolicyName : DescribeASGs
1896
+ PolicyDocument :
1897
+ Version : 2012-10-17
1898
+ Statement :
1899
+ - Effect : Allow
1900
+ Action :
1901
+ - " autoscaling:DescribeAutoScalingGroups"
1902
+ Resource : " *"
1903
+ - PolicyName : ModifyASGs
1904
+ PolicyDocument :
1905
+ Version : 2012-10-17
1906
+ Statement :
1907
+ - Effect : Allow
1908
+ Action :
1909
+ - " autoscaling:UpdateAutoScalingGroup"
1910
+ Resource : !Sub arn:${AWS::Partition}:autoscaling:${AWS::Region}:${AWS::AccountId}:autoScalingGroup:*:autoScalingGroupName/${AWS::StackName}-AgentAutoScaleGroup-*
1911
+ - PolicyName : RunStopBuildkiteDocument
1912
+ PolicyDocument :
1913
+ Version : 2012-10-17
1914
+ Statement :
1915
+ - Effect : Allow
1916
+ Action :
1917
+ - " ssm:SendCommand"
1918
+ Resource :
1919
+ - !Sub arn:${AWS::Partition}:ssm:${AWS::Region}::document/AWS-RunShellScript
1920
+ - PolicyName : StopBuildkiteInstances
1921
+ PolicyDocument :
1922
+ Version : 2012-10-17
1923
+ Statement :
1924
+ - Effect : Allow
1925
+ Action :
1926
+ - " ssm:SendCommand"
1927
+ Resource :
1928
+ - !Sub arn:${AWS::Partition}:ec2:${AWS::Region}:${AWS::AccountId}:instance/*
1929
+ Condition :
1930
+ StringEquals :
1931
+ " aws:resourceTag/aws:cloudformation:logical-id " : " AgentAutoScaleGroup"
1932
+
1933
+ StopBuildkiteAgentsFunction :
1934
+ Type : AWS::Lambda::Function
1935
+ Condition : EnableBuildkiteAgentGracefulShutdown
1936
+ Properties :
1937
+ Description : " Gracefully stops all Buildkite agents in a given Auto Scaling group."
1938
+ Code :
1939
+ ZipFile : |
1940
+ import boto3
1941
+ import logging
1942
+ import cfnresponse
1943
+
1944
+ logger = logging.getLogger()
1945
+ logger.setLevel(logging.INFO)
1946
+
1947
+ autoscaling_client = boto3.client("autoscaling")
1948
+ ssm_client = boto3.client("ssm")
1949
+
1950
+ def handler(event, context):
1951
+ logger.info(f"Received event: {event}")
1952
+
1953
+ # only trigger on update upon replacement events
1954
+ if event["RequestType"] == "Update":
1955
+ try:
1956
+ props = event["OldResourceProperties"]
1957
+ autoscaling_group_name = props["AutoScalingGroupName"]
1958
+
1959
+ # Scale ASG down to zero, to allow Buildkite agents to terminate
1960
+ force_instance_termination(autoscaling_group_name)
1961
+
1962
+ # Stop all Buildkite agents in the old ASG
1963
+ stop_bk_agents(autoscaling_group_name)
1964
+
1965
+ # Send success response to CloudFormation
1966
+ cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, "CustomResourcePhysicalID")
1967
+ except Exception as e:
1968
+ logger.error(f"Error: {str(e)}")
1969
+ cfnresponse.send(event, context, cfnresponse.FAILED, {"Error": str(e)}, "CustomResourcePhysicalID")
1970
+ else:
1971
+ # For Create and Delete events, just send success response
1972
+ cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, "CustomResourcePhysicalID")
1973
+
1974
+ def force_instance_termination(autoscaling_group_name):
1975
+ """Forces all EC2 instances to terminate in the specified Auto Scaling group by setting the desired capacity to zero."""
1976
+ logger.info(f"Setting the desired capacity of {autoscaling_group_name} to zero")
1977
+ autoscaling_client.update_auto_scaling_group(
1978
+ AutoScalingGroupName=autoscaling_group_name,
1979
+ MinSize=0,
1980
+ DesiredCapacity=0
1981
+ )
1982
+
1983
+ def stop_bk_agents(autoscaling_group_name):
1984
+ """Gracefully terminates Buildkite agents running in the given Auto Scaling Group."""
1985
+ stack_name = autoscaling_group_name.split("-AgentAutoScaleGroup")[0]
1986
+
1987
+ logger.info(f"Stopping BK agents in {stack_name}")
1988
+ response = ssm_client.send_command(
1989
+ Targets=[
1990
+ {
1991
+ "Key": "tag:aws:autoscaling:groupName",
1992
+ "Values": [autoscaling_group_name]
1993
+ }
1994
+ ],
1995
+ DocumentName="AWS-RunShellScript",
1996
+ Comment=f"Stopping BK agents in {stack_name}",
1997
+ Parameters={
1998
+ "commands": ["sudo kill -s SIGTERM $(/bin/pidof buildkite-agent)"]
1999
+ }
2000
+ )
2001
+ logger.info(f"SSM command response: {response}")
2002
+ Handler : index.handler
2003
+ Role : !GetAtt StopBuildkiteAgentsRole.Arn
2004
+ Runtime : " python3.12"
2005
+
2006
+ StopBuildkiteAgents :
2007
+ Type : AWS::CloudFormation::CustomResource
2008
+ Condition : EnableBuildkiteAgentGracefulShutdown
2009
+ Version : 1.0
2010
+ Properties :
2011
+ ServiceToken : !GetAtt StopBuildkiteAgentsFunction.Arn
2012
+ AutoScalingGroupName : !Ref AgentAutoScaleGroup
2013
+
1861
2014
SecurityGroup :
1862
2015
Type : AWS::EC2::SecurityGroup
1863
2016
Condition : CreateSecurityGroup
0 commit comments