Skip to content

Commit 2bb42f4

Browse files
authored
Merge pull request #1476 from n-tucker/nt/gracefully-terminate-bk-agents
2 parents 0071d9a + b7727ae commit 2bb42f4

File tree

1 file changed

+153
-0
lines changed

1 file changed

+153
-0
lines changed

templates/aws-stack.yml

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ Metadata:
6060
- BuildkiteAgentScalerServerlessARN
6161
- BuildkiteAgentScalerVersion
6262
- LogRetentionDays
63+
- BuildkiteAgentEnableGracefulShutdown
6364

6465
- Label:
6566
default: Network Configuration
@@ -237,6 +238,14 @@ Parameters:
237238
Description: The number of days to retain the Cloudwatch Logs of the lambda.
238239
Default: "1"
239240

241+
BuildkiteAgentEnableGracefulShutdown:
242+
Description: Set to true to enable graceful shutdown of agents when the ASG is updated with replacement. This allows ASGs to be removed in a timely manner during an in-place update of the elastic stack, and allows remaining agents to finish jobs without interruptions.
243+
Type: String
244+
AllowedValues:
245+
- "true"
246+
- "false"
247+
Default: "false"
248+
240249
BuildkiteAgentTracingBackend:
241250
Description: The tracing backend to use for CI tracing. See https://buildkite.com/docs/agent/v3/tracing
242251
Type: String
@@ -937,6 +946,9 @@ Conditions:
937946
UseCostAllocationTags:
938947
!Equals [ !Ref EnableCostAllocationTags, "true" ]
939948

949+
EnableBuildkiteAgentGracefulShutdown:
950+
!Equals [ !Ref BuildkiteAgentEnableGracefulShutdown, "true" ]
951+
940952
UsePipelineSigningKMSKey:
941953
!Not [ !Equals [ !Ref PipelineSigningKMSKeyId, "" ] ]
942954

@@ -1858,6 +1870,147 @@ Resources:
18581870
ServiceToken: !GetAtt AzRebalancingSuspenderFunction.Arn
18591871
AutoScalingGroupName: !Ref AgentAutoScaleGroup
18601872

1873+
StopBuildkiteAgentsRole:
1874+
Type: AWS::IAM::Role
1875+
Condition: EnableBuildkiteAgentGracefulShutdown
1876+
Properties:
1877+
PermissionsBoundary:
1878+
!If [
1879+
SetInstanceRolePermissionsBoundaryARN,
1880+
!Ref InstanceRolePermissionsBoundaryARN,
1881+
!Ref "AWS::NoValue",
1882+
]
1883+
AssumeRolePolicyDocument:
1884+
Version: 2012-10-17
1885+
Statement:
1886+
- Effect: Allow
1887+
Principal:
1888+
Service:
1889+
- lambda.amazonaws.com
1890+
Action:
1891+
- sts:AssumeRole
1892+
ManagedPolicyArns:
1893+
- arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
1894+
Policies:
1895+
- PolicyName: DescribeASGs
1896+
PolicyDocument:
1897+
Version: 2012-10-17
1898+
Statement:
1899+
- Effect: Allow
1900+
Action:
1901+
- "autoscaling:DescribeAutoScalingGroups"
1902+
Resource: "*"
1903+
- PolicyName: ModifyASGs
1904+
PolicyDocument:
1905+
Version: 2012-10-17
1906+
Statement:
1907+
- Effect: Allow
1908+
Action:
1909+
- "autoscaling:UpdateAutoScalingGroup"
1910+
Resource: !Sub arn:${AWS::Partition}:autoscaling:${AWS::Region}:${AWS::AccountId}:autoScalingGroup:*:autoScalingGroupName/${AWS::StackName}-AgentAutoScaleGroup-*
1911+
- PolicyName: RunStopBuildkiteDocument
1912+
PolicyDocument:
1913+
Version: 2012-10-17
1914+
Statement:
1915+
- Effect: Allow
1916+
Action:
1917+
- "ssm:SendCommand"
1918+
Resource:
1919+
- !Sub arn:${AWS::Partition}:ssm:${AWS::Region}::document/AWS-RunShellScript
1920+
- PolicyName: StopBuildkiteInstances
1921+
PolicyDocument:
1922+
Version: 2012-10-17
1923+
Statement:
1924+
- Effect: Allow
1925+
Action:
1926+
- "ssm:SendCommand"
1927+
Resource:
1928+
- !Sub arn:${AWS::Partition}:ec2:${AWS::Region}:${AWS::AccountId}:instance/*
1929+
Condition:
1930+
StringEquals:
1931+
"aws:resourceTag/aws:cloudformation:logical-id": "AgentAutoScaleGroup"
1932+
1933+
StopBuildkiteAgentsFunction:
1934+
Type: AWS::Lambda::Function
1935+
Condition: EnableBuildkiteAgentGracefulShutdown
1936+
Properties:
1937+
Description: "Gracefully stops all Buildkite agents in a given Auto Scaling group."
1938+
Code:
1939+
ZipFile: |
1940+
import boto3
1941+
import logging
1942+
import cfnresponse
1943+
1944+
logger = logging.getLogger()
1945+
logger.setLevel(logging.INFO)
1946+
1947+
autoscaling_client = boto3.client("autoscaling")
1948+
ssm_client = boto3.client("ssm")
1949+
1950+
def handler(event, context):
1951+
logger.info(f"Received event: {event}")
1952+
1953+
# only trigger on update upon replacement events
1954+
if event["RequestType"] == "Update":
1955+
try:
1956+
props = event["OldResourceProperties"]
1957+
autoscaling_group_name = props["AutoScalingGroupName"]
1958+
1959+
# Scale ASG down to zero, to allow Buildkite agents to terminate
1960+
force_instance_termination(autoscaling_group_name)
1961+
1962+
# Stop all Buildkite agents in the old ASG
1963+
stop_bk_agents(autoscaling_group_name)
1964+
1965+
# Send success response to CloudFormation
1966+
cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, "CustomResourcePhysicalID")
1967+
except Exception as e:
1968+
logger.error(f"Error: {str(e)}")
1969+
cfnresponse.send(event, context, cfnresponse.FAILED, {"Error": str(e)}, "CustomResourcePhysicalID")
1970+
else:
1971+
# For Create and Delete events, just send success response
1972+
cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, "CustomResourcePhysicalID")
1973+
1974+
def force_instance_termination(autoscaling_group_name):
1975+
"""Forces all EC2 instances to terminate in the specified Auto Scaling group by setting the desired capacity to zero."""
1976+
logger.info(f"Setting the desired capacity of {autoscaling_group_name} to zero")
1977+
autoscaling_client.update_auto_scaling_group(
1978+
AutoScalingGroupName=autoscaling_group_name,
1979+
MinSize=0,
1980+
DesiredCapacity=0
1981+
)
1982+
1983+
def stop_bk_agents(autoscaling_group_name):
1984+
"""Gracefully terminates Buildkite agents running in the given Auto Scaling Group."""
1985+
stack_name = autoscaling_group_name.split("-AgentAutoScaleGroup")[0]
1986+
1987+
logger.info(f"Stopping BK agents in {stack_name}")
1988+
response = ssm_client.send_command(
1989+
Targets=[
1990+
{
1991+
"Key": "tag:aws:autoscaling:groupName",
1992+
"Values": [autoscaling_group_name]
1993+
}
1994+
],
1995+
DocumentName="AWS-RunShellScript",
1996+
Comment=f"Stopping BK agents in {stack_name}",
1997+
Parameters={
1998+
"commands": ["sudo kill -s SIGTERM $(/bin/pidof buildkite-agent)"]
1999+
}
2000+
)
2001+
logger.info(f"SSM command response: {response}")
2002+
Handler: index.handler
2003+
Role: !GetAtt StopBuildkiteAgentsRole.Arn
2004+
Runtime: "python3.12"
2005+
2006+
StopBuildkiteAgents:
2007+
Type: AWS::CloudFormation::CustomResource
2008+
Condition: EnableBuildkiteAgentGracefulShutdown
2009+
Version: 1.0
2010+
Properties:
2011+
ServiceToken: !GetAtt StopBuildkiteAgentsFunction.Arn
2012+
AutoScalingGroupName: !Ref AgentAutoScaleGroup
2013+
18612014
SecurityGroup:
18622015
Type: AWS::EC2::SecurityGroup
18632016
Condition: CreateSecurityGroup

0 commit comments

Comments
 (0)