Skip to content

Commit 1bbcba2

Browse files
committed
Updates the upgrade test to print any fatal error messages to the job pod termination log.
The package is refactored to pass back any fatal errors, cancel context, and cleanup resources on a fatal error.Additional wait timeouts are added to prevent the test from hanging.
1 parent fd0d7d1 commit 1bbcba2

File tree

6 files changed

+235
-117
lines changed

6 files changed

+235
-117
lines changed

build/e2e_upgrade_test.sh

Lines changed: 35 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -34,23 +34,37 @@ print_failure_logs() {
3434
local testClusterLocation=$2
3535
echo "ERROR: Upgrade test failed on cluster: $testCluster"
3636
gcloud container clusters get-credentials "$testCluster" --region="$testClusterLocation" --project="$PROJECT_ID"
37-
job_pods=$(kubectl get pods -l job-name=upgrade-test-runner -o jsonpath="{.items[*].metadata.name}")
38-
if [[ -z "$job_pods" ]]; then
39-
echo "No pods found for job upgrade-test-runner. They might have failed to schedule or were deleted."
37+
38+
# Get all pods for the job
39+
job_pods_json=$(kubectl get pods -l job-name=upgrade-test-runner -o json)
40+
41+
# Check if any pods were found
42+
if [[ $(echo "$job_pods_json" | jq '.items | length') -eq 0 ]]; then
43+
echo "No pods found for job upgrade-test-runner. They might have failed to schedule or were deleted."
4044
else
41-
kubectl logs --tail=20 "$job_pods" || echo "Unable to retrieve logs for pod: $job_pods"
42-
for pod in $job_pods; do
43-
containers=$(kubectl get pod "$pod" -o jsonpath='{.spec.containers[*].name}')
44-
for container in $containers; do
45-
if [[ "$container" == "sdk-client-test" || "$container" == "upgrade-test-controller" ]]; then
46-
echo "----- Logs from pod: $pod, container: $container -----"
47-
kubectl logs "$pod" -c "$container" || echo "Failed to retrieve logs from $pod/$container"
48-
fi
49-
done
45+
# Get the name of the first (and only) pod
46+
job_pod=$(echo "$job_pods_json" | jq -r '.items[0].metadata.name')
47+
pod_status=$(kubectl get pod "$job_pod" -o jsonpath='{.status.phase}')
48+
49+
echo "--- Pod $job_pod status $pod_status. Retrieving termination message. ---"
50+
# A non-restarting pod will have its termination message in 'state.terminated'.
51+
termination_message=$(kubectl get pod "$job_pod" -o go-template='{{range .status.containerStatuses}}{{if eq .name "upgrade-test-controller"}}{{.state.terminated.message}}{{end}}{{end}}')
52+
53+
if [ -n "$termination_message" ]; then
54+
echo "Fatal Error: $termination_message"
55+
else
56+
echo "No termination message found for pod $job_pod. Dumping logs:"
57+
containers=$(kubectl get pod "$job_pod" -o jsonpath='{.spec.containers[*].name}')
58+
for container in $containers; do
59+
if [[ "$container" == "sdk-client-test" || "$container" == "upgrade-test-controller" ]]; then
60+
echo "----- Logs from pod: $job_pod, container: $container -----"
61+
kubectl logs "$job_pod" -c "$container" --tail=50 || echo "Failed to retrieve logs from $job_pod/$container"
62+
fi
5063
done
64+
fi
5165
fi
5266

53-
echo "Logs from log bucket: https://console.cloud.google.com/logs/query;storageScope=storage,projects%2F${PROJECT_ID}%2Flocations%2Fglobal%2Fbuckets%2F${BUCKET_NAME}%2Fviews%2F_AllLogs?hl=en&inv=1&invt=Ab4o5A&mods=logs_tg_prod&project=${PROJECT_ID}"
67+
echo "Logs from log bucket: https://console.cloud.google.com/logs/query;storageScope=storage,projects%2F${PROJECT_ID}%2Flocations%2Fglobal%2Fbuckets%2F${BUCKET_NAME}%2Fviews%2F_AllLogs?hl=en&inv=1&invt=Ab4o5A&mods=logs_tg_prod&project=${PROJECT_ID}"
5468
}
5569
# ------------------------------------------------------
5670

@@ -100,7 +114,7 @@ do
100114
echo Checking if resources from a previous build of upgrade-test-runner exist and need to be cleaned up on cluster "${testCluster}".
101115
if kubectl get jobs | grep upgrade-test-runner ; then
102116
echo Deleting job from previous run of upgrade-test-runner on cluster "${testCluster}".
103-
kubectl delete job upgrade-test-runner
117+
kubectl delete job upgrade-test-runner --ignore-not-found=true
104118
kubectl wait --for=delete pod -l job-name=upgrade-test-runner --timeout=5m
105119
fi
106120

@@ -110,30 +124,30 @@ do
110124
kubectl get gs -o=custom-columns=:.metadata.name --no-headers | xargs kubectl patch gs -p '{"metadata":{"finalizers":[]}}' --type=merge
111125
sleep 5
112126
echo Deleting game servers from previous run of upgrade-test-runner on cluster "${testCluster}".
113-
kubectl delete gs -l app=sdk-client-test
127+
kubectl delete gs -l app=sdk-client-test --ignore-not-found=true
114128
fi
115129

116130
if kubectl get po -l app=sdk-client-test | grep ".*"; then
117131
echo Deleting pods from previous run of upgrade-test-runner on cluster "${testCluster}".
118-
kubectl delete po -l app=sdk-client-test
132+
kubectl delete po -l app=sdk-client-test --ignore-not-found=true
119133
kubectl wait --for=delete pod -l app=sdk-client-test --timeout=5m
120134
fi
121135

122136
# The v1.allocation.agones.dev apiservice does not get removed automatically and will prevent the namespace from terminating.
123137
if kubectl get apiservice | grep v1.allocation.agones.dev ; then
124138
echo Deleting v1.allocation.agones.dev from previous run of upgrade-test-runner on cluster "${testCluster}".
125-
kubectl delete apiservice v1.allocation.agones.dev
139+
kubectl delete apiservice v1.allocation.agones.dev --ignore-not-found=true
126140
fi
127141

128142
if kubectl get namespace | grep agones-system ; then
129143
echo Deleting agones-system namespace from previous run of upgrade-test-runner on cluster "${testCluster}".
130-
kubectl delete namespace agones-system
144+
kubectl delete namespace agones-system --ignore-not-found=true
131145
kubectl wait --for=delete ns agones-system --timeout=5m
132146
fi
133147

134148
if kubectl get crds | grep agones ; then
135149
echo Deleting crds from previous run of upgrade-test-runner on cluster "${testCluster}".
136-
kubectl get crds -o=custom-columns=:.metadata.name | grep agones | xargs kubectl delete crd
150+
kubectl get crds -o=custom-columns=:.metadata.name | grep agones | xargs kubectl delete crd --ignore-not-found=true
137151
fi
138152

139153
echo kubectl apply -f permissions.yaml on cluster "${testCluster}"
@@ -147,9 +161,7 @@ do
147161
kubectl apply -f "${tmpdir}"/upgradeTest.yaml
148162

149163
# We need to wait for job pod to be created and ready before we can wait on the job itself.
150-
# TODO: Once all test clusters are at Kubernetes Version >= 1.31 use `kubectl wait --for=create` instead of sleep.
151-
# kubectl wait --for=create pod -l job-name=upgrade-test-runner --timeout=1m
152-
sleep 10s
164+
kubectl wait --for=create pod -l job-name=upgrade-test-runner --timeout=1m
153165

154166
# Wait for the pod to become ready (or timeout)
155167
if ! kubectl wait --for=condition=ready pod -l job-name=upgrade-test-runner --timeout=5m; then
@@ -170,7 +182,7 @@ done
170182

171183
for pid in "${pids[@]}"; do
172184
# This block executes when the process exits and pid status==0
173-
if wait $pid; then
185+
if wait "$pid"; then
174186
outputLog="${waitPids[$pid]}"
175187
# wait for output to finish writing to file
176188
until [ -s "$outputLog" ]; do sleep 1; done

cloudbuild.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ steps:
241241
args: [push]
242242
waitFor:
243243
- push-images
244+
timeout: 5400s # 1.5h
244245

245246
# Wait for us to be the oldest ongoing build before we run upgrade and e2e tests
246247
- name: gcr.io/google.com/cloudsdktool/cloud-sdk

test/upgrade/go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ toolchain go1.24.2
66

77
require (
88
agones.dev/agones v1.49.0
9+
golang.org/x/sync v0.15.0
910
k8s.io/api v0.33.1
1011
k8s.io/apimachinery v0.33.1
1112
k8s.io/client-go v0.33.1

test/upgrade/go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,8 @@ golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKl
141141
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
142142
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
143143
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
144+
golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8=
145+
golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
144146
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
145147
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
146148
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=

0 commit comments

Comments
 (0)