Skip to content

Commit e3f0ba9

Browse files
committed
Updates the upgrade test to print any fatal error messages to the job pod termination log.
The package is refactored to pass back any fatal errors, cancel context, and cleanup resources on a fatal error.Additional wait timeouts are added to prevent the test from hanging.
1 parent fd0d7d1 commit e3f0ba9

File tree

6 files changed

+238
-116
lines changed

6 files changed

+238
-116
lines changed

build/e2e_upgrade_test.sh

Lines changed: 38 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -34,23 +34,41 @@ print_failure_logs() {
3434
local testClusterLocation=$2
3535
echo "ERROR: Upgrade test failed on cluster: $testCluster"
3636
gcloud container clusters get-credentials "$testCluster" --region="$testClusterLocation" --project="$PROJECT_ID"
37-
job_pods=$(kubectl get pods -l job-name=upgrade-test-runner -o jsonpath="{.items[*].metadata.name}")
38-
if [[ -z "$job_pods" ]]; then
39-
echo "No pods found for job upgrade-test-runner. They might have failed to schedule or were deleted."
37+
38+
# Get all pods for the job
39+
job_pods_json=$(kubectl get pods -l job-name=upgrade-test-runner -o json)
40+
41+
# Check if any pods were found
42+
if [[ $(echo "$job_pods_json" | jq '.items | length') -eq 0 ]]; then
43+
echo "No pods found for job upgrade-test-runner. They might have failed to schedule or were deleted."
4044
else
41-
kubectl logs --tail=20 "$job_pods" || echo "Unable to retrieve logs for pod: $job_pods"
45+
# Get the name of the first pod
46+
job_pod=$(echo "$job_pods_json" | jq -r '.items[0].metadata.name')
47+
pod_status=$(kubectl get pod "$job_pod" -o jsonpath='{.status.phase}')
48+
49+
echo "--- Pod $job_pod status $pod_status. Retrieving termination message. ---"
50+
# A non-restarting pod will have its termination message in 'state.terminated'.
51+
termination_message=$(kubectl get pod "$job_pod" -o go-template='{{range .status.containerStatuses}}{{if eq .name "upgrade-test-controller"}}{{.state.terminated.message}}{{end}}{{end}}')
52+
53+
if [ -n "$termination_message" ]; then
54+
echo "Fatal Error: $termination_message"
55+
else
56+
echo "No termination message found for pod $job_pod. Dumping logs:"
57+
# Log all pods found
58+
job_pods=$(echo "$job_pods_json" | jq -r '.items[].metadata.name')
4259
for pod in $job_pods; do
43-
containers=$(kubectl get pod "$pod" -o jsonpath='{.spec.containers[*].name}')
44-
for container in $containers; do
45-
if [[ "$container" == "sdk-client-test" || "$container" == "upgrade-test-controller" ]]; then
46-
echo "----- Logs from pod: $pod, container: $container -----"
47-
kubectl logs "$pod" -c "$container" || echo "Failed to retrieve logs from $pod/$container"
48-
fi
49-
done
60+
containers=$(kubectl get pod "$pod" -o jsonpath='{.spec.containers[*].name}')
61+
for container in $containers; do
62+
if [[ "$container" == "sdk-client-test" || "$container" == "upgrade-test-controller" ]]; then
63+
echo "----- Logs from pod: $pod, container: $container -----"
64+
kubectl logs "$pod" -c "$container" --tail=50 || echo "Failed to retrieve logs from $pod/$container"
65+
fi
66+
done
5067
done
68+
fi
5169
fi
5270

53-
echo "Logs from log bucket: https://console.cloud.google.com/logs/query;storageScope=storage,projects%2F${PROJECT_ID}%2Flocations%2Fglobal%2Fbuckets%2F${BUCKET_NAME}%2Fviews%2F_AllLogs?hl=en&inv=1&invt=Ab4o5A&mods=logs_tg_prod&project=${PROJECT_ID}"
71+
echo "Logs from log bucket: https://console.cloud.google.com/logs/query;storageScope=storage,projects%2F${PROJECT_ID}%2Flocations%2Fglobal%2Fbuckets%2F${BUCKET_NAME}%2Fviews%2F_AllLogs?hl=en&inv=1&invt=Ab4o5A&mods=logs_tg_prod&project=${PROJECT_ID}"
5472
}
5573
# ------------------------------------------------------
5674

@@ -100,7 +118,7 @@ do
100118
echo Checking if resources from a previous build of upgrade-test-runner exist and need to be cleaned up on cluster "${testCluster}".
101119
if kubectl get jobs | grep upgrade-test-runner ; then
102120
echo Deleting job from previous run of upgrade-test-runner on cluster "${testCluster}".
103-
kubectl delete job upgrade-test-runner
121+
kubectl delete job upgrade-test-runner --ignore-not-found=true
104122
kubectl wait --for=delete pod -l job-name=upgrade-test-runner --timeout=5m
105123
fi
106124

@@ -110,30 +128,30 @@ do
110128
kubectl get gs -o=custom-columns=:.metadata.name --no-headers | xargs kubectl patch gs -p '{"metadata":{"finalizers":[]}}' --type=merge
111129
sleep 5
112130
echo Deleting game servers from previous run of upgrade-test-runner on cluster "${testCluster}".
113-
kubectl delete gs -l app=sdk-client-test
131+
kubectl delete gs -l app=sdk-client-test --ignore-not-found=true
114132
fi
115133

116134
if kubectl get po -l app=sdk-client-test | grep ".*"; then
117135
echo Deleting pods from previous run of upgrade-test-runner on cluster "${testCluster}".
118-
kubectl delete po -l app=sdk-client-test
136+
kubectl delete po -l app=sdk-client-test --ignore-not-found=true
119137
kubectl wait --for=delete pod -l app=sdk-client-test --timeout=5m
120138
fi
121139

122140
# The v1.allocation.agones.dev apiservice does not get removed automatically and will prevent the namespace from terminating.
123141
if kubectl get apiservice | grep v1.allocation.agones.dev ; then
124142
echo Deleting v1.allocation.agones.dev from previous run of upgrade-test-runner on cluster "${testCluster}".
125-
kubectl delete apiservice v1.allocation.agones.dev
143+
kubectl delete apiservice v1.allocation.agones.dev --ignore-not-found=true
126144
fi
127145

128146
if kubectl get namespace | grep agones-system ; then
129147
echo Deleting agones-system namespace from previous run of upgrade-test-runner on cluster "${testCluster}".
130-
kubectl delete namespace agones-system
148+
kubectl delete namespace agones-system --ignore-not-found=true
131149
kubectl wait --for=delete ns agones-system --timeout=5m
132150
fi
133151

134152
if kubectl get crds | grep agones ; then
135153
echo Deleting crds from previous run of upgrade-test-runner on cluster "${testCluster}".
136-
kubectl get crds -o=custom-columns=:.metadata.name | grep agones | xargs kubectl delete crd
154+
kubectl get crds -o=custom-columns=:.metadata.name | grep agones | xargs kubectl delete crd --ignore-not-found=true
137155
fi
138156

139157
echo kubectl apply -f permissions.yaml on cluster "${testCluster}"
@@ -147,9 +165,7 @@ do
147165
kubectl apply -f "${tmpdir}"/upgradeTest.yaml
148166

149167
# We need to wait for job pod to be created and ready before we can wait on the job itself.
150-
# TODO: Once all test clusters are at Kubernetes Version >= 1.31 use `kubectl wait --for=create` instead of sleep.
151-
# kubectl wait --for=create pod -l job-name=upgrade-test-runner --timeout=1m
152-
sleep 10s
168+
kubectl wait --for=create pod -l job-name=upgrade-test-runner --timeout=1m
153169

154170
# Wait for the pod to become ready (or timeout)
155171
if ! kubectl wait --for=condition=ready pod -l job-name=upgrade-test-runner --timeout=5m; then
@@ -170,7 +186,7 @@ done
170186

171187
for pid in "${pids[@]}"; do
172188
# This block executes when the process exits and pid status==0
173-
if wait $pid; then
189+
if wait "$pid"; then
174190
outputLog="${waitPids[$pid]}"
175191
# wait for output to finish writing to file
176192
until [ -s "$outputLog" ]; do sleep 1; done

cloudbuild.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ steps:
241241
args: [push]
242242
waitFor:
243243
- push-images
244+
timeout: 5400s # 1.5h
244245

245246
# Wait for us to be the oldest ongoing build before we run upgrade and e2e tests
246247
- name: gcr.io/google.com/cloudsdktool/cloud-sdk

test/upgrade/go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ toolchain go1.24.2
66

77
require (
88
agones.dev/agones v1.49.0
9+
golang.org/x/sync v0.15.0
910
k8s.io/api v0.33.1
1011
k8s.io/apimachinery v0.33.1
1112
k8s.io/client-go v0.33.1

test/upgrade/go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,8 @@ golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKl
141141
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
142142
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
143143
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
144+
golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8=
145+
golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
144146
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
145147
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
146148
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=

0 commit comments

Comments
 (0)