Skip to content

Commit 062d08a

Browse files
[fix] add more checks for retrying on transient error during machine creation (#432)
* add more checks for retrying on transient error during machine creation * only have 429 errors wait a full minute
1 parent 3119045 commit 062d08a

File tree

5 files changed

+146
-35
lines changed

5 files changed

+146
-35
lines changed

controller/linodemachine_controller.go

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ import (
5656

5757
const (
5858
linodeBusyCode = 400
59-
linodeTooManyRequests = 429
6059
defaultDiskFilesystem = string(linodego.FilesystemExt4)
6160

6261
// conditions for preflight instance creation
@@ -262,6 +261,16 @@ func (r *LinodeMachineReconciler) reconcile(
262261
return
263262
}
264263

264+
func retryIfTransient(err error) (ctrl.Result, error) {
265+
if util.IsRetryableError(err) {
266+
if linodego.ErrHasStatus(err, http.StatusTooManyRequests) {
267+
return ctrl.Result{RequeueAfter: reconciler.DefaultLinodeTooManyRequestsErrorRetryDelay}, nil
268+
}
269+
return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerRetryDelay}, nil
270+
}
271+
return ctrl.Result{}, err
272+
}
273+
265274
func (r *LinodeMachineReconciler) reconcileCreate(
266275
ctx context.Context,
267276
logger logr.Logger,
@@ -302,16 +311,15 @@ func (r *LinodeMachineReconciler) reconcileCreate(
302311
createOpts, err := r.newCreateConfig(ctx, machineScope, tags, logger)
303312
if err != nil {
304313
logger.Error(err, "Failed to create Linode machine InstanceCreateOptions")
305-
if util.IsTransientError(err) {
306-
return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerRetryDelay}, nil
307-
}
308-
309-
return ctrl.Result{}, err
314+
return retryIfTransient(err)
310315
}
311316
linodeInstance, err = machineScope.LinodeClient.CreateInstance(ctx, *createOpts)
312317
if err != nil {
313-
if linodego.ErrHasStatus(err, linodeTooManyRequests) || linodego.ErrHasStatus(err, linodego.ErrorFromError) {
318+
if util.IsRetryableError(err) {
314319
logger.Error(err, "Failed to create Linode instance due to API error, requeing")
320+
if linodego.ErrHasStatus(err, http.StatusTooManyRequests) {
321+
return ctrl.Result{RequeueAfter: reconciler.DefaultLinodeTooManyRequestsErrorRetryDelay}, nil
322+
}
315323
return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerRetryDelay}, nil
316324
}
317325
logger.Error(err, "Failed to create Linode machine instance")
@@ -362,11 +370,12 @@ func (r *LinodeMachineReconciler) reconcileInstanceCreate(
362370
instanceConfig, err := r.getDefaultInstanceConfig(ctx, machineScope, linodeInstance.ID)
363371
if err != nil {
364372
logger.Error(err, "Failed to get default instance configuration")
365-
return ctrl.Result{}, err
373+
return retryIfTransient(err)
366374
}
367375

368376
if _, err := machineScope.LinodeClient.UpdateInstanceConfig(ctx, linodeInstance.ID, instanceConfig.ID, linodego.InstanceConfigUpdateOptions{Kernel: machineScope.LinodeMachine.Spec.Configuration.Kernel}); err != nil {
369-
return ctrl.Result{}, err
377+
logger.Error(err, "Failed to update default instance configuration")
378+
return retryIfTransient(err)
370379
}
371380
}
372381

controller/linodemachine_controller_test.go

Lines changed: 110 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"context"
2222
"errors"
2323
"net"
24+
"net/http"
2425
"time"
2526

2627
"github.com/go-logr/logr"
@@ -811,9 +812,10 @@ var _ = Describe("machine-lifecycle", Ordered, Label("machine", "machine-lifecyc
811812
linodeMachine := &infrav1alpha2.LinodeMachine{
812813
ObjectMeta: metadata,
813814
Spec: infrav1alpha2.LinodeMachineSpec{
814-
InstanceID: ptr.To(0),
815-
Type: "g6-nanode-1",
816-
Image: rutil.DefaultMachineControllerLinodeImage,
815+
InstanceID: ptr.To(0),
816+
Type: "g6-nanode-1",
817+
Image: rutil.DefaultMachineControllerLinodeImage,
818+
Configuration: &infrav1alpha2.InstanceConfiguration{Kernel: "test"},
817819
},
818820
}
819821
machineKey := client.ObjectKeyFromObject(linodeMachine)
@@ -930,8 +932,113 @@ var _ = Describe("machine-lifecycle", Ordered, Label("machine", "machine-lifecyc
930932
})),
931933
),
932934
),
935+
Path(
936+
Call("machine is not created because there were too many requests", func(ctx context.Context, mck Mock) {
937+
listInst := mck.LinodeClient.EXPECT().
938+
ListInstances(ctx, gomock.Any()).
939+
Return([]linodego.Instance{}, nil)
940+
mck.LinodeClient.EXPECT().
941+
GetRegion(ctx, gomock.Any()).
942+
After(listInst).
943+
Return(&linodego.Region{Capabilities: []string{"Metadata"}}, nil)
944+
}),
945+
OneOf(
946+
Path(Result("create requeues when failing to create instance config", func(ctx context.Context, mck Mock) {
947+
mck.LinodeClient.EXPECT().
948+
GetImage(ctx, gomock.Any()).
949+
Return(nil, &linodego.Error{Code: http.StatusTooManyRequests})
950+
res, err := reconciler.reconcile(ctx, mck.Logger(), mScope)
951+
Expect(err).NotTo(HaveOccurred())
952+
Expect(res.RequeueAfter).To(Equal(rutil.DefaultLinodeTooManyRequestsErrorRetryDelay))
953+
Expect(mck.Logs()).To(ContainSubstring("Failed to create Linode machine InstanceCreateOptions"))
954+
})),
955+
Path(Result("create requeues when failing to create instance", func(ctx context.Context, mck Mock) {
956+
getImage := mck.LinodeClient.EXPECT().
957+
GetImage(ctx, gomock.Any()).
958+
Return(&linodego.Image{Capabilities: []string{"cloud-init"}}, nil)
959+
mck.LinodeClient.EXPECT().CreateInstance(gomock.Any(), gomock.Any()).
960+
After(getImage).
961+
Return(nil, &linodego.Error{Code: http.StatusTooManyRequests})
962+
res, err := reconciler.reconcile(ctx, mck.Logger(), mScope)
963+
Expect(err).NotTo(HaveOccurred())
964+
Expect(res.RequeueAfter).To(Equal(rutil.DefaultLinodeTooManyRequestsErrorRetryDelay))
965+
Expect(mck.Logs()).To(ContainSubstring("Failed to create Linode instance due to API error"))
966+
})),
967+
Path(Result("create requeues when failing to update instance config", func(ctx context.Context, mck Mock) {
968+
getImage := mck.LinodeClient.EXPECT().
969+
GetImage(ctx, gomock.Any()).
970+
Return(&linodego.Image{Capabilities: []string{"cloud-init"}}, nil)
971+
createInst := mck.LinodeClient.EXPECT().
972+
CreateInstance(ctx, gomock.Any()).
973+
After(getImage).
974+
Return(&linodego.Instance{
975+
ID: 123,
976+
IPv4: []*net.IP{ptr.To(net.IPv4(192, 168, 0, 2))},
977+
IPv6: "fd00::",
978+
Status: linodego.InstanceOffline,
979+
}, nil)
980+
listInstConfigs := mck.LinodeClient.EXPECT().
981+
ListInstanceConfigs(ctx, 123, gomock.Any()).
982+
After(createInst).
983+
Return([]linodego.InstanceConfig{{
984+
Devices: &linodego.InstanceConfigDeviceMap{
985+
SDA: &linodego.InstanceConfigDevice{DiskID: 100},
986+
},
987+
}}, nil)
988+
mck.LinodeClient.EXPECT().
989+
UpdateInstanceConfig(ctx, 123, 0, gomock.Any()).
990+
After(listInstConfigs).
991+
Return(nil, &linodego.Error{Code: http.StatusTooManyRequests})
992+
res, err := reconciler.reconcile(ctx, mck.Logger(), mScope)
993+
Expect(err).NotTo(HaveOccurred())
994+
Expect(res.RequeueAfter).To(Equal(rutil.DefaultLinodeTooManyRequestsErrorRetryDelay))
995+
Expect(mck.Logs()).To(ContainSubstring("Failed to update default instance configuration"))
996+
})),
997+
Path(Result("create requeues when failing to get instance config", func(ctx context.Context, mck Mock) {
998+
getImage := mck.LinodeClient.EXPECT().
999+
GetImage(ctx, gomock.Any()).
1000+
Return(&linodego.Image{Capabilities: []string{"cloud-init"}}, nil)
1001+
createInst := mck.LinodeClient.EXPECT().
1002+
CreateInstance(ctx, gomock.Any()).
1003+
After(getImage).
1004+
Return(&linodego.Instance{
1005+
ID: 123,
1006+
IPv4: []*net.IP{ptr.To(net.IPv4(192, 168, 0, 2))},
1007+
IPv6: "fd00::",
1008+
Status: linodego.InstanceOffline,
1009+
}, nil)
1010+
updateInstConfig := mck.LinodeClient.EXPECT().
1011+
UpdateInstanceConfig(ctx, 123, 0, gomock.Any()).
1012+
After(createInst).
1013+
Return(nil, nil).AnyTimes()
1014+
getAddrs := mck.LinodeClient.EXPECT().
1015+
GetInstanceIPAddresses(ctx, 123).
1016+
After(updateInstConfig).
1017+
Return(&linodego.InstanceIPAddressResponse{
1018+
IPv4: &linodego.InstanceIPv4Response{
1019+
Private: []*linodego.InstanceIP{{Address: "192.168.0.2"}},
1020+
Public: []*linodego.InstanceIP{{Address: "172.0.0.2"}},
1021+
},
1022+
IPv6: &linodego.InstanceIPv6Response{
1023+
SLAAC: &linodego.InstanceIP{
1024+
Address: "fd00::",
1025+
},
1026+
},
1027+
}, nil).AnyTimes()
1028+
mck.LinodeClient.EXPECT().
1029+
ListInstanceConfigs(ctx, 123, gomock.Any()).
1030+
After(getAddrs).
1031+
Return(nil, &linodego.Error{Code: http.StatusTooManyRequests})
1032+
res, err := reconciler.reconcile(ctx, mck.Logger(), mScope)
1033+
Expect(err).NotTo(HaveOccurred())
1034+
Expect(res.RequeueAfter).To(Equal(rutil.DefaultLinodeTooManyRequestsErrorRetryDelay))
1035+
Expect(mck.Logs()).To(ContainSubstring("Failed to get default instance configuration"))
1036+
})),
1037+
),
1038+
),
9331039
Path(
9341040
Call("machine is created", func(ctx context.Context, mck Mock) {
1041+
linodeMachine.Spec.Configuration = nil
9351042
}),
9361043
OneOf(
9371044
Path(Result("creates a worker machine without disks", func(ctx context.Context, mck Mock) {

util/helpers.go

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,22 +34,15 @@ func UnwrapError(err error) error {
3434
return err
3535
}
3636

37-
// IsTransientError determines if the error is transient, meaning a controller that
37+
// IsRetryableError determines if the error is retryable, meaning a controller that
3838
// encounters this error should requeue reconciliation to try again later
39-
func IsTransientError(err error) bool {
40-
if linodego.ErrHasStatus(
39+
func IsRetryableError(err error) bool {
40+
return linodego.ErrHasStatus(
4141
err,
4242
http.StatusTooManyRequests,
4343
http.StatusInternalServerError,
4444
http.StatusBadGateway,
4545
http.StatusGatewayTimeout,
46-
http.StatusServiceUnavailable) {
47-
return true
48-
}
49-
50-
if errors.Is(err, http.ErrHandlerTimeout) || errors.Is(err, os.ErrDeadlineExceeded) || errors.Is(err, io.ErrUnexpectedEOF) {
51-
return true
52-
}
53-
54-
return false
46+
http.StatusServiceUnavailable,
47+
linodego.ErrorFromError) || errors.Is(err, http.ErrHandlerTimeout) || errors.Is(err, os.ErrDeadlineExceeded) || errors.Is(err, io.ErrUnexpectedEOF)
5548
}

util/helpers_test.go

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -52,39 +52,39 @@ func TestIgnoreLinodeAPIError(t *testing.T) {
5252
}
5353
}
5454

55-
func TestIsTransientError(t *testing.T) {
55+
func TestIsRetryableError(t *testing.T) {
5656
t.Parallel()
5757
tests := []struct {
58-
name string
59-
err error
60-
shouldRetry bool
58+
name string
59+
err error
60+
want bool
6161
}{{
62-
name: "unexpected EOF",
63-
err: io.ErrUnexpectedEOF,
64-
shouldRetry: true,
62+
name: "unexpected EOF",
63+
err: io.ErrUnexpectedEOF,
64+
want: true,
6565
}, {
6666
name: "not found Linode API error",
6767
err: &linodego.Error{
6868
Response: nil,
6969
Code: http.StatusNotFound,
7070
Message: "not found",
7171
},
72-
shouldRetry: false,
72+
want: false,
7373
}, {
7474
name: "Rate limiting Linode API error",
7575
err: &linodego.Error{
7676
Response: nil,
7777
Code: http.StatusTooManyRequests,
7878
Message: "rate limited",
7979
},
80-
shouldRetry: true,
80+
want: true,
8181
}}
8282
for _, tt := range tests {
8383
testcase := tt
8484
t.Run(testcase.name, func(t *testing.T) {
8585
t.Parallel()
86-
if testcase.shouldRetry != IsTransientError(testcase.err) {
87-
t.Errorf("wanted %v, got %v", testcase.shouldRetry, IsTransientError(testcase.err))
86+
if testcase.want != IsRetryableError(testcase.err) {
87+
t.Errorf("wanted %v, got %v", testcase.want, IsRetryableError(testcase.err))
8888
}
8989
})
9090
}

util/reconciler/defaults.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ const (
3838
DefaultMachineControllerWaitForRunningTimeout = 20 * time.Minute
3939
// DefaultMachineControllerRetryDelay is the default requeue delay if there is an error.
4040
DefaultMachineControllerRetryDelay = 10 * time.Second
41+
// DefaultLinodeTooManyRequestsErrorRetryDelay is the default requeue delay if there is a Linode API error.
42+
DefaultLinodeTooManyRequestsErrorRetryDelay = time.Minute
4143

4244
// DefaultVPCControllerReconcileDelay is the default requeue delay when a reconcile operation fails.
4345
DefaultVPCControllerReconcileDelay = 5 * time.Second

0 commit comments

Comments
 (0)