@@ -235,26 +235,27 @@ func (s *session) beginTransaction(mode db.AccessMode, config *TransactionConfig
235
235
}, nil
236
236
}
237
237
238
- func (s * session ) runOneTry (mode db.AccessMode , work TransactionWork , config * TransactionConfig ) (interface {}, error ) {
238
+ func (s * session ) runOneTry (mode db.AccessMode , work TransactionWork , config * TransactionConfig ) (interface {}, bool , error ) {
239
239
tx , err := s .beginTransaction (mode , config )
240
240
if err != nil {
241
- return nil , err
241
+ return nil , false , err
242
242
}
243
243
defer func () {
244
244
tx .Close ()
245
245
}()
246
246
247
247
x , err := work (tx )
248
248
if err != nil {
249
- return nil , err
249
+ return nil , false , err
250
250
}
251
251
252
252
err = tx .Commit ()
253
253
if err != nil {
254
- return nil , err
254
+ // Indicate that Commit failed, not safe to retry network error in this case
255
+ return nil , true , err
255
256
}
256
257
257
- return x , nil
258
+ return x , false , nil
258
259
}
259
260
260
261
func (s * session ) runRetriable (
@@ -278,42 +279,52 @@ func (s *session) runRetriable(
278
279
}
279
280
280
281
var (
281
- maxDeadErrors = s .config .MaxConnectionPoolSize / 2
282
- maxClusterErrors = 1
283
- throttle = throttler (s .throttleTime )
284
- start time.Time
282
+ maxDeadErrors = s .config .MaxConnectionPoolSize / 2
283
+ throttle = throttler (s .throttleTime )
284
+ start time.Time
285
285
)
286
286
for {
287
287
// Always return the current connection before trying (again)
288
288
s .returnConn ()
289
289
s .res = nil
290
290
291
- x , err := s .runOneTry (mode , work , & config )
291
+ x , commitFailure , err := s .runOneTry (mode , work , & config )
292
292
if err == nil {
293
293
return x , nil
294
294
}
295
295
296
296
s .log .Debugf (s .logId , "Retriable transaction evaluating error: %s" , err )
297
297
298
- // If we failed due to connect problem, just give up since the pool tries really hard
299
- if s .conn == nil {
300
- s .log .Errorf (s .logId , "Retriable transaction failed due to no available connection: %s" , err )
301
- return nil , err
302
- }
303
-
304
298
// Check retry timeout
305
299
if start .IsZero () {
306
300
start = s .now ()
307
301
}
308
302
if time .Since (start ) > s .config .MaxTransactionRetryTime {
309
- s .log .Errorf (s .logId , "Retriable transaction failed due to reaching MaxTransactionRetryTime: %s" , s .config .MaxTransactionRetryTime .String ())
303
+ s .log .Errorf (s .logId , "Retriable transaction failed due to reaching MaxTransactionRetryTime (%s): %s" ,
304
+ s .config .MaxTransactionRetryTime .String (), err )
310
305
return nil , err
311
306
}
312
307
313
308
// Failed, check cause and determine next action
314
309
310
+ // If we failed due to connect problem, wait a bit and retry
311
+ if s .conn == nil {
312
+ throttle = throttle .next ()
313
+ d := throttle .delay ()
314
+ s .log .Debugf (s .logId , "Retrying transaction due to no available connection after sleeping for %s" , d .String ())
315
+ s .sleep (d )
316
+ continue
317
+ }
318
+
315
319
// If the connection is dead just return the connection, get another and try again, no sleep
320
+ // Do not do this if the connection died during commit phase since we don't know if we have
321
+ // succesfully committed or not, might corrupt data otherwise!
316
322
if ! s .conn .IsAlive () {
323
+ if commitFailure {
324
+ err = errors .New (fmt .Sprintf ("Retriable transaction failed due to lost connection during commit: %s" , err ))
325
+ s .log .Error (s .logId , err )
326
+ return nil , err
327
+ }
317
328
maxDeadErrors --
318
329
if maxDeadErrors < 0 {
319
330
s .log .Errorf (s .logId , "Retriable transaction failed due to too many dead connections" )
@@ -331,12 +342,10 @@ func (s *session) runRetriable(
331
342
case e .IsRetriableCluster ():
332
343
// Force routing tables to be updated before trying again
333
344
s .router .Invalidate (s .databaseName )
334
- maxClusterErrors --
335
- if maxClusterErrors < 0 {
336
- s .log .Errorf (s .logId , "Retriable transaction failed due to encountering too many cluster errors" )
337
- return nil , err
338
- }
339
- s .log .Debugf (s .logId , "Retrying transaction due to cluster error" )
345
+ throttle = throttle .next ()
346
+ d := throttle .delay ()
347
+ s .log .Debugf (s .logId , "Retrying transaction due to cluster error after sleeping for %s" , d .String ())
348
+ s .sleep (d )
340
349
case e .IsRetriableTransient ():
341
350
throttle = throttle .next ()
342
351
d := throttle .delay ()
0 commit comments