Skip to content

Commit 5c3be9c

Browse files
authored
Use error-specific retry mechanisms in WebSocketConnection and associated classes
1 parent 8fc0b49 commit 5c3be9c

File tree

8 files changed

+79
-122
lines changed

8 files changed

+79
-122
lines changed

service/src/main/java/org/whispersystems/textsecuregcm/WhisperServerService.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,6 @@ public void run(WhisperServerConfiguration config, Environment environment) thro
504504
messageDeliveryQueue);
505505

506506
ScheduledExecutorService recurringJobExecutor = ScheduledExecutorServiceBuilder.of(environment, "recurringJob").threads(6).build();
507-
ScheduledExecutorService websocketScheduledExecutor = ScheduledExecutorServiceBuilder.of(environment, "websocket").threads(8).build();
508507
ExecutorService apnSenderExecutor = ExecutorServiceBuilder.of(environment, "apnSender")
509508
.maxThreads(1).minThreads(1).build();
510509
ExecutorService fcmSenderExecutor = ExecutorServiceBuilder.of(environment, "fcmSender")
@@ -996,7 +995,7 @@ protected void configureServer(final ServerBuilder<?> serverBuilder) {
996995
config.idlePrimaryDeviceReminderConfiguration().minIdleDuration(), Clock.systemUTC()));
997996
webSocketEnvironment.setConnectListener(
998997
new AuthenticatedConnectListener(accountsManager, receiptSender, messagesManager, messageMetrics, pushNotificationManager,
999-
pushNotificationScheduler, redisMessageAvailabilityManager, disconnectionRequestManager, websocketScheduledExecutor,
998+
pushNotificationScheduler, redisMessageAvailabilityManager, disconnectionRequestManager,
1000999
messageDeliveryScheduler, clientReleaseManager, messageDeliveryLoopMonitor, experimentEnrollmentManager));
10011000
webSocketEnvironment.jersey().register(new RateLimitByIpFilter(rateLimiters));
10021001
webSocketEnvironment.jersey().register(new RequestStatisticsFilter(TrafficSource.WEBSOCKET));

service/src/main/java/org/whispersystems/textsecuregcm/redis/ClusterLuaScript.java

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
import org.slf4j.Logger;
2222
import org.slf4j.LoggerFactory;
2323
import reactor.core.publisher.Flux;
24-
import reactor.core.publisher.Mono;
2524

2625
public class ClusterLuaScript {
2726

@@ -133,13 +132,7 @@ private <T> Flux<Object> executeReactive(final StatefulRedisClusterConnection<T,
133132
final T[] keys, final T[] args) {
134133

135134
return connection.reactive().evalsha(sha, scriptOutputType, keys, args)
136-
.onErrorResume(e -> {
137-
if (e instanceof RedisNoScriptException) {
138-
return connection.reactive().eval(script, scriptOutputType, keys, args);
139-
}
140-
141-
log.warn("Failed to execute script", e);
142-
return Mono.error(e);
143-
});
135+
.onErrorResume(RedisNoScriptException.class, _ -> connection.reactive().eval(script, scriptOutputType, keys, args))
136+
.doOnError(throwable -> log.warn("Failed to execute script", throwable));
144137
}
145138
}

service/src/main/java/org/whispersystems/textsecuregcm/storage/MessagesCache.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
import reactor.core.publisher.Mono;
5353
import reactor.core.scheduler.Scheduler;
5454
import reactor.core.scheduler.Schedulers;
55+
import reactor.util.retry.RetrySpec;
5556

5657
/**
5758
* Manages short-term storage of messages in Redis. Messages are frequently delivered to their destination and deleted
@@ -521,6 +522,7 @@ private Mono<Pair<List<byte[]>, Long>> getNextMessagePage(final UUID destination
521522
long messageId, int pageSize) {
522523

523524
return getItemsScript.execute(destinationUuid, destinationDevice, pageSize, messageId)
525+
.retryWhen(RetrySpec.backoff(4, Duration.ofSeconds(1)).maxBackoff(Duration.ofSeconds(4)))
524526
.map(queueItems -> {
525527
logger.trace("Processing page: {}", messageId);
526528

service/src/main/java/org/whispersystems/textsecuregcm/websocket/AuthenticatedConnectListener.java

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99

1010
import io.micrometer.core.instrument.Tags;
1111
import java.util.Optional;
12-
import java.util.concurrent.ScheduledExecutorService;
1312
import org.slf4j.Logger;
1413
import org.slf4j.LoggerFactory;
1514
import org.whispersystems.textsecuregcm.auth.AuthenticatedDevice;
@@ -19,10 +18,10 @@
1918
import org.whispersystems.textsecuregcm.limits.MessageDeliveryLoopMonitor;
2019
import org.whispersystems.textsecuregcm.metrics.MessageMetrics;
2120
import org.whispersystems.textsecuregcm.metrics.OpenWebSocketCounter;
22-
import org.whispersystems.textsecuregcm.push.RedisMessageAvailabilityManager;
2321
import org.whispersystems.textsecuregcm.push.PushNotificationManager;
2422
import org.whispersystems.textsecuregcm.push.PushNotificationScheduler;
2523
import org.whispersystems.textsecuregcm.push.ReceiptSender;
24+
import org.whispersystems.textsecuregcm.push.RedisMessageAvailabilityManager;
2625
import org.whispersystems.textsecuregcm.storage.Account;
2726
import org.whispersystems.textsecuregcm.storage.AccountsManager;
2827
import org.whispersystems.textsecuregcm.storage.ClientReleaseManager;
@@ -51,7 +50,6 @@ public class AuthenticatedConnectListener implements WebSocketConnectListener {
5150
private final PushNotificationScheduler pushNotificationScheduler;
5251
private final RedisMessageAvailabilityManager redisMessageAvailabilityManager;
5352
private final DisconnectionRequestManager disconnectionRequestManager;
54-
private final ScheduledExecutorService scheduledExecutorService;
5553
private final Scheduler messageDeliveryScheduler;
5654
private final ClientReleaseManager clientReleaseManager;
5755
private final MessageDeliveryLoopMonitor messageDeliveryLoopMonitor;
@@ -69,7 +67,6 @@ public AuthenticatedConnectListener(
6967
final PushNotificationScheduler pushNotificationScheduler,
7068
final RedisMessageAvailabilityManager redisMessageAvailabilityManager,
7169
final DisconnectionRequestManager disconnectionRequestManager,
72-
final ScheduledExecutorService scheduledExecutorService,
7370
final Scheduler messageDeliveryScheduler,
7471
final ClientReleaseManager clientReleaseManager,
7572
final MessageDeliveryLoopMonitor messageDeliveryLoopMonitor,
@@ -83,7 +80,6 @@ public AuthenticatedConnectListener(
8380
this.pushNotificationScheduler = pushNotificationScheduler;
8481
this.redisMessageAvailabilityManager = redisMessageAvailabilityManager;
8582
this.disconnectionRequestManager = disconnectionRequestManager;
86-
this.scheduledExecutorService = scheduledExecutorService;
8783
this.messageDeliveryScheduler = messageDeliveryScheduler;
8884
this.clientReleaseManager = clientReleaseManager;
8985
this.messageDeliveryLoopMonitor = messageDeliveryLoopMonitor;
@@ -126,7 +122,6 @@ public void onWebSocketConnect(final WebSocketSessionContext context) {
126122
maybeAuthenticatedAccount.get(),
127123
maybeAuthenticatedDevice.get(),
128124
context.getClient(),
129-
scheduledExecutorService,
130125
messageDeliveryScheduler,
131126
clientReleaseManager,
132127
messageDeliveryLoopMonitor,

service/src/main/java/org/whispersystems/textsecuregcm/websocket/WebSocketConnection.java

Lines changed: 24 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -18,20 +18,15 @@
1818
import java.util.Collections;
1919
import java.util.List;
2020
import java.util.Optional;
21-
import java.util.Random;
2221
import java.util.UUID;
2322
import java.util.concurrent.CompletableFuture;
24-
import java.util.concurrent.ScheduledExecutorService;
25-
import java.util.concurrent.ScheduledFuture;
2623
import java.util.concurrent.Semaphore;
2724
import java.util.concurrent.TimeUnit;
2825
import java.util.concurrent.TimeoutException;
2926
import java.util.concurrent.atomic.AtomicBoolean;
30-
import java.util.concurrent.atomic.AtomicInteger;
3127
import java.util.concurrent.atomic.AtomicLong;
3228
import java.util.concurrent.atomic.AtomicReference;
3329
import java.util.concurrent.atomic.LongAdder;
34-
import javax.annotation.Nullable;
3530
import org.apache.commons.lang3.StringUtils;
3631
import org.eclipse.jetty.util.StaticException;
3732
import org.reactivestreams.Publisher;
@@ -48,10 +43,10 @@
4843
import org.whispersystems.textsecuregcm.metrics.MessageMetrics;
4944
import org.whispersystems.textsecuregcm.metrics.MetricsUtil;
5045
import org.whispersystems.textsecuregcm.metrics.UserAgentTagUtil;
46+
import org.whispersystems.textsecuregcm.push.MessageAvailabilityListener;
5147
import org.whispersystems.textsecuregcm.push.PushNotificationManager;
5248
import org.whispersystems.textsecuregcm.push.PushNotificationScheduler;
5349
import org.whispersystems.textsecuregcm.push.ReceiptSender;
54-
import org.whispersystems.textsecuregcm.push.MessageAvailabilityListener;
5550
import org.whispersystems.textsecuregcm.storage.Account;
5651
import org.whispersystems.textsecuregcm.storage.ClientReleaseManager;
5752
import org.whispersystems.textsecuregcm.storage.Device;
@@ -65,6 +60,7 @@
6560
import reactor.core.publisher.Flux;
6661
import reactor.core.publisher.Mono;
6762
import reactor.core.scheduler.Scheduler;
63+
import reactor.util.retry.Retry;
6864

6965
public class WebSocketConnection implements MessageAvailabilityListener, DisconnectionRequestListener {
7066

@@ -80,7 +76,6 @@ public class WebSocketConnection implements MessageAvailabilityListener, Disconn
8076
"initialQueueLength");
8177
private static final String INITIAL_QUEUE_DRAIN_TIMER_NAME = name(WebSocketConnection.class, "drainInitialQueue");
8278
private static final String SLOW_QUEUE_DRAIN_COUNTER_NAME = name(WebSocketConnection.class, "slowQueueDrain");
83-
private static final String QUEUE_DRAIN_RETRY_COUNTER_NAME = name(WebSocketConnection.class, "queueDrainRetry");
8479
private static final String DISPLACEMENT_COUNTER_NAME = name(WebSocketConnection.class, "displacement");
8580
private static final String NON_SUCCESS_RESPONSE_COUNTER_NAME = name(WebSocketConnection.class,
8681
"clientNonSuccessResponse");
@@ -105,11 +100,6 @@ public class WebSocketConnection implements MessageAvailabilityListener, Disconn
105100
@VisibleForTesting
106101
static final int MESSAGE_SENDER_MAX_CONCURRENCY = 256;
107102

108-
@VisibleForTesting
109-
static final int MAX_CONSECUTIVE_RETRIES = 5;
110-
private static final long RETRY_DELAY_MILLIS = 1_000;
111-
private static final int RETRY_DELAY_JITTER_MILLIS = 500;
112-
113103
private static final int DEFAULT_SEND_FUTURES_TIMEOUT_MILLIS = 5 * 60 * 1000;
114104

115105
private static final Duration CLOSE_WITH_PENDING_MESSAGES_NOTIFICATION_DELAY = Duration.ofMinutes(1);
@@ -130,19 +120,14 @@ public class WebSocketConnection implements MessageAvailabilityListener, Disconn
130120

131121
private final int sendFuturesTimeoutMillis;
132122

133-
private final ScheduledExecutorService scheduledExecutorService;
134-
135123
private final Semaphore processStoredMessagesSemaphore = new Semaphore(1);
136124
private final AtomicReference<StoredMessageState> storedMessageState = new AtomicReference<>(
137125
StoredMessageState.PERSISTED_NEW_MESSAGES_AVAILABLE);
138126
private final AtomicBoolean sentInitialQueueEmptyMessage = new AtomicBoolean(false);
139127
private final LongAdder sentMessageCounter = new LongAdder();
140128
private final AtomicLong queueDrainStartTime = new AtomicLong();
141-
private final AtomicInteger consecutiveRetries = new AtomicInteger();
142-
private final AtomicReference<ScheduledFuture<?>> retryFuture = new AtomicReference<>();
143129
private final AtomicReference<Disposable> messageSubscription = new AtomicReference<>();
144130

145-
private final Random random = new Random();
146131
private final Scheduler messageDeliveryScheduler;
147132

148133
private final ClientReleaseManager clientReleaseManager;
@@ -161,7 +146,6 @@ public WebSocketConnection(ReceiptSender receiptSender,
161146
Account authenticatedAccount,
162147
Device authenticatedDevice,
163148
WebSocketClient client,
164-
ScheduledExecutorService scheduledExecutorService,
165149
Scheduler messageDeliveryScheduler,
166150
ClientReleaseManager clientReleaseManager,
167151
MessageDeliveryLoopMonitor messageDeliveryLoopMonitor,
@@ -176,7 +160,6 @@ public WebSocketConnection(ReceiptSender receiptSender,
176160
authenticatedDevice,
177161
client,
178162
DEFAULT_SEND_FUTURES_TIMEOUT_MILLIS,
179-
scheduledExecutorService,
180163
messageDeliveryScheduler,
181164
clientReleaseManager,
182165
messageDeliveryLoopMonitor, experimentEnrollmentManager);
@@ -192,7 +175,6 @@ public WebSocketConnection(ReceiptSender receiptSender,
192175
Device authenticatedDevice,
193176
WebSocketClient client,
194177
int sendFuturesTimeoutMillis,
195-
ScheduledExecutorService scheduledExecutorService,
196178
Scheduler messageDeliveryScheduler,
197179
ClientReleaseManager clientReleaseManager,
198180
MessageDeliveryLoopMonitor messageDeliveryLoopMonitor,
@@ -207,7 +189,6 @@ public WebSocketConnection(ReceiptSender receiptSender,
207189
this.authenticatedDevice = authenticatedDevice;
208190
this.client = client;
209191
this.sendFuturesTimeoutMillis = sendFuturesTimeoutMillis;
210-
this.scheduledExecutorService = scheduledExecutorService;
211192
this.messageDeliveryScheduler = messageDeliveryScheduler;
212193
this.clientReleaseManager = clientReleaseManager;
213194
this.messageDeliveryLoopMonitor = messageDeliveryLoopMonitor;
@@ -221,12 +202,6 @@ public void start() {
221202
}
222203

223204
public void stop() {
224-
final ScheduledFuture<?> future = retryFuture.get();
225-
226-
if (future != null) {
227-
future.cancel(false);
228-
}
229-
230205
final Disposable subscription = messageSubscription.get();
231206
if (subscription != null) {
232207
subscription.dispose();
@@ -342,7 +317,6 @@ void processStoredMessages() {
342317
}
343318

344319
// Cleared the queue! Send a queue empty message if we need to
345-
consecutiveRetries.set(0);
346320
if (sentInitialQueueEmptyMessage.compareAndSet(false, true)) {
347321
final Tags tags = Tags.of(UserAgentTagUtil.getPlatformTag(client.getUserAgent()));
348322
final long drainDuration = System.currentTimeMillis() - queueDrainStartTime.get();
@@ -362,42 +336,23 @@ void processStoredMessages() {
362336
}
363337
})
364338
// Potentially kick off more work, must happen after we release the semaphore
365-
.whenComplete((ignored, cause) -> processMoreIfRequested(cause));
366-
}
367-
}
368-
369-
/**
370-
* After processing messages, kick off another processing job if more messages came in or if there was an error
371-
*
372-
* @param cause An error that was encountered when processing the message queue, if there was one
373-
*/
374-
private void processMoreIfRequested(final @Nullable Throwable cause) {
375-
if (cause == null) {
376-
// Success, but check if more messages came in while we were processing
377-
if (storedMessageState.get() != StoredMessageState.EMPTY) {
378-
processStoredMessages();
379-
}
380-
return;
381-
}
339+
.whenComplete((ignored, cause) -> {
340+
if (cause != null) {
341+
if (!client.isOpen()) {
342+
logger.debug("Client disconnected before queue cleared");
343+
return;
344+
}
382345

383-
if (!client.isOpen()) {
384-
logger.debug("Client disconnected before queue cleared");
385-
return;
386-
}
346+
client.close(1011, "Failed to retrieve messages");
347+
return;
348+
}
387349

388-
if (consecutiveRetries.incrementAndGet() > MAX_CONSECUTIVE_RETRIES) {
389-
logger.warn("Max consecutive retries exceeded", cause);
390-
client.close(1011, "Failed to retrieve messages");
391-
return;
350+
// Success, but check if more messages came in while we were processing
351+
if (storedMessageState.get() != StoredMessageState.EMPTY) {
352+
processStoredMessages();
353+
}
354+
});
392355
}
393-
394-
logger.debug("Failed to clear queue", cause);
395-
final Tags tags = Tags.of(UserAgentTagUtil.getPlatformTag(client.getUserAgent()));
396-
397-
Metrics.counter(QUEUE_DRAIN_RETRY_COUNTER_NAME, tags).increment();
398-
399-
final long delay = RETRY_DELAY_MILLIS + random.nextInt(RETRY_DELAY_JITTER_MILLIS);
400-
retryFuture.set(scheduledExecutorService.schedule(this::processStoredMessages, delay, TimeUnit.MILLISECONDS));
401356
}
402357

403358
private CompletableFuture<Void> sendMessages(final boolean cachedMessagesOnly) {
@@ -407,7 +362,6 @@ private CompletableFuture<Void> sendMessages(final boolean cachedMessagesOnly) {
407362
messagesManager.getMessagesForDeviceReactive(authenticatedAccount.getIdentifier(IdentityType.ACI), authenticatedDevice, cachedMessagesOnly);
408363

409364
final AtomicBoolean hasSentFirstMessage = new AtomicBoolean();
410-
final AtomicBoolean hasErrored = new AtomicBoolean();
411365

412366
final Disposable subscription = Flux.from(messages)
413367
.name(SEND_MESSAGES_FLUX_NAME)
@@ -423,19 +377,12 @@ private CompletableFuture<Void> sendMessages(final boolean cachedMessagesOnly) {
423377
}
424378
})
425379
.flatMapSequential(envelope ->
426-
Mono.fromFuture(() -> sendMessage(envelope)
427-
.orTimeout(sendFuturesTimeoutMillis, TimeUnit.MILLISECONDS))
428-
.onErrorResume(
429-
// let the first error pass through to terminate the subscription
430-
e -> {
431-
final boolean firstError = !hasErrored.getAndSet(true);
432-
measureSendMessageErrors(e, firstError);
433-
434-
return !firstError;
435-
},
436-
// otherwise just emit nothing
437-
e -> Mono.empty()
438-
), MESSAGE_SENDER_MAX_CONCURRENCY)
380+
Mono.defer(() -> Mono.fromFuture(() -> sendMessage(envelope).orTimeout(sendFuturesTimeoutMillis, TimeUnit.MILLISECONDS)))
381+
.doOnError(this::measureSendMessageErrors)
382+
// Note that this will retry both for "send to client" timeouts and failures to delete messages on
383+
// acknowledgement
384+
.retryWhen(Retry.backoff(4, Duration.ofSeconds(1))),
385+
MESSAGE_SENDER_MAX_CONCURRENCY)
439386
.subscribeOn(messageDeliveryScheduler)
440387
.subscribe(
441388
// no additional consumer of values - it is Flux<Void> by now
@@ -450,7 +397,7 @@ private CompletableFuture<Void> sendMessages(final boolean cachedMessagesOnly) {
450397
return queueCleared;
451398
}
452399

453-
private void measureSendMessageErrors(final Throwable e, final boolean terminal) {
400+
private void measureSendMessageErrors(final Throwable e) {
454401
final String errorType;
455402

456403
if (e instanceof TimeoutException) {
@@ -461,7 +408,7 @@ private void measureSendMessageErrors(final Throwable e, final boolean terminal)
461408
(e instanceof StaticException staticException && "Closed".equals(staticException.getMessage()))) {
462409
errorType = "connectionClosed";
463410
} else {
464-
logger.warn(terminal ? "Send message failure terminated stream" : "Send message failed", e);
411+
logger.warn("Send message failed", e);
465412
errorType = "other";
466413
}
467414

0 commit comments

Comments
 (0)