Skip to content

Commit 9f689e9

Browse files
authored
Add runtime metrics for task time on event loop and log slow execution (prestodb#25009)
## Description 1. Add runtime metrics for task execution time on event loop 2. Add logging for slow execution where the logging threshold can be controlled by config and we will log the query id, task id, and the method name if the runnable is taking too long to finish. ## Motivation and Context 1. This increases the observability on what can be running slow on event loop ## Impact <!---Describe any public API or user-facing feature change or any performance impact--> ## Test Plan 1. running verifier ## Contributor checklist - [ ] Please make sure your submission complies with our [contributing guide](https://github.com/prestodb/presto/blob/master/CONTRIBUTING.md), in particular [code style](https://github.com/prestodb/presto/blob/master/CONTRIBUTING.md#code-style) and [commit standards](https://github.com/prestodb/presto/blob/master/CONTRIBUTING.md#commit-standards). - [ ] PR description addresses the issue accurately and concisely. If the change is non-trivial, a GitHub Issue is referenced. - [ ] Documented new properties (with its default value), SQL syntax, functions, or other functionality. - [ ] If release notes are required, they follow the [release notes guidelines](https://github.com/prestodb/presto/wiki/Release-Notes-Guidelines). - [ ] Adequate tests were added if applicable. - [ ] CI passed. ## Release Notes Please follow [release notes guidelines](https://github.com/prestodb/presto/wiki/Release-Notes-Guidelines) and fill in the release notes below. ``` == NO RELEASE NOTE == ```
1 parent f7af3b5 commit 9f689e9

File tree

8 files changed

+80
-18
lines changed

8 files changed

+80
-18
lines changed

presto-common/src/main/java/com/facebook/presto/common/RuntimeMetricName.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ private RuntimeMetricName()
6868
public static final String TASK_UPDATE_DELIVERED_WALL_TIME_NANOS = "taskUpdateDeliveredWallTimeNanos";
6969
public static final String TASK_UPDATE_SERIALIZED_CPU_TIME_NANOS = "taskUpdateSerializedCpuNanos";
7070
public static final String TASK_PLAN_SERIALIZED_CPU_TIME_NANOS = "taskPlanSerializedCpuNanos";
71+
// Time for event loop to execute a method
72+
public static final String EVENT_LOOP_METHOD_EXECUTION_CPU_TIME_NANOS = "eventLoopMethodExecutionCpuNanos";
7173
// Time taken for a read call to storage
7274
public static final String STORAGE_READ_TIME_NANOS = "storageReadTimeNanos";
7375
// Size of the data retrieved by read call to storage

presto-main-base/src/main/java/com/facebook/presto/execution/SafeEventLoopGroup.java

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,22 @@
1414
package com.facebook.presto.execution;
1515

1616
import com.facebook.airlift.log.Logger;
17+
import com.sun.management.ThreadMXBean;
1718
import io.netty.channel.DefaultEventLoop;
1819
import io.netty.channel.DefaultEventLoopGroup;
1920
import io.netty.channel.EventLoop;
2021
import io.netty.channel.EventLoopGroup;
2122

23+
import java.lang.management.GarbageCollectorMXBean;
24+
import java.lang.management.ManagementFactory;
25+
import java.util.List;
2226
import java.util.concurrent.Executor;
2327
import java.util.concurrent.ThreadFactory;
2428
import java.util.function.Consumer;
2529
import java.util.function.Supplier;
2630

2731
import static java.util.Objects.requireNonNull;
32+
import static java.util.concurrent.TimeUnit.NANOSECONDS;
2833

2934
/***
3035
* One observation about event loop is if submitted task fails, it could kill the thread but the event loop group will not create a new one.
@@ -35,10 +40,15 @@ public class SafeEventLoopGroup
3540
extends DefaultEventLoopGroup
3641
{
3742
private static final Logger log = Logger.get(SafeEventLoopGroup.class);
43+
private static final ThreadMXBean THREAD_MX_BEAN = (ThreadMXBean) ManagementFactory.getThreadMXBean();
44+
private static final List<GarbageCollectorMXBean> gcBeans = ManagementFactory.getGarbageCollectorMXBeans();
3845

39-
public SafeEventLoopGroup(int nThreads, ThreadFactory threadFactory)
46+
private final long slowMethodThresholdOnEventLoopInNanos;
47+
48+
public SafeEventLoopGroup(int nThreads, ThreadFactory threadFactory, long slowMethodThresholdOnEventLoopInNanos)
4049
{
4150
super(nThreads, threadFactory);
51+
this.slowMethodThresholdOnEventLoopInNanos = slowMethodThresholdOnEventLoopInNanos;
4252
}
4353

4454
@Override
@@ -47,7 +57,7 @@ protected EventLoop newChild(Executor executor, Object... args)
4757
return new SafeEventLoop(this, executor);
4858
}
4959

50-
public static class SafeEventLoop
60+
public class SafeEventLoop
5161
extends DefaultEventLoop
5262
{
5363
public SafeEventLoop(EventLoopGroup parent, Executor executor)
@@ -73,9 +83,11 @@ protected void run()
7383
while (!this.confirmShutdown());
7484
}
7585

76-
public void execute(Runnable task, Consumer<Throwable> failureHandler)
86+
public void execute(Runnable task, Consumer<Throwable> failureHandler, SchedulerStatsTracker statsTracker, String methodSignature)
7787
{
7888
requireNonNull(task, "task is null");
89+
long initialGCTime = getTotalGCTime();
90+
long start = THREAD_MX_BEAN.getCurrentThreadCpuTime();
7991
this.execute(() -> {
8092
try {
8193
task.run();
@@ -86,6 +98,15 @@ public void execute(Runnable task, Consumer<Throwable> failureHandler)
8698
failureHandler.accept(t);
8799
}
88100
}
101+
finally {
102+
long currentGCTime = getTotalGCTime();
103+
long cpuTimeInNanos = THREAD_MX_BEAN.getCurrentThreadCpuTime() - start - (currentGCTime - initialGCTime);
104+
105+
statsTracker.recordEventLoopMethodExecutionCpuTime(cpuTimeInNanos);
106+
if (cpuTimeInNanos > slowMethodThresholdOnEventLoopInNanos) {
107+
log.warn("Slow method execution on event loop: %s took %s milliseconds", methodSignature, NANOSECONDS.toMillis(cpuTimeInNanos));
108+
}
109+
}
89110
});
90111
}
91112

@@ -108,4 +129,9 @@ public <T> void execute(Supplier<T> task, Consumer<T> successHandler, Consumer<T
108129
});
109130
}
110131
}
132+
133+
private long getTotalGCTime()
134+
{
135+
return gcBeans.stream().mapToLong(GarbageCollectorMXBean::getCollectionTime).sum();
136+
}
111137
}

presto-main-base/src/main/java/com/facebook/presto/execution/SchedulerStatsTracker.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,16 @@ public void recordTaskUpdateSerializedCpuTime(long nanos) {}
2525

2626
@Override
2727
public void recordTaskPlanSerializedCpuTime(long nanos) {}
28+
29+
@Override
30+
public void recordEventLoopMethodExecutionCpuTime(long nanos) {}
2831
};
2932

3033
void recordTaskUpdateDeliveredTime(long nanos);
3134

3235
void recordTaskUpdateSerializedCpuTime(long nanos);
3336

3437
void recordTaskPlanSerializedCpuTime(long nanos);
38+
39+
void recordEventLoopMethodExecutionCpuTime(long nanos);
3540
}

presto-main-base/src/main/java/com/facebook/presto/execution/StageExecutionStateMachine.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import java.util.concurrent.atomic.AtomicReference;
3838
import java.util.function.Supplier;
3939

40+
import static com.facebook.presto.common.RuntimeMetricName.EVENT_LOOP_METHOD_EXECUTION_CPU_TIME_NANOS;
4041
import static com.facebook.presto.common.RuntimeMetricName.GET_SPLITS_TIME_NANOS;
4142
import static com.facebook.presto.common.RuntimeMetricName.SCAN_STAGE_SCHEDULER_BLOCKED_TIME_NANOS;
4243
import static com.facebook.presto.common.RuntimeMetricName.SCAN_STAGE_SCHEDULER_CPU_TIME_NANOS;
@@ -413,6 +414,12 @@ public void recordTaskPlanSerializedCpuTime(long nanos)
413414
runtimeStats.addMetricValue(TASK_PLAN_SERIALIZED_CPU_TIME_NANOS, NANO, max(nanos, 0));
414415
}
415416

417+
@Override
418+
public void recordEventLoopMethodExecutionCpuTime(long nanos)
419+
{
420+
runtimeStats.addMetricValue(EVENT_LOOP_METHOD_EXECUTION_CPU_TIME_NANOS, NANO, max(nanos, 0));
421+
}
422+
416423
@Override
417424
public String toString()
418425
{

presto-main-base/src/main/java/com/facebook/presto/execution/TaskManagerConfig.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import java.math.BigDecimal;
3434
import java.util.concurrent.TimeUnit;
3535

36+
import static java.util.concurrent.TimeUnit.NANOSECONDS;
3637
import static java.util.concurrent.TimeUnit.SECONDS;
3738

3839
@DefunctConfig({
@@ -101,6 +102,20 @@ public class TaskManagerConfig
101102
private Duration highMemoryTaskKillerFrequentFullGCDurationThreshold = new Duration(1, SECONDS);
102103
private double highMemoryTaskKillerHeapMemoryThreshold = 0.9;
103104
private boolean enableEventLoop;
105+
private Duration slowMethodThresholdOnEventLoop = new Duration(10, SECONDS);
106+
107+
@Min(50_000_000L)
108+
public long getSlowMethodThresholdOnEventLoop()
109+
{
110+
return slowMethodThresholdOnEventLoop.roundTo(NANOSECONDS);
111+
}
112+
113+
@Config("task.event-loop-slow-method-threshold")
114+
public TaskManagerConfig setSlowMethodThresholdOnEventLoop(Duration slowMethodThresholdOnEventLoop)
115+
{
116+
this.slowMethodThresholdOnEventLoop = slowMethodThresholdOnEventLoop;
117+
return this;
118+
}
104119

105120
@Config("task.enable-event-loop")
106121
public TaskManagerConfig setEventLoopEnabled(boolean enableEventLoop)

presto-main-base/src/test/java/com/facebook/presto/execution/TestTaskManagerConfig.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ public void testDefaults()
7979
.setHighMemoryTaskKillerFrequentFullGCDurationThreshold(new Duration(1, SECONDS))
8080
.setHighMemoryTaskKillerHeapMemoryThreshold(0.9)
8181
.setTaskUpdateSizeTrackingEnabled(true)
82+
.setSlowMethodThresholdOnEventLoop(new Duration(10, SECONDS))
8283
.setEventLoopEnabled(false));
8384
}
8485

@@ -129,6 +130,7 @@ public void testExplicitPropertyMappings()
129130
.put("experimental.task.high-memory-task-killer-heap-memory-threshold", "0.8")
130131
.put("task.update-size-tracking-enabled", "false")
131132
.put("task.enable-event-loop", "true")
133+
.put("task.event-loop-slow-method-threshold", "1s")
132134
.build();
133135

134136
TaskManagerConfig expected = new TaskManagerConfig()
@@ -174,7 +176,8 @@ public void testExplicitPropertyMappings()
174176
.setHighMemoryTaskKillerFrequentFullGCDurationThreshold(new Duration(2, SECONDS))
175177
.setHighMemoryTaskKillerHeapMemoryThreshold(0.8)
176178
.setTaskUpdateSizeTrackingEnabled(false)
177-
.setEventLoopEnabled(true);
179+
.setEventLoopEnabled(true)
180+
.setSlowMethodThresholdOnEventLoop(new Duration(1, SECONDS));
178181

179182
assertFullMapping(properties, expected);
180183
}

presto-main/src/main/java/com/facebook/presto/server/remotetask/HttpRemoteTaskFactory.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ public class HttpRemoteTaskFactory
109109
private final DecayCounter taskUpdateRequestSize;
110110
private final boolean taskUpdateSizeTrackingEnabled;
111111
private final Optional<SafeEventLoopGroup> eventLoopGroup;
112+
private final long slowMethodThresholdOnEventLoopInNanos;
112113

113114
@Inject
114115
public HttpRemoteTaskFactory(
@@ -194,8 +195,9 @@ else if (binaryTransportEnabled) {
194195
this.taskUpdateRequestSize = new DecayCounter(ExponentialDecay.oneMinute());
195196
this.taskUpdateSizeTrackingEnabled = taskConfig.isTaskUpdateSizeTrackingEnabled();
196197

198+
this.slowMethodThresholdOnEventLoopInNanos = taskConfig.getSlowMethodThresholdOnEventLoop();
197199
this.eventLoopGroup = taskConfig.isEventLoopEnabled() ? Optional.of(new SafeEventLoopGroup(config.getRemoteTaskMaxCallbackThreads(),
198-
new ThreadFactoryBuilder().setNameFormat("task-event-loop-%s").setDaemon(true).build())
200+
new ThreadFactoryBuilder().setNameFormat("task-event-loop-%s").setDaemon(true).build(), this.slowMethodThresholdOnEventLoopInNanos)
199201
{
200202
@Override
201203
protected EventLoop newChild(Executor executor, Object... args)

presto-main/src/main/java/com/facebook/presto/server/remotetask/HttpRemoteTaskWithEventLoop.java

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ public final class HttpRemoteTaskWithEventLoop
232232
private final SchedulerStatsTracker schedulerStatsTracker;
233233

234234
private final SafeEventLoopGroup.SafeEventLoop taskEventLoop;
235+
private final String loggingPrefix;
235236

236237
public static HttpRemoteTaskWithEventLoop createHttpRemoteTaskWithEventLoop(
237238
Session session,
@@ -469,6 +470,7 @@ private HttpRemoteTaskWithEventLoop(Session session,
469470
handleResolver,
470471
connectorTypeSerdeManager,
471472
thriftProtocol);
473+
this.loggingPrefix = format("Query: %s, Task: %s", session.getQueryId(), taskId);
472474
}
473475

474476
// this is a separate method to ensure that the `this` reference is not leaked during construction
@@ -488,7 +490,7 @@ private void initialize()
488490
});
489491

490492
updateTaskStats();
491-
safeExecuteOnEventLoop(this::updateSplitQueueSpace);
493+
safeExecuteOnEventLoop(this::updateSplitQueueSpace, "updateSplitQueueSpace");
492494
}
493495

494496
@Override
@@ -537,7 +539,7 @@ public void start()
537539

538540
taskStatusFetcher.start();
539541
taskInfoFetcher.start();
540-
});
542+
}, "start");
541543
}
542544

543545
@Override
@@ -581,7 +583,7 @@ public void addSplits(Multimap<PlanNodeId, Split> splitsBySource)
581583
needsUpdate = true;
582584
scheduleUpdate();
583585
}
584-
});
586+
}, "addSplits");
585587
}
586588

587589
@Override
@@ -595,7 +597,7 @@ public void noMoreSplits(PlanNodeId sourceId)
595597
noMoreSplits.put(sourceId, true);
596598
needsUpdate = true;
597599
scheduleUpdate();
598-
});
600+
}, "noMoreSplits");
599601
}
600602

601603
@Override
@@ -606,7 +608,7 @@ public void noMoreSplits(PlanNodeId sourceId, Lifespan lifespan)
606608
needsUpdate = true;
607609
scheduleUpdate();
608610
}
609-
});
611+
}, "noMoreSplits with lifeSpan");
610612
}
611613

612614
@Override
@@ -622,7 +624,7 @@ public void setOutputBuffers(OutputBuffers newOutputBuffers)
622624
needsUpdate = true;
623625
scheduleUpdate();
624626
}
625-
});
627+
}, "setOutputBuffers");
626628
}
627629

628630
@Override
@@ -793,7 +795,7 @@ public ListenableFuture<?> whenSplitQueueHasSpace(long weightThreshold)
793795
future.set(null);
794796
}
795797
whenSplitQueueHasSpace.createNewListener().addListener(() -> future.set(null), taskEventLoop);
796-
});
798+
}, "whenSplitQueueHasSpace");
797799
return future;
798800
}
799801

@@ -1042,7 +1044,7 @@ private void sendUpdate()
10421044
future,
10431045
new SimpleHttpResponseHandler<>(new UpdateResponseHandler(sources), request.getUri(), stats.getHttpResponseStats(), REMOTE_TASK_ERROR),
10441046
taskEventLoop);
1045-
});
1047+
}, "sendUpdate");
10461048
}
10471049

10481050
private String getExceededTaskUpdateSizeMessage(byte[] taskUpdateRequestJson)
@@ -1091,7 +1093,7 @@ public void cancel()
10911093
Request request = builder.setUri(uriBuilder.build())
10921094
.build();
10931095
scheduleAsyncCleanupRequest(createCleanupBackoff(), request, "cancel");
1094-
});
1096+
}, "cancel");
10951097
}
10961098

10971099
private void cleanUpTask()
@@ -1129,7 +1131,7 @@ private void cleanUpTask()
11291131
.build();
11301132

11311133
scheduleAsyncCleanupRequest(createCleanupBackoff(), request, "cleanup");
1132-
});
1134+
}, "cleanUpTask");
11331135
}
11341136

11351137
@Override
@@ -1159,7 +1161,7 @@ private void abort(TaskStatus status)
11591161
Request request = builder.setUri(uriBuilder.build())
11601162
.build();
11611163
scheduleAsyncCleanupRequest(createCleanupBackoff(), request, "abort");
1162-
});
1164+
}, "abort");
11631165
}
11641166

11651167
private void scheduleAsyncCleanupRequest(Backoff cleanupBackoff, Request request, String action)
@@ -1407,8 +1409,8 @@ public void onFailure(Throwable throwable)
14071409
}
14081410
}
14091411

1410-
private void safeExecuteOnEventLoop(Runnable r)
1412+
private void safeExecuteOnEventLoop(Runnable r, String methodName)
14111413
{
1412-
taskEventLoop.execute(r, this::failTask);
1414+
taskEventLoop.execute(r, this::failTask, schedulerStatsTracker, loggingPrefix + ", method: " + methodName);
14131415
}
14141416
}

0 commit comments

Comments
 (0)