metrics = taskData.getMetrics();
if (metrics == null) {
continue;
}
for (Metric metric : metrics) {
- if (metric.getName().equals(Constants.MAX_MEMORY_BYTES)) {
- if (metric.getValue() > maxMemoryBytesUsed) {
- maxMemoryBytesUsed = metric.getValue();
+ if (metric.getName().equals(metricName)) {
+ if (metric.getValue() <= 0) {
+ continue;
}
+
+ avgMetric += metric.getValue();
+ numMetrics++;
}
}
}
- return maxMemoryBytesUsed;
+
+ return avgMetric / numMetrics;
}
}
diff --git a/app/views/help/tony/helpTaskGPU.scala.html b/app/views/help/tony/helpTaskGPU.scala.html
new file mode 100644
index 000000000..ec997ff93
--- /dev/null
+++ b/app/views/help/tony/helpTaskGPU.scala.html
@@ -0,0 +1,37 @@
+@*
+* Copyright 2019 LinkedIn Corp.
+*
+* Licensed under the Apache License, Version 2.0 (the "License"); you may not
+* use this file except in compliance with the License. You may obtain a copy of
+* the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+* License for the specific language governing permissions and limitations under
+* the License.
+*@
+
+ This heuristic shows GPU utilization and GPU memory utilization for each task type.
+ Try to optimize your GPU utilization!
+
+
+ GPU_UTILIZATION shows the percent of time over the past sample period during which one or more kernels was executing
+ on the GPU.
+
+
+ GPU_FB_MEMORY_USAGE shows the on-board frame buffer memory usage in percentage. Note, reported total memory is
+ affected by ECC (error-correcting code) state. If ECC is enabled the total available memory is decreased by several
+ percent, due to the requisite parity bits. The driver may also reserve a small amount of memory for internal use, even
+ without active work on the GPU.
+
+
+ GPU_MAIN_MEMORY_USAGE aka BAR1 memory usage shows the percentage of memory used to map the FB (device memory) so that
+ it can be directly accessed by CPU.
+
+
+ Above metrics are collected via nvidia-smi tools installed on the host, for more detailed information please visit
+ nvidia-smi manual.
+
\ No newline at end of file
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
index 597952c2f..d1fbc6e79 100644
--- a/project/Dependencies.scala
+++ b/project/Dependencies.scala
@@ -30,7 +30,7 @@ object Dependencies {
lazy val jsoupVersion = "1.7.3"
lazy val mysqlConnectorVersion = "5.1.36"
lazy val oozieClientVersion = "4.2.0"
- lazy val tonyVersion = "0.3.6"
+ lazy val tonyVersion = "0.3.16"
lazy val HADOOP_VERSION = "hadoopversion"
lazy val SPARK_VERSION = "sparkversion"
diff --git a/test/com/linkedin/drelephant/tony/fetchers/TonyFetcherTest.java b/test/com/linkedin/drelephant/tony/fetchers/TonyFetcherTest.java
index b9b5c8a4a..caf495be5 100644
--- a/test/com/linkedin/drelephant/tony/fetchers/TonyFetcherTest.java
+++ b/test/com/linkedin/drelephant/tony/fetchers/TonyFetcherTest.java
@@ -125,7 +125,9 @@ public void testFetchDataFinishedDir() throws Exception {
private static void testHelper(String appId) throws Exception {
FetcherConfigurationData configData = new FetcherConfigurationData(null, null,
- ImmutableMap.of(Constants.TONY_CONF_DIR, _tonyConfDir));
+ ImmutableMap.of(Constants.TONY_CONF_DIR, _tonyConfDir,
+ TonyConfigurationKeys.TONY_HISTORY_FINISHED_DIR_TIMEZONE,
+ TonyConfigurationKeys.DEFAULT_TONY_HISTORY_FINISHED_DIR_TIMEZONE));
TonyFetcher tonyFetcher = new TonyFetcher(configData);
AnalyticJob job = new AnalyticJob();
diff --git a/test/com/linkedin/drelephant/tony/util/TonyUtilsTest.java b/test/com/linkedin/drelephant/tony/util/TonyUtilsTest.java
index 116cea7cb..e305e9edd 100644
--- a/test/com/linkedin/drelephant/tony/util/TonyUtilsTest.java
+++ b/test/com/linkedin/drelephant/tony/util/TonyUtilsTest.java
@@ -28,6 +28,95 @@ public void testGetMaxMemorySomeTasksMissingMetrics() {
taskDataMap.put(1, worker1Data);
Assert.assertEquals(worker1MaxMemoryBytes,
- TonyUtils.getMaxMemoryBytesUsedForTaskType(ImmutableMap.of("worker", taskDataMap), "worker"), 0);
+ TonyUtils.getMaxMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "worker",
+ Constants.MAX_MEMORY_BYTES), 0);
+ }
+
+ @Test
+ public void testGetMaxMetricForTaskTypeAndMetricNameMissingTask() {
+ Map taskDataMap = new TreeMap<>();
+ TonyTaskData worker0Data = new TonyTaskData("worker", 0);
+
+ taskDataMap.put(0, worker0Data);
+
+ Assert.assertEquals(-1.0d,
+ TonyUtils.getMaxMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "ps",
+ Constants.MAX_MEMORY_BYTES), 0);
+ }
+
+ @Test
+ public void testGetMaxMetricForTaskTypeAndMetricName() {
+ Map taskDataMap = new TreeMap<>();
+ TonyTaskData worker0Data = new TonyTaskData("worker", 0);
+ TonyTaskData worker1Data = new TonyTaskData("worker", 1);
+
+ double worker0MaxGPUUtilization = 20.0d;
+ double worker1MaxGPUUtilization = 21.0d;
+ double worker0MaxGPUFBMemoryUsage = 22.0d;
+ double worker1MaxGPUFBMemoryUsage = 23.0d;
+ double worker0MaxGPUMainMemoryUsage = 2.0d;
+ double worker1MaxGPUMainMemoryUsage = -1.0d;
+
+ worker0Data.setMetrics(ImmutableList.of(
+ new Metric(Constants.MAX_GPU_UTILIZATION, worker0MaxGPUUtilization),
+ new Metric(Constants.MAX_GPU_FB_MEMORY_USAGE, worker0MaxGPUFBMemoryUsage),
+ new Metric(Constants.MAX_GPU_MAIN_MEMORY_USAGE, worker0MaxGPUMainMemoryUsage)
+ ));
+ worker1Data.setMetrics(ImmutableList.of(
+ new Metric(Constants.MAX_GPU_UTILIZATION, worker1MaxGPUUtilization),
+ new Metric(Constants.MAX_GPU_FB_MEMORY_USAGE, worker1MaxGPUFBMemoryUsage),
+ new Metric(Constants.MAX_GPU_MAIN_MEMORY_USAGE, worker1MaxGPUMainMemoryUsage)
+ ));
+
+ taskDataMap.put(0, worker0Data);
+ taskDataMap.put(1, worker1Data);
+
+ Assert.assertEquals(worker1MaxGPUUtilization,
+ TonyUtils.getMaxMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "worker",
+ Constants.MAX_GPU_UTILIZATION), 0);
+ Assert.assertEquals(worker1MaxGPUFBMemoryUsage,
+ TonyUtils.getMaxMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "worker",
+ Constants.MAX_GPU_FB_MEMORY_USAGE), 0);
+ Assert.assertEquals(worker0MaxGPUMainMemoryUsage,
+ TonyUtils.getMaxMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "worker",
+ Constants.MAX_GPU_MAIN_MEMORY_USAGE), 0);
+ }
+
+ @Test
+ public void testGetAvgMetricForTaskTypeAndMetricName() {
+ Map taskDataMap = new TreeMap<>();
+ TonyTaskData worker0Data = new TonyTaskData("worker", 0);
+ TonyTaskData worker1Data = new TonyTaskData("worker", 1);
+
+ double worker0AvgGPUUtilization = 10.0d;
+ double worker1AvgGPUUtilization = 20.0d;
+ double worker0AvgGPUFBMemoryUsage = 30.0d;
+ double worker1AvgGPUFBMemoryUsage = 0.0d;
+ double worker0AvgGPUMainMemoryUsage = 40.0d;
+ double worker1AvgGPUMainMemoryUsage = -1.0d;
+
+ worker0Data.setMetrics(ImmutableList.of(
+ new Metric(Constants.AVG_GPU_UTILIZATION, worker0AvgGPUUtilization),
+ new Metric(Constants.AVG_GPU_FB_MEMORY_USAGE, worker0AvgGPUFBMemoryUsage),
+ new Metric(Constants.AVG_GPU_MAIN_MEMORY_USAGE, worker0AvgGPUMainMemoryUsage))
+ );
+ worker1Data.setMetrics(ImmutableList.of(
+ new Metric(Constants.AVG_GPU_UTILIZATION, worker1AvgGPUUtilization),
+ new Metric(Constants.AVG_GPU_FB_MEMORY_USAGE, worker1AvgGPUFBMemoryUsage),
+ new Metric(Constants.AVG_GPU_MAIN_MEMORY_USAGE, worker1AvgGPUMainMemoryUsage)
+ ));
+
+ taskDataMap.put(0, worker0Data);
+ taskDataMap.put(1, worker1Data);
+
+ Assert.assertEquals(15.0d,
+ TonyUtils.getAvgMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "worker",
+ Constants.AVG_GPU_UTILIZATION), 0);
+ Assert.assertEquals(30.0d,
+ TonyUtils.getAvgMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "worker",
+ Constants.AVG_GPU_FB_MEMORY_USAGE), 0);
+ Assert.assertEquals(40.0d,
+ TonyUtils.getAvgMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "worker",
+ Constants.AVG_GPU_MAIN_MEMORY_USAGE), 0);
}
}