diff --git a/app-conf/HeuristicConf.xml b/app-conf/HeuristicConf.xml index 9efa2f228..c14f85556 100644 --- a/app-conf/HeuristicConf.xml +++ b/app-conf/HeuristicConf.xml @@ -368,6 +368,12 @@ + + tony + Task GPU + com.linkedin.drelephant.tony.heuristics.TaskGPUHeuristic + views.html.help.tony.helpTaskGPU + diff --git a/app/com/linkedin/drelephant/tony/TonyMetricsAggregator.java b/app/com/linkedin/drelephant/tony/TonyMetricsAggregator.java index 27eb11b85..540e54d22 100644 --- a/app/com/linkedin/drelephant/tony/TonyMetricsAggregator.java +++ b/app/com/linkedin/drelephant/tony/TonyMetricsAggregator.java @@ -60,7 +60,8 @@ public void aggregate(HadoopApplicationData data) { String memoryString = tonyConf.get(TonyConfigurationKeys.getResourceKey(taskType, Constants.MEMORY)); String memoryStringMB = com.linkedin.tony.util.Utils.parseMemoryString(memoryString); long mbRequested = Long.parseLong(memoryStringMB); - double maxMemoryMBUsed = TonyUtils.getMaxMemoryBytesUsedForTaskType(taskMap, taskType) / FileUtils.ONE_MB; + double maxMemoryMBUsed = TonyUtils.getMaxMetricForTaskTypeAndMetricName(taskMap, taskType, + Constants.MAX_MEMORY_BYTES) / FileUtils.ONE_MB; for (TonyTaskData taskData : entry.getValue().values()) { long taskDurationSec = (taskData.getTaskEndTime() - taskData.getTaskStartTime()) / Statistics.SECOND_IN_MS; diff --git a/app/com/linkedin/drelephant/tony/fetchers/TonyFetcher.java b/app/com/linkedin/drelephant/tony/fetchers/TonyFetcher.java index 767b06ba9..82e271bb0 100644 --- a/app/com/linkedin/drelephant/tony/fetchers/TonyFetcher.java +++ b/app/com/linkedin/drelephant/tony/fetchers/TonyFetcher.java @@ -26,6 +26,7 @@ import java.io.IOException; import java.util.Date; import java.util.List; +import java.time.ZoneId; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -36,7 +37,9 @@ public class TonyFetcher implements ElephantFetcher { private static final Logger _LOGGER = Logger.getLogger(TonyFetcher.class); private final Path _intermediateDir; private final Path _finishedDir; + private String _finishedDirTimezone; private final FileSystem _fs; + private ZoneId _zoneId; /** * Constructor for {@link TonyFetcher}. @@ -56,6 +59,13 @@ public TonyFetcher(FetcherConfigurationData fetcherConfig) throws IOException { _intermediateDir = new Path(conf.get(TonyConfigurationKeys.TONY_HISTORY_INTERMEDIATE)); _finishedDir = new Path(conf.get(TonyConfigurationKeys.TONY_HISTORY_FINISHED)); _fs = _finishedDir.getFileSystem(conf); + + _finishedDirTimezone = conf.get(TonyConfigurationKeys.TONY_HISTORY_FINISHED_DIR_TIMEZONE); + if (_finishedDirTimezone == null) { + _finishedDirTimezone = TonyConfigurationKeys.DEFAULT_TONY_HISTORY_FINISHED_DIR_TIMEZONE; + } + _zoneId = ZoneId.of(_finishedDirTimezone); + _LOGGER.info("Using ZoneID: " + _zoneId.getId()); } @Override @@ -69,7 +79,7 @@ public TonyApplicationData fetchData(AnalyticJob job) throws Exception { // application finishes and thus is slightly before the RM's application finish time. So it's possible that the // yyyy/MM/dd derived from the RM's application finish time is a day later and we may not find the history files. // In case we don't find the history files in yyyy/MM/dd, we should check the previous day as well. - String yearMonthDay = ParserUtils.getYearMonthDayDirectory(date); + String yearMonthDay = ParserUtils.getYearMonthDayDirectory(date, _zoneId); Path jobDir = new Path(_finishedDir, yearMonthDay + Path.SEPARATOR + job.getAppId()); if (!_fs.exists(jobDir)) { // check intermediate dir diff --git a/app/com/linkedin/drelephant/tony/heuristics/TaskGPUHeuristic.java b/app/com/linkedin/drelephant/tony/heuristics/TaskGPUHeuristic.java new file mode 100644 index 000000000..3eea27dd2 --- /dev/null +++ b/app/com/linkedin/drelephant/tony/heuristics/TaskGPUHeuristic.java @@ -0,0 +1,122 @@ +/* + * Copyright 2019 LinkedIn Corp. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.linkedin.drelephant.tony.heuristics; + +import com.linkedin.drelephant.analysis.Heuristic; +import com.linkedin.drelephant.analysis.HeuristicResult; +import com.linkedin.drelephant.analysis.HeuristicResultDetails; +import com.linkedin.drelephant.analysis.Severity; +import com.linkedin.drelephant.configurations.heuristic.HeuristicConfigurationData; +import com.linkedin.drelephant.tony.data.TonyApplicationData; +import com.linkedin.drelephant.tony.data.TonyTaskData; +import com.linkedin.drelephant.tony.util.TonyUtils; +import com.linkedin.drelephant.util.Utils; +import com.linkedin.tony.Constants; +import com.linkedin.tony.TonyConfigurationKeys; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.log4j.Logger; + + +/** + * For each type of task (e.g.: worker, ps), this heuristic checks the GPU utilization across all tasks of that type + * and compares against some thresholds. The final severity is the highest severity across all task types. + */ +public class TaskGPUHeuristic implements Heuristic { + private static final Logger _LOGGER = Logger.getLogger(TaskGPUHeuristic.class); + private static final String[] MAX_METRICS_TO_APPLY = { + Constants.MAX_GPU_UTILIZATION, + Constants.MAX_GPU_FB_MEMORY_USAGE, + Constants.MAX_GPU_MAIN_MEMORY_USAGE}; + private static final String[] AVG_METRICS_TO_APPLY = { + Constants.AVG_GPU_UTILIZATION, + Constants.AVG_GPU_FB_MEMORY_USAGE, + Constants.AVG_GPU_MAIN_MEMORY_USAGE + }; + + private HeuristicConfigurationData _heuristicConfData; + + /** + * Constructor for {@link TaskGPUHeuristic}. + * @param heuristicConfData the configuration for this heuristic + */ + public TaskGPUHeuristic(HeuristicConfigurationData heuristicConfData) { + this._heuristicConfData = heuristicConfData; + } + + @Override + public HeuristicResult apply(TonyApplicationData data) { + _LOGGER.debug("Applying TaskGPUHeuristic"); + Map> taskMap = data.getTaskMap(); + Configuration conf = data.getConfiguration(); + + Set taskTypes = com.linkedin.tony.util.Utils.getAllJobTypes(conf); + Severity finalSeverity = Severity.NONE; + List details = new ArrayList<>(); + + for (String taskType : taskTypes) { + details.add(new HeuristicResultDetails("Number of " + taskType + " tasks", + Integer.toString(taskMap.get(taskType).size()))); + + // get number of GPU resource requested, if any + int numGPUsRequested = conf.getInt(TonyConfigurationKeys.getResourceKey(taskType, Constants.GPUS), 0); + if (numGPUsRequested > 0) { + details.add(new HeuristicResultDetails("Number of GPUs requested per " + taskType + " tasks", + Integer.toString(numGPUsRequested))); + } + + // get global max gpu utilization metrics + for (String maxMetricToApply : MAX_METRICS_TO_APPLY) { + double maxMetric = TonyUtils.getMaxMetricForTaskTypeAndMetricName(taskMap, taskType, maxMetricToApply); + + if (maxMetric <= 0 || Double.isNaN(maxMetric)) { + details.add(new HeuristicResultDetails(maxMetricToApply + " in any " + taskType + " task", "Unknown")); + continue; + } + details.add(new HeuristicResultDetails(maxMetricToApply + " in any " + taskType + " task", + String.format("%.2f", maxMetric) + "%")); + } + + // get global average gpu utilization metrics + for (String avgMetricToApply : AVG_METRICS_TO_APPLY) { + double avgMetric = TonyUtils.getAvgMetricForTaskTypeAndMetricName(taskMap, taskType, avgMetricToApply); + + if (avgMetric <= 0 || Double.isNaN(avgMetric)) { + details.add(new HeuristicResultDetails(avgMetricToApply + " in any " + taskType + " task", "Unknown")); + continue; + } + details.add(new HeuristicResultDetails(avgMetricToApply + " in any " + taskType + " task", + String.format("%.2f", avgMetric) + "%")); + } + + // compare to threshold and update severity + // TODO: pass all metrics collected as not severe for now, update heuristic when collected enough data to + // determine the appropriate threshold. + } + + return new HeuristicResult(_heuristicConfData.getClassName(), _heuristicConfData.getHeuristicName(), finalSeverity, + 0, details); + } + + @Override + public HeuristicConfigurationData getHeuristicConfData() { + return _heuristicConfData; + } +} diff --git a/app/com/linkedin/drelephant/tony/heuristics/TaskMemoryHeuristic.java b/app/com/linkedin/drelephant/tony/heuristics/TaskMemoryHeuristic.java index deacde80d..bb75bdef1 100644 --- a/app/com/linkedin/drelephant/tony/heuristics/TaskMemoryHeuristic.java +++ b/app/com/linkedin/drelephant/tony/heuristics/TaskMemoryHeuristic.java @@ -107,7 +107,8 @@ public HeuristicResult apply(TonyApplicationData data) { Long.toString(taskBytesRequested / FileUtils.ONE_MB))); // get global max memory per task - double maxMemoryBytesUsed = TonyUtils.getMaxMemoryBytesUsedForTaskType(taskMap, taskType); + double maxMemoryBytesUsed = TonyUtils.getMaxMetricForTaskTypeAndMetricName(taskMap, taskType, + Constants.MAX_MEMORY_BYTES); if (maxMemoryBytesUsed <= 0) { details.add(new HeuristicResultDetails("Max memory (MB) used in any " + taskType + " task", "Unknown")); continue; diff --git a/app/com/linkedin/drelephant/tony/util/TonyUtils.java b/app/com/linkedin/drelephant/tony/util/TonyUtils.java index 55b708d66..5085518c9 100644 --- a/app/com/linkedin/drelephant/tony/util/TonyUtils.java +++ b/app/com/linkedin/drelephant/tony/util/TonyUtils.java @@ -24,27 +24,74 @@ public class TonyUtils { /** - * Returns the max memory in bytes used by any task of the specified type. - * @param taskMap a map containing data for all tasks - * @param taskType the task type - * @return the max memory in bytes used by any task of the specified type + * Returns the max metric value of any task of the specified type given metricName + * Skips metrics collection for unavailable metric data + * @param taskMap a map containing data for all tasks + * @param taskType the task type + * @param metricName the name of the metric to query + * @return the max metric value of any task of the specified type */ - public static double getMaxMemoryBytesUsedForTaskType(Map> taskMap, - String taskType) { - double maxMemoryBytesUsed = 0; + public static double getMaxMetricForTaskTypeAndMetricName(Map> taskMap, + String taskType, String metricName) { + double maxMetric = 0.0; + if (taskMap.get(taskType) == null) { + return -1.0d; + } + + for (TonyTaskData taskData : taskMap.get(taskType).values()) { + List metrics = taskData.getMetrics(); + if (metrics == null) { + continue; + } + for (Metric metric : metrics) { + if (metric.getName().equals(metricName)) { + if (metric.getValue() <= 0) { + continue; + } + + if (metric.getValue() > maxMetric) { + maxMetric = metric.getValue(); + } + } + } + } + + return maxMetric; + } + + /** + * Returns the average metric value of all task of the specified type given metricName + * Skips metrics collection for unavailable metric data + * @param taskMap a map containing data for all tasks + * @param tasktype the task type + * @param metricsName the name of the metric to query + * @return the average metric value of any task of the specified type + */ + public static double getAvgMetricForTaskTypeAndMetricName(Map> taskMap, + String taskType, String metricName) { + double avgMetric = 0.0; + double numMetrics = 0; + if (taskMap.get(taskType) == null) { + return -1.0d; + } for (TonyTaskData taskData : taskMap.get(taskType).values()) { + double avgMetricPerTask = 0.0; List metrics = taskData.getMetrics(); if (metrics == null) { continue; } for (Metric metric : metrics) { - if (metric.getName().equals(Constants.MAX_MEMORY_BYTES)) { - if (metric.getValue() > maxMemoryBytesUsed) { - maxMemoryBytesUsed = metric.getValue(); + if (metric.getName().equals(metricName)) { + if (metric.getValue() <= 0) { + continue; } + + avgMetric += metric.getValue(); + numMetrics++; } } } - return maxMemoryBytesUsed; + + return avgMetric / numMetrics; } } diff --git a/app/views/help/tony/helpTaskGPU.scala.html b/app/views/help/tony/helpTaskGPU.scala.html new file mode 100644 index 000000000..ec997ff93 --- /dev/null +++ b/app/views/help/tony/helpTaskGPU.scala.html @@ -0,0 +1,37 @@ +@* +* Copyright 2019 LinkedIn Corp. +* +* Licensed under the Apache License, Version 2.0 (the "License"); you may not +* use this file except in compliance with the License. You may obtain a copy of +* the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +* License for the specific language governing permissions and limitations under +* the License. +*@ +

+ This heuristic shows GPU utilization and GPU memory utilization for each task type. + Try to optimize your GPU utilization! +

+

+ GPU_UTILIZATION shows the percent of time over the past sample period during which one or more kernels was executing + on the GPU. +

+

+ GPU_FB_MEMORY_USAGE shows the on-board frame buffer memory usage in percentage. Note, reported total memory is + affected by ECC (error-correcting code) state. If ECC is enabled the total available memory is decreased by several + percent, due to the requisite parity bits. The driver may also reserve a small amount of memory for internal use, even + without active work on the GPU. +

+

+ GPU_MAIN_MEMORY_USAGE aka BAR1 memory usage shows the percentage of memory used to map the FB (device memory) so that + it can be directly accessed by CPU. +

+

+ Above metrics are collected via nvidia-smi tools installed on the host, for more detailed information please visit + nvidia-smi manual. +

\ No newline at end of file diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 597952c2f..d1fbc6e79 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -30,7 +30,7 @@ object Dependencies { lazy val jsoupVersion = "1.7.3" lazy val mysqlConnectorVersion = "5.1.36" lazy val oozieClientVersion = "4.2.0" - lazy val tonyVersion = "0.3.6" + lazy val tonyVersion = "0.3.16" lazy val HADOOP_VERSION = "hadoopversion" lazy val SPARK_VERSION = "sparkversion" diff --git a/test/com/linkedin/drelephant/tony/fetchers/TonyFetcherTest.java b/test/com/linkedin/drelephant/tony/fetchers/TonyFetcherTest.java index b9b5c8a4a..caf495be5 100644 --- a/test/com/linkedin/drelephant/tony/fetchers/TonyFetcherTest.java +++ b/test/com/linkedin/drelephant/tony/fetchers/TonyFetcherTest.java @@ -125,7 +125,9 @@ public void testFetchDataFinishedDir() throws Exception { private static void testHelper(String appId) throws Exception { FetcherConfigurationData configData = new FetcherConfigurationData(null, null, - ImmutableMap.of(Constants.TONY_CONF_DIR, _tonyConfDir)); + ImmutableMap.of(Constants.TONY_CONF_DIR, _tonyConfDir, + TonyConfigurationKeys.TONY_HISTORY_FINISHED_DIR_TIMEZONE, + TonyConfigurationKeys.DEFAULT_TONY_HISTORY_FINISHED_DIR_TIMEZONE)); TonyFetcher tonyFetcher = new TonyFetcher(configData); AnalyticJob job = new AnalyticJob(); diff --git a/test/com/linkedin/drelephant/tony/util/TonyUtilsTest.java b/test/com/linkedin/drelephant/tony/util/TonyUtilsTest.java index 116cea7cb..e305e9edd 100644 --- a/test/com/linkedin/drelephant/tony/util/TonyUtilsTest.java +++ b/test/com/linkedin/drelephant/tony/util/TonyUtilsTest.java @@ -28,6 +28,95 @@ public void testGetMaxMemorySomeTasksMissingMetrics() { taskDataMap.put(1, worker1Data); Assert.assertEquals(worker1MaxMemoryBytes, - TonyUtils.getMaxMemoryBytesUsedForTaskType(ImmutableMap.of("worker", taskDataMap), "worker"), 0); + TonyUtils.getMaxMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "worker", + Constants.MAX_MEMORY_BYTES), 0); + } + + @Test + public void testGetMaxMetricForTaskTypeAndMetricNameMissingTask() { + Map taskDataMap = new TreeMap<>(); + TonyTaskData worker0Data = new TonyTaskData("worker", 0); + + taskDataMap.put(0, worker0Data); + + Assert.assertEquals(-1.0d, + TonyUtils.getMaxMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "ps", + Constants.MAX_MEMORY_BYTES), 0); + } + + @Test + public void testGetMaxMetricForTaskTypeAndMetricName() { + Map taskDataMap = new TreeMap<>(); + TonyTaskData worker0Data = new TonyTaskData("worker", 0); + TonyTaskData worker1Data = new TonyTaskData("worker", 1); + + double worker0MaxGPUUtilization = 20.0d; + double worker1MaxGPUUtilization = 21.0d; + double worker0MaxGPUFBMemoryUsage = 22.0d; + double worker1MaxGPUFBMemoryUsage = 23.0d; + double worker0MaxGPUMainMemoryUsage = 2.0d; + double worker1MaxGPUMainMemoryUsage = -1.0d; + + worker0Data.setMetrics(ImmutableList.of( + new Metric(Constants.MAX_GPU_UTILIZATION, worker0MaxGPUUtilization), + new Metric(Constants.MAX_GPU_FB_MEMORY_USAGE, worker0MaxGPUFBMemoryUsage), + new Metric(Constants.MAX_GPU_MAIN_MEMORY_USAGE, worker0MaxGPUMainMemoryUsage) + )); + worker1Data.setMetrics(ImmutableList.of( + new Metric(Constants.MAX_GPU_UTILIZATION, worker1MaxGPUUtilization), + new Metric(Constants.MAX_GPU_FB_MEMORY_USAGE, worker1MaxGPUFBMemoryUsage), + new Metric(Constants.MAX_GPU_MAIN_MEMORY_USAGE, worker1MaxGPUMainMemoryUsage) + )); + + taskDataMap.put(0, worker0Data); + taskDataMap.put(1, worker1Data); + + Assert.assertEquals(worker1MaxGPUUtilization, + TonyUtils.getMaxMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "worker", + Constants.MAX_GPU_UTILIZATION), 0); + Assert.assertEquals(worker1MaxGPUFBMemoryUsage, + TonyUtils.getMaxMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "worker", + Constants.MAX_GPU_FB_MEMORY_USAGE), 0); + Assert.assertEquals(worker0MaxGPUMainMemoryUsage, + TonyUtils.getMaxMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "worker", + Constants.MAX_GPU_MAIN_MEMORY_USAGE), 0); + } + + @Test + public void testGetAvgMetricForTaskTypeAndMetricName() { + Map taskDataMap = new TreeMap<>(); + TonyTaskData worker0Data = new TonyTaskData("worker", 0); + TonyTaskData worker1Data = new TonyTaskData("worker", 1); + + double worker0AvgGPUUtilization = 10.0d; + double worker1AvgGPUUtilization = 20.0d; + double worker0AvgGPUFBMemoryUsage = 30.0d; + double worker1AvgGPUFBMemoryUsage = 0.0d; + double worker0AvgGPUMainMemoryUsage = 40.0d; + double worker1AvgGPUMainMemoryUsage = -1.0d; + + worker0Data.setMetrics(ImmutableList.of( + new Metric(Constants.AVG_GPU_UTILIZATION, worker0AvgGPUUtilization), + new Metric(Constants.AVG_GPU_FB_MEMORY_USAGE, worker0AvgGPUFBMemoryUsage), + new Metric(Constants.AVG_GPU_MAIN_MEMORY_USAGE, worker0AvgGPUMainMemoryUsage)) + ); + worker1Data.setMetrics(ImmutableList.of( + new Metric(Constants.AVG_GPU_UTILIZATION, worker1AvgGPUUtilization), + new Metric(Constants.AVG_GPU_FB_MEMORY_USAGE, worker1AvgGPUFBMemoryUsage), + new Metric(Constants.AVG_GPU_MAIN_MEMORY_USAGE, worker1AvgGPUMainMemoryUsage) + )); + + taskDataMap.put(0, worker0Data); + taskDataMap.put(1, worker1Data); + + Assert.assertEquals(15.0d, + TonyUtils.getAvgMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "worker", + Constants.AVG_GPU_UTILIZATION), 0); + Assert.assertEquals(30.0d, + TonyUtils.getAvgMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "worker", + Constants.AVG_GPU_FB_MEMORY_USAGE), 0); + Assert.assertEquals(40.0d, + TonyUtils.getAvgMetricForTaskTypeAndMetricName(ImmutableMap.of("worker", taskDataMap), "worker", + Constants.AVG_GPU_MAIN_MEMORY_USAGE), 0); } }