Skip to content

Commit

Permalink
* linkedin#606 Integrated TonY GPU metrics to Dr. Elephant; linkedin#605
Browse files Browse the repository at this point in the history
 fixed NPE in TonYUtils

* removed unused imports

* Added number GPU requested metric; Updated nvidia-smi doc link

* Addressed reviewer's comment

(cherry picked from commit be6d03a)
  • Loading branch information
UWFrankGu committed Jun 26, 2019
1 parent 96e2786 commit 4d4f370
Show file tree
Hide file tree
Showing 10 changed files with 332 additions and 17 deletions.
6 changes: 6 additions & 0 deletions app-conf/HeuristicConf.xml
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,12 @@
<!--task_memory_thresholds>0.8, 0.7, 0.6, 0.5</task_memory_thresholds-->
</params>
</heuristic>
<heuristic>
<applicationtype>tony</applicationtype>
<heuristicname>Task GPU</heuristicname>
<classname>com.linkedin.drelephant.tony.heuristics.TaskGPUHeuristic</classname>
<viewname>views.html.help.tony.helpTaskGPU</viewname>
</heuristic>
<!-- END TONY HEURISTICS -->

</heuristics>
3 changes: 2 additions & 1 deletion app/com/linkedin/drelephant/tony/TonyMetricsAggregator.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ public void aggregate(HadoopApplicationData data) {
String memoryString = tonyConf.get(TonyConfigurationKeys.getResourceKey(taskType, Constants.MEMORY));
String memoryStringMB = com.linkedin.tony.util.Utils.parseMemoryString(memoryString);
long mbRequested = Long.parseLong(memoryStringMB);
double maxMemoryMBUsed = TonyUtils.getMaxMemoryBytesUsedForTaskType(taskMap, taskType) / FileUtils.ONE_MB;
double maxMemoryMBUsed = TonyUtils.getMaxMetricForTaskTypeAndMetricName(taskMap, taskType,
Constants.MAX_MEMORY_BYTES) / FileUtils.ONE_MB;

for (TonyTaskData taskData : entry.getValue().values()) {
long taskDurationSec = (taskData.getTaskEndTime() - taskData.getTaskStartTime()) / Statistics.SECOND_IN_MS;
Expand Down
12 changes: 11 additions & 1 deletion app/com/linkedin/drelephant/tony/fetchers/TonyFetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import java.io.IOException;
import java.util.Date;
import java.util.List;
import java.time.ZoneId;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
Expand All @@ -36,7 +37,9 @@ public class TonyFetcher implements ElephantFetcher<TonyApplicationData> {
private static final Logger _LOGGER = Logger.getLogger(TonyFetcher.class);
private final Path _intermediateDir;
private final Path _finishedDir;
private String _finishedDirTimezone;
private final FileSystem _fs;
private ZoneId _zoneId;

/**
* Constructor for {@link TonyFetcher}.
Expand All @@ -56,6 +59,13 @@ public TonyFetcher(FetcherConfigurationData fetcherConfig) throws IOException {
_intermediateDir = new Path(conf.get(TonyConfigurationKeys.TONY_HISTORY_INTERMEDIATE));
_finishedDir = new Path(conf.get(TonyConfigurationKeys.TONY_HISTORY_FINISHED));
_fs = _finishedDir.getFileSystem(conf);

_finishedDirTimezone = conf.get(TonyConfigurationKeys.TONY_HISTORY_FINISHED_DIR_TIMEZONE);
if (_finishedDirTimezone == null) {
_finishedDirTimezone = TonyConfigurationKeys.DEFAULT_TONY_HISTORY_FINISHED_DIR_TIMEZONE;
}
_zoneId = ZoneId.of(_finishedDirTimezone);
_LOGGER.info("Using ZoneID: " + _zoneId.getId());
}

@Override
Expand All @@ -69,7 +79,7 @@ public TonyApplicationData fetchData(AnalyticJob job) throws Exception {
// application finishes and thus is slightly before the RM's application finish time. So it's possible that the
// yyyy/MM/dd derived from the RM's application finish time is a day later and we may not find the history files.
// In case we don't find the history files in yyyy/MM/dd, we should check the previous day as well.
String yearMonthDay = ParserUtils.getYearMonthDayDirectory(date);
String yearMonthDay = ParserUtils.getYearMonthDayDirectory(date, _zoneId);
Path jobDir = new Path(_finishedDir, yearMonthDay + Path.SEPARATOR + job.getAppId());
if (!_fs.exists(jobDir)) {
// check intermediate dir
Expand Down
122 changes: 122 additions & 0 deletions app/com/linkedin/drelephant/tony/heuristics/TaskGPUHeuristic.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/*
* Copyright 2019 LinkedIn Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.linkedin.drelephant.tony.heuristics;

import com.linkedin.drelephant.analysis.Heuristic;
import com.linkedin.drelephant.analysis.HeuristicResult;
import com.linkedin.drelephant.analysis.HeuristicResultDetails;
import com.linkedin.drelephant.analysis.Severity;
import com.linkedin.drelephant.configurations.heuristic.HeuristicConfigurationData;
import com.linkedin.drelephant.tony.data.TonyApplicationData;
import com.linkedin.drelephant.tony.data.TonyTaskData;
import com.linkedin.drelephant.tony.util.TonyUtils;
import com.linkedin.drelephant.util.Utils;
import com.linkedin.tony.Constants;
import com.linkedin.tony.TonyConfigurationKeys;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.log4j.Logger;


/**
* For each type of task (e.g.: worker, ps), this heuristic checks the GPU utilization across all tasks of that type
* and compares against some thresholds. The final severity is the highest severity across all task types.
*/
public class TaskGPUHeuristic implements Heuristic<TonyApplicationData> {
private static final Logger _LOGGER = Logger.getLogger(TaskGPUHeuristic.class);
private static final String[] MAX_METRICS_TO_APPLY = {
Constants.MAX_GPU_UTILIZATION,
Constants.MAX_GPU_FB_MEMORY_USAGE,
Constants.MAX_GPU_MAIN_MEMORY_USAGE};
private static final String[] AVG_METRICS_TO_APPLY = {
Constants.AVG_GPU_UTILIZATION,
Constants.AVG_GPU_FB_MEMORY_USAGE,
Constants.AVG_GPU_MAIN_MEMORY_USAGE
};

private HeuristicConfigurationData _heuristicConfData;

/**
* Constructor for {@link TaskGPUHeuristic}.
* @param heuristicConfData the configuration for this heuristic
*/
public TaskGPUHeuristic(HeuristicConfigurationData heuristicConfData) {
this._heuristicConfData = heuristicConfData;
}

@Override
public HeuristicResult apply(TonyApplicationData data) {
_LOGGER.debug("Applying TaskGPUHeuristic");
Map<String, Map<Integer, TonyTaskData>> taskMap = data.getTaskMap();
Configuration conf = data.getConfiguration();

Set<String> taskTypes = com.linkedin.tony.util.Utils.getAllJobTypes(conf);
Severity finalSeverity = Severity.NONE;
List<HeuristicResultDetails> details = new ArrayList<>();

for (String taskType : taskTypes) {
details.add(new HeuristicResultDetails("Number of " + taskType + " tasks",
Integer.toString(taskMap.get(taskType).size())));

// get number of GPU resource requested, if any
int numGPUsRequested = conf.getInt(TonyConfigurationKeys.getResourceKey(taskType, Constants.GPUS), 0);
if (numGPUsRequested > 0) {
details.add(new HeuristicResultDetails("Number of GPUs requested per " + taskType + " tasks",
Integer.toString(numGPUsRequested)));
}

// get global max gpu utilization metrics
for (String maxMetricToApply : MAX_METRICS_TO_APPLY) {
double maxMetric = TonyUtils.getMaxMetricForTaskTypeAndMetricName(taskMap, taskType, maxMetricToApply);

if (maxMetric <= 0 || Double.isNaN(maxMetric)) {
details.add(new HeuristicResultDetails(maxMetricToApply + " in any " + taskType + " task", "Unknown"));
continue;
}
details.add(new HeuristicResultDetails(maxMetricToApply + " in any " + taskType + " task",
String.format("%.2f", maxMetric) + "%"));
}

// get global average gpu utilization metrics
for (String avgMetricToApply : AVG_METRICS_TO_APPLY) {
double avgMetric = TonyUtils.getAvgMetricForTaskTypeAndMetricName(taskMap, taskType, avgMetricToApply);

if (avgMetric <= 0 || Double.isNaN(avgMetric)) {
details.add(new HeuristicResultDetails(avgMetricToApply + " in any " + taskType + " task", "Unknown"));
continue;
}
details.add(new HeuristicResultDetails(avgMetricToApply + " in any " + taskType + " task",
String.format("%.2f", avgMetric) + "%"));
}

// compare to threshold and update severity
// TODO: pass all metrics collected as not severe for now, update heuristic when collected enough data to
// determine the appropriate threshold.
}

return new HeuristicResult(_heuristicConfData.getClassName(), _heuristicConfData.getHeuristicName(), finalSeverity,
0, details);
}

@Override
public HeuristicConfigurationData getHeuristicConfData() {
return _heuristicConfData;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ public HeuristicResult apply(TonyApplicationData data) {
Long.toString(taskBytesRequested / FileUtils.ONE_MB)));

// get global max memory per task
double maxMemoryBytesUsed = TonyUtils.getMaxMemoryBytesUsedForTaskType(taskMap, taskType);
double maxMemoryBytesUsed = TonyUtils.getMaxMetricForTaskTypeAndMetricName(taskMap, taskType,
Constants.MAX_MEMORY_BYTES);
if (maxMemoryBytesUsed <= 0) {
details.add(new HeuristicResultDetails("Max memory (MB) used in any " + taskType + " task", "Unknown"));
continue;
Expand Down
69 changes: 58 additions & 11 deletions app/com/linkedin/drelephant/tony/util/TonyUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,27 +24,74 @@

public class TonyUtils {
/**
* Returns the max memory in bytes used by any task of the specified type.
* @param taskMap a map containing data for all tasks
* @param taskType the task type
* @return the max memory in bytes used by any task of the specified type
* Returns the max metric value of any task of the specified type given metricName
* Skips metrics collection for unavailable metric data
* @param taskMap a map containing data for all tasks
* @param taskType the task type
* @param metricName the name of the metric to query
* @return the max metric value of any task of the specified type
*/
public static double getMaxMemoryBytesUsedForTaskType(Map<String, Map<Integer, TonyTaskData>> taskMap,
String taskType) {
double maxMemoryBytesUsed = 0;
public static double getMaxMetricForTaskTypeAndMetricName(Map<String, Map<Integer, TonyTaskData>> taskMap,
String taskType, String metricName) {
double maxMetric = 0.0;
if (taskMap.get(taskType) == null) {
return -1.0d;
}

for (TonyTaskData taskData : taskMap.get(taskType).values()) {
List<Metric> metrics = taskData.getMetrics();
if (metrics == null) {
continue;
}
for (Metric metric : metrics) {
if (metric.getName().equals(metricName)) {
if (metric.getValue() <= 0) {
continue;
}

if (metric.getValue() > maxMetric) {
maxMetric = metric.getValue();
}
}
}
}

return maxMetric;
}

/**
* Returns the average metric value of all task of the specified type given metricName
* Skips metrics collection for unavailable metric data
* @param taskMap a map containing data for all tasks
* @param tasktype the task type
* @param metricsName the name of the metric to query
* @return the average metric value of any task of the specified type
*/
public static double getAvgMetricForTaskTypeAndMetricName(Map<String, Map<Integer, TonyTaskData>> taskMap,
String taskType, String metricName) {
double avgMetric = 0.0;
double numMetrics = 0;
if (taskMap.get(taskType) == null) {
return -1.0d;
}
for (TonyTaskData taskData : taskMap.get(taskType).values()) {
double avgMetricPerTask = 0.0;
List<Metric> metrics = taskData.getMetrics();
if (metrics == null) {
continue;
}
for (Metric metric : metrics) {
if (metric.getName().equals(Constants.MAX_MEMORY_BYTES)) {
if (metric.getValue() > maxMemoryBytesUsed) {
maxMemoryBytesUsed = metric.getValue();
if (metric.getName().equals(metricName)) {
if (metric.getValue() <= 0) {
continue;
}

avgMetric += metric.getValue();
numMetrics++;
}
}
}
return maxMemoryBytesUsed;

return avgMetric / numMetrics;
}
}
37 changes: 37 additions & 0 deletions app/views/help/tony/helpTaskGPU.scala.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
@*
* Copyright 2019 LinkedIn Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*@
<p>
This heuristic shows GPU utilization and GPU memory utilization for each task type.
Try to optimize your GPU utilization!
</p>
<p>
GPU_UTILIZATION shows the percent of time over the past sample period during which one or more kernels was executing
on the GPU.
</p>
<p>
GPU_FB_MEMORY_USAGE shows the on-board frame buffer memory usage in percentage. Note, reported total memory is
affected by ECC (error-correcting code) state. If ECC is enabled the total available memory is decreased by several
percent, due to the requisite parity bits. The driver may also reserve a small amount of memory for internal use, even
without active work on the GPU.
</p>
<p>
GPU_MAIN_MEMORY_USAGE aka BAR1 memory usage shows the percentage of memory used to map the FB (device memory) so that
it can be directly accessed by CPU.
</p>
<p>
Above metrics are collected via nvidia-smi tools installed on the host, for more detailed information please visit
<a href="https://developer.nvidia.com/nvidia-system-management-interface">nvidia-smi manual</a>.
</p>
2 changes: 1 addition & 1 deletion project/Dependencies.scala
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ object Dependencies {
lazy val jsoupVersion = "1.7.3"
lazy val mysqlConnectorVersion = "5.1.36"
lazy val oozieClientVersion = "4.2.0"
lazy val tonyVersion = "0.3.6"
lazy val tonyVersion = "0.3.16"

lazy val HADOOP_VERSION = "hadoopversion"
lazy val SPARK_VERSION = "sparkversion"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,9 @@ public void testFetchDataFinishedDir() throws Exception {

private static void testHelper(String appId) throws Exception {
FetcherConfigurationData configData = new FetcherConfigurationData(null, null,
ImmutableMap.of(Constants.TONY_CONF_DIR, _tonyConfDir));
ImmutableMap.of(Constants.TONY_CONF_DIR, _tonyConfDir,
TonyConfigurationKeys.TONY_HISTORY_FINISHED_DIR_TIMEZONE,
TonyConfigurationKeys.DEFAULT_TONY_HISTORY_FINISHED_DIR_TIMEZONE));
TonyFetcher tonyFetcher = new TonyFetcher(configData);

AnalyticJob job = new AnalyticJob();
Expand Down
Loading

0 comments on commit 4d4f370

Please sign in to comment.