Suggest in the Profiler's Overview Page that Kernel Launch time may b…

…e due to CPU contention with tf.data. PiperOrigin-RevId: 307713646 Change-Id: I4cf6fcc5986858f7c20ddd5f96803c367885851c
Marbck · Apr 22, 2020 · 1ac4f9e · 1ac4f9e
1 parent 838b19e
commit 1ac4f9e
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 7 deletions.
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@@ -84,6 +84,10 @@ constexpr int kHostAnalysisSectionNumber = 3;
 const char* kAllOthersPythonExplanation =
     " % of the total step time sampled is spent on 'All Others' time. "
     "This could be due to Python execution overhead.";
+// Explanation for "Kernel Launch" time due to CPU contention with tf.data.
+const char* kKernelLaunchTfDataContention =
+    " It could be due to CPU contention with tf.data. In this case, you may "
+    "try to set the environment variable TF_GPU_THREAD_MODE=gpu_private.";
 
 template <class Collection>
 double GetTimeInMs(const Collection& type_ps, EventType event_type) {
@@ -357,7 +361,7 @@ double RatioOfHostToDeviceTimeToStepTime(
   return 0.0;
 }
 
-void KernelLaunchAnalysis(double kernel_launch_percent,
+void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
                           string* kernel_launch_classification,
                           string* kernel_launch_statement) {
   string percent_str = absl::StrFormat("%.1lf", kernel_launch_percent);
@@ -366,12 +370,18 @@ void KernelLaunchAnalysis(double kernel_launch_percent,
     *kernel_launch_statement = absl::StrCat(
         percent_str,
         " % of the total step time sampled is spent on 'Kernel Launch'.");
+    if (tfdata_used) {
+      absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
+    }
   } else if (kernel_launch_percent >=
              kModeratelyKernelLaunchBoundThresholdInPercent) {
     *kernel_launch_classification = "moderate";
     *kernel_launch_statement = absl::StrCat(
         percent_str,
         " % of the total step time sampled is spent on 'Kernel Launch'.");
+    if (tfdata_used) {
+      absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
+    }
   } else {
     *kernel_launch_classification = "no";
     *kernel_launch_statement = "";
@@ -566,8 +576,8 @@ InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
   GenerateHostResult(op_stats.host_op_metrics_db(), &result);
 
   InputPipelineAnalysisRecommendation recommendation = GenerateRecommendation();
-  BottleneckAnalysis bottleneck_analysis =
-      ComputeBottleneckAnalysis(result.step_details());
+  BottleneckAnalysis bottleneck_analysis = ComputeBottleneckAnalysis(
+      result.input_time_breakdown(), result.step_details());
   recommendation.mutable_bottleneck_analysis()->PackFrom(bottleneck_analysis);
   *recommendation.mutable_summary_next_step() =
       GetSummaryNextStep(bottleneck_analysis.input_classification(),
@@ -646,6 +656,7 @@ void OutputAnalysis(double output_percent, string* output_classification,
 }
 
 BottleneckAnalysis ComputeBottleneckAnalysis(
+    const InputTimeBreakdown& input_time_breakdown,
     const ::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any>&
         any_step_details) {
   double total_step_time_ms = 0;
@@ -700,8 +711,8 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
 
   string kernel_launch_classification;
   string kernel_launch_statement;
-  KernelLaunchAnalysis(kernel_launch_percent, &kernel_launch_classification,
-                       &kernel_launch_statement);
+  KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
+                       &kernel_launch_classification, &kernel_launch_statement);
 
   string all_other_classification;
   string all_other_statement;

diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
@@ -40,6 +40,7 @@ InputPipelineAnalysisRecommendation GenerateRecommendation();
 
 // Returns the performance bottleneck of the program executed.
 BottleneckAnalysis ComputeBottleneckAnalysis(
+    const InputTimeBreakdown& input_time_breakdown,
     const ::tensorflow::protobuf::RepeatedPtrField<::google::protobuf::Any>&
         any_step_details);
 

diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -253,8 +253,9 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
   *overview_page.mutable_analysis() = ComputeAnalysisResult(op_stats);
   *overview_page.mutable_input_analysis() =
       ConvertOpStatsToInputPipelineAnalysis(op_stats, hardware_type);
-  BottleneckAnalysis bottleneck =
-      ComputeBottleneckAnalysis(overview_page.input_analysis().step_details());
+  BottleneckAnalysis bottleneck = ComputeBottleneckAnalysis(
+      overview_page.input_analysis().input_time_breakdown(),
+      overview_page.input_analysis().step_details());
   *overview_page.mutable_recommendation() = ComputeGenericRecommendation(
       bottleneck, op_stats.device_op_metrics_db().precision_stats());
   SetCommonRecommendation(bottleneck.input_classification(),