From 33a5d996c6a3cc961824668edd908cebd05164aa Mon Sep 17 00:00:00 2001 From: lishid Date: Tue, 15 Apr 2014 12:27:16 -0700 Subject: [PATCH] Minor changes to the help page, adding hadoop security to work with keytab --- README.md | 21 ++-- app/com/linkedin/drelephant/DrElephant.java | 1 - .../linkedin/drelephant/ElephantAnalyser.java | 13 +++ .../linkedin/drelephant/ElephantFetcher.java | 54 +++++++++- .../linkedin/drelephant/ElephantRunner.java | 100 ++++++++++-------- .../drelephant/analysis/Heuristic.java | 1 + .../drelephant/analysis/HeuristicResult.java | 3 +- .../drelephant/analysis/Severity.java | 10 +- .../heuristics/ShuffleSortHeuristic.java | 25 +---- .../drelephant/hadoop/HadoopJobData.java | 9 +- .../drelephant/hadoop/HadoopSecurity.java | 60 +++++++++++ .../drelephant/hadoop/HadoopTaskData.java | 59 ++++++----- .../linkedin/drelephant/math/Statistics.java | 28 ++++- .../drelephant/notifications/EmailThread.java | 6 ++ app/model/JobResult.java | 3 + app/views/helpMapperDataSkew.scala.html | 1 + app/views/helpMapperInputSize.scala.html | 6 +- app/views/helpMapperSpeed.scala.html | 2 +- app/views/helpReducerDataSkew.scala.html | 5 +- app/views/helpReducerTime.scala.html | 10 +- app/views/helpShuffleSort.scala.html | 4 +- app/views/multijob.scala.html | 4 +- app/views/singlejob.scala.html | 4 +- conf/application.conf | 6 +- 24 files changed, 308 insertions(+), 127 deletions(-) create mode 100644 app/com/linkedin/drelephant/hadoop/HadoopSecurity.java diff --git a/README.md b/README.md index 4d04a5cc4..af3c3a697 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,20 @@ ## Dr Elephant -### Compiling +### Compiling & testing locally -* This project uses Ant to compile and package the jar. -* Running ant in the project directory will package the jar file ready for use. +* To be able to build & run the application, download and install [Play framework 2.2.2](http://downloads.typesafe.com/play/2.2.2/play-2.2.2.zip). +* To build and run the application in dev mode, run from command line "play run" in the project directory. +* There is need to investigate the framework to see how one can add parameters to the classpath in dev mode. -### Running +### Deployment -* To run the project, copy the package "dr-elephant.jar" to a machine that has hadoop setup. -* Use the command 'hadoop jar dr-elephant.jar job_id' to start execution. \ No newline at end of file +* To create a deployment package, use "play dist" to create a zip package, or use "play universal:package-zip-tarball" to create a tarball +* To run the deployed package with Hadoop properly, some changes needs to be added to the startup script located at ./bin/dr-elephant + +* in the classpath ("declare -r app\_classpath=...") , add to the end of the string, before the end quotes + + :$HADOOP_HOME/*:$HADOOP_HOME/lib/*:$HADOOP_HOME/conf + +* after the next line ("addJava ... ;"), add new line + + addJava "-Djava.library.path=$HADOOP_HOME/lib/native/Linux-amd64-64" \ No newline at end of file diff --git a/app/com/linkedin/drelephant/DrElephant.java b/app/com/linkedin/drelephant/DrElephant.java index d67edf271..8ac895cf7 100644 --- a/app/com/linkedin/drelephant/DrElephant.java +++ b/app/com/linkedin/drelephant/DrElephant.java @@ -23,7 +23,6 @@ public static void analyse(String jobId) throws IOException { HadoopJobData jobData = fetcher.getJobData(job_id); ElephantAnalyser analyser = new ElephantAnalyser(); HeuristicResult[] result = analyser.analyse(jobData); - //System.out.println(result); } public DrElephant() throws IOException { diff --git a/app/com/linkedin/drelephant/ElephantAnalyser.java b/app/com/linkedin/drelephant/ElephantAnalyser.java index aac90072c..8462eedcc 100644 --- a/app/com/linkedin/drelephant/ElephantAnalyser.java +++ b/app/com/linkedin/drelephant/ElephantAnalyser.java @@ -48,6 +48,19 @@ public HeuristicResult[] analyse(HadoopJobData data) { return results.toArray(new HeuristicResult[results.size()]); } + public String getJobType(HadoopJobData data) { + String pigVersion = data.getJobConf().getProperty("pig.version"); + if (pigVersion != null && !pigVersion.isEmpty()) { + return "Pig"; + } + String hiveMapredMode = data.getJobConf().getProperty("hive.mapred.mode"); + if (hiveMapredMode != null && !hiveMapredMode.isEmpty()) { + return "Hive"; + } + + return "Unknown/Hadoop"; + } + public static ElephantAnalyser instance() { return instance; } diff --git a/app/com/linkedin/drelephant/ElephantFetcher.java b/app/com/linkedin/drelephant/ElephantFetcher.java index e5066a4d6..d2a557768 100644 --- a/app/com/linkedin/drelephant/ElephantFetcher.java +++ b/app/com/linkedin/drelephant/ElephantFetcher.java @@ -3,11 +3,21 @@ import com.linkedin.drelephant.hadoop.HadoopCounterHolder; import com.linkedin.drelephant.hadoop.HadoopJobData; import com.linkedin.drelephant.hadoop.HadoopTaskData; +import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapred.*; +import org.apache.hadoop.security.authentication.client.AuthenticatedURL; +import org.apache.hadoop.security.authentication.client.AuthenticationException; import org.apache.log4j.Logger; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.util.Properties; public class ElephantFetcher { private static final Logger logger = Logger.getLogger(ElephantFetcher.class); @@ -49,7 +59,18 @@ public HadoopJobData getJobData(JobID job_id) throws IOException { reducers[i] = new HadoopTaskData(job, reducerTasks[i], true); } - return new HadoopJobData(counterHolder, mappers, reducers) + Properties jobConf = null; + try { + jobConf = getJobConf(job); + } catch (AuthenticationException e) { + e.printStackTrace(); + } + + if (jobConf == null) { + jobConf = new Properties(); + } + + return new HadoopJobData(counterHolder, mappers, reducers, jobConf) .setUsername(username).setStartTime(startTime).setUrl(jobUrl).setJobName(jobName); } @@ -68,4 +89,35 @@ private TaskReport[] getReduceTaskReports(JobID job_id) throws IOException { public JobStatus[] getJobList() throws IOException { return jobClient.getAllJobs(); } + + public Properties getJobConf(RunningJob job) throws IOException, AuthenticationException { + Properties properties = new Properties(); + String jobconfUrl = getJobconfUrl(job); + if (jobconfUrl == null) { + return properties; + } + + URL url = new URL(jobconfUrl); + AuthenticatedURL.Token token = new AuthenticatedURL.Token(); + HttpURLConnection conn = new AuthenticatedURL().openConnection(url, token); + String data = IOUtils.toString(conn.getInputStream()); + Document doc = Jsoup.parse(data); + Elements rows = doc.select("table").select("tr"); + for (int i = 1; i < rows.size(); i++) { + Element row = rows.get(i); + Elements cells = row.select("> td"); + if (cells.size() == 2) { + String key = cells.get(0).text().trim(); + String value = cells.get(1).text().trim(); + properties.put(key, value); + } + } + return properties; + } + + private String getJobconfUrl(RunningJob job) { + String jobDetails = job.getTrackingURL(); + String root = jobDetails.substring(0, jobDetails.indexOf("jobdetails.jsp")); + return root + "jobconf.jsp?jobid=" + job.getID().toString(); + } } diff --git a/app/com/linkedin/drelephant/ElephantRunner.java b/app/com/linkedin/drelephant/ElephantRunner.java index 9f2d2f5c7..c5cc3579b 100644 --- a/app/com/linkedin/drelephant/ElephantRunner.java +++ b/app/com/linkedin/drelephant/ElephantRunner.java @@ -4,6 +4,7 @@ import com.linkedin.drelephant.analysis.HeuristicResult; import com.linkedin.drelephant.analysis.Severity; import com.linkedin.drelephant.hadoop.HadoopJobData; +import com.linkedin.drelephant.hadoop.HadoopSecurity; import com.linkedin.drelephant.notifications.EmailThread; import model.JobHeuristicResult; import model.JobResult; @@ -11,6 +12,7 @@ import org.apache.hadoop.mapred.JobStatus; import org.apache.log4j.Logger; +import java.security.PrivilegedAction; import java.util.ArrayList; import java.util.HashSet; import java.util.Set; @@ -22,62 +24,71 @@ public class ElephantRunner implements Runnable { private AtomicBoolean running = new AtomicBoolean(true); private EmailThread emailer = new EmailThread(); private boolean firstRun = true; + private HadoopSecurity hadoopSecurity; @Override public void run() { - Constants.load(); - emailer.start(); - try { - ElephantFetcher fetcher = new ElephantFetcher(); - Set previousJobs = new HashSet(); - long lastRun; - - while (running.get()) { - lastRun = System.currentTimeMillis(); - + hadoopSecurity = new HadoopSecurity(); + hadoopSecurity.doAs(new PrivilegedAction() { + @Override + public Void run() { + Constants.load(); + emailer.start(); try { - logger.info("Fetching job list."); - JobStatus[] jobs = fetcher.getJobList(); - if (jobs == null) { - throw new IllegalArgumentException("Jobtracker returned 'null' for job list"); - } - - Set successJobs = filterSuccessfulJobs(jobs); + ElephantFetcher fetcher = new ElephantFetcher(); + Set previousJobs = new HashSet(); + long lastRun; - successJobs = filterPreviousJobs(successJobs, previousJobs); + while (running.get()) { + lastRun = System.currentTimeMillis(); - logger.info(successJobs.size() + " jobs to analyse."); - - //Analyse all ready jobs - for (JobID jobId : successJobs) { try { - analyzeJob(fetcher, jobId); - previousJobs.add(jobId); + logger.info("Fetching job list."); + hadoopSecurity.checkLogin(); + JobStatus[] jobs = fetcher.getJobList(); + if (jobs == null) { + throw new IllegalArgumentException("Jobtracker returned 'null' for job list"); + } + + Set successJobs = filterSuccessfulJobs(jobs); + + successJobs = filterPreviousJobs(successJobs, previousJobs); + + logger.info(successJobs.size() + " jobs to analyse."); + + //Analyse all ready jobs + for (JobID jobId : successJobs) { + try { + analyzeJob(fetcher, jobId); + previousJobs.add(jobId); + } catch (Exception e) { + logger.error("Error analysing job", e); + } + } + logger.info("Finished all jobs. Waiting for refresh."); + } catch (Exception e) { - logger.error("Error analysing job", e); + logger.error("Error getting job list", e); } - } - logger.info("Finished all jobs. Waiting for refresh."); - } catch (Exception e) { - logger.error("Error getting job list", e); - } - - //Wait for long enough - long nextRun = lastRun + WAIT_INTERVAL; - long waitTime = nextRun - System.currentTimeMillis(); - while (running.get() && waitTime > 0) { - try { - Thread.sleep(waitTime); - } catch (InterruptedException e) { - e.printStackTrace(); + //Wait for long enough + long nextRun = lastRun + WAIT_INTERVAL; + long waitTime = nextRun - System.currentTimeMillis(); + while (running.get() && waitTime > 0) { + try { + Thread.sleep(waitTime); + } catch (InterruptedException e) { + e.printStackTrace(); + } + waitTime = nextRun - System.currentTimeMillis(); + } } - waitTime = nextRun - System.currentTimeMillis(); + } catch (Exception e) { + logger.error("Error in ElephantRunner", e); } + return null; } - } catch (Exception e) { - logger.error("Error in ElephantRunner", e); - } + }); } private Set filterSuccessfulJobs(JobStatus[] jobs) { @@ -162,6 +173,9 @@ private void analyzeJob(ElephantFetcher fetcher, JobID jobId) throws Exception { result.save(); + //TODO: Save this + logger.info("Job Type: " + ElephantAnalyser.instance().getJobType(jobData)); + emailer.enqueue(result); } diff --git a/app/com/linkedin/drelephant/analysis/Heuristic.java b/app/com/linkedin/drelephant/analysis/Heuristic.java index 542a95a7f..073e90171 100644 --- a/app/com/linkedin/drelephant/analysis/Heuristic.java +++ b/app/com/linkedin/drelephant/analysis/Heuristic.java @@ -4,5 +4,6 @@ public interface Heuristic { public HeuristicResult apply(HadoopJobData data); + public String getHeuristicName(); } diff --git a/app/com/linkedin/drelephant/analysis/HeuristicResult.java b/app/com/linkedin/drelephant/analysis/HeuristicResult.java index e61560e1f..b039a8f9f 100644 --- a/app/com/linkedin/drelephant/analysis/HeuristicResult.java +++ b/app/com/linkedin/drelephant/analysis/HeuristicResult.java @@ -1,6 +1,7 @@ package com.linkedin.drelephant.analysis; -import java.util.*; +import java.util.ArrayList; +import java.util.List; public class HeuristicResult { private String analysis; diff --git a/app/com/linkedin/drelephant/analysis/Severity.java b/app/com/linkedin/drelephant/analysis/Severity.java index caa990d9a..ba0a1635e 100644 --- a/app/com/linkedin/drelephant/analysis/Severity.java +++ b/app/com/linkedin/drelephant/analysis/Severity.java @@ -20,12 +20,12 @@ public enum Severity { private int value; private String text; - private String color; + private String bootstrapColor; - Severity(int value, String text, String color) { + Severity(int value, String text, String bootstrapColor) { this.value = value; this.text = text; - this.color = color; + this.bootstrapColor = bootstrapColor; } public int getValue() { @@ -36,8 +36,8 @@ public String getText() { return text; } - public String getColor() { - return color; + public String getBootstrapColor() { + return bootstrapColor; } public static Severity byValue(int value) { diff --git a/app/com/linkedin/drelephant/analysis/heuristics/ShuffleSortHeuristic.java b/app/com/linkedin/drelephant/analysis/heuristics/ShuffleSortHeuristic.java index 5a48137c4..3f9b4852d 100644 --- a/app/com/linkedin/drelephant/analysis/heuristics/ShuffleSortHeuristic.java +++ b/app/com/linkedin/drelephant/analysis/heuristics/ShuffleSortHeuristic.java @@ -8,9 +8,6 @@ import com.linkedin.drelephant.hadoop.HadoopTaskData; import com.linkedin.drelephant.math.Statistics; -import java.util.Arrays; -import java.util.Collections; - public class ShuffleSortHeuristic implements Heuristic { public static final String heuristicName = "Shuffle & Sort"; @@ -21,7 +18,7 @@ public String getHeuristicName() { @Override public HeuristicResult apply(HadoopJobData data) { - HadoopTaskData[] tasks = createSample(data.getReducerData()); + HadoopTaskData[] tasks = Statistics.createSample(HadoopTaskData.class, data.getReducerData(), Constants.SHUFFLE_SORT_MAX_SAMPLE_SIZE); //Gather data fetchShuffleSort(tasks); @@ -58,26 +55,6 @@ public HeuristicResult apply(HadoopJobData data) { return result; } - private HadoopTaskData[] createSample(HadoopTaskData[] reducers) { - int MAX_SAMPLE_SIZE = Constants.SHUFFLE_SORT_MAX_SAMPLE_SIZE; - - //Skip this process if number of items already smaller than sample size - if (reducers.length <= MAX_SAMPLE_SIZE) { - return reducers; - } - - HadoopTaskData[] result = new HadoopTaskData[MAX_SAMPLE_SIZE]; - - //Shuffle a clone copy - HadoopTaskData[] clone = reducers.clone(); - Collections.shuffle(Arrays.asList(clone)); - - //Take the first n items - System.arraycopy(clone, 0, result, 0, MAX_SAMPLE_SIZE); - - return result; - } - private void fetchShuffleSort(HadoopTaskData[] reducers) { for (HadoopTaskData reducer : reducers) { reducer.fetchTaskDetails(); diff --git a/app/com/linkedin/drelephant/hadoop/HadoopJobData.java b/app/com/linkedin/drelephant/hadoop/HadoopJobData.java index b44a0b784..3432bb1c3 100644 --- a/app/com/linkedin/drelephant/hadoop/HadoopJobData.java +++ b/app/com/linkedin/drelephant/hadoop/HadoopJobData.java @@ -1,6 +1,7 @@ package com.linkedin.drelephant.hadoop; import java.io.IOException; +import java.util.Properties; public class HadoopJobData { private String username = ""; @@ -10,11 +11,13 @@ public class HadoopJobData { private HadoopCounterHolder counterHolder; private HadoopTaskData[] mapperData; private HadoopTaskData[] reducerData; + private Properties jobConf; - public HadoopJobData(HadoopCounterHolder counters, HadoopTaskData[] mappers, HadoopTaskData[] reducers) throws IOException { + public HadoopJobData(HadoopCounterHolder counters, HadoopTaskData[] mappers, HadoopTaskData[] reducers, Properties jobConf) throws IOException { counterHolder = counters; mapperData = mappers; reducerData = reducers; + this.jobConf = jobConf; } public HadoopJobData setUsername(String username) { @@ -49,6 +52,10 @@ public HadoopTaskData[] getReducerData() { return reducerData; } + public Properties getJobConf() { + return jobConf; + } + public String getUsername() { return username; } diff --git a/app/com/linkedin/drelephant/hadoop/HadoopSecurity.java b/app/com/linkedin/drelephant/hadoop/HadoopSecurity.java new file mode 100644 index 000000000..53e34d8c0 --- /dev/null +++ b/app/com/linkedin/drelephant/hadoop/HadoopSecurity.java @@ -0,0 +1,60 @@ +package com.linkedin.drelephant.hadoop; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.log4j.Logger; +import play.Play; + +import java.io.IOException; +import java.security.PrivilegedAction; + +public class HadoopSecurity { + private static final Logger logger = Logger.getLogger(HadoopSecurity.class); + + private UserGroupInformation loginUser = null; + + private String keytabLocation; + private String keytabUser; + private boolean securityEnabled = false; + + public HadoopSecurity() { + Configuration conf = new Configuration(); + UserGroupInformation.setConfiguration(conf); + securityEnabled = UserGroupInformation.isSecurityEnabled(); + if (securityEnabled) { + keytabLocation = Play.application().configuration().getString("keytab.location"); + keytabUser = Play.application().configuration().getString("keytab.user"); + checkLogin(); + } + } + + public UserGroupInformation getUGI() { + checkLogin(); + return loginUser; + } + + public void checkLogin() { + // try login + try { + if (loginUser == null) { + logger.info("No login user. Creating login user"); + logger.info("Logging with " + keytabUser + " and " + keytabLocation); + UserGroupInformation.loginUserFromKeytab(keytabUser, keytabLocation); + loginUser = UserGroupInformation.getLoginUser(); + logger.info("Logged in with user " + loginUser); + } else { + loginUser.checkTGTAndReloginFromKeytab(); + } + } catch (IOException e) { + logger.error("Failed to login with kerberos ", e); + } + } + + public T doAs(PrivilegedAction action) { + UserGroupInformation ugi = getUGI(); + if (ugi != null) { + return ugi.doAs(action); + } + return null; + } +} diff --git a/app/com/linkedin/drelephant/hadoop/HadoopTaskData.java b/app/com/linkedin/drelephant/hadoop/HadoopTaskData.java index d23dc0234..926077843 100644 --- a/app/com/linkedin/drelephant/hadoop/HadoopTaskData.java +++ b/app/com/linkedin/drelephant/hadoop/HadoopTaskData.java @@ -95,7 +95,7 @@ public void fetchTaskDetails() { return; } try { - URL url = new URL(this.url); + URL url = new URL(HadoopTaskData.this.url); AuthenticatedURL.Token token = new AuthenticatedURL.Token(); HttpURLConnection conn = new AuthenticatedURL().openConnection(url, token); String data = IOUtils.toString(conn.getInputStream()); @@ -103,30 +103,8 @@ public void fetchTaskDetails() { Elements rows = doc.select("table").select("tr"); for (int i = 1; i < rows.size(); i++) { Element row = rows.get(i); - Elements cells = row.select("> td"); - if (cells.size() < 12) { - continue; - } - boolean succeeded = cells.get(2).html().trim().equals("SUCCEEDED"); - if (succeeded) { - try { - String startTime = cells.get(4).html().trim(); - String shuffleTime = cells.get(5).html().trim(); - String sortTime = cells.get(6).html().trim(); - if (shuffleTime.contains("(")) { - shuffleTime = shuffleTime.substring(0, shuffleTime.indexOf("(") - 1); - } - if (sortTime.contains("(")) { - sortTime = sortTime.substring(0, sortTime.indexOf("(") - 1); - } - long start = dateFormat.parse(startTime).getTime(); - long shuffle = dateFormat.parse(shuffleTime).getTime(); - long sort = dateFormat.parse(sortTime).getTime(); - this.shuffleTime = (shuffle - start); - this.sortTime = (sort - shuffle); - } catch (ParseException e) { - //Ignored //e.printStackTrace(); - } + if (tryExtractDetailFromRow(row)) { + return; } } } catch (IOException e) { @@ -136,6 +114,37 @@ public void fetchTaskDetails() { } } + //Return true if successfully extracted data from row + private boolean tryExtractDetailFromRow(Element row) { + Elements cells = row.select("> td"); + if (cells.size() < 12) { + return false; + } + boolean succeeded = cells.get(2).html().trim().equals("SUCCEEDED"); + if (succeeded) { + try { + String startTime = cells.get(4).html().trim(); + String shuffleTime = cells.get(5).html().trim(); + String sortTime = cells.get(6).html().trim(); + if (shuffleTime.contains("(")) { + shuffleTime = shuffleTime.substring(0, shuffleTime.indexOf("(") - 1); + } + if (sortTime.contains("(")) { + sortTime = sortTime.substring(0, sortTime.indexOf("(") - 1); + } + long start = dateFormat.parse(startTime).getTime(); + long shuffle = dateFormat.parse(shuffleTime).getTime(); + long sort = dateFormat.parse(sortTime).getTime(); + HadoopTaskData.this.shuffleTime = (shuffle - start); + HadoopTaskData.this.sortTime = (sort - shuffle); + return true; + } catch (ParseException e) { + //Ignored //e.printStackTrace(); + } + } + return false; + } + private String getTaskDetailsPage(RunningJob job, TaskID taskId) { String jobDetails = job.getTrackingURL(); String root = jobDetails.substring(0, jobDetails.indexOf("jobdetails.jsp")); diff --git a/app/com/linkedin/drelephant/math/Statistics.java b/app/com/linkedin/drelephant/math/Statistics.java index 9bfab3719..32d32124a 100644 --- a/app/com/linkedin/drelephant/math/Statistics.java +++ b/app/com/linkedin/drelephant/math/Statistics.java @@ -2,7 +2,10 @@ import com.linkedin.drelephant.analysis.Severity; +import java.lang.reflect.Array; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.List; public class Statistics { @@ -73,13 +76,13 @@ private static long[][] two_means(long[] values, long middle) { } long[][] result = new long[2][]; - result[0] = toIntArray(smaller); - result[1] = toIntArray(larger); + result[0] = toArray(smaller); + result[1] = toArray(larger); return result; } - private static long[] toIntArray(List input) { + private static long[] toArray(List input) { long[] result = new long[input.size()]; for (int i = 0; i < result.length; i++) { result[i] = input.get(i); @@ -127,4 +130,23 @@ public static Severity getNumTasksSeverity(long numTasks) { return Severity.getSeverityAscending(numTasks, 10, 50, 100, 200); } + + public static T[] createSample(Class clazz, T[] objects, int size) { + //Skip this process if number of items already smaller than sample size + if (objects.length <= size) { + return objects; + } + + @SuppressWarnings("unchecked") + T[] result = (T[]) Array.newInstance(clazz, size); + + //Shuffle a clone copy + T[] clone = objects.clone(); + Collections.shuffle(Arrays.asList(clone)); + + //Take the first n items + System.arraycopy(clone, 0, result, 0, size); + + return result; + } } diff --git a/app/com/linkedin/drelephant/notifications/EmailThread.java b/app/com/linkedin/drelephant/notifications/EmailThread.java index d63ed784b..8eacbbe89 100644 --- a/app/com/linkedin/drelephant/notifications/EmailThread.java +++ b/app/com/linkedin/drelephant/notifications/EmailThread.java @@ -89,6 +89,12 @@ private void sendCriticalEmail(JobResult result) { email.setSubject("Dr. Elephant - Hadoop Job Status Notification"); email.setHtmlMsg(html); email.setDebug(true); + /////////////////// + // + // WARNING: This next line will send out the emails. + // Do NOT uncomment before proper testing and mental meditation. + // + /////////////////// //email.send(); } catch (EmailException e) { e.printStackTrace(); diff --git a/app/model/JobResult.java b/app/model/JobResult.java index 2f33f43a5..e9b782a47 100644 --- a/app/model/JobResult.java +++ b/app/model/JobResult.java @@ -28,6 +28,9 @@ public class JobResult extends Model { @Column public Severity severity; + //@Column(length = 100) + //public String jobType; + @Column(length = 200) public String url; diff --git a/app/views/helpMapperDataSkew.scala.html b/app/views/helpMapperDataSkew.scala.html index d50305d3a..fbf7733b6 100644 --- a/app/views/helpMapperDataSkew.scala.html +++ b/app/views/helpMapperDataSkew.scala.html @@ -32,4 +32,5 @@

Mapper Data Skew

Suggestions

Try to make your input files strictly smaller than the HDFS block size to avoid the small excess pieces creating new map tasks. + Understand WHY there's skew, use CombinedFileInputFormat (or similar in the various languages), turn on slowstart to mitigate effects.

\ No newline at end of file diff --git a/app/views/helpMapperInputSize.scala.html b/app/views/helpMapperInputSize.scala.html index 79e5580ce..dad7880fa 100644 --- a/app/views/helpMapperInputSize.scala.html +++ b/app/views/helpMapperInputSize.scala.html @@ -36,9 +36,9 @@

Suggestions

Set the number of mappers smaller by specifying a number or combining small files using Pig or Hive.

- For Hadoop/Java jobs: Use "jobConf.setNumMapTasks(NUMBER_OF_MAPPERS);"
- For Apache-Pig jobs: Try "set pig.maxCombinedSplitSize 536870912"
- For Apache-Hive jobs: Use "set hive.merge.mapredfiles=true"
+ For Hadoop/Java jobs: Try to use mapred.max.split.size in the job conf to split the input files appropriately.
+ For Apache-Pig jobs: Lower pig.maxCombinedSplitSize and set mapred.max.split.size to something smaller.
+ For Apache-Hive jobs: Try to use mapred.max.split.size in the job conf to split the input files appropriately.

Large files/Unsplittable files

diff --git a/app/views/helpMapperSpeed.scala.html b/app/views/helpMapperSpeed.scala.html index 7107461d6..1e618c356 100644 --- a/app/views/helpMapperSpeed.scala.html +++ b/app/views/helpMapperSpeed.scala.html @@ -1,6 +1,6 @@

This analysis shows the effectiveness of your mapper code.
- This should allow you to determine if your mapper is CPU-bound. + This should allow you to determine if your mapper is CPU-bound or if your mapper is outputting huge amounts of data.

This result of the analysis shows problems with mappers with significant slow speeds for the amount of data it needs to read. diff --git a/app/views/helpReducerDataSkew.scala.html b/app/views/helpReducerDataSkew.scala.html index b4d5a2cfe..61831cacb 100644 --- a/app/views/helpReducerDataSkew.scala.html +++ b/app/views/helpReducerDataSkew.scala.html @@ -31,5 +31,8 @@

Reducer Data Skew

Suggestions

- ... Ask Hadoop-Dev + This is often caused by skew in the keyspace (aggregation key for group by, join key for joins).
+ If using Pig, try a skew join.
+ Otherwise, consider what you're doing in the job and if there's a better way to do it.
+ THEN talk to hadoop dev

\ No newline at end of file diff --git a/app/views/helpReducerTime.scala.html b/app/views/helpReducerTime.scala.html index 8644731f9..08daa3658 100644 --- a/app/views/helpReducerTime.scala.html +++ b/app/views/helpReducerTime.scala.html @@ -1,11 +1,11 @@

- This analysis shows the efficiency your reducers.
- This should allow you to better tweak the number of reducers for your job.
+ This analysis shows the efficiency of your reducers.
+ This should allow you to better adjust the number of reducers for your job.
There are two possible situations that needs some tuning.

Too many reducers

- This usually happens when the Hadoop job has: + This happens when the Hadoop job has: