From 33a5d996c6a3cc961824668edd908cebd05164aa Mon Sep 17 00:00:00 2001
From: lishid <lishid@gmail.com>
Date: Tue, 15 Apr 2014 12:27:16 -0700
Subject: [PATCH] Minor changes to the help page, adding hadoop security to
 work with keytab

---
 README.md                                     |  21 ++--
 app/com/linkedin/drelephant/DrElephant.java   |   1 -
 .../linkedin/drelephant/ElephantAnalyser.java |  13 +++
 .../linkedin/drelephant/ElephantFetcher.java  |  54 +++++++++-
 .../linkedin/drelephant/ElephantRunner.java   | 100 ++++++++++--------
 .../drelephant/analysis/Heuristic.java        |   1 +
 .../drelephant/analysis/HeuristicResult.java  |   3 +-
 .../drelephant/analysis/Severity.java         |  10 +-
 .../heuristics/ShuffleSortHeuristic.java      |  25 +----
 .../drelephant/hadoop/HadoopJobData.java      |   9 +-
 .../drelephant/hadoop/HadoopSecurity.java     |  60 +++++++++++
 .../drelephant/hadoop/HadoopTaskData.java     |  59 ++++++-----
 .../linkedin/drelephant/math/Statistics.java  |  28 ++++-
 .../drelephant/notifications/EmailThread.java |   6 ++
 app/model/JobResult.java                      |   3 +
 app/views/helpMapperDataSkew.scala.html       |   1 +
 app/views/helpMapperInputSize.scala.html      |   6 +-
 app/views/helpMapperSpeed.scala.html          |   2 +-
 app/views/helpReducerDataSkew.scala.html      |   5 +-
 app/views/helpReducerTime.scala.html          |  10 +-
 app/views/helpShuffleSort.scala.html          |   4 +-
 app/views/multijob.scala.html                 |   4 +-
 app/views/singlejob.scala.html                |   4 +-
 conf/application.conf                         |   6 +-
 24 files changed, 308 insertions(+), 127 deletions(-)
 create mode 100644 app/com/linkedin/drelephant/hadoop/HadoopSecurity.java

diff --git a/README.md b/README.md
index 4d04a5cc4..af3c3a697 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,20 @@
 ## Dr Elephant
 
-### Compiling
+### Compiling & testing locally
 
-* This project uses Ant to compile and package the jar.
-* Running ant in the project directory will package the jar file ready for use.
+* To be able to build & run the application, download and install [Play framework 2.2.2](http://downloads.typesafe.com/play/2.2.2/play-2.2.2.zip).
+* To build and run the application in dev mode, run from command line "play run" in the project directory.
+* There is need to investigate the framework to see how one can add parameters to the classpath in dev mode.
 
-### Running
+### Deployment
 
-* To run the project, copy the package "dr-elephant.jar" to a machine that has hadoop setup.
-* Use the command 'hadoop jar dr-elephant.jar job_id' to start execution.
\ No newline at end of file
+* To create a deployment package, use "play dist" to create a zip package, or use "play universal:package-zip-tarball" to create a tarball
+* To run the deployed package with Hadoop properly, some changes needs to be added to the startup script located at ./bin/dr-elephant
+
+* in the classpath ("declare -r app\_classpath=...") , add to the end of the string, before the end quotes
+
+            :$HADOOP_HOME/*:$HADOOP_HOME/lib/*:$HADOOP_HOME/conf
+
+* after the next line ("addJava ... ;"), add new line
+
+            addJava "-Djava.library.path=$HADOOP_HOME/lib/native/Linux-amd64-64"
\ No newline at end of file
diff --git a/app/com/linkedin/drelephant/DrElephant.java b/app/com/linkedin/drelephant/DrElephant.java
index d67edf271..8ac895cf7 100644
--- a/app/com/linkedin/drelephant/DrElephant.java
+++ b/app/com/linkedin/drelephant/DrElephant.java
@@ -23,7 +23,6 @@ public static void analyse(String jobId) throws IOException {
         HadoopJobData jobData = fetcher.getJobData(job_id);
         ElephantAnalyser analyser = new ElephantAnalyser();
         HeuristicResult[] result = analyser.analyse(jobData);
-        //System.out.println(result);
     }
 
     public DrElephant() throws IOException {
diff --git a/app/com/linkedin/drelephant/ElephantAnalyser.java b/app/com/linkedin/drelephant/ElephantAnalyser.java
index aac90072c..8462eedcc 100644
--- a/app/com/linkedin/drelephant/ElephantAnalyser.java
+++ b/app/com/linkedin/drelephant/ElephantAnalyser.java
@@ -48,6 +48,19 @@ public HeuristicResult[] analyse(HadoopJobData data) {
         return results.toArray(new HeuristicResult[results.size()]);
     }
 
+    public String getJobType(HadoopJobData data) {
+        String pigVersion = data.getJobConf().getProperty("pig.version");
+        if (pigVersion != null && !pigVersion.isEmpty()) {
+            return "Pig";
+        }
+        String hiveMapredMode = data.getJobConf().getProperty("hive.mapred.mode");
+        if (hiveMapredMode != null && !hiveMapredMode.isEmpty()) {
+            return "Hive";
+        }
+
+        return "Unknown/Hadoop";
+    }
+
     public static ElephantAnalyser instance() {
         return instance;
     }
diff --git a/app/com/linkedin/drelephant/ElephantFetcher.java b/app/com/linkedin/drelephant/ElephantFetcher.java
index e5066a4d6..d2a557768 100644
--- a/app/com/linkedin/drelephant/ElephantFetcher.java
+++ b/app/com/linkedin/drelephant/ElephantFetcher.java
@@ -3,11 +3,21 @@
 import com.linkedin.drelephant.hadoop.HadoopCounterHolder;
 import com.linkedin.drelephant.hadoop.HadoopJobData;
 import com.linkedin.drelephant.hadoop.HadoopTaskData;
+import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.security.authentication.client.AuthenticatedURL;
+import org.apache.hadoop.security.authentication.client.AuthenticationException;
 import org.apache.log4j.Logger;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
 
 import java.io.IOException;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.Properties;
 
 public class ElephantFetcher {
     private static final Logger logger = Logger.getLogger(ElephantFetcher.class);
@@ -49,7 +59,18 @@ public HadoopJobData getJobData(JobID job_id) throws IOException {
             reducers[i] = new HadoopTaskData(job, reducerTasks[i], true);
         }
 
-        return new HadoopJobData(counterHolder, mappers, reducers)
+        Properties jobConf = null;
+        try {
+            jobConf = getJobConf(job);
+        } catch (AuthenticationException e) {
+            e.printStackTrace();
+        }
+
+        if (jobConf == null) {
+            jobConf = new Properties();
+        }
+
+        return new HadoopJobData(counterHolder, mappers, reducers, jobConf)
                 .setUsername(username).setStartTime(startTime).setUrl(jobUrl).setJobName(jobName);
     }
 
@@ -68,4 +89,35 @@ private TaskReport[] getReduceTaskReports(JobID job_id) throws IOException {
     public JobStatus[] getJobList() throws IOException {
         return jobClient.getAllJobs();
     }
+
+    public Properties getJobConf(RunningJob job) throws IOException, AuthenticationException {
+        Properties properties = new Properties();
+        String jobconfUrl = getJobconfUrl(job);
+        if (jobconfUrl == null) {
+            return properties;
+        }
+
+        URL url = new URL(jobconfUrl);
+        AuthenticatedURL.Token token = new AuthenticatedURL.Token();
+        HttpURLConnection conn = new AuthenticatedURL().openConnection(url, token);
+        String data = IOUtils.toString(conn.getInputStream());
+        Document doc = Jsoup.parse(data);
+        Elements rows = doc.select("table").select("tr");
+        for (int i = 1; i < rows.size(); i++) {
+            Element row = rows.get(i);
+            Elements cells = row.select("> td");
+            if (cells.size() == 2) {
+                String key = cells.get(0).text().trim();
+                String value = cells.get(1).text().trim();
+                properties.put(key, value);
+            }
+        }
+        return properties;
+    }
+
+    private String getJobconfUrl(RunningJob job) {
+        String jobDetails = job.getTrackingURL();
+        String root = jobDetails.substring(0, jobDetails.indexOf("jobdetails.jsp"));
+        return root + "jobconf.jsp?jobid=" + job.getID().toString();
+    }
 }
diff --git a/app/com/linkedin/drelephant/ElephantRunner.java b/app/com/linkedin/drelephant/ElephantRunner.java
index 9f2d2f5c7..c5cc3579b 100644
--- a/app/com/linkedin/drelephant/ElephantRunner.java
+++ b/app/com/linkedin/drelephant/ElephantRunner.java
@@ -4,6 +4,7 @@
 import com.linkedin.drelephant.analysis.HeuristicResult;
 import com.linkedin.drelephant.analysis.Severity;
 import com.linkedin.drelephant.hadoop.HadoopJobData;
+import com.linkedin.drelephant.hadoop.HadoopSecurity;
 import com.linkedin.drelephant.notifications.EmailThread;
 import model.JobHeuristicResult;
 import model.JobResult;
@@ -11,6 +12,7 @@
 import org.apache.hadoop.mapred.JobStatus;
 import org.apache.log4j.Logger;
 
+import java.security.PrivilegedAction;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Set;
@@ -22,62 +24,71 @@ public class ElephantRunner implements Runnable {
     private AtomicBoolean running = new AtomicBoolean(true);
     private EmailThread emailer = new EmailThread();
     private boolean firstRun = true;
+    private HadoopSecurity hadoopSecurity;
 
     @Override
     public void run() {
-        Constants.load();
-        emailer.start();
-        try {
-            ElephantFetcher fetcher = new ElephantFetcher();
-            Set<JobID> previousJobs = new HashSet<JobID>();
-            long lastRun;
-
-            while (running.get()) {
-                lastRun = System.currentTimeMillis();
-
+        hadoopSecurity = new HadoopSecurity();
+        hadoopSecurity.doAs(new PrivilegedAction<Void>() {
+            @Override
+            public Void run() {
+                Constants.load();
+                emailer.start();
                 try {
-                    logger.info("Fetching job list.");
-                    JobStatus[] jobs = fetcher.getJobList();
-                    if (jobs == null) {
-                        throw new IllegalArgumentException("Jobtracker returned 'null' for job list");
-                    }
-
-                    Set<JobID> successJobs = filterSuccessfulJobs(jobs);
+                    ElephantFetcher fetcher = new ElephantFetcher();
+                    Set<JobID> previousJobs = new HashSet<JobID>();
+                    long lastRun;
 
-                    successJobs = filterPreviousJobs(successJobs, previousJobs);
+                    while (running.get()) {
+                        lastRun = System.currentTimeMillis();
 
-                    logger.info(successJobs.size() + " jobs to analyse.");
-
-                    //Analyse all ready jobs
-                    for (JobID jobId : successJobs) {
                         try {
-                            analyzeJob(fetcher, jobId);
-                            previousJobs.add(jobId);
+                            logger.info("Fetching job list.");
+                            hadoopSecurity.checkLogin();
+                            JobStatus[] jobs = fetcher.getJobList();
+                            if (jobs == null) {
+                                throw new IllegalArgumentException("Jobtracker returned 'null' for job list");
+                            }
+
+                            Set<JobID> successJobs = filterSuccessfulJobs(jobs);
+
+                            successJobs = filterPreviousJobs(successJobs, previousJobs);
+
+                            logger.info(successJobs.size() + " jobs to analyse.");
+
+                            //Analyse all ready jobs
+                            for (JobID jobId : successJobs) {
+                                try {
+                                    analyzeJob(fetcher, jobId);
+                                    previousJobs.add(jobId);
+                                } catch (Exception e) {
+                                    logger.error("Error analysing job", e);
+                                }
+                            }
+                            logger.info("Finished all jobs. Waiting for refresh.");
+
                         } catch (Exception e) {
-                            logger.error("Error analysing job", e);
+                            logger.error("Error getting job list", e);
                         }
-                    }
-                    logger.info("Finished all jobs. Waiting for refresh.");
 
-                } catch (Exception e) {
-                    logger.error("Error getting job list", e);
-                }
-
-                //Wait for long enough
-                long nextRun = lastRun + WAIT_INTERVAL;
-                long waitTime = nextRun - System.currentTimeMillis();
-                while (running.get() && waitTime > 0) {
-                    try {
-                        Thread.sleep(waitTime);
-                    } catch (InterruptedException e) {
-                        e.printStackTrace();
+                        //Wait for long enough
+                        long nextRun = lastRun + WAIT_INTERVAL;
+                        long waitTime = nextRun - System.currentTimeMillis();
+                        while (running.get() && waitTime > 0) {
+                            try {
+                                Thread.sleep(waitTime);
+                            } catch (InterruptedException e) {
+                                e.printStackTrace();
+                            }
+                            waitTime = nextRun - System.currentTimeMillis();
+                        }
                     }
-                    waitTime = nextRun - System.currentTimeMillis();
+                } catch (Exception e) {
+                    logger.error("Error in ElephantRunner", e);
                 }
+                return null;
             }
-        } catch (Exception e) {
-            logger.error("Error in ElephantRunner", e);
-        }
+        });
     }
 
     private Set<JobID> filterSuccessfulJobs(JobStatus[] jobs) {
@@ -162,6 +173,9 @@ private void analyzeJob(ElephantFetcher fetcher, JobID jobId) throws Exception {
 
         result.save();
 
+        //TODO: Save this
+        logger.info("Job Type: " + ElephantAnalyser.instance().getJobType(jobData));
+
         emailer.enqueue(result);
     }
 
diff --git a/app/com/linkedin/drelephant/analysis/Heuristic.java b/app/com/linkedin/drelephant/analysis/Heuristic.java
index 542a95a7f..073e90171 100644
--- a/app/com/linkedin/drelephant/analysis/Heuristic.java
+++ b/app/com/linkedin/drelephant/analysis/Heuristic.java
@@ -4,5 +4,6 @@
 
 public interface Heuristic {
     public HeuristicResult apply(HadoopJobData data);
+
     public String getHeuristicName();
 }
diff --git a/app/com/linkedin/drelephant/analysis/HeuristicResult.java b/app/com/linkedin/drelephant/analysis/HeuristicResult.java
index e61560e1f..b039a8f9f 100644
--- a/app/com/linkedin/drelephant/analysis/HeuristicResult.java
+++ b/app/com/linkedin/drelephant/analysis/HeuristicResult.java
@@ -1,6 +1,7 @@
 package com.linkedin.drelephant.analysis;
 
-import java.util.*;
+import java.util.ArrayList;
+import java.util.List;
 
 public class HeuristicResult {
     private String analysis;
diff --git a/app/com/linkedin/drelephant/analysis/Severity.java b/app/com/linkedin/drelephant/analysis/Severity.java
index caa990d9a..ba0a1635e 100644
--- a/app/com/linkedin/drelephant/analysis/Severity.java
+++ b/app/com/linkedin/drelephant/analysis/Severity.java
@@ -20,12 +20,12 @@ public enum Severity {
 
     private int value;
     private String text;
-    private String color;
+    private String bootstrapColor;
 
-    Severity(int value, String text, String color) {
+    Severity(int value, String text, String bootstrapColor) {
         this.value = value;
         this.text = text;
-        this.color = color;
+        this.bootstrapColor = bootstrapColor;
     }
 
     public int getValue() {
@@ -36,8 +36,8 @@ public String getText() {
         return text;
     }
 
-    public String getColor() {
-        return color;
+    public String getBootstrapColor() {
+        return bootstrapColor;
     }
 
     public static Severity byValue(int value) {
diff --git a/app/com/linkedin/drelephant/analysis/heuristics/ShuffleSortHeuristic.java b/app/com/linkedin/drelephant/analysis/heuristics/ShuffleSortHeuristic.java
index 5a48137c4..3f9b4852d 100644
--- a/app/com/linkedin/drelephant/analysis/heuristics/ShuffleSortHeuristic.java
+++ b/app/com/linkedin/drelephant/analysis/heuristics/ShuffleSortHeuristic.java
@@ -8,9 +8,6 @@
 import com.linkedin.drelephant.hadoop.HadoopTaskData;
 import com.linkedin.drelephant.math.Statistics;
 
-import java.util.Arrays;
-import java.util.Collections;
-
 public class ShuffleSortHeuristic implements Heuristic {
     public static final String heuristicName = "Shuffle & Sort";
 
@@ -21,7 +18,7 @@ public String getHeuristicName() {
 
     @Override
     public HeuristicResult apply(HadoopJobData data) {
-        HadoopTaskData[] tasks = createSample(data.getReducerData());
+        HadoopTaskData[] tasks = Statistics.createSample(HadoopTaskData.class, data.getReducerData(), Constants.SHUFFLE_SORT_MAX_SAMPLE_SIZE);
 
         //Gather data
         fetchShuffleSort(tasks);
@@ -58,26 +55,6 @@ public HeuristicResult apply(HadoopJobData data) {
         return result;
     }
 
-    private HadoopTaskData[] createSample(HadoopTaskData[] reducers) {
-        int MAX_SAMPLE_SIZE = Constants.SHUFFLE_SORT_MAX_SAMPLE_SIZE;
-
-        //Skip this process if number of items already smaller than sample size
-        if (reducers.length <= MAX_SAMPLE_SIZE) {
-            return reducers;
-        }
-
-        HadoopTaskData[] result = new HadoopTaskData[MAX_SAMPLE_SIZE];
-
-        //Shuffle a clone copy
-        HadoopTaskData[] clone = reducers.clone();
-        Collections.shuffle(Arrays.asList(clone));
-
-        //Take the first n items
-        System.arraycopy(clone, 0, result, 0, MAX_SAMPLE_SIZE);
-
-        return result;
-    }
-
     private void fetchShuffleSort(HadoopTaskData[] reducers) {
         for (HadoopTaskData reducer : reducers) {
             reducer.fetchTaskDetails();
diff --git a/app/com/linkedin/drelephant/hadoop/HadoopJobData.java b/app/com/linkedin/drelephant/hadoop/HadoopJobData.java
index b44a0b784..3432bb1c3 100644
--- a/app/com/linkedin/drelephant/hadoop/HadoopJobData.java
+++ b/app/com/linkedin/drelephant/hadoop/HadoopJobData.java
@@ -1,6 +1,7 @@
 package com.linkedin.drelephant.hadoop;
 
 import java.io.IOException;
+import java.util.Properties;
 
 public class HadoopJobData {
     private String username = "";
@@ -10,11 +11,13 @@ public class HadoopJobData {
     private HadoopCounterHolder counterHolder;
     private HadoopTaskData[] mapperData;
     private HadoopTaskData[] reducerData;
+    private Properties jobConf;
 
-    public HadoopJobData(HadoopCounterHolder counters, HadoopTaskData[] mappers, HadoopTaskData[] reducers) throws IOException {
+    public HadoopJobData(HadoopCounterHolder counters, HadoopTaskData[] mappers, HadoopTaskData[] reducers, Properties jobConf) throws IOException {
         counterHolder = counters;
         mapperData = mappers;
         reducerData = reducers;
+        this.jobConf = jobConf;
     }
 
     public HadoopJobData setUsername(String username) {
@@ -49,6 +52,10 @@ public HadoopTaskData[] getReducerData() {
         return reducerData;
     }
 
+    public Properties getJobConf() {
+        return jobConf;
+    }
+
     public String getUsername() {
         return username;
     }
diff --git a/app/com/linkedin/drelephant/hadoop/HadoopSecurity.java b/app/com/linkedin/drelephant/hadoop/HadoopSecurity.java
new file mode 100644
index 000000000..53e34d8c0
--- /dev/null
+++ b/app/com/linkedin/drelephant/hadoop/HadoopSecurity.java
@@ -0,0 +1,60 @@
+package com.linkedin.drelephant.hadoop;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.log4j.Logger;
+import play.Play;
+
+import java.io.IOException;
+import java.security.PrivilegedAction;
+
+public class HadoopSecurity {
+    private static final Logger logger = Logger.getLogger(HadoopSecurity.class);
+
+    private UserGroupInformation loginUser = null;
+
+    private String keytabLocation;
+    private String keytabUser;
+    private boolean securityEnabled = false;
+
+    public HadoopSecurity() {
+        Configuration conf = new Configuration();
+        UserGroupInformation.setConfiguration(conf);
+        securityEnabled = UserGroupInformation.isSecurityEnabled();
+        if (securityEnabled) {
+            keytabLocation = Play.application().configuration().getString("keytab.location");
+            keytabUser = Play.application().configuration().getString("keytab.user");
+            checkLogin();
+        }
+    }
+
+    public UserGroupInformation getUGI() {
+        checkLogin();
+        return loginUser;
+    }
+
+    public void checkLogin() {
+        // try login
+        try {
+            if (loginUser == null) {
+                logger.info("No login user. Creating login user");
+                logger.info("Logging with " + keytabUser + " and " + keytabLocation);
+                UserGroupInformation.loginUserFromKeytab(keytabUser, keytabLocation);
+                loginUser = UserGroupInformation.getLoginUser();
+                logger.info("Logged in with user " + loginUser);
+            } else {
+                loginUser.checkTGTAndReloginFromKeytab();
+            }
+        } catch (IOException e) {
+            logger.error("Failed to login with kerberos ", e);
+        }
+    }
+
+    public <T> T doAs(PrivilegedAction<T> action) {
+        UserGroupInformation ugi = getUGI();
+        if (ugi != null) {
+            return ugi.doAs(action);
+        }
+        return null;
+    }
+}
diff --git a/app/com/linkedin/drelephant/hadoop/HadoopTaskData.java b/app/com/linkedin/drelephant/hadoop/HadoopTaskData.java
index d23dc0234..926077843 100644
--- a/app/com/linkedin/drelephant/hadoop/HadoopTaskData.java
+++ b/app/com/linkedin/drelephant/hadoop/HadoopTaskData.java
@@ -95,7 +95,7 @@ public void fetchTaskDetails() {
             return;
         }
         try {
-            URL url = new URL(this.url);
+            URL url = new URL(HadoopTaskData.this.url);
             AuthenticatedURL.Token token = new AuthenticatedURL.Token();
             HttpURLConnection conn = new AuthenticatedURL().openConnection(url, token);
             String data = IOUtils.toString(conn.getInputStream());
@@ -103,30 +103,8 @@ public void fetchTaskDetails() {
             Elements rows = doc.select("table").select("tr");
             for (int i = 1; i < rows.size(); i++) {
                 Element row = rows.get(i);
-                Elements cells = row.select("> td");
-                if (cells.size() < 12) {
-                    continue;
-                }
-                boolean succeeded = cells.get(2).html().trim().equals("SUCCEEDED");
-                if (succeeded) {
-                    try {
-                        String startTime = cells.get(4).html().trim();
-                        String shuffleTime = cells.get(5).html().trim();
-                        String sortTime = cells.get(6).html().trim();
-                        if (shuffleTime.contains("(")) {
-                            shuffleTime = shuffleTime.substring(0, shuffleTime.indexOf("(") - 1);
-                        }
-                        if (sortTime.contains("(")) {
-                            sortTime = sortTime.substring(0, sortTime.indexOf("(") - 1);
-                        }
-                        long start = dateFormat.parse(startTime).getTime();
-                        long shuffle = dateFormat.parse(shuffleTime).getTime();
-                        long sort = dateFormat.parse(sortTime).getTime();
-                        this.shuffleTime = (shuffle - start);
-                        this.sortTime = (sort - shuffle);
-                    } catch (ParseException e) {
-                        //Ignored //e.printStackTrace();
-                    }
+                if (tryExtractDetailFromRow(row)) {
+                    return;
                 }
             }
         } catch (IOException e) {
@@ -136,6 +114,37 @@ public void fetchTaskDetails() {
         }
     }
 
+    //Return true if successfully extracted data from row
+    private boolean tryExtractDetailFromRow(Element row) {
+        Elements cells = row.select("> td");
+        if (cells.size() < 12) {
+            return false;
+        }
+        boolean succeeded = cells.get(2).html().trim().equals("SUCCEEDED");
+        if (succeeded) {
+            try {
+                String startTime = cells.get(4).html().trim();
+                String shuffleTime = cells.get(5).html().trim();
+                String sortTime = cells.get(6).html().trim();
+                if (shuffleTime.contains("(")) {
+                    shuffleTime = shuffleTime.substring(0, shuffleTime.indexOf("(") - 1);
+                }
+                if (sortTime.contains("(")) {
+                    sortTime = sortTime.substring(0, sortTime.indexOf("(") - 1);
+                }
+                long start = dateFormat.parse(startTime).getTime();
+                long shuffle = dateFormat.parse(shuffleTime).getTime();
+                long sort = dateFormat.parse(sortTime).getTime();
+                HadoopTaskData.this.shuffleTime = (shuffle - start);
+                HadoopTaskData.this.sortTime = (sort - shuffle);
+                return true;
+            } catch (ParseException e) {
+                //Ignored //e.printStackTrace();
+            }
+        }
+        return false;
+    }
+
     private String getTaskDetailsPage(RunningJob job, TaskID taskId) {
         String jobDetails = job.getTrackingURL();
         String root = jobDetails.substring(0, jobDetails.indexOf("jobdetails.jsp"));
diff --git a/app/com/linkedin/drelephant/math/Statistics.java b/app/com/linkedin/drelephant/math/Statistics.java
index 9bfab3719..32d32124a 100644
--- a/app/com/linkedin/drelephant/math/Statistics.java
+++ b/app/com/linkedin/drelephant/math/Statistics.java
@@ -2,7 +2,10 @@
 
 import com.linkedin.drelephant.analysis.Severity;
 
+import java.lang.reflect.Array;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
 public class Statistics {
@@ -73,13 +76,13 @@ private static long[][] two_means(long[] values, long middle) {
         }
 
         long[][] result = new long[2][];
-        result[0] = toIntArray(smaller);
-        result[1] = toIntArray(larger);
+        result[0] = toArray(smaller);
+        result[1] = toArray(larger);
 
         return result;
     }
 
-    private static long[] toIntArray(List<Long> input) {
+    private static long[] toArray(List<Long> input) {
         long[] result = new long[input.size()];
         for (int i = 0; i < result.length; i++) {
             result[i] = input.get(i);
@@ -127,4 +130,23 @@ public static Severity getNumTasksSeverity(long numTasks) {
         return Severity.getSeverityAscending(numTasks,
                 10, 50, 100, 200);
     }
+
+    public static <T> T[] createSample(Class<T> clazz, T[] objects, int size) {
+        //Skip this process if number of items already smaller than sample size
+        if (objects.length <= size) {
+            return objects;
+        }
+
+        @SuppressWarnings("unchecked")
+        T[] result = (T[]) Array.newInstance(clazz, size);
+
+        //Shuffle a clone copy
+        T[] clone = objects.clone();
+        Collections.shuffle(Arrays.asList(clone));
+
+        //Take the first n items
+        System.arraycopy(clone, 0, result, 0, size);
+
+        return result;
+    }
 }
diff --git a/app/com/linkedin/drelephant/notifications/EmailThread.java b/app/com/linkedin/drelephant/notifications/EmailThread.java
index d63ed784b..8eacbbe89 100644
--- a/app/com/linkedin/drelephant/notifications/EmailThread.java
+++ b/app/com/linkedin/drelephant/notifications/EmailThread.java
@@ -89,6 +89,12 @@ private void sendCriticalEmail(JobResult result) {
             email.setSubject("Dr. Elephant - Hadoop Job Status Notification");
             email.setHtmlMsg(html);
             email.setDebug(true);
+            ///////////////////
+            //
+            // WARNING: This next line will send out the emails.
+            // Do NOT uncomment before proper testing and mental meditation.
+            //
+            ///////////////////
             //email.send();
         } catch (EmailException e) {
             e.printStackTrace();
diff --git a/app/model/JobResult.java b/app/model/JobResult.java
index 2f33f43a5..e9b782a47 100644
--- a/app/model/JobResult.java
+++ b/app/model/JobResult.java
@@ -28,6 +28,9 @@ public class JobResult extends Model {
     @Column
     public Severity severity;
 
+    //@Column(length = 100)
+    //public String jobType;
+
     @Column(length = 200)
     public String url;
 
diff --git a/app/views/helpMapperDataSkew.scala.html b/app/views/helpMapperDataSkew.scala.html
index d50305d3a..fbf7733b6 100644
--- a/app/views/helpMapperDataSkew.scala.html
+++ b/app/views/helpMapperDataSkew.scala.html
@@ -32,4 +32,5 @@ <h4 class="list-group-item-heading">Mapper Data Skew</h4>
 <h3>Suggestions</h3>
 <p>
     Try to make your input files strictly smaller than the HDFS block size to avoid the small excess pieces creating new map tasks.
+    Understand WHY there's skew, use CombinedFileInputFormat (or similar in the various languages), turn on slowstart to mitigate effects.
 </p>
\ No newline at end of file
diff --git a/app/views/helpMapperInputSize.scala.html b/app/views/helpMapperInputSize.scala.html
index 79e5580ce..dad7880fa 100644
--- a/app/views/helpMapperInputSize.scala.html
+++ b/app/views/helpMapperInputSize.scala.html
@@ -36,9 +36,9 @@ <h4>Suggestions</h4>
 <p>
     Set the number of mappers smaller by specifying a number or combining small files using Pig or Hive.<br>
     <br>
-    For Hadoop/Java jobs: Use "jobConf.setNumMapTasks(NUMBER_OF_MAPPERS);"<br>
-    For Apache-Pig jobs: Try "set pig.maxCombinedSplitSize 536870912"<br>
-    For Apache-Hive jobs: Use "set hive.merge.mapredfiles=true"<br>
+    For Hadoop/Java jobs: Try to use mapred.max.split.size in the job conf to split the input files appropriately.<br>
+    For Apache-Pig jobs: Lower pig.maxCombinedSplitSize and set mapred.max.split.size to something smaller.<br>
+    For Apache-Hive jobs: Try to use mapred.max.split.size in the job conf to split the input files appropriately.<br>
 </p>
 <h3>Large files/Unsplittable files</h3>
 <p>
diff --git a/app/views/helpMapperSpeed.scala.html b/app/views/helpMapperSpeed.scala.html
index 7107461d6..1e618c356 100644
--- a/app/views/helpMapperSpeed.scala.html
+++ b/app/views/helpMapperSpeed.scala.html
@@ -1,6 +1,6 @@
 <p>
     This analysis shows the effectiveness of your mapper code.<br>
-    This should allow you to determine if your mapper is CPU-bound.
+    This should allow you to determine if your mapper is CPU-bound or if your mapper is outputting huge amounts of data.
 </p>
 <p>
     This result of the analysis shows problems with mappers with significant slow speeds for the amount of data it needs to read.
diff --git a/app/views/helpReducerDataSkew.scala.html b/app/views/helpReducerDataSkew.scala.html
index b4d5a2cfe..61831cacb 100644
--- a/app/views/helpReducerDataSkew.scala.html
+++ b/app/views/helpReducerDataSkew.scala.html
@@ -31,5 +31,8 @@ <h4 class="list-group-item-heading">Reducer Data Skew</h4>
 </p>
 <h3>Suggestions</h3>
 <p>
-    ... Ask Hadoop-Dev
+    This is often caused by skew in the keyspace (aggregation key for group by, join key for joins).<br>
+    If using Pig, try a skew join.<br>
+    Otherwise, consider what you're doing in the job and if there's a better way to do it.<br>
+    THEN talk to hadoop dev
 </p>
\ No newline at end of file
diff --git a/app/views/helpReducerTime.scala.html b/app/views/helpReducerTime.scala.html
index 8644731f9..08daa3658 100644
--- a/app/views/helpReducerTime.scala.html
+++ b/app/views/helpReducerTime.scala.html
@@ -1,11 +1,11 @@
 <p>
-    This analysis shows the efficiency your reducers.<br>
-    This should allow you to better tweak the number of reducers for your job.<br>
+    This analysis shows the efficiency of your reducers.<br>
+    This should allow you to better adjust the number of reducers for your job.<br>
     There are two possible situations that needs some tuning.
 </p>
 <h3>Too many reducers</h3>
 <p>
-    This usually happens when the Hadoop job has:
+    This happens when the Hadoop job has:
     <ul>
         <li>A <u>large</u> number of reducers</li>
         <li><u>Short</u> reducer runtime</li>
@@ -34,7 +34,7 @@ <h4 class="list-group-item-heading">Reducer Time</h4>
 </p>
 <h3>Too few reducers</h3>
 <p>
-    This usually happens when the Hadoop job has:
+    This happens when the Hadoop job has:
     <ul>
         <li>A <u>small</u> number of reducers</li>
         <li><u>Long</u> reducer runtime</li>
@@ -66,7 +66,7 @@ <h3>Suggestions</h3>
     Set the number of reducers by specifying a better number in your Hadoop job.<br>
     <br>
     For Hadoop/Java jobs: Use "jobConf.setNumReduceTasks(NUMBER_OF_REDUCERS);"<br>
-    For Apache-Pig jobs: Use "set pig.exec.reducers.max NUMBER_OF_REDUCERS"<br>
+    For Apache-Pig jobs: Use PARALLEL [num] clause on the operator which caused this job (Though this will probably be hard for people to understand without Lipstick)<br>
     For Apache-Hive jobs: Use "set mapred.reduce.tasks=NUMBER_OF_REDUCERS"<br>
     (change NUMBER_OF_TASKS to an appropriate number of tasks)
 </p>
\ No newline at end of file
diff --git a/app/views/helpShuffleSort.scala.html b/app/views/helpShuffleSort.scala.html
index 6d795c35b..7db398b4e 100644
--- a/app/views/helpShuffleSort.scala.html
+++ b/app/views/helpShuffleSort.scala.html
@@ -1,5 +1,5 @@
 <p>
-    This analysis shows the how much time the reducer spends in shuffle and sort steps versus in the reducer code.<br>
+    This analysis shows how much time the reducer spends in shuffle and sort steps versus in the reducer code.<br>
     This should allow you to understand the efficiency of your reducer.
 </p>
 <p>
@@ -36,5 +36,5 @@ <h4 class="list-group-item-heading">Shuffle &amp; Sort</h4>
 </p>
 <h3>Suggestions</h3>
 <p>
-    ...Ask Hadoop-Dev
+    If your shuffle time is high but sort is low, you likely need to turn slowstart on
 </p>
\ No newline at end of file
diff --git a/app/views/multijob.scala.html b/app/views/multijob.scala.html
index b7bf0d20a..052fc328e 100644
--- a/app/views/multijob.scala.html
+++ b/app/views/multijob.scala.html
@@ -6,11 +6,11 @@ <h3 class="panel-title">@title</h3>
     </div>
     <div class="list-group">
         @for(result <- results) {
-        <a class="list-group-item list-group-item-@result.severity.getColor()" href="@routes.Application.search()?jobid=@result.job_id">
+        <a class="list-group-item list-group-item-@result.severity.getBootstrapColor()" href="@routes.Application.search()?jobid=@result.job_id">
             <p class="list-group-item-heading"><h4 class="list-group-item-heading">[@result.username] @result.job_id</h4>@result.jobName</p>
             <p>
                 @for(heuristicResult <- result.heuristicResults) {
-                <span class="label label-@heuristicResult.severity.getColor()">@heuristicResult.analysisName</span>
+                <span class="label label-@heuristicResult.severity.getBootstrapColor()">@heuristicResult.analysisName</span>
                 }
             </p>
         </a>
diff --git a/app/views/singlejob.scala.html b/app/views/singlejob.scala.html
index 6904fdda5..667fdac65 100644
--- a/app/views/singlejob.scala.html
+++ b/app/views/singlejob.scala.html
@@ -10,14 +10,14 @@ <h4>[@result.username] @result.job_id</h4>
             <div>&nbsp;</div>
             <div>
                 @for(heuristicResult <- result.heuristicResults) {
-                <a href="#@heuristicResult.analysisName.replace(" ", "")" class="label label-@heuristicResult.severity.getColor()">@heuristicResult.analysisName</a>
+                <a href="#@heuristicResult.analysisName.replace(" ", "")" class="label label-@heuristicResult.severity.getBootstrapColor()">@heuristicResult.analysisName</a>
                 }
             </div>
         </p>
     </div>
     <div class="list-group">
         @for(heuristicResult <- result.heuristicResults) {
-        <a name="@heuristicResult.analysisName.replace(" ", "")" class="list-group-item list-group-item-@heuristicResult.severity.getColor()" href="">
+        <a name="@heuristicResult.analysisName.replace(" ", "")" class="list-group-item list-group-item-@heuristicResult.severity.getBootstrapColor()">
             <h4 class="list-group-item-heading">@heuristicResult.analysisName</h4>
                 @defining(heuristicResult.getDataArray()) { data =>
                 <table class="list-group-item-text table table-condensed left-table">
diff --git a/conf/application.conf b/conf/application.conf
index 16de04bd8..aa3685e8a 100644
--- a/conf/application.conf
+++ b/conf/application.conf
@@ -79,4 +79,8 @@ smtp.host=email.corp.linkedin.com
 smtp.port=25
 smtp.from="azkaban-noreply@linkedin.com"
 # smtp.user=azkaban-noreply
-# smtp.password=
\ No newline at end of file
+# smtp.password=
+
+#Kerberos
+keytab.location="/export/apps/hadoop/keytabs/dr_elephant-service.keytab"
+keytab.user="elephant/eat1-magicaz01.grid.linkedin.com"
\ No newline at end of file