From 3aefb23f236d097a102eaa19e91c1e547e448ed5 Mon Sep 17 00:00:00 2001 From: Varun Saxena Date: Fri, 11 Jan 2019 18:39:28 +0530 Subject: [PATCH] Integrate CPD into Dr.Elephant for detecting code duplication (#503) (#503) --- baseline.conf | 27 +++++ build.sbt | 5 +- common.sh | 191 ++++++++++++++++++++++++++++++++++++ compile.sh | 197 ++++++++++++++++++++++++++++++------- cpd.sbt | 28 ++++++ project/build.properties | 2 +- project/plugins.sbt | 3 + travis.sh | 206 ++++++++++++++++++++++++++++++++++----- 8 files changed, 595 insertions(+), 64 deletions(-) create mode 100755 baseline.conf create mode 100755 common.sh create mode 100644 cpd.sbt diff --git a/baseline.conf b/baseline.conf new file mode 100755 index 000000000..9437ee66b --- /dev/null +++ b/baseline.conf @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# +# Copyright 2016 LinkedIn Corp. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# + +# +# Configurations for threshold and baseline for various tools. +# + +# ********** Baseline/threshold numbers for Copy Paste Detector(CPD) ************* +# Threshold for CPD when run for Java +readonly JAVA_CPD_THRESHOLD=32 +# Threshold for CPD when run for Scala +readonly SCALA_CPD_THRESHOLD=0 diff --git a/build.sbt b/build.sbt index fb36f05a2..5aae3a90d 100644 --- a/build.sbt +++ b/build.sbt @@ -23,6 +23,9 @@ version := "2.1.7" organization := "com.linkedin.drelephant" +// Enable CPD SBT plugin +lazy val root = (project in file(".")).enablePlugins(CopyPasteDetector) + javacOptions in Compile ++= Seq("-source", "1.6", "-target", "1.6") libraryDependencies ++= dependencies map { _.excludeAll(exclusionRules: _*) } @@ -37,4 +40,4 @@ playJavaSettings scalaVersion := "2.10.4" -envVars in Test := Map("PSO_DIR_PATH" -> (baseDirectory.value / "scripts/pso").getAbsolutePath) \ No newline at end of file +envVars in Test := Map("PSO_DIR_PATH" -> (baseDirectory.value / "scripts/pso").getAbsolutePath) diff --git a/common.sh b/common.sh new file mode 100755 index 000000000..b6e7215b7 --- /dev/null +++ b/common.sh @@ -0,0 +1,191 @@ +#!/usr/bin/env bash + +# +# Copyright 2016 LinkedIn Corp. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# + +# +# This script contains common functions and constants which will be used by both +# compile.sh and travis.sh while running different tools. +# + +######################################################## +# +# Global constants +# +######################################################## +# Base path for most of the quality tool reports +readonly REPORTS_BASE_PATH="target/scala-2.10/" + +# ******************** Constants for Findbugs ********************* +# Default path for Findbugs report +readonly FINDBUGS_REPORT_PATH=$REPORTS_BASE_PATH"findbugs/report.xml" + +# ************* Constants for Copy Paste Detector(CPD) ************* +# CPD report resides in this path +readonly CPD_REPORT_BASE_PATH=$REPORTS_BASE_PATH"cpd/" +# Default path for CPD report +readonly CPD_REPORT_PATH=$CPD_REPORT_BASE_PATH"cpd.xml" + +# ************************ Other constants ************************** +# Color coded prefixes for ERROR, WARNING, INFO and SUCCESS messages +readonly ERROR_COLOR_PREFIX="[\033[0;31mERROR\033[0m]" +readonly WARNING_COLOR_PREFIX="[\033[0;33mWARNING\033[0m]" +readonly INFO_COLOR_PREFIX="[\033[0;36mINFO\033[0m]" +readonly SUCCESS_COLOR_PREFIX="[\033[0;32mSUCCESS\033[0m]" + +########################################################## +# Get CPD report name based on language. +# +# Arguments: +# arg1: Report location +# arg2: Language for which CPD report willbe generated +# (Java or Scala) +# Returns: +# File name where CPD report will be written to. +########################################################## +function getCPDReportName() { + echo $1"cpd-"$2".xml" +} + +########################################################## +# Check if there is a failure due to duplicates in CPD +# report above the configured threshold for the language. +# +# Arguments: +# arg1: Language for which CPD is run (Java or Scala) +# arg2: Duplicates threshold for the language +# arg3: Name of the threshold constant for the language +# arg4: CPD report file which contains duplicates +# arg5: Flag which indicates whether to dump CPD report +# Report will be dumped if value of argument is 1 +# Returns: +# 0: Success +# 1: Failure due to threshold +# 2: Failure due to threshold variables not updated +########################################################## +function checkIfCPDFailed() { + duplicates=`grep " 0) { + tag = tag ORS $0; + # Remove section which contains the License + if (/Licensed under the Apache License/) { + p = 0; + } + # Break out of loop if duplication end tag matches + if (/<\/duplication>/) { + break; + } + } + $0 = tag + } p' $1".bak" > $1 + rm -rf $1".bak" +} + +########################################################## +# Change cpdLanguage setting in cpd.sbt from the passed +# language in first argument to language in second +# argument. +# Note: For consistency across platforms not using sed's +# -i option and instead redirecting output and moving +# files. +# +# Arguments: +# arg1: Language setting changed from +# arg2: Language setting changed to +# Returns: +# None +########################################################## +function changeCPDLanguageSetting() { + sed "s/$1/$2/g" cpd.sbt > cpd.sbt.bak + mv cpd.sbt.bak cpd.sbt +} diff --git a/compile.sh b/compile.sh index 6927db244..f826d6d61 100755 --- a/compile.sh +++ b/compile.sh @@ -16,8 +16,15 @@ # the License. # -function print_usage(){ - echo "usage: ./compile.sh PATH_TO_CONFIG_FILE(optional)" +function print_usage() { + echo "" + echo "Usage: ./compile.sh [config_file_path] [additional_options]" + echo " compile.sh takes optionally, custom configuration file path(denoted as config_file_path above) as first argument."\ + "This argument can't be at any other position." + echo " We can also, optionally pass, additional_options, in any order. Additional options are as under:" + echo -e "\tcoverage: Runs Jacoco code coverage and fails the build as per configured threshold" + echo -e "\tfindbugs: Runs Findbugs for Java code" + echo -e "\tcpd: Runs Copy Paste Detector(CPD) for Java and Scala code" } function play_command() { @@ -46,46 +53,139 @@ function require_programs() { fi } +############################################################ +# Generate CPD report based on language in the report path. +# For Scala, also remove duplicates generated due to license +# header as they are false negatives. In the end, fail the +# build if failures are found. +# +# Arguments: +# arg1: Language (one of Java or Scala) +# arg2: Duplicates threshold for the language +# arg3: Name of the threshold constant for the language +# Returns: +# None +############################################################ +function processCPDReportByLanguage() { + cpd_result_file=$(getCPDReportName $CPD_REPORT_BASE_PATH $1) + mv $CPD_REPORT_PATH $cpd_result_file + if [ $1 = "Scala" ]; then + removeLicenseHeaderDuplicates $cpd_result_file + fi + echo "CPD report generated at path $cpd_result_file" + checkIfCPDFailed $1 $2 $3 $cpd_result_file "0" + result=$? + if [ $result -gt 0 ]; then + if [ $result -eq 2 ]; then + echo "" + echo -e "$WARNING_COLOR_PREFIX Note: Make sure your local repo is up to date with the branch you want to merge to, otherwise threshold/baseline "\ + "values to be updated in baseline.conf\n\tmight be different and that can lead to CI failure..." + fi + echo "" + exit 1; + fi +} + +########################################################## +# Run CPD for Java and Scala one by one. For Scala, first +# change cpdLanguage setting in cpd.sbt to Language.Scala +# and then run CPD. Ensure that specific CPD reports are +# generated for each language in the report folder. +# +# Arguments: +# arg1: Play command OPTS +# Returns: +# None +########################################################## +function runCPD() { + echo "Running CPD for Java" + play_command $1 cpd + if [ $? -ne 0 ]; then + exit 1; + fi + processCPDReportByLanguage "Java" $JAVA_CPD_THRESHOLD "JAVA_CPD_THRESHOLD" + + echo "Running CPD for Scala" + changeCPDLanguageSetting "Language.Java" "Language.Scala" + play_command $OPTS cpd + if [ $? -ne 0 ]; then + # Reset language back to Java + changeCPDLanguageSetting "Language.Scala" "Language.Java" + exit 1; + fi + processCPDReportByLanguage "Scala" $SCALA_CPD_THRESHOLD "SCALA_CPD_THRESHOLD" + # Reset language back to Java + changeCPDLanguageSetting "Language.Scala" "Language.Java" +} + require_programs zip unzip # Default configurations HADOOP_VERSION="2.3.0" SPARK_VERSION="1.4.0" -# User should pass an optional argument which is a path to config file -if [ -z "$1" ]; -then - echo "Using the default configuration" -else - CONF_FILE_PATH=$1 - echo "Using config file: "$CONF_FILE_PATH - # User must give a valid file as argument - if [ -f $CONF_FILE_PATH ]; - then - echo "Reading from config file..." +extra_commands="" +# Indicates whether a custom configuration file is passed as first parameter. +custom_config="n" +run_CPD="n" +# Process command line arguments +while :; do + if [ ! -z $1 ]; then + case $1 in + coverage) + extra_commands=$extra_commands" jacoco:cover" + ;; + findbugs) + extra_commands=$extra_commands" findbugs" + ;; + cpd) + run_CPD="y" + ;; + *) + # User may pass the first argument(optional) which is a path to config file + if [[ -z $extra_commands && $custom_config = "n" ]]; then + CONF_FILE_PATH=$1 + + # User must give a valid file as argument + if [ -f $CONF_FILE_PATH ]; then + echo "Using config file: "$CONF_FILE_PATH + else + echo "error: Couldn't find a valid config file at: " $CONF_FILE_PATH + print_usage + exit 1 + fi + + custom_config="y" + source $CONF_FILE_PATH + + # Fetch the Hadoop version + if [ -n "${hadoop_version}" ]; then + HADOOP_VERSION=${hadoop_version} + fi + + # Fetch the Spark version + if [ -n "${spark_version}" ]; then + SPARK_VERSION=${spark_version} + fi + + # Fetch other play opts + if [ -n "${play_opts}" ]; then + PLAY_OPTS=${play_opts} + fi + else + echo "Invalid option: $1" + print_usage + exit 1; + fi + esac + shift else - echo "error: Couldn't find a valid config file at: " $CONF_FILE_PATH - print_usage - exit 1 - fi - - source $CONF_FILE_PATH - - # Fetch the Hadoop version - if [ -n "${hadoop_version}" ]; then - HADOOP_VERSION=${hadoop_version} - fi - - # Fetch the Spark version - if [ -n "${spark_version}" ]; then - SPARK_VERSION=${spark_version} - fi - - # Fetch other play opts - if [ -n "${play_opts}" ]; then - PLAY_OPTS=${play_opts} + break fi +done +if [ $custom_config = "n" ]; then + echo "Using the default configuration" fi echo "Hadoop Version : $HADOOP_VERSION" @@ -137,17 +237,44 @@ else fi trap "exit" SIGINT SIGTERM +set +x +set +v start_script=${project_root}/scripts/start.sh stop_script=${project_root}/scripts/stop.sh app_conf=${project_root}/app-conf pso_dir=${project_root}/scripts/pso +# Import baseline/threshold numbers used across compile.sh and travis.sh +source baseline.conf +# Import common functions used across compile.sh and travis.sh +source common.sh + +# Run the main command alongwith the extra commands passed as arguments to compile.sh +echo "Command is: play $OPTS clean compile test $extra_commands" +play_command $OPTS clean compile test $extra_commands +if [ $? -ne 0 ]; then + echo "Build failed..." + exit 1; +fi + +if [[ $extra_commands == *"findbugs"* ]]; then + # Parse and check findbugs report + checkFindbugsReport +fi + +# Run CPD if passed as an argument +if [ $run_CPD = "y" ]; then + runCPD $OPTS +fi + +set -v +set -x # Echo the value of pwd in the script so that it is clear what is being removed. rm -rf ${project_root}/dist mkdir dist - -play_command $OPTS clean test compile jacoco:cover dist +# Run distribution +play_command $OPTS dist cd target/universal diff --git a/cpd.sbt b/cpd.sbt new file mode 100644 index 000000000..09e2ca855 --- /dev/null +++ b/cpd.sbt @@ -0,0 +1,28 @@ +// +// Copyright 2016 LinkedIn Corp. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy of +// the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. +// + +// +// cpd4sbt plugin settings for integrating with CPD which is used for code duplication +// +import de.johoop.cpd4sbt._ + +// By default language will be Java but this will be changed to run for Scala as well +// while running build through Travis CI. +cpdLanguage := Language.Java + +// Take distinct source directories to ensure whole file is not reported as duplicate +// of itself. +cpdSourceDirectories in Compile := (cpdSourceDirectories in Compile).value.distinct diff --git a/project/build.properties b/project/build.properties index bb96499e0..d8f797834 100644 --- a/project/build.properties +++ b/project/build.properties @@ -14,4 +14,4 @@ # the License. # -sbt.version=0.13.2 \ No newline at end of file +sbt.version=0.13.5 diff --git a/project/plugins.sbt b/project/plugins.sbt index cb7a66122..32d4f2f33 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -27,3 +27,6 @@ addSbtPlugin("de.johoop" % "jacoco4sbt" % "2.1.6") // Findbugs plugin addSbtPlugin("de.johoop" % "findbugs4sbt" % "1.4.0") + +// Copy paste detector plugin +addSbtPlugin("de.johoop" % "cpd4sbt" % "1.2.0") diff --git a/travis.sh b/travis.sh index 1325ade24..47d4d5863 100755 --- a/travis.sh +++ b/travis.sh @@ -16,19 +16,113 @@ # the License. # -######################################################## # -# Global constants +# Script to be used for building on Travis CI # -######################################################## -# Base path for most of the quality tool reports -readonly REPORTS_BASE_PATH="target/scala-2.10/" -# Default path for Findbugs report -readonly FINDBUGS_REPORT_PATH=$REPORTS_BASE_PATH"findbugs/report.xml" -# Color coded prefixes for ERROR and SUCCESS messages -readonly SUCCESS_COLOR_PREFIX="[\033[0;32mSUCCESS\033[0m]" -readonly ERROR_COLOR_PREFIX="[\033[0;31mERROR\033[0m]" +############################################################ +# Get files chnged in this PR using git commands. +# +# Arguments: +# None +# Returns: +# List of files changed in the PR +############################################################ +function getChangedFiles() { + # Get commit hashes which have been added in the PR + commitHashes=`git rev-list origin/HEAD..HEAD` + # Extract file names changed for each commit hash + changedFiles=$(for hash in $commitHashes; do + fileNamesForHash=`git show --name-only --oneline $hash | awk '{if (NR > 1) print}'` + if [ ! -z "${fileNamesForHash}" ]; then + echo "${fileNamesForHash}" + fi + done) + echo "${changedFiles}" | sort | uniq +} + +########################################################## +# Check if there are duplicates in CPD report above the +# configured threshold for the language. +# +# Arguments: +# arg1: CPD report file to be checked for duplicates +# arg2: List of files changed in the PR +# Returns: +# None +########################################################## +function dumpCPDSummaryForChangedFiles() { + reportDump=`cat $1` + for changedFile in $2; do + fileDuplicateCnt=`echo "${reportDump}" | grep $changedFile | wc -l` + if [ $fileDuplicateCnt -gt 0 ]; then + echo -e "\tDuplicate info for file $changedFile:" + echo -e "\t------------------------------------------------------------------------------------"; + echo $reportDump | awk -v filename="$changedFile" '{ + # Iterate over all the duplicates in CPD report + numDuplicates = split($0, duplicates, ".*<\/codefragment>/, "", duplicates[duplicateIdx]); + # Proceed only if filename is found. + if (index(duplicates[duplicateIdx], filename) > 0) { + # Sanitize the report for processing. + sub(/<\/duplication>/, "", duplicates[duplicateIdx]) + sub(/<\/pmd-cpd>/, "", duplicates[duplicateIdx]) + gsub(//, "", duplicates[duplicateIdx]) + gsub(/"\/>/, "", duplicates[duplicateIdx]) + gsub(/