Skip to content

Commit

Permalink
[SPARK-28440][MLLIB][TEST] Use TestingUtils to compare floating point…
Browse files Browse the repository at this point in the history
… values

## What changes were proposed in this pull request?

Use `org.apache.spark.mllib.util.TestingUtils` object across `MLLIB` component to compare floating point values in tests.

## How was this patch tested?

`build/mvn test` - existing tests against updated code.

Closes apache#25191 from eugen-prokhorenko/mllib-testingutils-double-comparison.

Authored-by: Ievgen Prokhorenko <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
  • Loading branch information
Ievgen Prokhorenko authored and dongjoon-hyun committed Jul 19, 2019
1 parent 127bc89 commit 52ddf03
Show file tree
Hide file tree
Showing 9 changed files with 52 additions and 46 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package org.apache.spark.mllib.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.rdd.RDD

class MultilabelMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
Expand Down Expand Up @@ -79,24 +80,24 @@ class MultilabelMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
val hammingLoss = (1.0 / (7 * 3)) * (2 + 2 + 1 + 0 + 0 + 1 + 1)
val strictAccuracy = 2.0 / 7
val accuracy = 1.0 / 7 * (1.0 / 3 + 1.0 /3 + 0 + 1.0 / 1 + 2.0 / 2 + 2.0 / 3 + 1.0 / 2)
assert(math.abs(metrics.precision(0.0) - precision0) < delta)
assert(math.abs(metrics.precision(1.0) - precision1) < delta)
assert(math.abs(metrics.precision(2.0) - precision2) < delta)
assert(math.abs(metrics.recall(0.0) - recall0) < delta)
assert(math.abs(metrics.recall(1.0) - recall1) < delta)
assert(math.abs(metrics.recall(2.0) - recall2) < delta)
assert(math.abs(metrics.f1Measure(0.0) - f1measure0) < delta)
assert(math.abs(metrics.f1Measure(1.0) - f1measure1) < delta)
assert(math.abs(metrics.f1Measure(2.0) - f1measure2) < delta)
assert(math.abs(metrics.microPrecision - microPrecisionClass) < delta)
assert(math.abs(metrics.microRecall - microRecallClass) < delta)
assert(math.abs(metrics.microF1Measure - microF1MeasureClass) < delta)
assert(math.abs(metrics.precision - macroPrecisionDoc) < delta)
assert(math.abs(metrics.recall - macroRecallDoc) < delta)
assert(math.abs(metrics.f1Measure - macroF1MeasureDoc) < delta)
assert(math.abs(metrics.hammingLoss - hammingLoss) < delta)
assert(math.abs(metrics.subsetAccuracy - strictAccuracy) < delta)
assert(math.abs(metrics.accuracy - accuracy) < delta)
assert(metrics.precision(0.0) ~== precision0 absTol delta)
assert(metrics.precision(1.0) ~== precision1 absTol delta)
assert(metrics.precision(2.0) ~== precision2 absTol delta)
assert(metrics.recall(0.0) ~== recall0 absTol delta)
assert(metrics.recall(1.0) ~== recall1 absTol delta)
assert(metrics.recall(2.0) ~== recall2 absTol delta)
assert(metrics.f1Measure(0.0) ~== f1measure0 absTol delta)
assert(metrics.f1Measure(1.0) ~== f1measure1 absTol delta)
assert(metrics.f1Measure(2.0) ~== f1measure2 absTol delta)
assert(metrics.microPrecision ~== microPrecisionClass absTol delta)
assert(metrics.microRecall ~== microRecallClass absTol delta)
assert(metrics.microF1Measure ~== microF1MeasureClass absTol delta)
assert(metrics.precision ~== macroPrecisionDoc absTol delta)
assert(metrics.recall ~== macroRecallDoc absTol delta)
assert(metrics.f1Measure ~== macroF1MeasureDoc absTol delta)
assert(metrics.hammingLoss ~== hammingLoss absTol delta)
assert(metrics.subsetAccuracy ~== strictAccuracy absTol delta)
assert(metrics.accuracy ~== accuracy absTol delta)
assert(metrics.labels.sameElements(Array(0.0, 1.0, 2.0)))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package org.apache.spark.mllib.fpm

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext {

Expand Down Expand Up @@ -63,7 +64,7 @@ class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext {
[1] 23
*/
assert(results1.size === 23)
assert(results1.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23)
assert(results1.count(rule => rule.confidence ~= 1.0D absTol 1e-6) == 23)

val results2 = ar
.setMinConfidence(0)
Expand All @@ -84,7 +85,7 @@ class AssociationRulesSuite extends SparkFunSuite with MLlibTestSparkContext {
[1] 23
*/
assert(results2.size === 30)
assert(results2.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23)
assert(results2.count(rule => rule.confidence ~= 1.0D absTol 1e-6) == 23)
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package org.apache.spark.mllib.fpm

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.util.Utils

class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
Expand Down Expand Up @@ -172,7 +173,7 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
.collect()

assert(rules.size === 23)
assert(rules.count(rule => math.abs(rule.confidence - 1.0D) < 1e-6) == 23)
assert(rules.count(rule => rule.confidence ~= 1.0D absTol 1e-6) == 23)
}

test("FP-Growth using Int type") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import breeze.linalg.{diag => brzDiag, DenseMatrix => BDM, DenseVector => BDV}
import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.rdd.RDD

class IndexedRowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
Expand Down Expand Up @@ -238,7 +239,7 @@ class IndexedRowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {

for (i <- 0 until n; j <- i + 1 until n) {
val trueResult = gram(i, j) / scala.math.sqrt(gram(i, i) * gram(j, j))
assert(math.abs(G(i, j) - trueResult) < 1e-6)
assert(G(i, j) ~== trueResult absTol 1e-6)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ package org.apache.spark.mllib.random
import org.apache.commons.math3.special.Gamma

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.util.StatCounter

// TODO update tests to use TestingUtils for floating point comparison after PR 1367 is merged
class RandomDataGeneratorSuite extends SparkFunSuite {

def apiChecks(gen: RandomDataGenerator[Double]) {
Expand Down Expand Up @@ -61,8 +61,8 @@ class RandomDataGeneratorSuite extends SparkFunSuite {
gen.setSeed(seed.toLong)
val sample = (0 until 100000).map { _ => gen.nextValue()}
val stats = new StatCounter(sample)
assert(math.abs(stats.mean - mean) < epsilon)
assert(math.abs(stats.stdev - stddev) < epsilon)
assert(stats.mean ~== mean absTol epsilon)
assert(stats.stdev ~== stddev absTol epsilon)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,13 @@ import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.rdd.{RandomRDD, RandomRDDPartition}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.rdd.RDD
import org.apache.spark.util.StatCounter

/*
* Note: avoid including APIs that do not set the seed for the RNG in unit tests
* in order to guarantee deterministic behavior.
*
* TODO update tests to use TestingUtils for floating point comparison after PR 1367 is merged
*/
class RandomRDDsSuite extends SparkFunSuite with MLlibTestSparkContext with Serializable {

Expand All @@ -43,8 +42,8 @@ class RandomRDDsSuite extends SparkFunSuite with MLlibTestSparkContext with Seri
val stats = rdd.stats()
assert(expectedSize === stats.count)
assert(expectedNumPartitions === rdd.partitions.size)
assert(math.abs(stats.mean - expectedMean) < epsilon)
assert(math.abs(stats.stdev - expectedStddev) < epsilon)
assert(stats.mean ~== expectedMean absTol epsilon)
assert(stats.stdev ~== expectedStddev absTol epsilon)
}

// assume test RDDs are small
Expand All @@ -63,8 +62,8 @@ class RandomRDDsSuite extends SparkFunSuite with MLlibTestSparkContext with Seri
}}
assert(expectedRows === values.size / expectedColumns)
val stats = new StatCounter(values)
assert(math.abs(stats.mean - expectedMean) < epsilon)
assert(math.abs(stats.stdev - expectedStddev) < epsilon)
assert(stats.mean ~== expectedMean absTol epsilon)
assert(stats.stdev ~== expectedStddev absTol epsilon)
}

test("RandomRDD sizes") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.mllib.stat.correlation.{Correlations, PearsonCorrelation,
SpearmanCorrelation}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Logging {

Expand Down Expand Up @@ -57,15 +58,15 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Log
val expected = 0.6546537
val default = Statistics.corr(x, y)
val p1 = Statistics.corr(x, y, "pearson")
assert(approxEqual(expected, default))
assert(approxEqual(expected, p1))
assert(expected ~== default absTol 1e-6)
assert(expected ~== p1 absTol 1e-6)

// numPartitions >= size for input RDDs
for (numParts <- List(xData.size, xData.size * 2)) {
val x1 = sc.parallelize(xData, numParts)
val y1 = sc.parallelize(yData, numParts)
val p2 = Statistics.corr(x1, y1)
assert(approxEqual(expected, p2))
assert(expected ~== p2 absTol 1e-6)
}

// RDD of zero variance
Expand All @@ -78,14 +79,14 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Log
val y = sc.parallelize(yData)
val expected = 0.5
val s1 = Statistics.corr(x, y, "spearman")
assert(approxEqual(expected, s1))
assert(expected ~== s1 absTol 1e-6)

// numPartitions >= size for input RDDs
for (numParts <- List(xData.size, xData.size * 2)) {
val x1 = sc.parallelize(xData, numParts)
val y1 = sc.parallelize(yData, numParts)
val s2 = Statistics.corr(x1, y1, "spearman")
assert(approxEqual(expected, s2))
assert(expected ~== s2 absTol 1e-6)
}

// RDD of zero variance => zero variance in ranks
Expand Down Expand Up @@ -141,14 +142,14 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Log
val a = RandomRDDs.normalRDD(sc, 100000, 10).map(_ + 1000000000.0)
val b = RandomRDDs.normalRDD(sc, 100000, 10).map(_ + 1000000000.0)
val p = Statistics.corr(a, b, method = "pearson")
assert(approxEqual(p, 0.0, 0.01))
assert(p ~== 0.0 absTol 0.01)
}

def approxEqual(v1: Double, v2: Double, threshold: Double = 1e-6): Boolean = {
if (v1.isNaN) {
v2.isNaN
} else {
math.abs(v1 - v2) <= threshold
v1 ~== v2 absTol threshold
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
test("kernel density single sample") {
Expand All @@ -29,8 +30,8 @@ class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
val normal = new NormalDistribution(5.0, 3.0)
val acceptableErr = 1e-6
assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
assert(densities(0) ~== normal.density(5.0) absTol acceptableErr)
assert(densities(1) ~== normal.density(6.0) absTol acceptableErr)
}

test("kernel density multiple samples") {
Expand All @@ -40,9 +41,9 @@ class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
val normal1 = new NormalDistribution(5.0, 3.0)
val normal2 = new NormalDistribution(10.0, 3.0)
val acceptableErr = 1e-6
assert(math.abs(
densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
assert(math.abs(
densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
assert(
densities(0) ~== ((normal1.density(5.0) + normal2.density(5.0)) / 2) absTol acceptableErr)
assert(
densities(1) ~== ((normal1.density(6.0) + normal2.density(6.0)) / 2) absTol acceptableErr)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import scala.collection.mutable
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.util.StatCounter

object EnsembleTestHelper {
Expand All @@ -43,8 +44,8 @@ object EnsembleTestHelper {
values ++= row
}
val stats = new StatCounter(values)
assert(math.abs(stats.mean - expectedMean) < epsilon)
assert(math.abs(stats.stdev - expectedStddev) < epsilon)
assert(stats.mean ~== expectedMean absTol epsilon)
assert(stats.stdev ~== expectedStddev absTol epsilon)
}

def validateClassifier(
Expand Down

0 comments on commit 52ddf03

Please sign in to comment.