Skip to content

Commit

Permalink
[SPARK-35310][MLLIB] Update to breeze 1.2
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

Update to the latest breeze 1.2

### Why are the changes needed?

Minor bug fixes

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Existing tests

Closes apache#33449 from srowen/SPARK-35310.

Authored-by: Sean Owen <[email protected]>
Signed-off-by: Sean Owen <[email protected]>
  • Loading branch information
srowen committed Jul 22, 2021
1 parent 07fa38e commit 518f00f
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 32 deletions.
17 changes: 8 additions & 9 deletions dev/deps/spark-deps-hadoop-2.7-hive-2.3
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ RoaringBitmap/0.9.0//RoaringBitmap-0.9.0.jar
ST4/4.0.4//ST4-4.0.4.jar
activation/1.1.1//activation-1.1.1.jar
aircompressor/0.19//aircompressor-0.19.jar
algebra_2.12/2.0.0-M2//algebra_2.12-2.0.0-M2.jar
algebra_2.12/2.0.1//algebra_2.12-2.0.1.jar
annotations/17.0.0//annotations-17.0.0.jar
antlr-runtime/3.5.2//antlr-runtime-3.5.2.jar
antlr4-runtime/4.8//antlr4-runtime-4.8.jar
Expand All @@ -28,9 +28,9 @@ avro-mapred/1.10.2//avro-mapred-1.10.2.jar
avro/1.10.2//avro-1.10.2.jar
blas/2.2.0//blas-2.2.0.jar
bonecp/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar
breeze-macros_2.12/1.0//breeze-macros_2.12-1.0.jar
breeze_2.12/1.0//breeze_2.12-1.0.jar
cats-kernel_2.12/2.0.0-M4//cats-kernel_2.12-2.0.0-M4.jar
breeze-macros_2.12/1.2//breeze-macros_2.12-1.2.jar
breeze_2.12/1.2//breeze_2.12-1.2.jar
cats-kernel_2.12/2.1.1//cats-kernel_2.12-2.1.1.jar
chill-java/0.10.0//chill-java-0.10.0.jar
chill_2.12/0.10.0//chill_2.12-0.10.0.jar
commons-beanutils/1.9.4//commons-beanutils-1.9.4.jar
Expand Down Expand Up @@ -182,7 +182,6 @@ libthrift/0.12.0//libthrift-0.12.0.jar
log4j/1.2.17//log4j-1.2.17.jar
logging-interceptor/3.12.12//logging-interceptor-3.12.12.jar
lz4-java/1.8.0//lz4-java-1.8.0.jar
machinist_2.12/0.6.8//machinist_2.12-0.6.8.jar
macro-compat_2.12/1.1.1//macro-compat_2.12-1.1.1.jar
mesos/1.4.0/shaded-protobuf/mesos-1.4.0-shaded-protobuf.jar
metrics-core/4.2.2//metrics-core-4.2.2.jar
Expand Down Expand Up @@ -224,10 +223,10 @@ slf4j-api/1.7.30//slf4j-api-1.7.30.jar
slf4j-log4j12/1.7.30//slf4j-log4j12-1.7.30.jar
snakeyaml/1.27//snakeyaml-1.27.jar
snappy-java/1.1.8.4//snappy-java-1.1.8.4.jar
spire-macros_2.12/0.17.0-M1//spire-macros_2.12-0.17.0-M1.jar
spire-platform_2.12/0.17.0-M1//spire-platform_2.12-0.17.0-M1.jar
spire-util_2.12/0.17.0-M1//spire-util_2.12-0.17.0-M1.jar
spire_2.12/0.17.0-M1//spire_2.12-0.17.0-M1.jar
spire-macros_2.12/0.17.0//spire-macros_2.12-0.17.0.jar
spire-platform_2.12/0.17.0//spire-platform_2.12-0.17.0.jar
spire-util_2.12/0.17.0//spire-util_2.12-0.17.0.jar
spire_2.12/0.17.0//spire_2.12-0.17.0.jar
stax-api/1.0.1//stax-api-1.0.1.jar
stream/2.9.6//stream-2.9.6.jar
super-csv/2.2.0//super-csv-2.2.0.jar
Expand Down
17 changes: 8 additions & 9 deletions dev/deps/spark-deps-hadoop-3.2-hive-2.3
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ RoaringBitmap/0.9.0//RoaringBitmap-0.9.0.jar
ST4/4.0.4//ST4-4.0.4.jar
activation/1.1.1//activation-1.1.1.jar
aircompressor/0.19//aircompressor-0.19.jar
algebra_2.12/2.0.0-M2//algebra_2.12-2.0.0-M2.jar
algebra_2.12/2.0.1//algebra_2.12-2.0.1.jar
annotations/17.0.0//annotations-17.0.0.jar
antlr-runtime/3.5.2//antlr-runtime-3.5.2.jar
antlr4-runtime/4.8//antlr4-runtime-4.8.jar
Expand All @@ -23,9 +23,9 @@ avro-mapred/1.10.2//avro-mapred-1.10.2.jar
avro/1.10.2//avro-1.10.2.jar
blas/2.2.0//blas-2.2.0.jar
bonecp/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar
breeze-macros_2.12/1.0//breeze-macros_2.12-1.0.jar
breeze_2.12/1.0//breeze_2.12-1.0.jar
cats-kernel_2.12/2.0.0-M4//cats-kernel_2.12-2.0.0-M4.jar
breeze-macros_2.12/1.2//breeze-macros_2.12-1.2.jar
breeze_2.12/1.2//breeze_2.12-1.2.jar
cats-kernel_2.12/2.1.1//cats-kernel_2.12-2.1.1.jar
chill-java/0.10.0//chill-java-0.10.0.jar
chill_2.12/0.10.0//chill_2.12-0.10.0.jar
commons-cli/1.2//commons-cli-1.2.jar
Expand Down Expand Up @@ -153,7 +153,6 @@ libthrift/0.12.0//libthrift-0.12.0.jar
log4j/1.2.17//log4j-1.2.17.jar
logging-interceptor/3.12.12//logging-interceptor-3.12.12.jar
lz4-java/1.8.0//lz4-java-1.8.0.jar
machinist_2.12/0.6.8//machinist_2.12-0.6.8.jar
macro-compat_2.12/1.1.1//macro-compat_2.12-1.1.1.jar
mesos/1.4.0/shaded-protobuf/mesos-1.4.0-shaded-protobuf.jar
metrics-core/4.2.2//metrics-core-4.2.2.jar
Expand Down Expand Up @@ -195,10 +194,10 @@ slf4j-api/1.7.30//slf4j-api-1.7.30.jar
slf4j-log4j12/1.7.30//slf4j-log4j12-1.7.30.jar
snakeyaml/1.27//snakeyaml-1.27.jar
snappy-java/1.1.8.4//snappy-java-1.1.8.4.jar
spire-macros_2.12/0.17.0-M1//spire-macros_2.12-0.17.0-M1.jar
spire-platform_2.12/0.17.0-M1//spire-platform_2.12-0.17.0-M1.jar
spire-util_2.12/0.17.0-M1//spire-util_2.12-0.17.0-M1.jar
spire_2.12/0.17.0-M1//spire_2.12-0.17.0-M1.jar
spire-macros_2.12/0.17.0//spire-macros_2.12-0.17.0.jar
spire-platform_2.12/0.17.0//spire-platform_2.12-0.17.0.jar
spire-util_2.12/0.17.0//spire-util_2.12-0.17.0.jar
spire_2.12/0.17.0//spire_2.12-0.17.0.jar
stax-api/1.0.1//stax-api-1.0.1.jar
stream/2.9.6//stream-2.9.6.jar
super-csv/2.2.0//super-csv-2.2.0.jar
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
solverType = WeightedLeastSquares.Cholesky)
val wlsModelWithIntercept = wlsWithIntercept.fit(instances)
val wls = new WeightedLeastSquares(false, 0.0, 0.0, true, true,
solverType = WeightedLeastSquares.Cholesky)
solverType = WeightedLeastSquares.Cholesky, tol = 1e-14, maxIter = 100000)
val wlsModel = wls.fit(instances)

assert(expectedWithIntercept ~== wlsModelWithIntercept.diagInvAtWA relTol 1e-4)
Expand All @@ -169,7 +169,8 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
solver <- Seq(WeightedLeastSquares.Auto, WeightedLeastSquares.QuasiNewton)) {
val singularModel = new WeightedLeastSquares(fitIntercept, regParam = 0.0,
elasticNetParam = 0.0, standardizeFeatures = standardization,
standardizeLabel = standardization, solverType = solver).fit(collinearInstances)
standardizeLabel = standardization, solverType = solver,
tol = 1e-14, maxIter = 100000).fit(collinearInstances)

collinearInstances.collect().foreach { case Instance(l, w, f) =>
val pred = BLAS.dot(singularModel.coefficients, f) + singularModel.intercept
Expand Down Expand Up @@ -202,6 +203,7 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
for (solver <- WeightedLeastSquares.supportedSolvers) {
val wls = new WeightedLeastSquares(fitIntercept, regParam = 0.0, elasticNetParam = 0.0,
standardizeFeatures = standardization, standardizeLabel = standardization,
tol = 1e-14, maxIter = 100000,
solverType = solver).fit(instances)
val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
assert(actual ~== expected(idx) absTol 1e-4)
Expand Down Expand Up @@ -305,7 +307,8 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
for (fitIntercept <- Seq(false, true)) {
val wls = new WeightedLeastSquares(fitIntercept = fitIntercept, regParam = 0.5,
elasticNetParam = 0.0, standardizeFeatures = true,
standardizeLabel = true, solverType = WeightedLeastSquares.Cholesky)
standardizeLabel = true, solverType = WeightedLeastSquares.Cholesky,
tol = 1e-14, maxIter = 100000)
.fit(constantFeaturesInstances)
val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
assert(actual ~== expectedCholesky(idx) absTol 1e-6)
Expand Down Expand Up @@ -363,7 +366,7 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
(lambda, alpha) <- Seq((0.0, 0.0), (0.5, 0.0), (0.5, 0.5), (0.5, 1.0))) {
val wls = new WeightedLeastSquares(fitIntercept, regParam = lambda, elasticNetParam = alpha,
standardizeFeatures = standardization, standardizeLabel = true,
solverType = WeightedLeastSquares.QuasiNewton)
solverType = WeightedLeastSquares.QuasiNewton, tol = 1e-14, maxIter = 100000)
val model = wls.fit(constantFeaturesInstances)
val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1))
assert(actual ~== expectedQuasiNewton(idx) absTol 1e-6)
Expand Down Expand Up @@ -473,7 +476,7 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
elasticNetParam <- Seq(0.1, 0.5, 1.0)) {
val wls = new WeightedLeastSquares(fitIntercept, regParam, elasticNetParam,
standardizeFeatures = standardization, standardizeLabel = true,
solverType = WeightedLeastSquares.Auto)
solverType = WeightedLeastSquares.Auto, tol = 1e-14, maxIter = 100000)
.fit(instances)
val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
assert(actual ~== expected(idx) absTol 1e-4)
Expand Down Expand Up @@ -531,7 +534,8 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext
standardization <- Seq(false, true)) {
for (solver <- WeightedLeastSquares.supportedSolvers) {
val wls = new WeightedLeastSquares(fitIntercept, regParam, elasticNetParam = 0.0,
standardizeFeatures = standardization, standardizeLabel = true, solverType = solver)
standardizeFeatures = standardization, standardizeLabel = true, solverType = solver,
tol = 1e-14, maxIter = 100000)
.fit(instances)
val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1))
assert(actual ~== expected(idx) absTol 1e-4)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import scala.collection.mutable.ArrayBuilder
import scala.reflect.ClassTag
import scala.util.Random

import breeze.linalg.{squaredDistance => breezeSquaredDistance, DenseMatrix => BDM}
import breeze.linalg.{DenseMatrix => BDM}
import org.json4s.jackson.JsonMethods.{parse => parseJson}

import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
Expand Down Expand Up @@ -295,7 +295,9 @@ class VectorsSuite extends SparkFunSuite with Logging {
val denseVector1 = Vectors.dense(sparseVector1.toArray)
val denseVector2 = Vectors.dense(sparseVector2.toArray)

val squaredDist = breezeSquaredDistance(sparseVector1.asBreeze, sparseVector2.asBreeze)
val squaredDist = sparseVector1.toArray.zip(sparseVector2.toArray).map {
case (a, b) => (a - b) * (a - b)
}.sum

// SparseVector vs. SparseVector
assert(Vectors.sqdist(sparseVector1, sparseVector2) ~== squaredDist relTol 1E-8)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,10 @@ import java.nio.charset.StandardCharsets

import scala.io.Source

import breeze.linalg.{squaredDistance => breezeSquaredDistance}
import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.mllib.linalg.{DenseVector, Matrices, SparseVector, Vectors}
import org.apache.spark.mllib.linalg.{DenseVector, Matrices, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils._
import org.apache.spark.mllib.util.TestingUtils._
Expand All @@ -50,28 +49,34 @@ class MLUtilsSuite extends SparkFunSuite with MLlibTestSparkContext {
val v1 = Vectors.dense(a)
val norm1 = Vectors.norm(v1, 2.0)
val precision = 1e-6

def squaredDistance(v1: Vector, v2: Vector): Double =
v1.toArray.zip(v2.toArray).map {
case (a, b) => (a - b) * (a - b)
}.sum

for (m <- 0 until n) {
val indices = (0 to m).toArray
val values = indices.map(i => a(i))
val v2 = Vectors.sparse(n, indices, values)
val norm2 = Vectors.norm(v2, 2.0)
val v3 = Vectors.sparse(n, indices, indices.map(i => a(i) + 0.5))
val norm3 = Vectors.norm(v3, 2.0)
val squaredDist = breezeSquaredDistance(v1.asBreeze, v2.asBreeze)
val squaredDist = squaredDistance(v1, v2)
val fastSquaredDist1 = fastSquaredDistance(v1, norm1, v2, norm2, precision)
assert((fastSquaredDist1 - squaredDist) <= precision * squaredDist, s"failed with m = $m")
val fastSquaredDist2 =
fastSquaredDistance(v1, norm1, Vectors.dense(v2.toArray), norm2, precision)
assert((fastSquaredDist2 - squaredDist) <= precision * squaredDist, s"failed with m = $m")
val squaredDist2 = breezeSquaredDistance(v2.asBreeze, v3.asBreeze)
val squaredDist2 = squaredDistance(v2, v3)
val fastSquaredDist3 =
fastSquaredDistance(v2, norm2, v3, norm3, precision)
assert((fastSquaredDist3 - squaredDist2) <= precision * squaredDist2, s"failed with m = $m")
if (m > 10) {
val v4 = Vectors.sparse(n, indices.slice(0, m - 10),
indices.map(i => a(i) + 0.5).slice(0, m - 10))
val norm4 = Vectors.norm(v4, 2.0)
val squaredDist = breezeSquaredDistance(v2.asBreeze, v4.asBreeze)
val squaredDist = squaredDistance(v2, v4)
val fastSquaredDist =
fastSquaredDistance(v2, norm2, v4, norm4, precision)
assert((fastSquaredDist - squaredDist) <= precision * squaredDist, s"failed with m = $m")
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -880,7 +880,7 @@
<dependency>
<groupId>org.scalanlp</groupId>
<artifactId>breeze_${scala.binary.version}</artifactId>
<version>1.0</version>
<version>1.2</version>
<exclusions>
<exclusion>
<groupId>org.apache.commons</groupId>
Expand Down

0 comments on commit 518f00f

Please sign in to comment.