Skip to content

Commit

Permalink
[SPARK-17137][ML][WIP] Compress logistic regression coefficients
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

Use the new `compressed` method on matrices to store the logistic regression coefficients as sparse or dense - whichever is requires less memory.

Marked as WIP so we can add some performance test results. Basically, we should see if prediction is slower because of using a sparse matrix over a dense one. This can happen since sparse matrices do not use native BLAS operations when computing the margins.

## How was this patch tested?

Unit tests added.

Author: sethah <[email protected]>

Closes apache#17426 from sethah/SPARK-17137.
  • Loading branch information
sethah authored and dbtsai committed Mar 25, 2017
1 parent e8ddb91 commit be85245
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -399,14 +399,9 @@ class LogisticRegression @Since("1.2.0") (
logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " +
s"will be zeros. Training is not needed.")
val constantLabelIndex = Vectors.dense(histogram).argmax
// TODO: use `compressed` after SPARK-17471
val coefMatrix = if (numFeatures < numCoefficientSets) {
new SparseMatrix(numCoefficientSets, numFeatures,
Array.fill(numFeatures + 1)(0), Array.empty[Int], Array.empty[Double])
} else {
new SparseMatrix(numCoefficientSets, numFeatures, Array.fill(numCoefficientSets + 1)(0),
Array.empty[Int], Array.empty[Double], isTransposed = true)
}
val coefMatrix = new SparseMatrix(numCoefficientSets, numFeatures,
new Array[Int](numCoefficientSets + 1), Array.empty[Int], Array.empty[Double],
isTransposed = true).compressed
val interceptVec = if (isMultinomial) {
Vectors.sparse(numClasses, Seq((constantLabelIndex, Double.PositiveInfinity)))
} else {
Expand Down Expand Up @@ -617,26 +612,13 @@ class LogisticRegression @Since("1.2.0") (
denseCoefficientMatrix.update(_ - coefficientMean)
}

// TODO: use `denseCoefficientMatrix.compressed` after SPARK-17471
val compressedCoefficientMatrix = if (isMultinomial) {
denseCoefficientMatrix
} else {
val compressedVector = Vectors.dense(denseCoefficientMatrix.values).compressed
compressedVector match {
case dv: DenseVector => denseCoefficientMatrix
case sv: SparseVector =>
new SparseMatrix(1, numFeatures, Array(0, sv.indices.length), sv.indices, sv.values,
isTransposed = true)
}
}

// center the intercepts when using multinomial algorithm
if ($(fitIntercept) && isMultinomial) {
val interceptArray = interceptVec.toArray
val interceptMean = interceptArray.sum / interceptArray.length
(0 until interceptVec.size).foreach { i => interceptArray(i) -= interceptMean }
}
(compressedCoefficientMatrix, interceptVec.compressed, arrayBuilder.result())
(denseCoefficientMatrix.compressed, interceptVec.compressed, arrayBuilder.result())
}
}

Expand Down Expand Up @@ -713,7 +695,7 @@ class LogisticRegressionModel private[spark] (
// convert to appropriate vector representation without replicating data
private lazy val _coefficients: Vector = {
require(coefficientMatrix.isTransposed,
"LogisticRegressionModel coefficients should be row major.")
"LogisticRegressionModel coefficients should be row major for binomial model.")
coefficientMatrix match {
case dm: DenseMatrix => Vectors.dense(dm.values)
case sm: SparseMatrix => Vectors.sparse(coefficientMatrix.numCols, sm.rowIndices, sm.values)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.classification.LogisticRegressionSuite._
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, SparseVector, Vector, Vectors}
import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vector, Vectors}
import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
Expand Down Expand Up @@ -713,8 +713,6 @@ class LogisticRegressionSuite

assert(model2.intercept ~== interceptR relTol 1E-2)
assert(model2.coefficients ~== coefficientsR absTol 1E-3)
// TODO: move this to a standalone test of compression after SPARK-17471
assert(model2.coefficients.isInstanceOf[SparseVector])
}

test("binary logistic regression without intercept with L1 regularization") {
Expand Down Expand Up @@ -2031,29 +2029,61 @@ class LogisticRegressionSuite
// TODO: check num iters is zero when it become available in the model
}

test("compressed storage") {
test("compressed storage for constant label") {
/*
When the label is constant and fit intercept is true, all the coefficients will be
zeros, and so the model coefficients should be stored as sparse data structures, except
when the matrix dimensions are very small.
*/
val moreClassesThanFeatures = Seq(
LabeledPoint(4.0, Vectors.dense(0.0, 0.0, 0.0)),
LabeledPoint(4.0, Vectors.dense(1.0, 1.0, 1.0)),
LabeledPoint(4.0, Vectors.dense(2.0, 2.0, 2.0))).toDF()
val mlr = new LogisticRegression().setFamily("multinomial")
LabeledPoint(4.0, Vectors.dense(Array.fill(5)(0.0))),
LabeledPoint(4.0, Vectors.dense(Array.fill(5)(1.0))),
LabeledPoint(4.0, Vectors.dense(Array.fill(5)(2.0)))).toDF()
val mlr = new LogisticRegression().setFamily("multinomial").setFitIntercept(true)
val model = mlr.fit(moreClassesThanFeatures)
assert(model.coefficientMatrix.isInstanceOf[SparseMatrix])
assert(model.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 4)
assert(model.coefficientMatrix.isColMajor)

// in this case, it should be stored as row major
val moreFeaturesThanClasses = Seq(
LabeledPoint(1.0, Vectors.dense(0.0, 0.0, 0.0)),
LabeledPoint(1.0, Vectors.dense(1.0, 1.0, 1.0)),
LabeledPoint(1.0, Vectors.dense(2.0, 2.0, 2.0))).toDF()
LabeledPoint(1.0, Vectors.dense(Array.fill(5)(0.0))),
LabeledPoint(1.0, Vectors.dense(Array.fill(5)(1.0))),
LabeledPoint(1.0, Vectors.dense(Array.fill(5)(2.0)))).toDF()
val model2 = mlr.fit(moreFeaturesThanClasses)
assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix])
assert(model2.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 3)
assert(model2.coefficientMatrix.isRowMajor)

val blr = new LogisticRegression().setFamily("binomial")
val blr = new LogisticRegression().setFamily("binomial").setFitIntercept(true)
val blrModel = blr.fit(moreFeaturesThanClasses)
assert(blrModel.coefficientMatrix.isInstanceOf[SparseMatrix])
assert(blrModel.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 2)
}

test("compressed coefficients") {

val trainer1 = new LogisticRegression()
.setRegParam(0.1)
.setElasticNetParam(1.0)

// compressed row major is optimal
val model1 = trainer1.fit(multinomialDataset.limit(100))
assert(model1.coefficientMatrix.isInstanceOf[SparseMatrix])
assert(model1.coefficientMatrix.isRowMajor)

// compressed column major is optimal since there are more classes than features
val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata()
val model2 = trainer1.fit(multinomialDataset
.withColumn("label", col("label").as("label", labelMeta)).limit(100))
assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix])
assert(model2.coefficientMatrix.isColMajor)

// coefficients are dense without L1 regularization
val trainer2 = new LogisticRegression()
.setElasticNetParam(0.0)
val model3 = trainer2.fit(multinomialDataset.limit(100))
assert(model3.coefficientMatrix.isInstanceOf[DenseMatrix])
}

test("numClasses specified in metadata/inferred") {
val lr = new LogisticRegression().setMaxIter(1).setFamily("multinomial")

Expand Down

0 comments on commit be85245

Please sign in to comment.