Skip to content

Commit

Permalink
Merge pull request apache#809 from shivaram/sgd-cleanup
Browse files Browse the repository at this point in the history
Clean up scaladoc in ML Lib.
  • Loading branch information
etrain committed Aug 12, 2013
2 parents ea1b4ba + 8b5e3e2 commit 4346f0a
Show file tree
Hide file tree
Showing 19 changed files with 173 additions and 61 deletions.
1 change: 1 addition & 0 deletions docs/_layouts/global.html
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
<li><a href="api/core/index.html">Spark Java/Scala (Scaladoc)</a></li>
<li><a href="api/pyspark/index.html">Spark Python (Epydoc)</a></li>
<li><a href="api/streaming/index.html">Spark Streaming Java/Scala (Scaladoc) </a></li>
<li><a href="api/mllib/index.html">Spark ML Library (Scaladoc) </a></li>
</ul>
</li>

Expand Down
2 changes: 1 addition & 1 deletion docs/_plugins/copy_api_dirs.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

if ENV['SKIP_API'] != '1'
# Build Scaladoc for Java/Scala
projects = ["core", "examples", "repl", "bagel", "streaming"]
projects = ["core", "examples", "repl", "bagel", "streaming", "mllib"]

puts "Moving to project root and building scaladoc."
curr_dir = pwd
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@ import scala.math.round
import org.jblas.DoubleMatrix

/**
* Logistic Regression using Stochastic Gradient Descent.
* Based on Matlab code written by John Duchi.
* Classification model trained using Logistic Regression.
*
* @param weights Weights computed for every feature.
* @param intercept Intercept computed for this model.
*/
class LogisticRegressionModel(
override val weights: Array[Double],
Expand All @@ -43,7 +45,10 @@ class LogisticRegressionModel(
}
}

class LogisticRegressionWithSGD (
/**
* Train a classification model for Logistic Regression using Stochastic Gradient Descent.
*/
class LogisticRegressionWithSGD private (
var stepSize: Double,
var numIterations: Int,
var regParam: Double,
Expand All @@ -70,10 +75,10 @@ class LogisticRegressionWithSGD (

/**
* Top-level methods for calling Logistic Regression.
* NOTE(shivaram): We use multiple train methods instead of default arguments to support
* Java programs.
*/
object LogisticRegressionWithSGD {
// NOTE(shivaram): We use multiple train methods instead of default arguments to support
// Java programs.

/**
* Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
Expand Down
8 changes: 7 additions & 1 deletion mllib/src/main/scala/spark/mllib/classification/SVM.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ import spark.mllib.util.MLUtils
import org.jblas.DoubleMatrix

/**
* SVM using Stochastic Gradient Descent.
* Model built using SVM.
*
* @param weights Weights computed for every feature.
* @param intercept Intercept computed for this model.
*/
class SVMModel(
override val weights: Array[Double],
Expand All @@ -40,6 +43,9 @@ class SVMModel(
}
}

/**
* Train an SVM using Stochastic Gradient Descent.
*/
class SVMWithSGD private (
var stepSize: Double,
var numIterations: Int,
Expand Down
23 changes: 19 additions & 4 deletions mllib/src/main/scala/spark/mllib/optimization/Gradient.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,29 @@ package spark.mllib.optimization

import org.jblas.DoubleMatrix

/**
* Class used to compute the gradient for a loss function, given a single data point.
*/
abstract class Gradient extends Serializable {
/**
* Compute the gradient for a given row of data.
* Compute the gradient and loss given features of a single data point.
*
* @param data - One row of data. Row matrix of size 1xn where n is the number of features.
* @param data - Feature values for one data point. Column matrix of size nx1
* where n is the number of features.
* @param label - Label for this data item.
* @param weights - Column matrix containing weights for every feature.
*
* @return A tuple of 2 elements. The first element is a column matrix containing the computed
* gradient and the second element is the loss computed at this data point.
*
*/
def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
(DoubleMatrix, Double)
}

/**
* Compute gradient and loss for a logistic loss function.
*/
class LogisticGradient extends Gradient {
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
(DoubleMatrix, Double) = {
Expand All @@ -49,7 +60,9 @@ class LogisticGradient extends Gradient {
}
}


/**
* Compute gradient and loss for a Least-squared loss function.
*/
class SquaredGradient extends Gradient {
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
(DoubleMatrix, Double) = {
Expand All @@ -62,7 +75,9 @@ class SquaredGradient extends Gradient {
}
}


/**
* Compute gradient and loss for a Hinge loss function.
*/
class HingeGradient extends Gradient {
override def compute(data: DoubleMatrix, label: Double, weights: DoubleMatrix):
(DoubleMatrix, Double) = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,17 @@ import org.jblas.DoubleMatrix

import scala.collection.mutable.ArrayBuffer

/**
* Class used to solve an optimization problem using Gradient Descent.
* @param gradient Gradient function to be used.
* @param updater Updater to be used to update weights after every iteration.
*/
class GradientDescent(var gradient: Gradient, var updater: Updater) extends Optimizer {

var stepSize: Double = 1.0
var numIterations: Int = 100
var regParam: Double = 0.0
var miniBatchFraction: Double = 1.0
private var stepSize: Double = 1.0
private var numIterations: Int = 100
private var regParam: Double = 0.0
private var miniBatchFraction: Double = 1.0

/**
* Set the step size per-iteration of SGD. Default 1.0.
Expand Down Expand Up @@ -97,10 +102,10 @@ class GradientDescent(var gradient: Gradient, var updater: Updater) extends Opti

}

// Top-level method to run gradient descent.
object GradientDescent extends Logging {
/**
* Run gradient descent in parallel using mini batches.
* Based on Matlab code written by John Duchi.
*
* @param data - Input data for SGD. RDD of form (label, [feature values]).
* @param gradient - Gradient object that will be used to compute the gradient.
Expand Down Expand Up @@ -137,8 +142,8 @@ object GradientDescent extends Logging {
for (i <- 1 to numIterations) {
val (gradientSum, lossSum) = data.sample(false, miniBatchFraction, 42+i).map {
case (y, features) =>
val featuresRow = new DoubleMatrix(features.length, 1, features:_*)
val (grad, loss) = gradient.compute(featuresRow, y, weights)
val featuresCol = new DoubleMatrix(features.length, 1, features:_*)
val (grad, loss) = gradient.compute(featuresCol, y, weights)
(grad, loss)
}.reduce((a, b) => (a._1.addi(b._1), a._2 + b._2))

Expand Down
23 changes: 19 additions & 4 deletions mllib/src/main/scala/spark/mllib/optimization/Updater.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,14 @@ package spark.mllib.optimization
import scala.math._
import org.jblas.DoubleMatrix

/**
* Class used to update weights used in Gradient Descent.
*/
abstract class Updater extends Serializable {
/**
* Compute an updated value for weights given the gradient, stepSize and iteration number.
* Also returns the regularization value computed using the *updated* weights.
* Compute an updated value for weights given the gradient, stepSize, iteration number and
* regularization parameter. Also returns the regularization value computed using the
* *updated* weights.
*
* @param weightsOld - Column matrix of size nx1 where n is the number of features.
* @param gradient - Column matrix of size nx1 where n is the number of features.
Expand All @@ -38,6 +42,10 @@ abstract class Updater extends Serializable {
regParam: Double): (DoubleMatrix, Double)
}

/**
* A simple updater that adaptively adjusts the learning rate the
* square root of the number of iterations. Does not perform any regularization.
*/
class SimpleUpdater extends Updater {
override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = {
Expand All @@ -48,11 +56,15 @@ class SimpleUpdater extends Updater {
}

/**
* L1 regularization -- corresponding proximal operator is the soft-thresholding function
* That is, each weight component is shrunk towards 0 by shrinkageVal
* Updater that adjusts learning rate and performs L1 regularization.
*
* The corresponding proximal operator used is the soft-thresholding function.
* That is, each weight component is shrunk towards 0 by shrinkageVal.
*
* If w > shrinkageVal, set weight component to w-shrinkageVal.
* If w < -shrinkageVal, set weight component to w+shrinkageVal.
* If -shrinkageVal < w < shrinkageVal, set weight component to 0.
*
* Equivalently, set weight component to signum(w) * max(0.0, abs(w) - shrinkageVal)
*/
class L1Updater extends Updater {
Expand All @@ -72,6 +84,9 @@ class L1Updater extends Updater {
}
}

/**
* Updater that adjusts the learning rate and performs L2 regularization
*/
class SquaredL2Updater extends Updater {
override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@ import spark.SparkContext._

import org.jblas._

/**
* Model representing the result of matrix factorization.
*
* @param rank Rank for the features in this model.
* @param userFeatures RDD of tuples where each tuple represents the userId and
* the features computed for this user.
* @param productFeatures RDD of tuples where each tuple represents the productId
* and the features computed for this product.
*/
class MatrixFactorizationModel(
val rank: Int,
val userFeatures: RDD[(Int, Array[Double])],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@ import org.jblas.DoubleMatrix

/**
* GeneralizedLinearModel (GLM) represents a model trained using
* GeneralizedLinearAlgorithm. GLMs consist of a weight vector,
* GeneralizedLinearAlgorithm. GLMs consist of a weight vector and
* an intercept.
*
* @param weights Weights computed for every feature.
* @param intercept Intercept computed for this model.
*/
abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept: Double)
extends Serializable {
Expand All @@ -43,6 +46,12 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
def predictPoint(dataMatrix: DoubleMatrix, weightMatrix: DoubleMatrix,
intercept: Double): Double

/**
* Predict values for the given data set using the model trained.
*
* @param testData RDD representing data points to be predicted
* @return RDD[Double] where each entry contains the corresponding prediction
*/
def predict(testData: spark.RDD[Array[Double]]): RDD[Double] = {
// A small optimization to avoid serializing the entire model. Only the weightsMatrix
// and intercept is needed.
Expand All @@ -55,24 +64,33 @@ abstract class GeneralizedLinearModel(val weights: Array[Double], val intercept:
}
}

/**
* Predict values for a single data point using the model trained.
*
* @param testData array representing a single data point
* @return Double prediction from the trained model
*/
def predict(testData: Array[Double]): Double = {
val dataMat = new DoubleMatrix(1, testData.length, testData:_*)
predictPoint(dataMat, weightsMatrix, intercept)
}
}

/**
* GeneralizedLinearAlgorithm abstracts out the training for all GLMs.
* GeneralizedLinearAlgorithm implements methods to train a Genearalized Linear Model (GLM).
* This class should be extended with an Optimizer to create a new GLM.
*/
abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
extends Logging with Serializable {

val optimizer: Optimizer

def createModel(weights: Array[Double], intercept: Double): M
/**
* Create a model given the weights and intercept
*/
protected def createModel(weights: Array[Double], intercept: Double): M

var addIntercept: Boolean
protected var addIntercept: Boolean

/**
* Set if the algorithm should add an intercept. Default true.
Expand All @@ -82,12 +100,20 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
this
}

/**
* Run the algorithm with the configured parameters on an input
* RDD of LabeledPoint entries.
*/
def run(input: RDD[LabeledPoint]) : M = {
val nfeatures: Int = input.first().features.length
val initialWeights = Array.fill(nfeatures)(1.0)
run(input, initialWeights)
}

/**
* Run the algorithm with the configured parameters on an input RDD
* of LabeledPoint entries starting from the initial weights provided.
*/
def run(input: RDD[LabeledPoint], initialWeights: Array[Double]) : M = {

// Add a extra variable consisting of all 1.0's for the intercept.
Expand Down
10 changes: 7 additions & 3 deletions mllib/src/main/scala/spark/mllib/regression/Lasso.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ import spark.mllib.util.MLUtils
import org.jblas.DoubleMatrix

/**
* Lasso using Stochastic Gradient Descent.
* Regression model trained using Lasso.
*
* @param weights Weights computed for every feature.
* @param intercept Intercept computed for this model.
*/
class LassoModel(
override val weights: Array[Double],
Expand All @@ -39,8 +41,10 @@ class LassoModel(
}
}


class LassoWithSGD (
/**
* Train a regression model with L1-regularization using Stochastic Gradient Descent.
*/
class LassoWithSGD private (
var stepSize: Double,
var numIterations: Int,
var regParam: Double,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,10 @@ class RidgeRegression private (var lambdaLow: Double, var lambdaHigh: Double)

/**
* Top-level methods for calling Ridge Regression.
* NOTE(shivaram): We use multiple train methods instead of default arguments to support
* Java programs.
*/
object RidgeRegression {
// NOTE(shivaram): We use multiple train methods instead of default arguments to support
// Java programs.

/**
* Train a ridge regression model given an RDD of (response, features) pairs.
Expand Down
10 changes: 7 additions & 3 deletions mllib/src/main/scala/spark/mllib/util/KMeansDataGenerator.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,16 @@ import scala.util.Random

import spark.{RDD, SparkContext}

/**
* Generate test data for KMeans. This class first chooses k cluster centers
* from a d-dimensional Gaussian distribution scaled by factor r and then creates a Gaussian
* cluster with scale 1 around each center.
*/

object KMeansDataGenerator {

/**
* Generate an RDD containing test data for KMeans. This function chooses k cluster centers
* from a d-dimensional Gaussian distribution scaled by factor r, then creates a Gaussian
* cluster with scale 1 around each center.
* Generate an RDD containing test data for KMeans.
*
* @param sc SparkContext to use for creating the RDD
* @param numPoints Number of points that will be contained in the RDD
Expand Down
Loading

0 comments on commit 4346f0a

Please sign in to comment.