Skip to content

Commit 60022bf

Browse files
yanboliangjkbradley
authored andcommitted
[SPARK-18318][ML] ML, Graph 2.1 QA: API: New Scala APIs, docs
## What changes were proposed in this pull request? API review for 2.1, except ```LSH``` related classes which are still under development. ## How was this patch tested? Only doc changes, no new tests. Author: Yanbo Liang <[email protected]> Closes apache#16009 from yanboliang/spark-18318.
1 parent c51c772 commit 60022bf

File tree

10 files changed

+31
-23
lines changed

10 files changed

+31
-23
lines changed

docs/ml-features.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1188,7 +1188,9 @@ categorical features. The number of bins is set by the `numBuckets` parameter. I
11881188
that the number of buckets used will be smaller than this value, for example, if there are too few
11891189
distinct values of the input to create enough distinct quantiles.
11901190

1191-
NaN values: Note also that QuantileDiscretizer
1191+
NaN values:
1192+
NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will produce
1193+
a `Bucketizer` model for making predictions. During the transformation, `Bucketizer`
11921194
will raise an error when it finds NaN values in the dataset, but the user can also choose to either
11931195
keep or remove NaN values within the dataset by setting `handleInvalid`. If the user chooses to keep
11941196
NaN values, they will be handled specially and placed into their own bucket, for example, if 4 buckets

mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala

+3-3
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,6 @@ class LogisticRegression @Since("1.2.0") (
312312

313313
private var optInitialModel: Option[LogisticRegressionModel] = None
314314

315-
/** @group setParam */
316315
private[spark] def setInitialModel(model: LogisticRegressionModel): this.type = {
317316
this.optInitialModel = Some(model)
318317
this
@@ -323,8 +322,9 @@ class LogisticRegression @Since("1.2.0") (
323322
train(dataset, handlePersistence)
324323
}
325324

326-
protected[spark] def train(dataset: Dataset[_], handlePersistence: Boolean):
327-
LogisticRegressionModel = {
325+
protected[spark] def train(
326+
dataset: Dataset[_],
327+
handlePersistence: Boolean): LogisticRegressionModel = {
328328
val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
329329
val instances: RDD[Instance] =
330330
dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map {

mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ import org.apache.spark.sql.types.DoubleType
3333
/**
3434
* Params for Naive Bayes Classifiers.
3535
*/
36-
private[ml] trait NaiveBayesParams extends PredictorParams with HasWeightCol {
36+
private[classification] trait NaiveBayesParams extends PredictorParams with HasWeightCol {
3737

3838
/**
3939
* The smoothing parameter.

mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala

+4-3
Original file line numberDiff line numberDiff line change
@@ -84,11 +84,12 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
8484
* Default: "error"
8585
* @group param
8686
*/
87+
// TODO: SPARK-18619 Make Bucketizer inherit from HasHandleInvalid.
8788
@Since("2.1.0")
88-
val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
89+
val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
8990
"invalid entries. Options are skip (filter out rows with invalid values), " +
9091
"error (throw an error), or keep (keep invalid values in a special additional bucket).",
91-
ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
92+
ParamValidators.inArray(Bucketizer.supportedHandleInvalids))
9293

9394
/** @group getParam */
9495
@Since("2.1.0")
@@ -145,7 +146,7 @@ object Bucketizer extends DefaultParamsReadable[Bucketizer] {
145146
private[feature] val SKIP_INVALID: String = "skip"
146147
private[feature] val ERROR_INVALID: String = "error"
147148
private[feature] val KEEP_INVALID: String = "keep"
148-
private[feature] val supportedHandleInvalid: Array[String] =
149+
private[feature] val supportedHandleInvalids: Array[String] =
149150
Array(SKIP_INVALID, ERROR_INVALID, KEEP_INVALID)
150151

151152
/**

mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala

+2
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,13 @@ private[feature] trait ChiSqSelectorParams extends Params
8282
* Default value is 0.05.
8383
* @group param
8484
*/
85+
@Since("2.1.0")
8586
final val fpr = new DoubleParam(this, "fpr", "The highest p-value for features to be kept.",
8687
ParamValidators.inRange(0, 1))
8788
setDefault(fpr -> 0.05)
8889

8990
/** @group getParam */
91+
@Since("2.1.0")
9092
def getFpr: Double = $(fpr)
9193

9294
/**

mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala

+7-4
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,12 @@ private[feature] trait QuantileDiscretizerBase extends Params
7272
* Default: "error"
7373
* @group param
7474
*/
75+
// TODO: SPARK-18619 Make QuantileDiscretizer inherit from HasHandleInvalid.
7576
@Since("2.1.0")
76-
val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
77+
val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
7778
"invalid entries. Options are skip (filter out rows with invalid values), " +
7879
"error (throw an error), or keep (keep invalid values in a special additional bucket).",
79-
ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
80+
ParamValidators.inArray(Bucketizer.supportedHandleInvalids))
8081
setDefault(handleInvalid, Bucketizer.ERROR_INVALID)
8182

8283
/** @group getParam */
@@ -91,8 +92,10 @@ private[feature] trait QuantileDiscretizerBase extends Params
9192
* possible that the number of buckets used will be smaller than this value, for example, if there
9293
* are too few distinct values of the input to create enough distinct quantiles.
9394
*
94-
* NaN handling: Note also that
95-
* QuantileDiscretizer will raise an error when it finds NaN values in the dataset, but the user can
95+
* NaN handling:
96+
* NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will
97+
* produce a `Bucketizer` model for making predictions. During the transformation,
98+
* `Bucketizer` will raise an error when it finds NaN values in the dataset, but the user can
9699
* also choose to either keep or remove NaN values within the dataset by setting `handleInvalid`.
97100
* If the user chooses to keep NaN values, they will be handled specially and placed into their own
98101
* bucket, for example, if 4 buckets are used, then non-NaN data will be put into buckets[0-3],

mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala

+4-4
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,15 @@ import org.apache.spark.mllib.linalg.CholeskyDecomposition
3434
* @param objectiveHistory Option containing the objective history when an optimization program is
3535
* used to solve the normal equations. None when an analytic solver is used.
3636
*/
37-
private[ml] class NormalEquationSolution(
37+
private[optim] class NormalEquationSolution(
3838
val coefficients: Array[Double],
3939
val aaInv: Option[Array[Double]],
4040
val objectiveHistory: Option[Array[Double]])
4141

4242
/**
4343
* Interface for classes that solve the normal equations locally.
4444
*/
45-
private[ml] sealed trait NormalEquationSolver {
45+
private[optim] sealed trait NormalEquationSolver {
4646

4747
/** Solve the normal equations from summary statistics. */
4848
def solve(
@@ -56,7 +56,7 @@ private[ml] sealed trait NormalEquationSolver {
5656
/**
5757
* A class that solves the normal equations directly, using Cholesky decomposition.
5858
*/
59-
private[ml] class CholeskySolver extends NormalEquationSolver {
59+
private[optim] class CholeskySolver extends NormalEquationSolver {
6060

6161
override def solve(
6262
bBar: Double,
@@ -75,7 +75,7 @@ private[ml] class CholeskySolver extends NormalEquationSolver {
7575
/**
7676
* A class for solving the normal equations using Quasi-Newton optimization methods.
7777
*/
78-
private[ml] class QuasiNewtonSolver(
78+
private[optim] class QuasiNewtonSolver(
7979
fitIntercept: Boolean,
8080
maxIter: Int,
8181
tol: Double,

mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala

+3-3
Original file line numberDiff line numberDiff line change
@@ -392,13 +392,13 @@ class NaiveBayes private (
392392
object NaiveBayes {
393393

394394
/** String name for multinomial model type. */
395-
private[spark] val Multinomial: String = "multinomial"
395+
private[classification] val Multinomial: String = "multinomial"
396396

397397
/** String name for Bernoulli model type. */
398-
private[spark] val Bernoulli: String = "bernoulli"
398+
private[classification] val Bernoulli: String = "bernoulli"
399399

400400
/* Set of modelTypes that NaiveBayes supports */
401-
private[spark] val supportedModelTypes = Set(Multinomial, Bernoulli)
401+
private[classification] val supportedModelTypes = Set(Multinomial, Bernoulli)
402402

403403
/**
404404
* Trains a Naive Bayes model given an RDD of `(label, features)` pairs.

mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ private[spark] object ChiSqSelector {
266266
val Percentile: String = "percentile"
267267

268268
/** String name for `fpr` selector type. */
269-
private[spark] val FPR: String = "fpr"
269+
val FPR: String = "fpr"
270270

271271
/** Set of selector types that ChiSqSelector supports. */
272272
val supportedSelectorTypes: Array[String] = Array(NumTopFeatures, Percentile, FPR)

mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala

+3-3
Original file line numberDiff line numberDiff line change
@@ -131,17 +131,17 @@ class HashingTF(val numFeatures: Int) extends Serializable {
131131

132132
object HashingTF {
133133

134-
private[spark] val Native: String = "native"
134+
private[HashingTF] val Native: String = "native"
135135

136-
private[spark] val Murmur3: String = "murmur3"
136+
private[HashingTF] val Murmur3: String = "murmur3"
137137

138138
private val seed = 42
139139

140140
/**
141141
* Calculate a hash code value for the term object using the native Scala implementation.
142142
* This is the default hash algorithm used in Spark 1.6 and earlier.
143143
*/
144-
private[spark] def nativeHash(term: Any): Int = term.##
144+
private[HashingTF] def nativeHash(term: Any): Int = term.##
145145

146146
/**
147147
* Calculate a hash code value for the term object using

0 commit comments

Comments
 (0)