[SPARK-18318][ML] ML, Graph 2.1 QA: API: New Scala APIs, docs

yanboliang · jkbradley · commit 60022bfd65e4 · 2016-11-30T13:21:05.000-08:00
## What changes were proposed in this pull request? API review for 2.1, except ```LSH``` related classes which are still under development. ## How was this patch tested? Only doc changes, no new tests. Author: Yanbo Liang <ybliang8@gmail.com> Closes apache#16009 from yanboliang/spark-18318.
diff --git a/docs/ml-features.md b/docs/ml-features.md
@@ -1188,7 +1188,9 @@ categorical features. The number of bins is set by the `numBuckets` parameter. I
 that the number of buckets used will be smaller than this value, for example, if there are too few
 distinct values of the input to create enough distinct quantiles.
 
-NaN values: Note also that QuantileDiscretizer
+NaN values:
+NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will produce
+a `Bucketizer` model for making predictions. During the transformation, `Bucketizer`
 will raise an error when it finds NaN values in the dataset, but the user can also choose to either
 keep or remove NaN values within the dataset by setting `handleInvalid`. If the user chooses to keep
 NaN values, they will be handled specially and placed into their own bucket, for example, if 4 buckets
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -312,7 +312,6 @@ class LogisticRegression @Since("1.2.0") (
 
   private var optInitialModel: Option[LogisticRegressionModel] = None
 
-  /** @group setParam */
   private[spark] def setInitialModel(model: LogisticRegressionModel): this.type = {
     this.optInitialModel = Some(model)
     this
@@ -323,8 +322,9 @@ class LogisticRegression @Since("1.2.0") (
     train(dataset, handlePersistence)
   }
 
-  protected[spark] def train(dataset: Dataset[_], handlePersistence: Boolean):
-      LogisticRegressionModel = {
+  protected[spark] def train(
+      dataset: Dataset[_],
+      handlePersistence: Boolean): LogisticRegressionModel = {
     val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
     val instances: RDD[Instance] =
       dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.types.DoubleType
 /**
  * Params for Naive Bayes Classifiers.
  */
-private[ml] trait NaiveBayesParams extends PredictorParams with HasWeightCol {
+private[classification] trait NaiveBayesParams extends PredictorParams with HasWeightCol {
 
   /**
    * The smoothing parameter.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -84,11 +84,12 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
    * Default: "error"
    * @group param
    */
+  // TODO: SPARK-18619 Make Bucketizer inherit from HasHandleInvalid.
   @Since("2.1.0")
-  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
+  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
     "invalid entries. Options are skip (filter out rows with invalid values), " +
     "error (throw an error), or keep (keep invalid values in a special additional bucket).",
-    ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
+    ParamValidators.inArray(Bucketizer.supportedHandleInvalids))
 
   /** @group getParam */
   @Since("2.1.0")
@@ -145,7 +146,7 @@ object Bucketizer extends DefaultParamsReadable[Bucketizer] {
   private[feature] val SKIP_INVALID: String = "skip"
   private[feature] val ERROR_INVALID: String = "error"
   private[feature] val KEEP_INVALID: String = "keep"
-  private[feature] val supportedHandleInvalid: Array[String] =
+  private[feature] val supportedHandleInvalids: Array[String] =
     Array(SKIP_INVALID, ERROR_INVALID, KEEP_INVALID)
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -82,11 +82,13 @@ private[feature] trait ChiSqSelectorParams extends Params
    * Default value is 0.05.
    * @group param
    */
+  @Since("2.1.0")
   final val fpr = new DoubleParam(this, "fpr", "The highest p-value for features to be kept.",
     ParamValidators.inRange(0, 1))
   setDefault(fpr -> 0.05)
 
   /** @group getParam */
+  @Since("2.1.0")
   def getFpr: Double = $(fpr)
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -72,11 +72,12 @@ private[feature] trait QuantileDiscretizerBase extends Params
    * Default: "error"
    * @group param
    */
+  // TODO: SPARK-18619 Make QuantileDiscretizer inherit from HasHandleInvalid.
   @Since("2.1.0")
-  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
+  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
     "invalid entries. Options are skip (filter out rows with invalid values), " +
     "error (throw an error), or keep (keep invalid values in a special additional bucket).",
-    ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
+    ParamValidators.inArray(Bucketizer.supportedHandleInvalids))
   setDefault(handleInvalid, Bucketizer.ERROR_INVALID)
 
   /** @group getParam */
@@ -91,8 +92,10 @@ private[feature] trait QuantileDiscretizerBase extends Params
  * possible that the number of buckets used will be smaller than this value, for example, if there
  * are too few distinct values of the input to create enough distinct quantiles.
  *
- * NaN handling: Note also that
- * QuantileDiscretizer will raise an error when it finds NaN values in the dataset, but the user can
+ * NaN handling:
+ * NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will
+ * produce a `Bucketizer` model for making predictions. During the transformation,
+ * `Bucketizer` will raise an error when it finds NaN values in the dataset, but the user can
  * also choose to either keep or remove NaN values within the dataset by setting `handleInvalid`.
  * If the user chooses to keep NaN values, they will be handled specially and placed into their own
  * bucket, for example, if 4 buckets are used, then non-NaN data will be put into buckets[0-3],
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala
@@ -34,15 +34,15 @@ import org.apache.spark.mllib.linalg.CholeskyDecomposition
  * @param objectiveHistory Option containing the objective history when an optimization program is
  *                         used to solve the normal equations. None when an analytic solver is used.
  */
-private[ml] class NormalEquationSolution(
+private[optim] class NormalEquationSolution(
     val coefficients: Array[Double],
     val aaInv: Option[Array[Double]],
     val objectiveHistory: Option[Array[Double]])
 
 /**
  * Interface for classes that solve the normal equations locally.
  */
-private[ml] sealed trait NormalEquationSolver {
+private[optim] sealed trait NormalEquationSolver {
 
   /** Solve the normal equations from summary statistics. */
   def solve(
@@ -56,7 +56,7 @@ private[ml] sealed trait NormalEquationSolver {
 /**
  * A class that solves the normal equations directly, using Cholesky decomposition.
  */
-private[ml] class CholeskySolver extends NormalEquationSolver {
+private[optim] class CholeskySolver extends NormalEquationSolver {
 
   override def solve(
       bBar: Double,
@@ -75,7 +75,7 @@ private[ml] class CholeskySolver extends NormalEquationSolver {
 /**
  * A class for solving the normal equations using Quasi-Newton optimization methods.
  */
-private[ml] class QuasiNewtonSolver(
+private[optim] class QuasiNewtonSolver(
     fitIntercept: Boolean,
     maxIter: Int,
     tol: Double,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -392,13 +392,13 @@ class NaiveBayes private (
 object NaiveBayes {
 
   /** String name for multinomial model type. */
-  private[spark] val Multinomial: String = "multinomial"
+  private[classification] val Multinomial: String = "multinomial"
 
   /** String name for Bernoulli model type. */
-  private[spark] val Bernoulli: String = "bernoulli"
+  private[classification] val Bernoulli: String = "bernoulli"
 
   /* Set of modelTypes that NaiveBayes supports */
-  private[spark] val supportedModelTypes = Set(Multinomial, Bernoulli)
+  private[classification] val supportedModelTypes = Set(Multinomial, Bernoulli)
 
   /**
    * Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -266,7 +266,7 @@ private[spark] object ChiSqSelector {
   val Percentile: String = "percentile"
 
   /** String name for `fpr` selector type. */
-  private[spark] val FPR: String = "fpr"
+  val FPR: String = "fpr"
 
   /** Set of selector types that ChiSqSelector supports. */
   val supportedSelectorTypes: Array[String] = Array(NumTopFeatures, Percentile, FPR)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
@@ -131,17 +131,17 @@ class HashingTF(val numFeatures: Int) extends Serializable {
 
 object HashingTF {
 
-  private[spark] val Native: String = "native"
+  private[HashingTF] val Native: String = "native"
 
-  private[spark] val Murmur3: String = "murmur3"
+  private[HashingTF] val Murmur3: String = "murmur3"
 
   private val seed = 42
 
   /**
    * Calculate a hash code value for the term object using the native Scala implementation.
    * This is the default hash algorithm used in Spark 1.6 and earlier.
    */
-  private[spark] def nativeHash(term: Any): Int = term.##
+  private[HashingTF] def nativeHash(term: Any): Int = term.##
 
   /**
    * Calculate a hash code value for the term object using