Skip to content

Commit

Permalink
[SPARK-7678] [ML] Fix default random seed in HasSeed
Browse files Browse the repository at this point in the history
Changed shared param HasSeed to have default based on hashCode of class name, instead of random number.
Also, removed fixed random seeds from Word2Vec and ALS.

CC: mengxr

Author: Joseph K. Bradley <[email protected]>

Closes apache#6251 from jkbradley/scala-fixed-seed and squashes the following commits:

0e37184 [Joseph K. Bradley] Fixed Word2VecSuite, ALSSuite in spark.ml to use original fixed random seeds
678ec3a [Joseph K. Bradley] Removed fixed random seeds from Word2Vec and ALS. Changed shared param HasSeed to have default based on hashCode of class name, instead of random number.
  • Loading branch information
jkbradley authored and mengxr committed May 19, 2015
1 parent fb90273 commit 7b16e9f
Show file tree
Hide file tree
Showing 6 changed files with 14 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ private[feature] trait Word2VecBase extends Params

setDefault(stepSize -> 0.025)
setDefault(maxIter -> 1)
setDefault(seed -> 42L)

/**
* Validate and transform the input schema.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ private[shared] object SharedParamsCodeGen {
ParamDesc[Int]("checkpointInterval", "checkpoint interval (>= 1)",
isValid = "ParamValidators.gtEq(1)"),
ParamDesc[Boolean]("fitIntercept", "whether to fit an intercept term", Some("true")),
ParamDesc[Long]("seed", "random seed", Some("Utils.random.nextLong()")),
ParamDesc[Long]("seed", "random seed", Some("this.getClass.getName.hashCode.toLong")),
ParamDesc[Double]("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]." +
" For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.",
isValid = "ParamValidators.inRange(0, 1)"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ private[ml] trait HasFitIntercept extends Params {
}

/**
* (private[ml]) Trait for shared param seed (default: Utils.random.nextLong()).
* (private[ml]) Trait for shared param seed (default: this.getClass.getName.hashCode.toLong).
*/
private[ml] trait HasSeed extends Params {

Expand All @@ -242,7 +242,7 @@ private[ml] trait HasSeed extends Params {
*/
final val seed: LongParam = new LongParam(this, "seed", "random seed")

setDefault(seed, Utils.random.nextLong())
setDefault(seed, this.getClass.getName.hashCode.toLong)

/** @group getParam */
final def getSeed: Long = $(seed)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR

setDefault(rank -> 10, maxIter -> 10, regParam -> 0.1, numUserBlocks -> 10, numItemBlocks -> 10,
implicitPrefs -> false, alpha -> 1.0, userCol -> "user", itemCol -> "item",
ratingCol -> "rating", nonnegative -> false, checkpointInterval -> 10, seed -> 0L)
ratingCol -> "rating", nonnegative -> false, checkpointInterval -> 10)

/**
* Validates and transforms the input schema.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class Word2VecSuite extends FunSuite with MLlibTestSparkContext {
.setVectorSize(3)
.setInputCol("text")
.setOutputCol("result")
.setSeed(42L)
.fit(docDF)

model.transform(docDF).select("result", "expected").collect().foreach {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
.setImplicitPrefs(implicitPrefs)
.setNumUserBlocks(numUserBlocks)
.setNumItemBlocks(numItemBlocks)
.setSeed(0)
val alpha = als.getAlpha
val model = als.fit(training.toDF())
val predictions = model.transform(test.toDF())
Expand Down Expand Up @@ -425,17 +426,18 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)

val longRatings = ratings.map(r => Rating(r.user.toLong, r.item.toLong, r.rating))
val (longUserFactors, _) = ALS.train(longRatings, rank = 2, maxIter = 4)
val (longUserFactors, _) = ALS.train(longRatings, rank = 2, maxIter = 4, seed = 0)
assert(longUserFactors.first()._1.getClass === classOf[Long])

val strRatings = ratings.map(r => Rating(r.user.toString, r.item.toString, r.rating))
val (strUserFactors, _) = ALS.train(strRatings, rank = 2, maxIter = 4)
val (strUserFactors, _) = ALS.train(strRatings, rank = 2, maxIter = 4, seed = 0)
assert(strUserFactors.first()._1.getClass === classOf[String])
}

test("nonnegative constraint") {
val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
val (userFactors, itemFactors) = ALS.train(ratings, rank = 2, maxIter = 4, nonnegative = true)
val (userFactors, itemFactors) =
ALS.train(ratings, rank = 2, maxIter = 4, nonnegative = true, seed = 0)
def isNonnegative(factors: RDD[(Int, Array[Float])]): Boolean = {
factors.values.map { _.forall(_ >= 0.0) }.reduce(_ && _)
}
Expand All @@ -459,7 +461,7 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {
test("partitioner in returned factors") {
val (ratings, _) = genImplicitTestData(numUsers = 20, numItems = 40, rank = 2, noiseStd = 0.01)
val (userFactors, itemFactors) = ALS.train(
ratings, rank = 2, maxIter = 4, numUserBlocks = 3, numItemBlocks = 4)
ratings, rank = 2, maxIter = 4, numUserBlocks = 3, numItemBlocks = 4, seed = 0)
for ((tpe, factors) <- Seq(("User", userFactors), ("Item", itemFactors))) {
assert(userFactors.partitioner.isDefined, s"$tpe factors should have partitioner.")
val part = userFactors.partitioner.get
Expand All @@ -476,8 +478,8 @@ class ALSSuite extends FunSuite with MLlibTestSparkContext with Logging {

test("als with large number of iterations") {
val (ratings, _) = genExplicitTestData(numUsers = 4, numItems = 4, rank = 1)
ALS.train(ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2)
ALS.train(
ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2, implicitPrefs = true)
ALS.train(ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2, seed = 0)
ALS.train(ratings, rank = 1, maxIter = 50, numUserBlocks = 2, numItemBlocks = 2,
implicitPrefs = true, seed = 0)
}
}

0 comments on commit 7b16e9f

Please sign in to comment.