Skip to content

Commit

Permalink
[SPARK-11978][ML] Move dataset_example.py to examples/ml and rename t…
Browse files Browse the repository at this point in the history
…o dataframe_example.py

Since ```Dataset``` has a new meaning in Spark 1.6, we should rename it to avoid confusion.
#9873 finished the work of Scala example, here we focus on the Python one.
Move dataset_example.py to ```examples/ml``` and rename to ```dataframe_example.py```.
BTW, fix minor missing issues of #9873.
cc mengxr

Author: Yanbo Liang <[email protected]>

Closes #9957 from yanboliang/SPARK-11978.
  • Loading branch information
yanboliang authored and jkbradley committed Dec 12, 2015
1 parent aea676c commit a0ff6d1
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
#

"""
An example of how to use DataFrame as a dataset for ML. Run with::
bin/spark-submit examples/src/main/python/mllib/dataset_example.py
An example of how to use DataFrame for ML. Run with::
bin/spark-submit examples/src/main/python/ml/dataframe_example.py <input>
"""
from __future__ import print_function

Expand All @@ -28,36 +28,48 @@

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.mllib.util import MLUtils
from pyspark.mllib.stat import Statistics


def summarize(dataset):
print("schema: %s" % dataset.schema().json())
labels = dataset.map(lambda r: r.label)
print("label average: %f" % labels.mean())
features = dataset.map(lambda r: r.features)
summary = Statistics.colStats(features)
print("features average: %r" % summary.mean())

if __name__ == "__main__":
if len(sys.argv) > 2:
print("Usage: dataset_example.py <libsvm file>", file=sys.stderr)
print("Usage: dataframe_example.py <libsvm file>", file=sys.stderr)
exit(-1)
sc = SparkContext(appName="DatasetExample")
sc = SparkContext(appName="DataFrameExample")
sqlContext = SQLContext(sc)
if len(sys.argv) == 2:
input = sys.argv[1]
else:
input = "data/mllib/sample_libsvm_data.txt"
points = MLUtils.loadLibSVMFile(sc, input)
dataset0 = sqlContext.inferSchema(points).setName("dataset0").cache()
summarize(dataset0)

# Load input data
print("Loading LIBSVM file with UDT from " + input + ".")
df = sqlContext.read.format("libsvm").load(input).cache()
print("Schema from LIBSVM:")
df.printSchema()
print("Loaded training data as a DataFrame with " +
str(df.count()) + " records.")

# Show statistical summary of labels.
labelSummary = df.describe("label")
labelSummary.show()

# Convert features column to an RDD of vectors.
features = df.select("features").map(lambda r: r.features)
summary = Statistics.colStats(features)
print("Selected features column with average values:\n" +
str(summary.mean()))

# Save the records in a parquet file.
tempdir = tempfile.NamedTemporaryFile(delete=False).name
os.unlink(tempdir)
print("Save dataset as a Parquet file to %s." % tempdir)
dataset0.saveAsParquetFile(tempdir)
print("Load it back and summarize it again.")
dataset1 = sqlContext.parquetFile(tempdir).setName("dataset1").cache()
summarize(dataset1)
print("Saving to " + tempdir + " as Parquet file.")
df.write.parquet(tempdir)

# Load the records back.
print("Loading Parquet file with UDT from " + tempdir)
newDF = sqlContext.read.parquet(tempdir)
print("Schema from Parquet:")
newDF.printSchema()
shutil.rmtree(tempdir)

sc.stop()
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ object DataFrameExample {
def main(args: Array[String]) {
val defaultParams = Params()

val parser = new OptionParser[Params]("DatasetExample") {
head("Dataset: an example app using DataFrame as a Dataset for ML.")
val parser = new OptionParser[Params]("DataFrameExample") {
head("DataFrameExample: an example app using DataFrame for ML.")
opt[String]("input")
.text(s"input path to dataset")
.text(s"input path to dataframe")
.action((x, c) => c.copy(input = x))
checkConfig { params =>
success
Expand Down Expand Up @@ -88,7 +88,7 @@ object DataFrameExample {
// Save the records in a parquet file.
val tmpDir = Files.createTempDir()
tmpDir.deleteOnExit()
val outputDir = new File(tmpDir, "dataset").toString
val outputDir = new File(tmpDir, "dataframe").toString
println(s"Saving to $outputDir as Parquet file.")
df.write.parquet(outputDir)

Expand Down

0 comments on commit a0ff6d1

Please sign in to comment.