Skip to content

Commit

Permalink
[SPARK-11646] WholeTextFileRDD should return Text rather than String
Browse files Browse the repository at this point in the history
If it returns Text, we can reuse this in Spark SQL to provide a WholeTextFile data source and directly convert the Text into UTF8String without extra string decoding and encoding.

Author: Reynold Xin <[email protected]>

Closes apache#9622 from rxin/SPARK-11646.
  • Loading branch information
rxin committed Nov 11, 2015
1 parent 27524a3 commit 95daff6
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 44 deletions.
6 changes: 3 additions & 3 deletions core/src/main/scala/org/apache/spark/SparkContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -863,10 +863,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
new WholeTextFileRDD(
this,
classOf[WholeTextFileInputFormat],
classOf[String],
classOf[String],
classOf[Text],
classOf[Text],
updateConf,
minPartitions).setName(path)
minPartitions).setName(path).map(record => (record._1.toString, record._2.toString))
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package org.apache.spark.input
import scala.collection.JavaConverters._

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat
Expand All @@ -33,14 +34,13 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext
*/

private[spark] class WholeTextFileInputFormat
extends CombineFileInputFormat[String, String] with Configurable {
extends CombineFileInputFormat[Text, Text] with Configurable {

override protected def isSplitable(context: JobContext, file: Path): Boolean = false

override def createRecordReader(
split: InputSplit,
context: TaskAttemptContext): RecordReader[String, String] = {

context: TaskAttemptContext): RecordReader[Text, Text] = {
val reader =
new ConfigurableCombineFileRecordReader(split, context, classOf[WholeTextFileRecordReader])
reader.setConf(getConf)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ private[spark] class WholeTextFileRecordReader(
split: CombineFileSplit,
context: TaskAttemptContext,
index: Integer)
extends RecordReader[String, String] with Configurable {
extends RecordReader[Text, Text] with Configurable {

private[this] val path = split.getPath(index)
private[this] val fs = path.getFileSystem(
Expand All @@ -58,18 +58,18 @@ private[spark] class WholeTextFileRecordReader(
// True means the current file has been processed, then skip it.
private[this] var processed = false

private[this] val key = path.toString
private[this] var value: String = null
private[this] val key: Text = new Text(path.toString)
private[this] var value: Text = null

override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = {}

override def close(): Unit = {}

override def getProgress: Float = if (processed) 1.0f else 0.0f

override def getCurrentKey: String = key
override def getCurrentKey: Text = key

override def getCurrentValue: String = value
override def getCurrentValue: Text = value

override def nextKeyValue(): Boolean = {
if (!processed) {
Expand All @@ -83,7 +83,7 @@ private[spark] class WholeTextFileRecordReader(
ByteStreams.toByteArray(fileIn)
}

value = new Text(innerBuffer).toString
value = new Text(innerBuffer)
Closeables.close(fileIn, false)
processed = true
true
Expand Down
33 changes: 1 addition & 32 deletions core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,11 @@ import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, FileSplit}

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.input.WholeTextFileInputFormat
import org.apache.spark._
import org.apache.spark.executor.DataReadMethod
import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
import org.apache.spark.rdd.NewHadoopRDD.NewHadoopMapPartitionsWithSplitRDD
import org.apache.spark.util.{SerializableConfiguration, ShutdownHookManager, Utils}
import org.apache.spark.util.{SerializableConfiguration, ShutdownHookManager}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.storage.StorageLevel

Expand All @@ -59,7 +58,6 @@ private[spark] class NewHadoopPartition(
* @param inputFormatClass Storage format of the data to be read.
* @param keyClass Class of the key associated with the inputFormatClass.
* @param valueClass Class of the value associated with the inputFormatClass.
* @param conf The Hadoop configuration.
*/
@DeveloperApi
class NewHadoopRDD[K, V](
Expand Down Expand Up @@ -282,32 +280,3 @@ private[spark] object NewHadoopRDD {
}
}
}

private[spark] class WholeTextFileRDD(
sc : SparkContext,
inputFormatClass: Class[_ <: WholeTextFileInputFormat],
keyClass: Class[String],
valueClass: Class[String],
conf: Configuration,
minPartitions: Int)
extends NewHadoopRDD[String, String](sc, inputFormatClass, keyClass, valueClass, conf) {

override def getPartitions: Array[Partition] = {
val inputFormat = inputFormatClass.newInstance
val conf = getConf
inputFormat match {
case configurable: Configurable =>
configurable.setConf(conf)
case _ =>
}
val jobContext = newJobContext(conf, jobId)
inputFormat.setMinPartitions(jobContext, minPartitions)
val rawSplits = inputFormat.getSplits(jobContext).toArray
val result = new Array[Partition](rawSplits.size)
for (i <- 0 until rawSplits.size) {
result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
}
result
}
}

56 changes: 56 additions & 0 deletions core/src/main/scala/org/apache/spark/rdd/WholeTextFileRDD.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.rdd

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.{Text, Writable}
import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.{Partition, SparkContext}
import org.apache.spark.input.WholeTextFileInputFormat

/**
* An RDD that reads a bunch of text files in, and each text file becomes one record.
*/
private[spark] class WholeTextFileRDD(
sc : SparkContext,
inputFormatClass: Class[_ <: WholeTextFileInputFormat],
keyClass: Class[Text],
valueClass: Class[Text],
conf: Configuration,
minPartitions: Int)
extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) {

override def getPartitions: Array[Partition] = {
val inputFormat = inputFormatClass.newInstance
val conf = getConf
inputFormat match {
case configurable: Configurable =>
configurable.setConf(conf)
case _ =>
}
val jobContext = newJobContext(conf, jobId)
inputFormat.setMinPartitions(jobContext, minPartitions)
val rawSplits = inputFormat.getSplits(jobContext).toArray
val result = new Array[Partition](rawSplits.size)
for (i <- 0 until rawSplits.size) {
result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
}
result
}
}

0 comments on commit 95daff6

Please sign in to comment.