Skip to content

Commit

Permalink
Show how to have a sequence file in Java
Browse files Browse the repository at this point in the history
  • Loading branch information
holdenk committed Jun 1, 2014
1 parent 3fa90ec commit 3e30bc2
Showing 1 changed file with 42 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/**
* Illustrates loading a sequence file of people and how many pandas they have seen
*/
package com.oreilly.learningsparkexamples.java;

import java.util.ArrayList;
import java.util.List;
import scala.Tuple2;

import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;

public class BasicSaveSequenceFile {

public static class ConvertToWritableTypes implements PairFunction<Tuple2<String, Integer>, Text, IntWritable> {
public Tuple2<Text, IntWritable> call(Tuple2<String, Integer> record) {
return new Tuple2(new Text(record._1), new IntWritable(record._2));
}
}

public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new Exception("Usage BasicSaveSequenceFile [sparkMaster] [output]");
}
String master = args[0];
String fileName = args[1];

JavaSparkContext sc = new JavaSparkContext(
master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
List<Tuple2<String, Integer>> input = new ArrayList();
input.add(new Tuple2("coffee", 1));
input.add(new Tuple2("coffee", 2));
input.add(new Tuple2("pandas", 3));
JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input);
JavaPairRDD<Text, IntWritable> result = rdd.mapToPair(new ConvertToWritableTypes());
result.saveAsHadoopFile(fileName, Text.class, IntWritable.class, SequenceFileOutputFormat.class);
` }
}

0 comments on commit 3e30bc2

Please sign in to comment.