-
Notifications
You must be signed in to change notification settings - Fork 0
/
hadoop-batch-big
executable file
·57 lines (49 loc) · 2.42 KB
/
hadoop-batch-big
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/bin/sh
#######
# This hadoop job assumes the key coming out of the mapper is <firsthop,from_call,latitude,longitude>
# and value is null (all the info we need is in the key).
# the goal is for the reducer to eliminate duplicate positions which is accomplished by
# shuffling on the full key but partitioning on just the firsthop so that all rows for the same
# firsthop go to the same reducer.
# the reducer will then eliminate duplicates and output a <firsthop,[list of positions]>
#######
cd ~alan/datasci/AWS/elastic-mapreduce-cli
NAME="Hadoop `date +'%Y-%m-%d-%H:%M:%S'`"
NUM=${1:-10}
# original mapper if proceessing the raw input logs:
#MAPPER=${2:-"s3://aprs-is/code/aprspig.py"}
#INPUT=${4:-"s3n://aprs-is/aprsis-*"}
# meow
#MAPPER=/bin/cat
# since I ran this once as a map-only job, the output has already been parsed and is
# in s3n://aprs-is/reduced/digipeaters.txt/ as key = <firsthop> value=<from_call,lat,lon>
# in 20-20 hindsight I should have made the key as I am now.
MAPPER=${2:-"s3://aprs-is/code/aprsmapper_fullkey.py"}
INPUT=${4:-"s3n://aprs-is/reduced/digipeaters.txt/part-*"}
# my first-pass reducer would blow up due to trying to build up a set()
#REDUCER=${3:-"s3://aprs-is/code/aprsreducer.py"}
# this one is for a map-only job
#REDUCER=NONE
# this one spits out position reports without deduping.
# see s3n://aprs-is/reduced/digipeaters.json/
#REDUCER=${3:-"s3://aprs-is/code/aprsreducer_nodedupe.py"}
#OUTPUT=${5:-"s3n://aprs-is/reduced/digipeaters.json"}
# my new deduper that expects key=<firsthop,from_call,latitude,longitude>, data=nothing
REDUCER=${3:-"s3://aprs-is/code/aprsreducer_dedupe.py"}
#OUTPUT=${5:-"s3n://aprs-is/reduced/firsthops.json"}
OUTPUT=${5:-"s3n://aprs-is/reduced/firsthops-deduped.json"}
# had to use a --json and --param's since the emr-cli script can't properly pass some hadoop args.
echo launching $NAME running $MAPPER and $REDUCER with $NUM instances in 5 seconds...
set -x
sleep 5
./elastic-mapreduce --create \
--name "$NAME" \
--num-instances $NUM --instance-type m1.medium \
--bootstrap-action s3n://us-west-2.elasticmapreduce/bootstrap-actions/configure-hadoop \
--args -m,mapred.task.timeout=2400000,-m,mapred.map.child.java.opts=-Xmx1024m,-m,mapred.map.child.java.opts=-Xmx1024m,-m,mapred.max.tracker.failures=10,-m,mapred.reduce.max.attempts=10 \
--json ../streamer.json \
--param "<input>=$INPUT" \
--param "<mapper>=$MAPPER" \
--param "<reducer>=$REDUCER" \
--param "<output>=$OUTPUT" \
--trace