-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathapacheElasticSearch.py
43 lines (32 loc) · 1.06 KB
/
apacheElasticSearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import json
import hashlib
import re
def addId(data):
j=json.dumps(data).encode('ascii', 'ignore')
data['doc_id'] = hashlib.sha224(j).hexdigest()
return (data['doc_id'], json.dumps(data))
def parse(str):
s=p.match(str)
d = {}
d['ip']=s.group(1)
d['date']=s.group(4)
d['operation']=s.group(5)
d['uri']=s.group(6)
return d
regex='^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+)\s?(\S+)?\s?(\S+)?" (\d{3}|-) (\d+|-)\s?"?([^"]*)"?\s?"?([^"]*)?"?$'
p=re.compile(regex)
rdd = sc.textFile("/home/ubuntu/walker/apache_logs")
rdd2 = rdd.map(parse)
rdd3 = rdd2.map(addID)
es_write_conf = {
"es.nodes" : "localhost",
"es.port" : "9200",
"es.resource" : 'walker/apache',
"es.input.json": "yes",
"es.mapping.id": "doc_id"
}
rdd3.saveAsNewAPIHadoopFile(
path='-',
outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", keyClass="org.apache.hadoop.io.NullWritable",
valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
conf=es_write_conf)