Skip to content

Commit

Permalink
add data process for preprocessed data
Browse files Browse the repository at this point in the history
  • Loading branch information
deljuven committed Feb 19, 2017
1 parent b88e7cf commit e34e66f
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 78 deletions.
18 changes: 14 additions & 4 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
# -*- coding: utf-8 -*-
import sys

from preprocess import pre_process
from process import process

if __name__ == '__main__':
args = sys.argv
if len(args) > 3:
process(args[1], args[2], args[3])
elif len(args) < 3:
if len(args) < 4:
print 'wtf'
else:
process(args[1], args[2])
if args[1] == 'pre':
if len(args) > 4:
pre_process(args[2], args[3], args[4])
else:
pre_process(args[2], args[3])
elif args[1] == 'proc':
if len(args) == 4:
process(args[2], args[3])
elif len(args) == 5:
process(args[2], args[3], args[4])
elif len(args) == 6:
process(args[2], args[3], args[4], args[5])
7 changes: 0 additions & 7 deletions plot.cmd

This file was deleted.

107 changes: 107 additions & 0 deletions preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# -*- coding: utf-8 -*-
import re
from datetime import datetime
from operator import itemgetter

MIN_TIME = datetime.strptime('1998-04-30 21:30:17', '%Y-%m-%d %H:%M:%S')
MAX_TIME = datetime.strptime('1998-07-26 21:59:55', '%Y-%m-%d %H:%M:%S')
BUFF_COUNT = 10000
REGULAR = re.compile(r'\[(\S+)\s(\S+)\]')


def pre_process(in_file, out_file=None, interval=0):
result = {}
result = read_from_file(in_file, result, interval * 60)
ordered = sorted(result.items(), key=itemgetter(0))
hint_order = sorted(result.items(), key=itemgetter(1), reverse=True)
output(out_file, ordered, hint_order)


def big_file_read_test(in_file, interval=0, flag=False):
dict_ = {}
begin = datetime.now()
with open(in_file, 'r') as in_file, open('./log/read_test.log', 'w') as log:
index = 0
for line in in_file:
index += 1
if flag:
if interval == 0:
key = data_process(line)
else:
key = data_process(line) / interval
val = dict_.get(key)
if val:
val += 1
else:
val = 1
dict_[key] = val
duration = (datetime.now() - begin).total_seconds()
log.write("read and parse cost seconds %s" % duration)
print (datetime.now() - begin).total_seconds()


def read_from_file(file_, dict_, interval):
begin = datetime.now()
with open(file_, 'r') as in_file, open('./log/pre_proc.log', 'w') as log:
index = 0
for line in in_file:
index += 1
# print "line no %d" % index
if interval == 0:
key = data_process(line)
else:
key = data_process(line) / interval
val = dict_.get(key)
if val:
val += 1
else:
val = 1
dict_[key] = val
duration = (datetime.now() - begin).total_seconds()
log.write("read and parse cost seconds %s" % duration)
print (datetime.now() - begin).total_seconds()
return dict_


def parse_time(time_str):
to_parse = time_str.split(' ')[0]
delta = datetime.strptime(to_parse, '%d/%b/%Y:%H:%M:%S') - MIN_TIME
return int(delta.total_seconds())


def data_process(line):
data = re.search(REGULAR, line)
return parse_time(data.group(1))


def output(out_file, statistics, hint_statics):
if not out_file:
out_file = './pre_proc.out'
hint_file = out_file + ".hit"
with open(out_file, 'w') as fp, open(hint_file, 'w') as hfp:
pt_count = 0
ht_count = 0
pt_buff = ""
ht_buff = ""
for point in statistics:
pt_buff += "%s %s\n" % (point[0], point[1])
if pt_count == BUFF_COUNT:
fp.write(pt_buff)
fp.flush()
pt_count = 0
pt_buff = ""
pt_count += 1
for hint in hint_statics:
ht_buff += "%s %s\n" % (hint[0], hint[1])
if ht_count == BUFF_COUNT:
hfp.write(ht_buff)
hfp.flush()
ht_count = 0
ht_buff = ""
ht_count += 1
if len(pt_buff) > 0:
fp.write(pt_buff)
fp.flush()
if len(ht_buff) > 0:
hfp.write(ht_buff)
hfp.flush()
86 changes: 19 additions & 67 deletions process.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,30 @@
# -*- coding: utf-8 -*-
import re
from datetime import datetime
from operator import itemgetter

MIN_TIME = datetime.strptime('1998-04-30 21:30:17', '%Y-%m-%d %H:%M:%S')
MAX_TIME = datetime.strptime('1998-7-26 21:59:55', '%Y-%m-%d %H:%M:%S')
BUFF_COUNT = 10000
REGULAR = re.compile(r'\[(\S+)\s(\S+)\]')
BUFF_COUNT = 1000


def process(in_file, out_file=None, interval=0):
def process(in_file, out_file=None, start=0, interval=5):
result = {}
result = read_from_file(in_file, result, interval * 60)
result = read_from_file(in_file, result, start * 1440 / interval, interval * 60)
ordered = sorted(result.items(), key=itemgetter(0))
hint_order = sorted(result.items(), key=itemgetter(1), reverse=True)
output(out_file, ordered, hint_order)
output(out_file, ordered)


def big_file_read_test(in_file, interval=0, flag=False):
dict_ = {}
def read_from_file(file_, dict_, start, interval):
begin = datetime.now()
with open(in_file, 'r') as in_file, open('./log/read_test.log', 'w') as log:
with open(file_, 'r') as in_file, open('./log/process.log', 'w') as log:
index = 0
for line in in_file:
index += 1
if flag:
if interval == 0:
key = data_process(line)
else:
key = data_process(line) / interval
val = dict_.get(key)
if val:
val += 1
else:
val = 1
dict_[key] = val
duration = (datetime.now() - begin).total_seconds()
log.write("read and parse cost seconds %s" % duration)
print (datetime.now() - begin).total_seconds()


def read_from_file(file_, dict_, interval):
begin = datetime.now()
with open(file_, 'r') as in_file, open('./log/proc.log', 'w') as log:
index = 0
for line in in_file:
index += 1
# print "line no %d" % index
if interval == 0:
key = data_process(line)
else:
key = data_process(line) / interval
point = parse_line(line)
if start > point[0]:
continue
key = point[0] / interval
val = dict_.get(key)
if val:
val += 1
val += point[1]
else:
val = 1
dict_[key] = val
Expand All @@ -63,24 +34,12 @@ def read_from_file(file_, dict_, interval):
return dict_


def parse_time(time_str):
to_parse = time_str.split(' ')[0]
delta = datetime.strptime(to_parse, '%d/%b/%Y:%H:%M:%S') - MIN_TIME
return int(delta.total_seconds())


def data_process(line):
data = re.search(REGULAR, line)
return parse_time(data.group(1))


def output(out_file, statistics, hint_statics):
hint_file = out_file + ".hint"
with open(out_file, 'w') as fp, open(hint_file, 'w') as hfp:
def output(out_file, statistics):
if not out_file:
out_file = './interval_5.out'
with open(out_file, 'w') as fp:
pt_count = 0
ht_count = 0
pt_buff = ""
ht_buff = ""
for point in statistics:
pt_buff += "%s %s\n" % (point[0], point[1])
if pt_count == BUFF_COUNT:
Expand All @@ -89,17 +48,10 @@ def output(out_file, statistics, hint_statics):
pt_count = 0
pt_buff = ""
pt_count += 1
for hint in hint_statics:
ht_buff += "%s %s\n" % (hint[0], hint[1])
if ht_count == BUFF_COUNT:
hfp.write(ht_buff)
hfp.flush()
ht_count = 0
ht_buff = ""
ht_count += 1
if len(pt_buff) > 0:
fp.write(pt_buff)
fp.flush()
if len(ht_buff) > 0:
hfp.write(ht_buff)
hfp.flush()


def parse_line(line):
return map(int, tuple(line.split(" ")))
6 changes: 6 additions & 0 deletions run_pre.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/sh
if [ ! $1 ] ;then
pypy app.py pre ./input/result.out ./output/all.out 1> ./log/pre_stat.log 2> ./log/pre_error.log
else
pypy app.py pre ./input/result.out ./output/all_$2.out $2 1> ./log/pre_stat.log 2> ./log/pre_error.log
fi
6 changes: 6 additions & 0 deletions run_proc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/sh
if [ ! $1 ] ;then
pypy app.py proc ./input/py.out ./output/interval_5.out 1> ./log/proc_stat.log 2> ./log/proc_error.log
else
pypy app.py proc ./input/py.out ./output/interval_$2.out $2 1> ./log/proc_stat.log 2> ./log/proc_error.log
fi

0 comments on commit e34e66f

Please sign in to comment.