add data process for preprocessed data

deljuven · Feb 19, 2017 · e34e66f · e34e66f
1 parent b88e7cf
commit e34e66f
Show file tree

Hide file tree

Showing 6 changed files with 152 additions and 78 deletions.
diff --git a/app.py b/app.py
@@ -1,13 +1,23 @@
 # -*- coding: utf-8 -*-
 import sys
 
+from preprocess import pre_process
 from process import process
 
 if __name__ == '__main__':
     args = sys.argv
-    if len(args) > 3:
-        process(args[1], args[2], args[3])
-    elif len(args) < 3:
+    if len(args) < 4:
         print 'wtf'
     else:
-        process(args[1], args[2])
+        if args[1] == 'pre':
+            if len(args) > 4:
+                pre_process(args[2], args[3], args[4])
+            else:
+                pre_process(args[2], args[3])
+        elif args[1] == 'proc':
+            if len(args) == 4:
+                process(args[2], args[3])
+            elif len(args) == 5:
+                process(args[2], args[3], args[4])
+            elif len(args) == 6:
+                process(args[2], args[3], args[4], args[5])
diff --git a/plot.cmd b/plot.cmd
diff --git a/preprocess.py b/preprocess.py
@@ -0,0 +1,107 @@
+# -*- coding: utf-8 -*-
+import re
+from datetime import datetime
+from operator import itemgetter
+
+MIN_TIME = datetime.strptime('1998-04-30 21:30:17', '%Y-%m-%d %H:%M:%S')
+MAX_TIME = datetime.strptime('1998-07-26 21:59:55', '%Y-%m-%d %H:%M:%S')
+BUFF_COUNT = 10000
+REGULAR = re.compile(r'\[(\S+)\s(\S+)\]')
+
+
+def pre_process(in_file, out_file=None, interval=0):
+    result = {}
+    result = read_from_file(in_file, result, interval * 60)
+    ordered = sorted(result.items(), key=itemgetter(0))
+    hint_order = sorted(result.items(), key=itemgetter(1), reverse=True)
+    output(out_file, ordered, hint_order)
+
+
+def big_file_read_test(in_file, interval=0, flag=False):
+    dict_ = {}
+    begin = datetime.now()
+    with open(in_file, 'r') as in_file, open('./log/read_test.log', 'w') as log:
+        index = 0
+        for line in in_file:
+            index += 1
+            if flag:
+                if interval == 0:
+                    key = data_process(line)
+                else:
+                    key = data_process(line) / interval
+                val = dict_.get(key)
+                if val:
+                    val += 1
+                else:
+                    val = 1
+                dict_[key] = val
+        duration = (datetime.now() - begin).total_seconds()
+        log.write("read and parse cost seconds %s" % duration)
+    print (datetime.now() - begin).total_seconds()
+
+
+def read_from_file(file_, dict_, interval):
+    begin = datetime.now()
+    with open(file_, 'r') as in_file, open('./log/pre_proc.log', 'w') as log:
+        index = 0
+        for line in in_file:
+            index += 1
+            # print "line no %d" % index
+            if interval == 0:
+                key = data_process(line)
+            else:
+                key = data_process(line) / interval
+            val = dict_.get(key)
+            if val:
+                val += 1
+            else:
+                val = 1
+            dict_[key] = val
+        duration = (datetime.now() - begin).total_seconds()
+        log.write("read and parse cost seconds %s" % duration)
+    print (datetime.now() - begin).total_seconds()
+    return dict_
+
+
+def parse_time(time_str):
+    to_parse = time_str.split(' ')[0]
+    delta = datetime.strptime(to_parse, '%d/%b/%Y:%H:%M:%S') - MIN_TIME
+    return int(delta.total_seconds())
+
+
+def data_process(line):
+    data = re.search(REGULAR, line)
+    return parse_time(data.group(1))
+
+
+def output(out_file, statistics, hint_statics):
+    if not out_file:
+        out_file = './pre_proc.out'
+    hint_file = out_file + ".hit"
+    with open(out_file, 'w') as fp, open(hint_file, 'w') as hfp:
+        pt_count = 0
+        ht_count = 0
+        pt_buff = ""
+        ht_buff = ""
+        for point in statistics:
+            pt_buff += "%s %s\n" % (point[0], point[1])
+            if pt_count == BUFF_COUNT:
+                fp.write(pt_buff)
+                fp.flush()
+                pt_count = 0
+                pt_buff = ""
+            pt_count += 1
+        for hint in hint_statics:
+            ht_buff += "%s %s\n" % (hint[0], hint[1])
+            if ht_count == BUFF_COUNT:
+                hfp.write(ht_buff)
+                hfp.flush()
+                ht_count = 0
+                ht_buff = ""
+            ht_count += 1
+        if len(pt_buff) > 0:
+            fp.write(pt_buff)
+            fp.flush()
+        if len(ht_buff) > 0:
+            hfp.write(ht_buff)
+            hfp.flush()
diff --git a/process.py b/process.py
@@ -1,59 +1,30 @@
 # -*- coding: utf-8 -*-
-import re
 from datetime import datetime
 from operator import itemgetter
 
-MIN_TIME = datetime.strptime('1998-04-30 21:30:17', '%Y-%m-%d %H:%M:%S')
-MAX_TIME = datetime.strptime('1998-7-26 21:59:55', '%Y-%m-%d %H:%M:%S')
-BUFF_COUNT = 10000
-REGULAR = re.compile(r'\[(\S+)\s(\S+)\]')
+BUFF_COUNT = 1000
 
 
-def process(in_file, out_file=None, interval=0):
+def process(in_file, out_file=None, start=0, interval=5):
     result = {}
-    result = read_from_file(in_file, result, interval * 60)
+    result = read_from_file(in_file, result, start * 1440 / interval, interval * 60)
     ordered = sorted(result.items(), key=itemgetter(0))
-    hint_order = sorted(result.items(), key=itemgetter(1), reverse=True)
-    output(out_file, ordered, hint_order)
+    output(out_file, ordered)
 
 
-def big_file_read_test(in_file, interval=0, flag=False):
-    dict_ = {}
+def read_from_file(file_, dict_, start, interval):
     begin = datetime.now()
-    with open(in_file, 'r') as in_file, open('./log/read_test.log', 'w') as log:
+    with open(file_, 'r') as in_file, open('./log/process.log', 'w') as log:
         index = 0
         for line in in_file:
             index += 1
-            if flag:
-                if interval == 0:
-                    key = data_process(line)
-                else:
-                    key = data_process(line) / interval
-                val = dict_.get(key)
-                if val:
-                    val += 1
-                else:
-                    val = 1
-                dict_[key] = val
-        duration = (datetime.now() - begin).total_seconds()
-        log.write("read and parse cost seconds %s" % duration)
-    print (datetime.now() - begin).total_seconds()
-
-
-def read_from_file(file_, dict_, interval):
-    begin = datetime.now()
-    with open(file_, 'r') as in_file, open('./log/proc.log', 'w') as log:
-        index = 0
-        for line in in_file:
-            index += 1
-            # print "line no %d" % index
-            if interval == 0:
-                key = data_process(line)
-            else:
-                key = data_process(line) / interval
+            point = parse_line(line)
+            if start > point[0]:
+                continue
+            key = point[0] / interval
             val = dict_.get(key)
             if val:
-                val += 1
+                val += point[1]
             else:
                 val = 1
             dict_[key] = val
@@ -63,24 +34,12 @@ def read_from_file(file_, dict_, interval):
     return dict_
 
 
-def parse_time(time_str):
-    to_parse = time_str.split(' ')[0]
-    delta = datetime.strptime(to_parse, '%d/%b/%Y:%H:%M:%S') - MIN_TIME
-    return int(delta.total_seconds())
-
-
-def data_process(line):
-    data = re.search(REGULAR, line)
-    return parse_time(data.group(1))
-
-
-def output(out_file, statistics, hint_statics):
-    hint_file = out_file + ".hint"
-    with open(out_file, 'w') as fp, open(hint_file, 'w') as hfp:
+def output(out_file, statistics):
+    if not out_file:
+        out_file = './interval_5.out'
+    with open(out_file, 'w') as fp:
         pt_count = 0
-        ht_count = 0
         pt_buff = ""
-        ht_buff = ""
         for point in statistics:
             pt_buff += "%s %s\n" % (point[0], point[1])
             if pt_count == BUFF_COUNT:
@@ -89,17 +48,10 @@ def output(out_file, statistics, hint_statics):
                 pt_count = 0
                 pt_buff = ""
             pt_count += 1
-        for hint in hint_statics:
-            ht_buff += "%s %s\n" % (hint[0], hint[1])
-            if ht_count == BUFF_COUNT:
-                hfp.write(ht_buff)
-                hfp.flush()
-                ht_count = 0
-                ht_buff = ""
-            ht_count += 1
         if len(pt_buff) > 0:
             fp.write(pt_buff)
             fp.flush()
-        if len(ht_buff) > 0:
-            hfp.write(ht_buff)
-            hfp.flush()
+
+
+def parse_line(line):
+    return map(int, tuple(line.split(" ")))
diff --git a/run_pre.sh b/run_pre.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+if [ ! $1 ] ;then
+    pypy app.py pre ./input/result.out ./output/all.out 1> ./log/pre_stat.log  2> ./log/pre_error.log
+else
+    pypy app.py pre ./input/result.out ./output/all_$2.out $2 1> ./log/pre_stat.log 2> ./log/pre_error.log
+fi
diff --git a/run_proc.sh b/run_proc.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+if [ ! $1 ] ;then
+    pypy app.py proc ./input/py.out ./output/interval_5.out 1> ./log/proc_stat.log  2> ./log/proc_error.log
+else
+    pypy app.py proc ./input/py.out ./output/interval_$2.out $2 1> ./log/proc_stat.log 2> ./log/proc_error.log
+fi