Skip to content

Commit

Permalink
=
Browse files Browse the repository at this point in the history
  • Loading branch information
panpap committed Mar 10, 2017
1 parent 96472dc commit adb100f
Showing 1 changed file with 31 additions and 25 deletions.
56 changes: 31 additions & 25 deletions awazzaCleanMerge.rb
Original file line number Diff line number Diff line change
@@ -1,34 +1,40 @@
def tokenize(line)
row=Hash.new
part=line.split('"')
row["NodeIP"]=part[0].split(" - ")[0]
row["UserIP"]=part[0].split(" - ")[1]
row["TimeStamp"]=part[0].split(" - ")[2].gsub("[","").gsub("]","").gsub(" ","")
row["Verb"]=part[1].split(" ")[0]
row["Path"]=part[1].split(" ")[1]
row["HttpVersion"]=part[1].split(" ")[2]
row["HttpReferer"]=part[1].split(" ")[3]
row["ResponseCode"]=part[2].split(" ")[0]
row["ContentLength"]=part[2].split(" ")[1]
row["DeliveredData"]=part[2].split(" ")[2]
row["Duration"]=part[2].split(" ")[3]
row["NumericHitOrMiss"]=part[2].split(" ")[4]
row["UserAgent"]=part[3]
bracks=line.split("{")
row["RequestHeaders"]=bracks[1].split("}")[0]
row["ResponseHeaders"]=bracks[2].split("}")[0]
temp=line.split("}").last.split("] ")
row["IncomingPort"]=temp[0].split(" [")[1]
row["OriginalSize"]=temp[1].split(" ")[0]
rest=line.split("(").last.gsub(")","")
row["Country"]=rest.split(",")[14]
begin
# part=line.split('"')
# row["NodeIP"]=part[0].split(" - ")[0]
# row["UserIP"]=part[0].split(" - ")[1]
# row["TimeStamp"]=part[0].split(" - ")[2].gsub("[","").gsub("]","").gsub(" ","")
# row["Verb"]=part[1].split(" ")[0]
# row["Path"]=part[1].split(" ")[1]
# row["HttpVersion"]=part[1].split(" ")[2]
# row["HttpReferer"]=part[1].split(" ")[3]
# row["ResponseCode"]=part[2].split(" ")[0]
# row["ContentLength"]=part[2].split(" ")[1]
# row["DeliveredData"]=part[2].split(" ")[2]
# row["Duration"]=part[2].split(" ")[3]
# row["NumericHitOrMiss"]=part[2].split(" ")[4]
# row["UserAgent"]=part[3]
# bracks=line.split("{")
# row["RequestHeaders"]=bracks[1].split("}")[0]
# row["ResponseHeaders"]=bracks[2].split("}")[0]
# temp=line.split("}").last.split("] ")
# row["IncomingPort"]=temp[0].split(" [")[1]
# row["OriginalSize"]=temp[1].split(" ")[0]
# rest=line.split("(").last.gsub(")","")
# row["Country"]=rest.split(",")[14]
temp=line.split(",")
row["Country"]=temp[temp.size-6]
rescue Exception => e
abort "Error -> "+e.backtrace.to_s+"\n"+line.to_s
end
return row
end

folderPath="/home/sysadmin/data/log_done/"
month=nil
fw=nil
Dir.entries(folderPath).each {|f|
Dir.entries(folderPath).sort.each {|f|
next if File.directory? f or f.include? ".done" or f.include? ".gz" #remove dirs and .done and .gz files
traceFile=f
#puts traceFile
Expand All @@ -38,14 +44,14 @@ def tokenize(line)
fw.close if fw!=nil
month=timestmp[4...6]
fw=File.new("month_2015"+month,"w")
puts month
end
system("sort "+folderPath+"/"+traceFile+" | uniq > tempSort")
puts month
File.foreach("tempSort") {|line|
next if line.include? "/awazzaredirect/" # Duplicate removal
row=tokenize(line.chop.force_encoding("iso-8859-1"))
next if row["Country"]!="ES"
next if row["Verb"]=="CONNECT"
next if line.include? "CONNECT"
#abort "SCREAM" if row["RequestHeaders"].downcase.include? "cookie"
fw.puts line
}
Expand Down

0 comments on commit adb100f

Please sign in to comment.