forked from LearningJournal/Spark-Programming-In-Python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLogFileDemo.py
26 lines (21 loc) · 989 Bytes
/
LogFileDemo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from pyspark.sql import *
from pyspark.sql.functions import regexp_extract, substring_index
if __name__ == "__main__":
spark = SparkSession \
.builder \
.master("local[3]") \
.appName("LogFileDemo") \
.getOrCreate()
file_df = spark.read.text("data/apache_logs.txt")
file_df.printSchema()
log_reg = r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\S+) "(\S+)" "([^"]*)'
logs_df = file_df.select(regexp_extract('value', log_reg, 1).alias('ip'),
regexp_extract('value', log_reg, 4).alias('date'),
regexp_extract('value', log_reg, 6).alias('request'),
regexp_extract('value', log_reg, 10).alias('referrer'))
logs_df \
.where("trim(referrer) != '-' ") \
.withColumn("referrer", substring_index("referrer", "/", 3)) \
.groupBy("referrer") \
.count() \
.show(100, truncate=False)