Skip to content

Commit

Permalink
new
Browse files Browse the repository at this point in the history
  • Loading branch information
lizhi16 committed Mar 4, 2021
1 parent a4cab52 commit 5b1f2ee
Show file tree
Hide file tree
Showing 111 changed files with 2,185 additions and 131 deletions.
Binary file modified __pycache__/parse2cmds.cpython-36.pyc
Binary file not shown.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
34 changes: 9 additions & 25 deletions dockerfile_analysis/keywords.py → analysis/tracer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import crawl
import download
import parse2cmds
import parser

basePath = "./results/"

Expand All @@ -10,12 +8,12 @@
"""
def trace_keywords(dockerfile, keywords):
# trace the source of the scripts
sourceEntry = parse2cmds.trace_entry_images(dockerfile)
sourceEntry = trace_entry_images(dockerfile)
write_log(image, sourceEntry, "images")

# identify the keywords
keywords = ["paste ", "split "]
identify = parse2cmds.identify_keywords(dockerfile, keywords)
identify = identify_keywords(dockerfile, keywords)
write_log(image, identify, "keywords")


Expand All @@ -27,7 +25,7 @@ def trace_entry_images(dockerfile):
sources_scripts = {}

# get the entrypoint
entrypoint = parse2cmds.parse_exe_from_dockerfile(dockerfile)
entrypoint = parser.parse_exe_from_dockerfile(dockerfile)
if len(entrypoint) == 0:
return sources_scripts

Expand All @@ -40,7 +38,7 @@ def trace_entry_images(dockerfile):
scripts.append(cmd)

# trace the source of the scripts
commands = parse2cmds.parse_cmds_from_dockerfile(dockerfile)
commands = parser.parse_cmds_from_dockerfile(dockerfile)
for script in scripts:
# from RUN commands
for command in commands:
Expand All @@ -51,7 +49,7 @@ def trace_entry_images(dockerfile):
sources_scripts[script].append(command)

# from COPY or ADD
copy = parse2cmds.parse_add_from_dockerfile(dockerfile)
copy = parser.parse_add_from_dockerfile(dockerfile)
for external in copy:
if script in external:
if script not in sources_scripts:
Expand All @@ -74,10 +72,10 @@ def identify_keywords(dockerfile, keywords):
results = {}

# "RUN" commands
commands = parse2cmds.parse_cmds_from_dockerfile(dockerfile)
commands = parser.parse_cmds_from_dockerfile(dockerfile)

# "ENTRYPOINT" or "CMD"
entrypoints = parse2cmds.parse_exe_from_dockerfile(dockerfile)
entrypoints = parser.parse_exe_from_dockerfile(dockerfile)

# "keywords" is a list of keyword
for keyword in keywords:
Expand All @@ -95,18 +93,4 @@ def identify_keywords(dockerfile, keywords):
if len(identify) != 0:
results[keyword] = identify

return results



# log the detection results
"""
def write_log(image, results, filename):
path = basePath + filename + ".csv"
with open(path, "a+") as log:
for item in results:
log.write(image + ", " + item + ", ")
for obj in results[item]:
log.write(obj.replace("\n", " "))
log.write("\n")
"""
return results
Binary file not shown.
56 changes: 0 additions & 56 deletions dockerfile_analysis/analysis.py.bk

This file was deleted.

26 changes: 0 additions & 26 deletions dockerfile_analysis/cmd2words.py

This file was deleted.

13 changes: 9 additions & 4 deletions crawl.py → handle/crawler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
# Crawling dockerhub for all Dockerfile
# =======================================================
# Crawling "Dockerfile" from dockerhub or Github:
# 1. Dockerfile
# 2. build history
# =======================================================

import requests
import filter

# major function
def resolve_images_info(image):
# get Dockerfile in dockerhub
Dockerfile = resolve_Dockerfile_from_dockerhub(image)
Expand All @@ -15,7 +22,6 @@ def resolve_images_info(image):
githubRepo = check_github_repo(user, imageName)
if githubRepo != "":
url = "https://raw.githubusercontent.com" + githubRepo + "/Dockerfile"
#print (url)
Dockerfile = resolve_Dockerfile_from_github(url)
if Dockerfile != "":
#print ("github:", Dockerfile)
Expand All @@ -29,7 +35,6 @@ def resolve_images_info(image):

return None


def get_url(url):
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
Expand Down Expand Up @@ -101,7 +106,7 @@ def resolve_imageHistory(image, tag):
try:
# [0] is needed, also only one result
for commands in content[0]["layers"]:
if "set -ex" in commands["instruction"] or "set -eux" in commands["instruction"] or "exit 101" in commands["instruction"]:
if filter.exsit(commands["instruction"], "meaningless_words", "or"):
continue
if "ENTRYPOINT" not in commands["instruction"] and "CMD" not in commands["instruction"]:
imageHistory = imageHistory + "\n" + commands["instruction"].strip().replace("/bin/sh -c", "RUN").replace(" in ", " ").replace("]", "").replace("[", "")
Expand Down
25 changes: 25 additions & 0 deletions handle/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#################### filter keywords in dockerfile #################

# unwanted commands in docker build history
meaningless_words = [
"set -ex",
"set -eux",
"exit 101",
]

# paras "wordsList" is a string
# paras "rule": and means all in, or means one in
def exsit(text, wordsList, rule):
words = vars()[wordsList]

if rule == "or":
for word in words:
if word in text:
return True
return False

elif rule == "and":
for word in words:
if word not in text:
return False
return True
45 changes: 36 additions & 9 deletions parse2cmds.py → handle/parser.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,52 @@
################### Handle Dockerfile #####################
################### Parser Dockerfile #####################
import json
import crawl
import crawler
import collections, logging, itertools

import bashlex.parser
import bashlex.ast

from json import dumps
from dockerfile_parser import parser

# major function
def dockerfile2cmds(image):
def dockerfile2bash(dockerfile):
cmds = []
commands = dockerfile2cmds(dockerfile)
if len(commands) == 0:
return cmds

for command in commands["RUN"]:
# get beshlex AST
try:
parts = bashlex.parse(command)
except:
return cmds

for ast in parts:
cmd = []
try:
for i in range(len(ast.parts)):
word = ast.parts[i].word
cmd.append(word)
cmds.append(cmd)
except:
continue

return cmds

# paras "dockerfile": return from crawler.resolve_images_info(image)
def dockerfile2cmds(dockerfile):
commands = {}

# get dockerfile from dockerhub
dockerfile = crawl.resolve_images_info(image)
if dockerfile == None or dockerfile == "":
print ("[ERR] dockerfile carwling failed...")
print ("[ERR] dockerfile format error...")
return commands

# resolve the dockerfile
#dockerfile = parse2cmds.parse_dockerfile(dockerfile)
try:
dockerfile = parse_dockerfile(dockerfile)
except:
print ("[ERR] Dockerfile resolve failed: ", image)
print ("[ERR] Dockerfile parsing failed...")
return commands

commands["RUN"] = parse_cmds_from_dockerfile(dockerfile)
Expand Down
14 changes: 4 additions & 10 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
# Crawling dockerhub for all Dockerfile, and checking this Dockerfile whether has malicious behaviors
import sys
import threading

from dockerfile_analysis import cmd2words
import parse2cmds
import parser

# using for debug
failed_resolve = []
Expand All @@ -17,7 +15,7 @@ def __init__(self, image):
self.image = image.strip()

def run(self):
commands = parse2cmds.dockerfile2cmds(self.image)
commands = parser.dockerfile2cmds(self.image)
if "RUN" not in commands:
return

Expand All @@ -33,12 +31,12 @@ def main():
index = 1
total = len(images)

cores = 8
cores = 64
analyze_thread = []

for image in images:
# check format
if "/" not in image:
if "/" not in image and not image.startswith("/"):
continue

# output the rate of processing
Expand All @@ -55,10 +53,6 @@ def main():
for t in analyze_thread:
t.join()

if index % 1000 == 0:
log()
results = {}

thread.start()
analyze_thread.append(thread)

Expand Down
18 changes: 18 additions & 0 deletions manifest_analysis/analysis.py.bk
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import crawl
import download
import parser

basePath = "./results/"
"""
judge if urls indicate a images's layers
"""
def identify_urls_layers(image):
urls = []

tags = crawl.resolve_tags(image)
if len(tags) == 0:
return urls

urls = download.judge_url_layers(image, tags)

return urls
File renamed without changes.
Loading

0 comments on commit 5b1f2ee

Please sign in to comment.