Skip to content

Commit

Permalink
cleaned download code
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Hossler committed Sep 21, 2021
1 parent 691527b commit f75938c
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 29 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*__pycache__
13 changes: 6 additions & 7 deletions build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ SHELL ["/bin/bash", "-c"]
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
ffmpeg \
git \
git \
git-core \
g++ \
vim \
vim \
zip \
zlib1g-dev \
cuda-command-line-tools-${CUDA/./-} \
Expand All @@ -42,7 +42,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libzmq3-dev \
pkg-config \
software-properties-common \
unzip
unzip \
wget

# Install TensorRT if not building for PowerPC
# NOTE: libnvinfer uses cuda11.1 versions
Expand Down Expand Up @@ -71,17 +72,15 @@ RUN apt update -y && \
RUN ln -s $(which python3) /usr/local/bin/python

RUN python3 -m pip install tensorflow==2.5.0
WORKDIR /app

WORKDIR /app

COPY requirements.txt .
RUN python3 -m pip install -r requirements.txt
RUN python3 -m pip install git+https://github.com/philferriere/cocoapi.git#subdirectory=PythonAPI

ENV TF_CPP_MIN_LOG_LEVEL=2

RUN apt-get update -y && apt-get install -y wget

RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.13.0/protoc-3.13.0-linux-x86_64.zip && \
unzip protoc-3.13.0-linux-x86_64.zip -d /app/protobuf/

Expand Down
6 changes: 3 additions & 3 deletions build/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
* NVIDIA GPU with the latest driver installed
* docker / nvidia-docker

This build has been tested with Nvidia Drivers 460.91.03 and CUDA 11.2. Please update the base image
if you plan on using older versions of CUDA.
This build has been tested with Nvidia Drivers 460.91.03 and CUDA 11.2 on a Ubutun 20.04 machine.
Please update the base image if you plan on using older versions of CUDA.

## Build
Build the image with:
Expand Down Expand Up @@ -37,4 +37,4 @@ gcloud auth login
tf object detection api

## Updating the instructions
Feel free to submit PRs or issues should you see a scope for improvement.
Feel free to submit PRs or issues should you see a scope for improvement.
40 changes: 21 additions & 19 deletions download_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,12 @@
import tensorflow.compat.v1 as tf
from PIL import Image
from psutil import cpu_count
from waymo_open_dataset import dataset_pb2 as open_dataset

from utils import *
from utils import get_module_logger, parse_frame


logger = get_module_logger(__name__)


def create_tf_example(filename, encoded_jpeg, annotations, resize=True):
Expand Down Expand Up @@ -72,11 +76,11 @@ def create_tf_example(filename, encoded_jpeg, annotations, resize=True):

def download_tfr(filename, data_dir):
"""
download a single tf record
download a single tf record
args:
- filename [str]: path to the tf record file
- temp_dir [str]: path to the directory where the raw data will be saved
- data_dir [str]: path to the destination directory
returns:
- local_path [str]: path where the file is saved
Expand Down Expand Up @@ -122,34 +126,32 @@ def process_tfr(path, data_dir):
writer.close()


@ray.remote
def download_and_process(filename, temp_dir, data_dir):
#@ray.remote
def download_and_process(filename, data_dir):
# need to re-import the logger because of multiprocesing
logger = get_module_logger(__name__)
local_path = download_tfr(filename, temp_dir)
local_path = download_tfr(filename, data_dir)
process_tfr(local_path, data_dir)
# remove the original tf record to save space
logger.info(f'Deleting {local_path}')
os.remove(local_path)


if __name__ == "__main__":
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Download and process tf files')
parser.add_argument('--data_dir', required=True,
help='processed data directory')
parser.add_argument('--temp_dir', required=True,
help='raw data directory')
help='data directory')
parser.add_argument('--size', required=False, default=100, type=int,
help='Number of files to download')
args = parser.parse_args()
logger = get_module_logger(__name__)
data_dir = args.data_dir
size = args.size

# open the filenames file
with open('filenames.txt', 'r') as f:
filenames = f.read().splitlines()
logger.info(f'Download {len(filenames)} files. Be patient, this will take a long time.')

data_dir = args.data_dir
temp_dir = args.temp_dir
filenames = f.read().splitlines()
logger.info(f'Download {len(filenames[:size])} files. Be patient, this will take a long time.')

# init ray
ray.init(num_cpus=cpu_count())

workers = [download_and_process.remote(fn, temp_dir, data_dir) for fn in filenames[:100]]
workers = [download_and_process.remote(fn, data_dir) for fn in filenames[:size]]
_ = ray.get(workers)

0 comments on commit f75938c

Please sign in to comment.