Skip to content

Commit

Permalink
Add scripts for downloading COCO2014 tools & data
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffdonahue committed Sep 4, 2015
1 parent 86b1dde commit e71fe69
Show file tree
Hide file tree
Showing 6 changed files with 179 additions and 0 deletions.
24 changes: 24 additions & 0 deletions data/coco/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
For details about the Microsoft COCO ("Common Objects in Context") dataset [1],
visit mscoco.org. This README provides instructions for downloading and
installing the tools and dataset.

1) Download and extract the COCO Python tools by running:

./download_tools.sh

2) Install the tools, and optionally download the data by running:

cd coco/PythonAPI
python setup.py install # follow prompts to download or skip data

3) Download train/val/test splits using:

./get_coco2014_aux.sh

(or see the COCO README (tools/README) for more information).


[1] Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona,
Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick.
"Microsoft COCO: Common Objects in Context."
arXiv preprint arXiv:1405.0312 (2014).
16 changes: 16 additions & 0 deletions data/coco/download_eval_tools.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash

# change to directory $DIR where this script is stored
pushd .
DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
cd $DIR

OUTFILE=coco_caption_eval.zip
wget --no-check-certificate https://github.com/jeffdonahue/coco-caption/archive/master.zip -O $OUTFILE
unzip $OUTFILE
mv coco-caption-master coco-caption-eval

# change back to original working directory
popd

echo "Downloaded COCO evaluation tools to: $DIR/coco-caption-eval"
17 changes: 17 additions & 0 deletions data/coco/download_tools.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env bash

# change to directory $DIR where this script is stored
pushd .
DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
cd $DIR

git clone https://github.com/pdollar/coco.git

# change back to original working directory
popd

echo "Cloned COCO tools to: $DIR/coco"
echo "To setup COCO tools (and optionally download data), run:"
echo " cd $DIR/coco"
echo " python setup.py install"
echo "and follow the prompts."
23 changes: 23 additions & 0 deletions data/coco/get_coco2014_aux.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/env bash
#
# Downloads Andrej Karpathy's train/val/test splits of COCO2014 as text files.

# change to directory $DIR where this script is stored
pushd .
DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
cd $DIR

FILENAME=coco2014_aux.tar.gz

echo "Downloading..."

wget http://dl.caffe.berkeleyvision.org/$FILENAME

echo "Unzipping to $DIR"

tar -xf $FILENAME && rm -f $FILENAME

echo "Done."

# change back to original working directory
popd
37 changes: 37 additions & 0 deletions data/coco/make_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env python

# This file is only meant to be run as a script with 0 arguments,
# and depends on steps 1-3 of README.md.
#
# It creates a test set from the image filenames of the test set.

import json
import os
import re

# get path to directory where this script is
script_dir = os.path.dirname(os.path.realpath(__file__))

set_name = 'test2014'
image_root = '%s/coco/images/%s' % (script_dir, set_name)
out_filename = '%s/coco/annotations/captions_%s.json' % (script_dir, set_name)
image_ext = 'jpg'
imname_re = re.compile('COCO_%s_(?P<image_id>\d+)\.%s' % (set_name, image_ext))
full_image_ext = '.%s' % image_ext
image_filenames = filter(lambda f: f.endswith(full_image_ext), os.listdir(image_root))
print 'Creating dummy annotation file for %d images at: %s' % \
(len(image_filenames), out_filename)

out_data = {'type': 'captions', 'images': [], 'annotations': [],
'licenses': [], 'info': {}}
for index, filename in enumerate(image_filenames):
match = imname_re.match(filename)
if match is None: raise Exception('Unsupported filename: %s' % filename)
image_id = int(match.group('image_id'))
out_data['images'].append({'file_name': filename, 'id': image_id})
for dummy_index in range(2):
annotation = {'caption': 'dummy caption %d' % dummy_index,
'id': index, 'image_id': image_id}
out_data['annotations'].append(annotation)
with open(out_filename, 'w') as out_file:
json.dump(out_data, out_file)
62 changes: 62 additions & 0 deletions data/coco/make_trainval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python

# This file is only meant to be run as a script with 0 arguments,
# and depends on steps 1-3 of README.md.
#
# It creates a "trainval" set by combining the COCO 2014 train and val sets.
# The trainval set is intended for use only when training a single final model
# for submission of results on the test set to the COCO evaluation server.

import os
import json

# get path to directory where this script is
script_dir = os.path.dirname(os.path.realpath(__file__))

anno_dir_path = '%s/coco/annotations' % script_dir
image_root = '%s/coco/images' % script_dir
abs_image_root = os.path.abspath(image_root)
out_coco_id_filename = '%s/coco2014_cocoid.trainval.txt' % script_dir
filename_pattern = 'captions_%s2014.json'
in_sets = ['train', 'val']
out_set = 'trainval'
path_pattern = '%s/%s' % (anno_dir_path, filename_pattern)

out_data = {}
for in_set in in_sets:
filename = path_pattern % in_set
print 'Loading input dataset from: %s' % filename
data = json.load(open(filename, 'r'))
for key, val in data.iteritems():
if type(val) == list:
if key not in out_data:
out_data[key] = []
out_data[key] += val
else:
if key not in out_data:
out_data[key] = val
assert out_data[key] == val
filename = path_pattern % out_set
print 'Dumping output dataset to: %s' % filename
json.dump(out_data, open(filename, 'w'))

out_ids = [str(im['id']) for im in out_data['images']]
print 'Writing COCO IDs to: %s' % out_coco_id_filename
with open(out_coco_id_filename, 'w') as coco_id_file:
coco_id_file.write('\n'.join(out_ids) + '\n')

# make a trainval dir with symlinks to all train+val images
out_dir = '%s/%s2014' % (image_root, out_set)
os.makedirs(out_dir)
print 'Writing image symlinks to: %s' % out_dir
for im in out_data['images']:
filename = im['file_name']
set_name = None
for in_set in in_sets:
if in_set in filename:
set_name = in_set
break
assert set_name is not None
real_path = '%s/%s2014/%s' % (abs_image_root, set_name, filename)
link_path = '%s/%s' % (out_dir, filename)
os.symlink(real_path, link_path)

0 comments on commit e71fe69

Please sign in to comment.