Add scripts for downloading COCO2014 tools & data

vsubhashini · Sep 4, 2015 · e71fe69 · e71fe69
1 parent 86b1dde
commit e71fe69
Show file tree

Hide file tree

Showing 6 changed files with 179 additions and 0 deletions.
diff --git a/data/coco/README.md b/data/coco/README.md
@@ -0,0 +1,24 @@
+For details about the Microsoft COCO ("Common Objects in Context") dataset [1],
+visit mscoco.org.  This README provides instructions for downloading and
+installing the tools and dataset.
+
+1) Download and extract the COCO Python tools by running:
+
+    ./download_tools.sh
+
+2) Install the tools, and optionally download the data by running:
+
+    cd coco/PythonAPI
+    python setup.py install  # follow prompts to download or skip data
+
+3) Download train/val/test splits using:
+
+    ./get_coco2014_aux.sh
+
+(or see the COCO README (tools/README) for more information).
+
+
+[1] Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona,
+    Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick.
+    "Microsoft COCO: Common Objects in Context."
+    arXiv preprint arXiv:1405.0312 (2014).
diff --git a/data/coco/download_eval_tools.sh b/data/coco/download_eval_tools.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+# change to directory $DIR where this script is stored
+pushd .
+DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+cd $DIR
+
+OUTFILE=coco_caption_eval.zip
+wget --no-check-certificate https://github.com/jeffdonahue/coco-caption/archive/master.zip -O $OUTFILE
+unzip $OUTFILE
+mv coco-caption-master coco-caption-eval
+
+# change back to original working directory
+popd
+
+echo "Downloaded COCO evaluation tools to: $DIR/coco-caption-eval"
diff --git a/data/coco/download_tools.sh b/data/coco/download_tools.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# change to directory $DIR where this script is stored
+pushd .
+DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+cd $DIR
+
+git clone https://github.com/pdollar/coco.git
+
+# change back to original working directory
+popd
+
+echo "Cloned COCO tools to: $DIR/coco"
+echo "To setup COCO tools (and optionally download data), run:"
+echo "    cd $DIR/coco"
+echo "    python setup.py install"
+echo "and follow the prompts."
diff --git a/data/coco/get_coco2014_aux.sh b/data/coco/get_coco2014_aux.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+#
+# Downloads Andrej Karpathy's train/val/test splits of COCO2014 as text files.
+
+# change to directory $DIR where this script is stored
+pushd .
+DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+cd $DIR
+
+FILENAME=coco2014_aux.tar.gz
+
+echo "Downloading..."
+
+wget http://dl.caffe.berkeleyvision.org/$FILENAME
+
+echo "Unzipping to $DIR"
+
+tar -xf $FILENAME && rm -f $FILENAME
+
+echo "Done."
+
+# change back to original working directory
+popd
diff --git a/data/coco/make_test.py b/data/coco/make_test.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+# This file is only meant to be run as a script with 0 arguments,
+# and depends on steps 1-3 of README.md.
+#
+# It creates a test set from the image filenames of the test set.
+
+import json
+import os
+import re
+
+# get path to directory where this script is
+script_dir = os.path.dirname(os.path.realpath(__file__))
+
+set_name = 'test2014'
+image_root = '%s/coco/images/%s' % (script_dir, set_name)
+out_filename = '%s/coco/annotations/captions_%s.json' % (script_dir, set_name)
+image_ext = 'jpg'
+imname_re = re.compile('COCO_%s_(?P<image_id>\d+)\.%s' % (set_name, image_ext))
+full_image_ext = '.%s' % image_ext
+image_filenames = filter(lambda f: f.endswith(full_image_ext), os.listdir(image_root))
+print 'Creating dummy annotation file for %d images at: %s' % \
+    (len(image_filenames), out_filename)
+
+out_data = {'type': 'captions', 'images': [], 'annotations': [],
+            'licenses': [], 'info': {}}
+for index, filename in enumerate(image_filenames):
+    match = imname_re.match(filename)
+    if match is None: raise Exception('Unsupported filename: %s' % filename)
+    image_id = int(match.group('image_id'))
+    out_data['images'].append({'file_name': filename, 'id': image_id})
+    for dummy_index in range(2):
+        annotation = {'caption': 'dummy caption %d' % dummy_index,
+                      'id': index, 'image_id': image_id}
+        out_data['annotations'].append(annotation)
+with open(out_filename, 'w') as out_file:
+    json.dump(out_data, out_file)
diff --git a/data/coco/make_trainval.py b/data/coco/make_trainval.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+
+# This file is only meant to be run as a script with 0 arguments,
+# and depends on steps 1-3 of README.md.
+#
+# It creates a "trainval" set by combining the COCO 2014 train and val sets.
+# The trainval set is intended for use only when training a single final model
+# for submission of results on the test set to the COCO evaluation server.
+
+import os
+import json
+
+# get path to directory where this script is
+script_dir = os.path.dirname(os.path.realpath(__file__))
+
+anno_dir_path = '%s/coco/annotations' % script_dir
+image_root = '%s/coco/images' % script_dir
+abs_image_root = os.path.abspath(image_root)
+out_coco_id_filename = '%s/coco2014_cocoid.trainval.txt' % script_dir
+filename_pattern = 'captions_%s2014.json'
+in_sets = ['train', 'val']
+out_set = 'trainval'
+path_pattern = '%s/%s' % (anno_dir_path, filename_pattern)
+
+out_data = {}
+for in_set in in_sets:
+    filename = path_pattern % in_set
+    print 'Loading input dataset from: %s' % filename
+    data = json.load(open(filename, 'r'))
+    for key, val in data.iteritems():
+        if type(val) == list:
+            if key not in out_data:
+                out_data[key] = []
+            out_data[key] += val
+        else:
+            if key not in out_data:
+                out_data[key] = val
+            assert out_data[key] == val
+filename = path_pattern % out_set
+print 'Dumping output dataset to: %s' % filename
+json.dump(out_data, open(filename, 'w'))
+
+out_ids = [str(im['id']) for im in out_data['images']]
+print 'Writing COCO IDs to: %s' % out_coco_id_filename
+with open(out_coco_id_filename, 'w') as coco_id_file:
+    coco_id_file.write('\n'.join(out_ids) + '\n')
+
+# make a trainval dir with symlinks to all train+val images
+out_dir = '%s/%s2014' % (image_root, out_set)
+os.makedirs(out_dir)
+print 'Writing image symlinks to: %s' % out_dir
+for im in out_data['images']:
+    filename = im['file_name']
+    set_name = None
+    for in_set in in_sets:
+        if in_set in filename:
+            set_name = in_set
+            break
+    assert set_name is not None
+    real_path = '%s/%s2014/%s' % (abs_image_root, set_name, filename)
+    link_path = '%s/%s' % (out_dir, filename)
+    os.symlink(real_path, link_path)