-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
17 changed files
with
963 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import sys\n", | ||
"sys.path.append('utils/')\n", | ||
"from extract_classifiers import *\n", | ||
"from config import *\n", | ||
"from eval.captioner import *\n", | ||
"%matplotlib inline\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"import matplotlib.image as mpimg\n", | ||
"\n", | ||
"coco_template = 'images/ipython_images/coco/COCO_val2014_%012d.jpg' " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"#build feature extractor\n", | ||
"\n", | ||
"#These lexical features are trained with MSCOCO images. For images which include a held out class (e.g., \"zebra\") \n", | ||
"#only the held-out class is used as the label. All other class labels are given a label of \"-1\" (or ignore label.)\n", | ||
"#For images which do not include a held out class, multiple labels\n", | ||
"#are mined from visual descriptions (e.g., \"A young girl with a blue sweater\" --> labels: \"young\", \"girl\", \"blue\",\n", | ||
"#\"sweater\") \n", | ||
"coco_model = 'prototxts/train_classifiers_deploy.prototxt'\n", | ||
"coco_model_weights = 'snapshots/attributes_JJ100_NN300_VB100_coco_471_eightCluster_0223_iter_80000.caffemodel'\n", | ||
"\n", | ||
"#These lexical features are trained with MSCOCO images and > 600 ImageNet classes specifically chosen such that they do\n", | ||
"#not overlap with categories seen in MSCOCO. For all MSCOCO images, labels are mined from visual descriptions. For \n", | ||
"#ImageNet images, categories mined from MSCOCO descriptions are given a \"-1\" (ignore label). This is done because\n", | ||
"#in many ImageNet images classes like \"grass\" or \"yellow\" are present\n", | ||
"\n", | ||
"imagenet_model = 'prototxts/train_classifiers_deploy.prototxt'\n", | ||
"imagenet_model_weights = 'snapshots/attributes_JJ100_NN300_VB100_coco_471_eightCluster_0223_iter_80000.caffemodel'\n", | ||
"\n", | ||
"coco_extractor = VisualFeatureExtractor(coco_model, coco_model_weights, device=0, feature_extract='probs')\n", | ||
"coco_extractor.build_image_processor()\n", | ||
"\n", | ||
"imagenet_model = 'prototxts/train_classifiers_deploy.imagenet.prototxt'\n", | ||
"imagenet_model_weights = 'snapshots/vgg_multilabel_FT_iter_100000.caffemodel'\n", | ||
"\n", | ||
"imagenet_extractor = VisualFeatureExtractor(imagenet_model, imagenet_model_weights, device=0, feature_extract='probs')\n", | ||
"imagenet_extractor.build_image_processor()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"#Extract features for coco and Imagenet images.\n", | ||
"\n", | ||
"def make_feature_dict(features, ims):\n", | ||
" feature_dict = {}\n", | ||
" for i, im in enumerate(ims):\n", | ||
" feature_dict[im] = features[i,:]\n", | ||
" return feature_dict\n", | ||
" \n", | ||
"coco_images = [coco_template %im_id for im_id in [380868, 356368, 279846, 531563]]\n", | ||
"feature_dict_coco = make_feature_dict(coco_extractor.extract_batch_features(coco_images), coco_images)\n", | ||
"\n", | ||
"imagenet_images = ['images/ipython_images/' + im_id for im_id in ['otter/n02444819_13167.JPEG', \n", | ||
" 'otter/n02444819_10502.JPEG',\n", | ||
" 'alpaca/n02698473_518.JPEG',\n", | ||
" 'candelabra/n02947818_15145.JPEG',\n", | ||
" 'baobab/n12189987_4309.JPEG']]\n", | ||
"feature_dict_imagenet = make_feature_dict(imagenet_extractor.extract_batch_features(imagenet_images), imagenet_images)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Captioning image: 0/4\n", | ||
"4/4 done after word 12\n", | ||
"Captioning image: 0/4\n", | ||
"4/4 done after word 12" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"#Build dcc captioner for coco and Imagenet\n", | ||
"def build_captioner(proto, weights, vocab):\n", | ||
" language_model = models_folder + proto\n", | ||
" model_weights = weights_folder + weights\n", | ||
" vocab = vocab_root + vocab\n", | ||
"\n", | ||
"\n", | ||
" captioner = Captioner(language_model, model_weights,\n", | ||
" sentence_generation_cont_in='cont_sentence',\n", | ||
" sentence_generation_sent_in='input_sentence',\n", | ||
" sentence_generation_feature_in=['image_features'],\n", | ||
" sentence_generation_out='predict',\n", | ||
" vocab_file=vocab,\n", | ||
" prev_word_restriction=True)\n", | ||
" return captioner\n", | ||
"\n", | ||
"#coco models\n", | ||
"\n", | ||
"language_prototxt = 'dcc_vgg.wtd.prototxt'\n", | ||
"vocab = 'vocabulary.txt'\n", | ||
"coco_no_transfer_weights = 'dcc_coco_rm1_vgg.471.solver.prototxt_iter_110000.caffemodel'\n", | ||
"coco_transfer_weights = 'dcc_coco_rm1_vgg.471.solver.prototxt_iter_110000.transfer_words_coco1.txt_closeness_embedding.caffemodel'\n", | ||
"\n", | ||
"coco_no_transfer_captioner = build_captioner(language_prototxt, coco_no_transfer_weights, vocab)\n", | ||
"gen_captions_no_transfer = coco_no_transfer_captioner.caption_images(feature_dict_coco, \n", | ||
" feature_dict_coco.keys(), batch_size=1000)\n", | ||
"\n", | ||
"coco_transfer_captioner = build_captioner(language_prototxt, coco_transfer_weights, vocab)\n", | ||
"gen_captions_transfer = coco_transfer_captioner.caption_images(feature_dict_coco, \n", | ||
" feature_dict_coco.keys(), batch_size=1000)\n", | ||
"\n", | ||
"#ImageNet models\n", | ||
"\n", | ||
"language_prototxt = 'dcc_vgg.80k.wtd.imagenet.prototxt'\n", | ||
"vocab = 'yt_coco_surface_80k_vocab.txt'\n", | ||
"imagenet_no_transfer_weights = 'vgg_feats.vgg_multilabel_FT_iter_100000_imagenetSentences_iter_110000.caffemodel'\n", | ||
"imagenet_transfer_weights = 'vgg_feats.vgg_multilabel_FT_iter_100000_imagenetSentences_iter_110000.transfer_words_imagenet.txt_closeness_embedding.caffemodel'\n", | ||
"\n", | ||
"imagenet_no_transfer_captioner = build_captioner(language_prototxt, imagenet_no_transfer_weights, vocab)\n", | ||
"gen_captions_no_transfer_imagenet = imagenet_no_transfer_captioner.caption_images(feature_dict_imagenet, \n", | ||
" feature_dict_imagenet.keys(), batch_size=1000)\n", | ||
"imagenet_transfer_captioner = build_captioner(language_prototxt, imagenet_transfer_weights, vocab)\n", | ||
"gen_captions_transfer_imagenet = imagenet_transfer_captioner.caption_images(feature_dict_imagenet, \n", | ||
" feature_dict_imagenet.keys(), batch_size=1000)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"for i, img in enumerate(coco_images):\n", | ||
" img_read=mpimg.imread(img)\n", | ||
" plt.figure()\n", | ||
" plt.imshow(img_read)\n", | ||
" plt.title(\"No transfer: %s\\nDCC Transfer: %s\" %(gen_captions_no_transfer[img], gen_captions_transfer[img]))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"for i, img in enumerate(imagenet_images):\n", | ||
" img_read=mpimg.imread(img)\n", | ||
" plt.figure()\n", | ||
" plt.imshow(img_read)\n", | ||
" plt.title(\"No transfer: %s\\nDCC Transfer: %s\" %(gen_captions_no_transfer_imagenet[img], \n", | ||
" gen_captions_transfer_imagenet[img]))" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 2", | ||
"language": "python", | ||
"name": "python2" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 2 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython2", | ||
"version": "2.7.13" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
Code for: | ||
|
||
Hendricks, Lisa Anne, et al. "Deep Compositional Captioning: Describing Novel Object Categories without Paired Training Data." CVPR (2016). | ||
|
||
License: BSD 2-Clause license | ||
|
||
You should be able to replicate my results using this code. I am still actively adding to it, so if something is unclear email me. | ||
|
||
To use my code, please make sure you have the following: | ||
|
||
Lisa Anne Hendricks' recurrent branch of Caffe installed: "https://github.com/LisaAnne/lisa-caffe-public/tree/lisa_recurrent". My code will probably work well with other Caffe versions, but I have tested on this version. | ||
|
||
To begin, please copy "utils/config.example.py" to "utils/config.py" and update to reflect the paths in your machine. For example, if you have already downloaded the MSCOCO images and do not want to redownload them, please update "coco_iamges_root" to point to where the MSCOCO images are stored on your machine. If you do not have the MSCOCO images, we will download them in the next step. | ||
|
||
Next, please run: ./setup.sh | ||
|
||
This will download data needed for DCC (MSCOCO), pretrained models and other tools. If you do not already have the MSCOCO annotations, images, and eval tools installed on your machine, you can use this script to download those tools using the following flags: | ||
--download_mscoco_annotations: downloads mscoco annotations | ||
--download_mscoco_images: downloads mscoco images | ||
--download_mscoco_tools: downloads mscoco eval tools | ||
|
||
|
||
|
||
Now that everything is setup, we can evaluate the DCC model. | ||
|
||
1. The first step in DCC is to train lexical models which map images to a set of visual concepts (e.g., "sheep", "grass", "stand"). | ||
- "attributes_JJ100_NN300_VB100_allObjects_coco_vgg_0111_iter_80000.caffemodel": image model trained with MSCOCO images | ||
- "attributes_JJ100_NN300_VB100_coco_471_eightCluster_0223_iter_80000.caffemodel": image model trained with MSCOCO images (Do not use multiple labels for held out classes. We mine MSCOCO labels from descriptions, and therefore images can have multiple labels. However, for the eight held out concepts, we just train with a single label corresponding to the held out class -- e.g., "bus" instead of "bus", "street", "building". We do this to ensure that the visual model does not exploit co-occurrences) | ||
- "attributes_JJ100_NN300_VB100_clusterEight_imagenet_vgg_0112_iter_80000.caffemode": image model trained with MSCOCO images EXCEPT for objects which are held outduring paired training. These categories are trained with ImageNet data. | ||
- "vgg_multilabel_FT_iter_100000.caffemodel": image model trained on all MSCOCO images and over 600 ImageNet objects not in MSCOCO | ||
|
||
The code to train these models will be coming soon, but you can use all my pretrained models. | ||
|
||
2. The next step in DCC is to train language models. | ||
- "mrnn.direct_iter_110000.caffemodel": language model trained on MSCOCO text | ||
- "mrnn.lm.direct_surf_lr0.01_iter_120000.caffemodel": language model trained on WebCorbus text | ||
- "mrnn.lm.direct_imtextyt_lr0.01_iter_120000.caffemodel": langauge model trained on Caption text | ||
|
||
The code to train these models will be coming soon, but you can use all my pretrained models. | ||
|
||
3. The final training step is to train the caption model. You can find the prototxts to train the caption models in "prototxts". To speed up training, I pre-extract image features. Please look at "extract_features.sh" to see how to extract features. Train the caption models using one of the following bash scripts: | ||
- "run_dcc_coco_baseline_vgg.sh": model with pair supervision | ||
- "run_dcc_coco_rm1_vgg.sh": direct transfer model with in domain text pre-training and in domain image pre-training | ||
- "run_dcc_coco_rm1_vgg.delta.sh": delta transfer model with in domain text pre-training and in domain image pre-training | ||
- "run_dcc_imagenet_rm1_vgg.sh": direct transfer model with in domain text pre-training and out of domain image pre-training | ||
- "run_dcc_imagenet_rm1_vgg.im2txt.sh": direct transfer model with out of domain text pre-training with Caption txt and out of domain image pre-training | ||
- "run_dcc_imagenet_rm1_vgg.sh": direct transfer model with out of domain text pre-training with WebCorpus and out of domain image pre-training | ||
- "run_dcc_imagenet_sentences_vgg.sh": direct transfer model for describing Imagnet objects | ||
|
||
Note that I include all my caption models in "snapshots", so you do not have to retrain these models yourself! | ||
|
||
4. Novel word transfer. Please look at transfer.sh to see how to transfer weigths for the direct transfer model and transfer_delta.sh to see how to transfer weights for the delta_transfer model. | ||
|
||
5. Evaluation on MSCOCO. Look at generate_coco.sh. | ||
|
||
6. Generating descriptions for ImageNet images. Look at generate_imagenet.sh. | ||
|
||
Please contact [email protected] if you have any issues. Happy captioning! | ||
|
||
Updated 6/18/2016 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!/bin/bash | ||
|
||
#coco | ||
model='prototxts/train_classifiers_deploy.prototxt' | ||
model_weights='snapshots/attributes_JJ100_NN300_VB100_coco_471_eightCluster_0223_iter_80000.caffemodel' | ||
image_dim=224 | ||
|
||
python dcc.py --image_model $model \ | ||
--model_weights $model_weights \ | ||
--batch_size 16 \ | ||
--image_dim $image_dim \ | ||
--extract_features |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#!/usr/bin/env bash | ||
|
||
#coco | ||
|
||
#This will generate results when using in domain data for transfer using the direct transfer method. | ||
|
||
#These numbers are a bit better than what is reported in the paper | ||
#model for direct transfer | ||
deploy_words=dcc_vgg.wtd.prototxt | ||
model_name=dcc_coco_rm1_vgg.471.solver.prototxt_iter_110000.transfer_words_coco1.txt_closeness_embedding.caffemodel | ||
#model for delta transfer | ||
#deploy_words=dcc_vgg.delta.wtd.prototxt | ||
#model_name=dcc_coco_rm1_vgg.delta_freezeLM_iter_50000.transfer_words_coco1.txt_closeness_embedding_delta_1.caffemodel | ||
vocab=vocabulary.txt | ||
precomputed_feats=vgg_feats.attributes_JJ100_NN300_VB100_coco_471_eightCluster_0223_iter_80000.caffemodel.val_val.h5 | ||
image_list=coco2014_cocoid.val_val.txt | ||
split=val_val | ||
|
||
#To generate result using out of domain for transfer: | ||
|
||
#For models trained with out of domain text (vocabulary is larger than coco vocab) you will want to use the following deploy and vocab | ||
#deploy_words=dcc_vgg.80k.wtd.prototxt | ||
#vocab=yt_coco_surface_80k_vocab.txt | ||
#You will also need to use a different model to train with out of domain text. | ||
#model_name=dcc_oodLM_rm1_vgg.im2txt.471.solver_0409_iter_110000.transfer_words_coco1.txt_closeness_embedding.caffemodel | ||
#model_name=dcc_oodLM_rm1_vgg.surf.solver_iter_110000.transfer_words_coco1.txt_closeness_embedding.caffemodel | ||
|
||
#To generate results when image model is trained with out of domain image data | ||
#precomputed_feats=vgg_feats.attributes_JJ100_NN300_VB100_clusterEight_imagenet_vgg_0112_iter_80000.val_val.h5 | ||
|
||
|
||
echo $deploy_words | ||
echo $model_name | ||
echo $vocab | ||
echo $precomputed_feats | ||
echo $image_list | ||
|
||
python dcc.py --language_model $deploy_words \ | ||
--model_weights $model_name \ | ||
--vocab $vocab \ | ||
--precomputed_features $precomputed_feats \ | ||
--image_list $image_list \ | ||
--split $split \ | ||
--generate_coco |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/usr/bin/env bash | ||
|
||
deploy_words=dcc_vgg.80k.wtd.imagenet.prototxt | ||
model_name=vgg_feats.vgg_multilabel_FT_iter_100000_imagenetSentences_iter_110000.transfer_words_imagenet.txt_closeness_embedding.caffemodel | ||
vocab=yt_coco_surface_80k_vocab.txt | ||
precomputed_feats=vgg_feats.vgg_multilabel_FT_iter_100000.caffemodel.imagenet_ims_test.h5 | ||
#image_list=test_imagenet_images.txt | ||
image_list=gecko_test_list.txt | ||
language_feature='predict' | ||
|
||
python dcc.py --language_model $deploy_words \ | ||
--model_weights $model_name \ | ||
--vocab $vocab \ | ||
--precomputed_features $precomputed_feats \ | ||
--image_list $image_list \ | ||
--language_feature $language_feature \ | ||
--generate_imagenet |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/usr/bin/env bash | ||
|
||
export PYTHONPATH='utils/:$PYTHONPATH' | ||
|
||
caffe/build/tools/caffe train -solver prototxts/dcc_coco_baseline_vgg.solver.prototxt -weights trained_models/language_models/mrnn.direct_iter_110000.caffemodel -gpu 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/usr/bin/env bash | ||
|
||
export PYTHONPATH='utils/:$PYTHONPATH' | ||
|
||
caffe/build/tools/caffe train -solver prototxts/dcc_coco_rm1_vgg.solver.freezeLM.prototxt -weights trained_models/language_models/mrnn.direct_iter_110000.caffemodel -gpu 0 | ||
|
||
caffe/build/tools/caffe train -solver prototxts/dcc_coco_rm1_vgg.solver.deltaLM.prototxt -weights snapshots/dcc_coco_rm1_vgg.freezeLM.prototxt_iter_50000.caffemodel -gpu 0 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/usr/bin/env bash | ||
|
||
export PYTHONPATH='utils/:$PYTHONPATH' | ||
|
||
caffe/python/train.py --solver prototxts/dcc_coco_rm1_vgg.solver.prototxt --weights snapshots/mrnn.direct_iter_110000.caffemodel --gpu 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/usr/bin/env bash | ||
|
||
export PYTHONPATH='utils/:$PYTHONPATH' | ||
|
||
caffe/build/tools/caffe train -solver prototxts/dcc_oodLM_rm1_vgg.im2txt.solver.prototxt -weights trained_models/language_models/mrnn.lm.direct_imtextyt_lr0.01_iter_120000.caffemodel -gpu 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/usr/bin/env bash | ||
|
||
export PYTHONPATH='utils/:$PYTHONPATH' | ||
|
||
caffe/build/tools/caffe train -solver prototxts/dcc_oodLM_rm1_vgg.surf.solver.prototxt -weights trained_models/language_models/mrnn.lm.direct_surf_lr0.01_iter_120000.caffemodel -gpu 0 |
Oops, something went wrong.