-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
7366ad9
commit 9209365
Showing
53 changed files
with
8,326 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
import numpy as np | ||
#import numpy | ||
|
||
|
||
def i2t_SCAN(sims, npts=None, return_ranks=False): | ||
""" | ||
Images->Text (Image Annotation) | ||
Images: (N, n_region, d) matrix of images | ||
Captions: (5N, max_n_word, d) matrix of captions | ||
CapLens: (5N) array of caption lengths | ||
sims: (N, 5N) matrix of similarity im-cap | ||
""" | ||
npts = sims.shape[0] | ||
ranks = np.zeros(npts) | ||
top1 = np.zeros(npts) | ||
for index in range(npts): | ||
inds = np.argsort(sims[index])[::-1] | ||
# Score | ||
rank = 1e20 | ||
for i in range(5 * index, 5 * index + 5, 1): | ||
tmp = np.where(inds == i)[0][0] | ||
if tmp < rank: | ||
rank = tmp | ||
ranks[index] = rank | ||
top1[index] = inds[0] | ||
|
||
# Compute metrics | ||
r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks) | ||
r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks) | ||
r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks) | ||
|
||
r20 = 100.0 * len(np.where(ranks < 20)[0]) / len(ranks) | ||
r50 = 100.0 * len(np.where(ranks < 50)[0]) / len(ranks) | ||
r70 = 100.0 * len(np.where(ranks < 70)[0]) / len(ranks) | ||
r100 = 100.0 * len(np.where(ranks < 100)[0]) / len(ranks) | ||
|
||
medr = np.floor(np.median(ranks)) + 1 | ||
meanr = ranks.mean() + 1 | ||
if return_ranks: | ||
return (r1, r5, r10, r20, r50, r70, r100, medr, meanr), (ranks, top1) | ||
else: | ||
return (r1, r5, r10, r20, r50, r70, r100, medr, meanr) | ||
|
||
def t2i_SCAN(sims, npts=None, return_ranks=False): | ||
""" | ||
Text->Images (Image Search) | ||
Images: (N, n_region, d) matrix of images | ||
Captions: (5N, max_n_word, d) matrix of captions | ||
CapLens: (5N) array of caption lengths | ||
sims: (N, 5N) matrix of similarity im-cap | ||
""" | ||
npts = sims.shape[0] | ||
ranks = np.zeros(5 * npts) | ||
top1 = np.zeros(5 * npts) | ||
|
||
# --> (5N(caption), N(image)) | ||
sims = sims.T | ||
|
||
for index in range(npts): | ||
for i in range(5): | ||
inds = np.argsort(sims[5 * index + i])[::-1] | ||
ranks[5 * index + i] = np.where(inds == index)[0][0] | ||
top1[5 * index + i] = inds[0] | ||
|
||
# Compute metrics | ||
r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks) | ||
r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks) | ||
r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks) | ||
|
||
r20 = 100.0 * len(np.where(ranks < 20)[0]) / len(ranks) | ||
r50 = 100.0 * len(np.where(ranks < 50)[0]) / len(ranks) | ||
r70 = 100.0 * len(np.where(ranks < 70)[0]) / len(ranks) | ||
r100 = 100.0 * len(np.where(ranks < 100)[0]) / len(ranks) | ||
|
||
medr = np.floor(np.median(ranks)) + 1 | ||
meanr = ranks.mean() + 1 | ||
if return_ranks: | ||
return (r1, r5, r10, r20, r50, r70, r100, medr, meanr), (ranks, top1) | ||
else: | ||
return (r1, r5, r10, r20, r50, r70, r100, medr, meanr) | ||
|
||
'''def i2t(images, captions, npts=None, measure='cosine', return_ranks=False): | ||
""" | ||
Images->Text (Image Annotation) | ||
Images: (5N, K) matrix of images | ||
Captions: (5N, K) matrix of captions | ||
""" | ||
if npts is None: | ||
npts = int(images.shape[0] / 5) | ||
index_list = [] | ||
ranks = numpy.zeros(npts) | ||
top1 = numpy.zeros(npts) | ||
for index in range(npts): | ||
# Get query image | ||
im = images[5 * index].reshape(1, images.shape[1]) | ||
# Compute scores | ||
d = numpy.dot(im, captions.T).flatten() | ||
inds = numpy.argsort(d)[::-1] | ||
index_list.append(inds[0]) | ||
# Score | ||
rank = 1e20 | ||
for i in range(5 * index, 5 * index + 5, 1): | ||
tmp = numpy.where(inds == i)[0][0] | ||
if tmp < rank: | ||
rank = tmp | ||
ranks[index] = rank | ||
top1[index] = inds[0] | ||
# Compute metrics | ||
r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks) | ||
r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks) | ||
r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks) | ||
r20 = 100.0 * len(numpy.where(ranks < 20)[0]) / len(ranks) | ||
r50 = 100.0 * len(numpy.where(ranks < 50)[0]) / len(ranks) | ||
r70 = 100.0 * len(numpy.where(ranks < 70)[0]) / len(ranks) | ||
r100 = 100.0 * len(numpy.where(ranks < 100)[0]) / len(ranks) | ||
medr = numpy.floor(numpy.median(ranks)) + 1 | ||
meanr = ranks.mean() + 1 | ||
if return_ranks: | ||
return (r1, r5, r10, r20, r50, r70, r100, medr, meanr), (ranks, top1) | ||
else: | ||
return (r1, r5, r10, r20, r50, r70, r100, medr, meanr) | ||
def t2i(images, captions, npts=None, measure='cosine', return_ranks=False): | ||
""" | ||
Text->Images (Image Search) | ||
Images: (5N, K) matrix of images | ||
Captions: (5N, K) matrix of captions | ||
""" | ||
if npts is None: | ||
npts = int(images.shape[0] / 5) | ||
ims = numpy.array([images[i] for i in range(0, len(images), 5)]) | ||
ranks = numpy.zeros(5 * npts) | ||
top1 = numpy.zeros(5 * npts) | ||
for index in range(npts): | ||
# Get query captions | ||
queries = captions[5 * index:5 * index + 5] | ||
# Compute scores | ||
d = numpy.dot(queries, ims.T) | ||
inds = numpy.zeros(d.shape) | ||
for i in range(len(inds)): | ||
inds[i] = numpy.argsort(d[i])[::-1] | ||
ranks[5 * index + i] = numpy.where(inds[i] == index)[0][0] | ||
top1[5 * index + i] = inds[i][0] | ||
# Compute metrics | ||
r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks) | ||
r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks) | ||
r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks) | ||
r20 = 100.0 * len(numpy.where(ranks < 20)[0]) / len(ranks) | ||
r50 = 100.0 * len(numpy.where(ranks < 50)[0]) / len(ranks) | ||
r70 = 100.0 * len(numpy.where(ranks < 70)[0]) / len(ranks) | ||
r100 = 100.0 * len(numpy.where(ranks < 100)[0]) / len(ranks) | ||
medr = numpy.floor(numpy.median(ranks)) + 1 | ||
meanr = ranks.mean() + 1 | ||
if return_ranks: | ||
return (r1, r5, r10, r20, r50, r70, r100, medr, meanr), (ranks, top1) | ||
else: | ||
return (r1, r5, r10, r20, r50, r70, r100, medr, meanr)''' |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
from sacred import Experiment | ||
|
||
ex = Experiment("METER") | ||
|
||
|
||
def _loss_names(d): | ||
ret = { | ||
"itm": 0, | ||
"mlm": 0, | ||
"mpp": 0, | ||
"vqa": 0, | ||
"vcr": 0, | ||
"vcr_qar": 0, | ||
"nlvr2": 0, | ||
"irtr": 0, | ||
"contras": 0, | ||
"snli": 0, | ||
} | ||
ret.update(d) | ||
return ret | ||
|
||
|
||
@ex.config | ||
def config(): | ||
exp_name = "finetune_irtr_f30k" | ||
seed = 0 | ||
datasets = "f30k" | ||
loss_names = _loss_names({"irtr": 1}) | ||
batch_size = 58 # this is a desired batch size; pl trainer will accumulate gradients when per step batch is smaller. | ||
margin = 0.2 | ||
|
||
# Image setting | ||
#train_transform_keys = ["clip"] | ||
#val_transform_keys = ["clip"] | ||
image_size = 224 | ||
patch_size = 32 | ||
#draw_false_image = 1 | ||
image_only = False | ||
|
||
# Text Setting | ||
#vqav2_label_size = 3129 | ||
max_text_len = 32 | ||
tokenizer = "bert-base-uncased" | ||
vocab_size = 30522 | ||
whole_word_masking = False # note that whole_word_masking does not work for RoBERTa | ||
mlm_prob = 0.15 | ||
#draw_false_text = 0 | ||
|
||
# Transformer Setting | ||
num_top_layer = 6 | ||
input_image_embed_size = 1024 | ||
input_text_embed_size = 768 | ||
vit = "swin_base_patch4_window7_224_in22k" | ||
hidden_size = 768 | ||
num_heads = 12 | ||
num_layers = 6 | ||
mlp_ratio = 4 | ||
drop_rate = 0.1 | ||
|
||
# Optimizer Setting | ||
optim_type = "adamw" | ||
learning_rate = 1e-4 | ||
lr_update = 10 | ||
weight_decay = 0.01 | ||
decay_power = 1 | ||
max_epoch = 100 | ||
max_steps = None | ||
warmup_steps = 10000 | ||
end_lr = 0 | ||
lr_mult_head = 5 # multiply lr for downstream heads | ||
lr_mult_cross_modal = 5 # multiply lr for the cross-modal module | ||
|
||
# Downstream Setting | ||
get_recall_metric = False | ||
|
||
# PL Trainer Setting | ||
resume_from = None | ||
fast_dev_run = False | ||
val_check_interval = 1.0 | ||
test_only = False | ||
checkpoint = '/data3/lihaoxuan/New_Time/TKDE/github/runs/i2t_freeze/epoch=68-step=172499-v1.ckpt' | ||
|
||
# below params varies with the environment | ||
data_root = '/data1/lihaoxuan/orignal-datasets/' | ||
log_dir = "result" | ||
per_gpu_batchsize = 58 # you should define this manually with per_gpu_batch_size=# | ||
num_gpus = 1 | ||
num_nodes = 1 | ||
load_path = "" | ||
num_workers = 8 | ||
precision = 16 | ||
|
||
#SCAN | ||
direction = 'i2t' | ||
lambda_softmax = 9 | ||
|
||
|
||
@ex.named_config | ||
def coco_config(): | ||
exp_name = "finetune_irtr_coco" | ||
seed = 0 | ||
datasets = "coco" | ||
loss_names = _loss_names({"irtr": 1}) | ||
batch_size = 58 # this is a desired batch size; pl trainer will accumulate gradients when per step batch is smaller. | ||
margin = 0.2 | ||
|
||
# Image setting | ||
#train_transform_keys = ["clip"] | ||
#val_transform_keys = ["clip"] | ||
image_size = 224 | ||
patch_size = 32 | ||
#draw_false_image = 1 | ||
image_only = False | ||
|
||
# Text Setting | ||
#vqav2_label_size = 3129 | ||
max_text_len = 32 | ||
tokenizer = "bert-base-uncased" | ||
vocab_size = 30522 | ||
whole_word_masking = False # note that whole_word_masking does not work for RoBERTa | ||
mlm_prob = 0.15 | ||
#draw_false_text = 0 | ||
|
||
# Transformer Setting | ||
num_top_layer = 6 | ||
input_image_embed_size = 1024 | ||
input_text_embed_size = 768 | ||
vit = "swin_base_patch4_window7_224_in22k" | ||
hidden_size = 768 | ||
num_heads = 12 | ||
num_layers = 6 | ||
mlp_ratio = 4 | ||
drop_rate = 0.1 | ||
|
||
# Optimizer Setting | ||
optim_type = "adamw" | ||
learning_rate = 1e-4 | ||
lr_update = 10 | ||
weight_decay = 0.01 | ||
decay_power = 1 | ||
max_epoch = 100 | ||
max_steps = None | ||
warmup_steps = 10000 | ||
end_lr = 0 | ||
lr_mult_head = 5 # multiply lr for downstream heads | ||
lr_mult_cross_modal = 5 # multiply lr for the cross-modal module | ||
|
||
# Downstream Setting | ||
get_recall_metric = False | ||
|
||
# PL Trainer Setting | ||
resume_from = None | ||
fast_dev_run = False | ||
val_check_interval = 1.0 | ||
test_only = False | ||
checkpoint = '/data3/lihaoxuan/New_Time/TKDE/github/runs/i2t_freeze/last.ckpt' | ||
|
||
# below params varies with the environment | ||
data_root = '/data1/lihaoxuan/orignal-datasets/' | ||
log_dir = "result" | ||
per_gpu_batchsize = 58 # you should define this manually with per_gpu_batch_size=# | ||
num_gpus = 1 | ||
num_nodes = 1 | ||
load_path = "" | ||
num_workers = 8 | ||
precision = 16 | ||
|
||
#SCAN | ||
direction = 'i2t' | ||
lambda_softmax = 9 | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from .vg_caption_datamodule import VisualGenomeCaptionDataModule | ||
from .f30k_caption_karpathy_datamodule import F30KCaptionKarpathyDataModule | ||
from .coco_caption_karpathy_datamodule import CocoCaptionKarpathyDataModule | ||
from .conceptual_caption_datamodule import ConceptualCaptionDataModule | ||
from .sbu_datamodule import SBUCaptionDataModule | ||
from .vqav2_datamodule import VQAv2DataModule | ||
from .nlvr2_datamodule import NLVR2DataModule | ||
from .snli_datamodule import SNLIDataModule | ||
|
||
_datamodules = { | ||
"vg": VisualGenomeCaptionDataModule, | ||
"f30k": F30KCaptionKarpathyDataModule, | ||
"coco": CocoCaptionKarpathyDataModule, | ||
"gcc": ConceptualCaptionDataModule, | ||
"sbu": SBUCaptionDataModule, | ||
"vqa": VQAv2DataModule, | ||
"nlvr2": NLVR2DataModule, | ||
"snli": SNLIDataModule, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from ..datasets import CocoCaptionKarpathyDataset | ||
from .datamodule_base import BaseDataModule | ||
|
||
|
||
class CocoCaptionKarpathyDataModule(BaseDataModule): | ||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
|
||
@property | ||
def dataset_cls(self): | ||
return CocoCaptionKarpathyDataset | ||
|
||
@property | ||
def dataset_cls_no_false(self): | ||
return CocoCaptionKarpathyDataset | ||
|
||
@property | ||
def dataset_name(self): | ||
return "coco" |
Oops, something went wrong.