add code, pic, requirements.txt

LuminosityX · Jul 12, 2023 · 9209365 · 9209365
1 parent 7366ad9
commit 9209365
Show file tree

Hide file tree

Showing 53 changed files with 8,326 additions and 0 deletions.
diff --git a/data.py b/data.py
diff --git a/evaluation.py b/evaluation.py
@@ -0,0 +1,172 @@
+import numpy as np
+#import numpy
+
+
+def i2t_SCAN(sims, npts=None, return_ranks=False):
+    """
+    Images->Text (Image Annotation)
+    Images: (N, n_region, d) matrix of images
+    Captions: (5N, max_n_word, d) matrix of captions
+    CapLens: (5N) array of caption lengths
+    sims: (N, 5N) matrix of similarity im-cap
+    """
+    npts = sims.shape[0]
+    ranks = np.zeros(npts)
+    top1 = np.zeros(npts)
+    for index in range(npts):
+        inds = np.argsort(sims[index])[::-1]
+        # Score
+        rank = 1e20
+        for i in range(5 * index, 5 * index + 5, 1):
+            tmp = np.where(inds == i)[0][0]
+            if tmp < rank:
+                rank = tmp
+        ranks[index] = rank
+        top1[index] = inds[0]
+
+    # Compute metrics
+    r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
+    r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
+    r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
+
+    r20 = 100.0 * len(np.where(ranks < 20)[0]) / len(ranks)
+    r50 = 100.0 * len(np.where(ranks < 50)[0]) / len(ranks)
+    r70 = 100.0 * len(np.where(ranks < 70)[0]) / len(ranks)
+    r100 = 100.0 * len(np.where(ranks < 100)[0]) / len(ranks)
+
+    medr = np.floor(np.median(ranks)) + 1
+    meanr = ranks.mean() + 1
+    if return_ranks:
+        return (r1, r5, r10, r20, r50, r70, r100, medr, meanr), (ranks, top1)
+    else:
+        return (r1, r5, r10, r20, r50, r70, r100, medr, meanr)
+
+def t2i_SCAN(sims, npts=None, return_ranks=False):
+    """
+    Text->Images (Image Search)
+    Images: (N, n_region, d) matrix of images
+    Captions: (5N, max_n_word, d) matrix of captions
+    CapLens: (5N) array of caption lengths
+    sims: (N, 5N) matrix of similarity im-cap
+    """
+    npts = sims.shape[0]
+    ranks = np.zeros(5 * npts)
+    top1 = np.zeros(5 * npts)
+
+    # --> (5N(caption), N(image))
+    sims = sims.T
+
+    for index in range(npts):
+        for i in range(5):
+            inds = np.argsort(sims[5 * index + i])[::-1]
+            ranks[5 * index + i] = np.where(inds == index)[0][0]
+            top1[5 * index + i] = inds[0]
+
+    # Compute metrics
+    r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
+    r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
+    r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
+
+    r20 = 100.0 * len(np.where(ranks < 20)[0]) / len(ranks)
+    r50 = 100.0 * len(np.where(ranks < 50)[0]) / len(ranks)
+    r70 = 100.0 * len(np.where(ranks < 70)[0]) / len(ranks)
+    r100 = 100.0 * len(np.where(ranks < 100)[0]) / len(ranks)
+
+    medr = np.floor(np.median(ranks)) + 1
+    meanr = ranks.mean() + 1
+    if return_ranks:
+        return (r1, r5, r10, r20, r50, r70, r100, medr, meanr), (ranks, top1)
+    else:
+        return (r1, r5, r10, r20, r50, r70, r100, medr, meanr)
+
+'''def i2t(images, captions, npts=None, measure='cosine', return_ranks=False):
+    """
+    Images->Text (Image Annotation)
+    Images: (5N, K) matrix of images
+    Captions: (5N, K) matrix of captions
+    """
+    if npts is None:
+        npts = int(images.shape[0] / 5)
+    index_list = []
+
+    ranks = numpy.zeros(npts)
+    top1 = numpy.zeros(npts)
+    for index in range(npts):
+
+        # Get query image
+        im = images[5 * index].reshape(1, images.shape[1])
+
+        # Compute scores
+        d = numpy.dot(im, captions.T).flatten()
+        inds = numpy.argsort(d)[::-1]
+        index_list.append(inds[0])
+
+        # Score
+        rank = 1e20
+        for i in range(5 * index, 5 * index + 5, 1):
+            tmp = numpy.where(inds == i)[0][0]
+            if tmp < rank:
+                rank = tmp
+        ranks[index] = rank
+        top1[index] = inds[0]
+
+    # Compute metrics
+    r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks)
+    r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks)
+    r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks)
+
+    r20 = 100.0 * len(numpy.where(ranks < 20)[0]) / len(ranks)
+    r50 = 100.0 * len(numpy.where(ranks < 50)[0]) / len(ranks)
+    r70 = 100.0 * len(numpy.where(ranks < 70)[0]) / len(ranks)
+    r100 = 100.0 * len(numpy.where(ranks < 100)[0]) / len(ranks)
+
+    medr = numpy.floor(numpy.median(ranks)) + 1
+    meanr = ranks.mean() + 1
+    if return_ranks:
+        return (r1, r5, r10, r20, r50, r70, r100, medr, meanr), (ranks, top1)
+    else:
+        return (r1, r5, r10, r20, r50, r70, r100,  medr, meanr)
+
+
+def t2i(images, captions, npts=None, measure='cosine', return_ranks=False):
+    """
+    Text->Images (Image Search)
+    Images: (5N, K) matrix of images
+    Captions: (5N, K) matrix of captions
+    """
+    if npts is None:
+        npts = int(images.shape[0] / 5)
+    ims = numpy.array([images[i] for i in range(0, len(images), 5)])
+
+    ranks = numpy.zeros(5 * npts)
+    top1 = numpy.zeros(5 * npts)
+    for index in range(npts):
+
+        # Get query captions
+        queries = captions[5 * index:5 * index + 5]
+
+        # Compute scores
+       
+        d = numpy.dot(queries, ims.T)
+        inds = numpy.zeros(d.shape)
+        for i in range(len(inds)):
+            inds[i] = numpy.argsort(d[i])[::-1]
+            ranks[5 * index + i] = numpy.where(inds[i] == index)[0][0]
+            top1[5 * index + i] = inds[i][0]
+
+    # Compute metrics
+    r1 = 100.0 * len(numpy.where(ranks < 1)[0]) / len(ranks)
+    r5 = 100.0 * len(numpy.where(ranks < 5)[0]) / len(ranks)
+    r10 = 100.0 * len(numpy.where(ranks < 10)[0]) / len(ranks)
+
+    r20 = 100.0 * len(numpy.where(ranks < 20)[0]) / len(ranks)
+    r50 = 100.0 * len(numpy.where(ranks < 50)[0]) / len(ranks)
+    r70 = 100.0 * len(numpy.where(ranks < 70)[0]) / len(ranks)
+    r100 = 100.0 * len(numpy.where(ranks < 100)[0]) / len(ranks)
+
+    medr = numpy.floor(numpy.median(ranks)) + 1
+    meanr = ranks.mean() + 1
+    if return_ranks:
+        return (r1, r5, r10, r20, r50, r70, r100, medr, meanr), (ranks, top1)
+    else:
+        return (r1, r5, r10, r20, r50, r70, r100, medr, meanr)'''
diff --git a/framework.png b/framework.png
diff --git a/meter/__init__.py b/meter/__init__.py
diff --git a/meter/config.py b/meter/config.py
@@ -0,0 +1,173 @@
+from sacred import Experiment
+
+ex = Experiment("METER")
+
+
+def _loss_names(d):
+    ret = {
+        "itm": 0,
+        "mlm": 0,
+        "mpp": 0,
+        "vqa": 0,
+        "vcr": 0,
+        "vcr_qar": 0,
+        "nlvr2": 0,
+        "irtr": 0,
+        "contras": 0,
+        "snli": 0,
+    }
+    ret.update(d)
+    return ret
+
+
+@ex.config
+def config():
+    exp_name = "finetune_irtr_f30k"
+    seed = 0
+    datasets = "f30k"
+    loss_names = _loss_names({"irtr": 1})
+    batch_size = 58  # this is a desired batch size; pl trainer will accumulate gradients when per step batch is smaller.
+    margin = 0.2
+
+    # Image setting
+    #train_transform_keys = ["clip"]
+    #val_transform_keys = ["clip"]
+    image_size = 224
+    patch_size = 32
+    #draw_false_image = 1
+    image_only = False
+
+    # Text Setting
+    #vqav2_label_size = 3129
+    max_text_len = 32
+    tokenizer = "bert-base-uncased"
+    vocab_size = 30522
+    whole_word_masking = False # note that whole_word_masking does not work for RoBERTa
+    mlm_prob = 0.15
+    #draw_false_text = 0
+
+    # Transformer Setting
+    num_top_layer = 6
+    input_image_embed_size = 1024
+    input_text_embed_size = 768
+    vit = "swin_base_patch4_window7_224_in22k"
+    hidden_size = 768
+    num_heads = 12
+    num_layers = 6
+    mlp_ratio = 4
+    drop_rate = 0.1
+
+    # Optimizer Setting
+    optim_type = "adamw"
+    learning_rate = 1e-4
+    lr_update = 10
+    weight_decay = 0.01
+    decay_power = 1
+    max_epoch = 100
+    max_steps = None
+    warmup_steps = 10000
+    end_lr = 0
+    lr_mult_head = 5  # multiply lr for downstream heads
+    lr_mult_cross_modal = 5  # multiply lr for the cross-modal module
+
+    # Downstream Setting
+    get_recall_metric = False
+
+    # PL Trainer Setting
+    resume_from = None
+    fast_dev_run = False
+    val_check_interval = 1.0
+    test_only = False
+    checkpoint = '/data3/lihaoxuan/New_Time/TKDE/github/runs/i2t_freeze/epoch=68-step=172499-v1.ckpt'
+
+    # below params varies with the environment
+    data_root = '/data1/lihaoxuan/orignal-datasets/'
+    log_dir = "result"
+    per_gpu_batchsize = 58  # you should define this manually with per_gpu_batch_size=#
+    num_gpus = 1
+    num_nodes = 1
+    load_path = ""
+    num_workers = 8
+    precision = 16
+
+    #SCAN
+    direction = 'i2t'
+    lambda_softmax = 9
+
+
+@ex.named_config
+def coco_config():
+    exp_name = "finetune_irtr_coco"
+    seed = 0
+    datasets = "coco"
+    loss_names = _loss_names({"irtr": 1})
+    batch_size = 58  # this is a desired batch size; pl trainer will accumulate gradients when per step batch is smaller.
+    margin = 0.2
+
+    # Image setting
+    #train_transform_keys = ["clip"]
+    #val_transform_keys = ["clip"]
+    image_size = 224
+    patch_size = 32
+    #draw_false_image = 1
+    image_only = False
+
+    # Text Setting
+    #vqav2_label_size = 3129
+    max_text_len = 32
+    tokenizer = "bert-base-uncased"
+    vocab_size = 30522
+    whole_word_masking = False # note that whole_word_masking does not work for RoBERTa
+    mlm_prob = 0.15
+    #draw_false_text = 0
+
+    # Transformer Setting
+    num_top_layer = 6
+    input_image_embed_size = 1024
+    input_text_embed_size = 768
+    vit = "swin_base_patch4_window7_224_in22k"
+    hidden_size = 768
+    num_heads = 12
+    num_layers = 6
+    mlp_ratio = 4
+    drop_rate = 0.1
+
+    # Optimizer Setting
+    optim_type = "adamw"
+    learning_rate = 1e-4
+    lr_update = 10
+    weight_decay = 0.01
+    decay_power = 1
+    max_epoch = 100
+    max_steps = None
+    warmup_steps = 10000
+    end_lr = 0
+    lr_mult_head = 5  # multiply lr for downstream heads
+    lr_mult_cross_modal = 5  # multiply lr for the cross-modal module
+
+    # Downstream Setting
+    get_recall_metric = False
+
+    # PL Trainer Setting
+    resume_from = None
+    fast_dev_run = False
+    val_check_interval = 1.0
+    test_only = False
+    checkpoint = '/data3/lihaoxuan/New_Time/TKDE/github/runs/i2t_freeze/last.ckpt'
+
+    # below params varies with the environment
+    data_root = '/data1/lihaoxuan/orignal-datasets/'
+    log_dir = "result"
+    per_gpu_batchsize = 58  # you should define this manually with per_gpu_batch_size=#
+    num_gpus = 1
+    num_nodes = 1
+    load_path = ""
+    num_workers = 8
+    precision = 16
+
+    #SCAN
+    direction = 'i2t'
+    lambda_softmax = 9
+
+
+
diff --git a/meter/datamodules/__init__.py b/meter/datamodules/__init__.py
@@ -0,0 +1,19 @@
+from .vg_caption_datamodule import VisualGenomeCaptionDataModule
+from .f30k_caption_karpathy_datamodule import F30KCaptionKarpathyDataModule
+from .coco_caption_karpathy_datamodule import CocoCaptionKarpathyDataModule
+from .conceptual_caption_datamodule import ConceptualCaptionDataModule
+from .sbu_datamodule import SBUCaptionDataModule
+from .vqav2_datamodule import VQAv2DataModule
+from .nlvr2_datamodule import NLVR2DataModule
+from .snli_datamodule import SNLIDataModule
+
+_datamodules = {
+    "vg": VisualGenomeCaptionDataModule,
+    "f30k": F30KCaptionKarpathyDataModule,
+    "coco": CocoCaptionKarpathyDataModule,
+    "gcc": ConceptualCaptionDataModule,
+    "sbu": SBUCaptionDataModule,
+    "vqa": VQAv2DataModule,
+    "nlvr2": NLVR2DataModule,
+    "snli": SNLIDataModule,
+}
diff --git a/meter/datamodules/coco_caption_karpathy_datamodule.py b/meter/datamodules/coco_caption_karpathy_datamodule.py
@@ -0,0 +1,19 @@
+from ..datasets import CocoCaptionKarpathyDataset
+from .datamodule_base import BaseDataModule
+
+
+class CocoCaptionKarpathyDataModule(BaseDataModule):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @property
+    def dataset_cls(self):
+        return CocoCaptionKarpathyDataset
+
+    @property
+    def dataset_cls_no_false(self):
+        return CocoCaptionKarpathyDataset
+
+    @property
+    def dataset_name(self):
+        return "coco"