Merge pull request Vision-CAIR#406 from junchen14/main

add evaluation script
nahidalam · Nov 1, 2023 · ff0322c · ff0322c
2 parents ca2314c + 48d1cc6
commit ff0322c
Show file tree

Hide file tree

Showing 27 changed files with 2,051 additions and 88 deletions.
diff --git a/dataset/README_MINIGPTv2_FINETUNE.md b/dataset/README_MINIGPTv2_FINETUNE.md
@@ -15,7 +15,7 @@ RefCOCOg | <a href="https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcocog
 OKVQA | <a href="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_train.json"> annotations </a>
 AOK-VQA | <a href="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json"> annotations </a>
 OCR-VQA | <a href="https://drive.google.com/drive/folders/1_GYPY5UkUy7HIcR0zq3ZCFgeZN7BAfm_?usp=sharing"> annotations </a>
-GQA | <a href="https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip">images</a>  &nbsp;&nbsp; <a href="/ibex/project/c2133/minigpt4_v2_dataset/gqa/annotations/train_balanced_questions.json"> annotations </a>
+GQA | <a href="https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip">images</a>  &nbsp;&nbsp; <a href="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json"> annotations </a>
 Filtered flickr-30k |  <a href="https://drive.google.com/drive/folders/19c_ggBI77AvdtYlPbuI0ZpnPz73T5teX?usp=sharing"> annotations </a>
 Multi-task conversation |  <a href="https://drive.google.com/file/d/11HHqB2c29hbSk-WLxdta-nG8UCUrcCN1/view?usp=sharing"> annotations </a> 
 Filtered unnatural instruction |  <a href="https://drive.google.com/file/d/1lXNnBcb5WU-sc8Fe2T2N8J0NRw4sBLev/view?usp=sharing"> annotations </a>

diff --git a/eval_configs/minigptv2_benchmark_evaluation.yaml b/eval_configs/minigptv2_benchmark_evaluation.yaml
@@ -0,0 +1,79 @@
+model:
+  arch: minigpt_v2
+  model_type: pretrain
+  max_txt_len: 500
+  end_sym: "</s>"
+  low_resource: False
+  prompt_template: '[INST] {} [/INST]'
+  llama_model: ""
+  ckpt: ""
+  lora_r: 64
+  lora_alpha: 16
+
+
+datasets:
+  cc_sbu_align:
+    vis_processor:
+      train:
+        name: "blip2_image_eval"
+        image_size: 448
+    text_processor:
+      train:
+        name: "blip_caption"
+
+evaluation_datasets:
+  refcoco:
+    eval_file_path: /path/to/eval/annotation/path  
+    img_path: /path/to/eval/image/path      
+    max_new_tokens: 20
+    batch_size: 10
+  refcocog:
+    eval_file_path: /path/to/eval/annotation/path  
+    img_path: /path/to/eval/image/path    
+    max_new_tokens: 20
+    batch_size: 10
+  refcoco+:
+    eval_file_path: /path/to/eval/annotation/path  
+    img_path: /path/to/eval/image/path    
+    max_new_tokens: 20
+    batch_size: 10
+  gqa:
+    eval_file_path: /path/to/eval/annotation/path  
+    img_path: /path/to/eval/image/path    
+    max_new_tokens: 20
+    batch_size: 10
+  okvqa:
+    eval_file_path: /path/to/eval/annotation/path  
+    img_path: /path/to/eval/image/path     
+    max_new_tokens: 20
+    batch_size: 10
+  vizwiz:
+    eval_file_path: /path/to/eval/annotation/path  
+    img_path: /path/to/eval/image/path    
+    max_new_tokens: 20
+    batch_size: 10
+  iconvqa:
+    eval_file_path: /path/to/eval/annotation/path  
+    img_path: /path/to/eval/image/path    
+    max_new_tokens: 20
+    batch_size: 10
+  vsr:
+    eval_file_path: cambridgeltl/vsr_zeroshot 
+    img_path: /path/to/eval/image/path    
+    max_new_tokens: 20
+    batch_size: 10
+  hm:
+    eval_file_path: /path/to/eval/annotation/path  
+    img_path: /path/to/eval/image/path 
+    max_new_tokens: 20
+    batch_size: 100
+
+run:
+  task: image_text_pretrain
+  name: minigptv2_evaluation
+  save_path: /path/to/save/folder_path
+
+
+
+
+
diff --git a/eval_configs/minigptv2_eval.yaml b/eval_configs/minigptv2_eval.yaml
@@ -1,11 +1,11 @@
 model:
   arch: minigpt_v2
   model_type: pretrain
-  max_txt_len: 160
+  max_txt_len: 500
   end_sym: "</s>"
   low_resource: True
   prompt_template: '[INST] {} [/INST]'
-  ckpt: 'please set this value to the path of pretrained checkpoint'
+  ckpt: "please set this value to the path of pretrained checkpoint"
   lora_r: 64
   lora_alpha: 16
 

diff --git a/eval_scripts/EVAL_README.md b/eval_scripts/EVAL_README.md
@@ -0,0 +1,104 @@
+## Evaluation Instruction for MiniGPT-v2
+
+### Data preparation
+Images download
+Image source | Download path
+--- | :---:
+OKVQA| <a href="https://drive.google.com/drive/folders/1jxIgAhtaLu_YqnZEl8Ym11f7LhX3nptN?usp=sharing">annotations</a> &nbsp;&nbsp;  <a href="http://images.cocodataset.org/zips/train2017.zip"> images</a>
+gqa | <a href="https://drive.google.com/drive/folders/1-dF-cgFwstutS4qq2D9CFQTDS0UTmIft?usp=drive_link">annotations</a> &nbsp;&nbsp;  <a href="https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip">images</a> 
+hateful meme |  <a href="https://github.com/faizanahemad/facebook-hateful-memes">images and annotations</a> 
+iconqa |  <a href="https://iconqa.github.io/#download">images and annotation</a>
+vizwiz |  <a href="https://vizwiz.org/tasks-and-datasets/vqa/">images and annotation</a>
+RefCOCO | <a href="https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco.zip"> annotations </a>
+RefCOCO+ | <a href="https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco+.zip"> annotations </a>
+RefCOCOg | <a href="https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcocog.zip"> annotations </a>
+
+### Evaluation dataset structure
+
+```
+${MINIGPTv2_EVALUATION_DATASET}
+├── gqa
+│   └── test_balanced_questions.json
+│   ├── testdev_balanced_questions.json
+│   ├── gqa_images
+├── hateful_meme
+│   └── hm_images
+│   ├── dev.jsonl
+├── iconvqa
+│   └── iconvqa_images
+│   ├── choose_text_val.json
+├── vizwiz
+│   └── vizwiz_images
+│   ├── val.json
+├── vsr
+│   └── vsr_images
+├── okvqa
+│   ├── okvqa_test_split.json
+│   ├── mscoco_val2014_annotations_clean.json
+│   ├── OpenEnded_mscoco_val2014_questions_clean.json
+├── refcoco
+│   └── instances.json
+│   ├── refs(google).p
+│   ├── refs(unc).p
+├── refcoco+
+│   └── instances.json
+│   ├── refs(unc).p
+├── refercocog
+│   └── instances.json
+│   ├── refs(google).p
+│   ├── refs(und).p
+...
+```
+
+
+### environment setup
+
+```
+export PYTHONPATH=$PYTHONPATH:/path/to/directory/of/MiniGPT-4
+```
+
+### config file setup
+
+Set **llama_model** to the path of LLaMA model.  
+Set **ckpt** to the path of our pretrained model.  
+Set **eval_file_path** to the path of the annotation files for each evaluation data.  
+Set **img_path** to the img_path for each evaluation dataset.  
+Set **save_path** to the save_path for evch evaluation dataset.    
+
+in [minigpt4/eval_configs/minigptv2_benchmark_evaluation.yaml](../minigpt4/eval_configs/minigptv2_benchmark_evaluation.yaml) 
+
+
+
+
+### start evalauting RefCOCO, RefCOCO+, RefCOCOg
+port=port_number  
+cfg_path=/path/to/eval_configs/minigptv2_benchmark_evaluation.yaml  
+
+dataset names:  
+| refcoco | refcoco+ | refcocog |
+| ------- | -------- | -------- |
+
+```
+torchrun --master-port ${port} --nproc_per_node 1 eval_ref.py \
+ --cfg-path ${cfg_path} --dataset refcoco,refcoco+,refcocog --resample
+```
+
+
+### start evaluating visual question answering
+
+port=port_number  
+cfg_path=/path/to/eval_configs/minigptv2_benchmark_evaluation.yaml 
+
+dataset names:  
+| okvqa | vizwiz | iconvqa | gqa | vsr | hm |
+| ------- | -------- | -------- |-------- | -------- | -------- |
+
+
+```
+torchrun --master-port ${port} --nproc_per_node 1 eval_vqa.py \
+ --cfg-path ${cfg_path} --dataset okvqa,vizwiz,iconvqa,gqa,vsr,hm
+```
+
+
+
+
diff --git a/eval_scripts/eval.sh b/eval_scripts/eval.sh
@@ -0,0 +1,25 @@
+#!/bin/bash --login
+
+export PYTHONPATH=$PYTHONPATH:$(pwd)
+
+cfg_path=eval_configs/minigpt4_llama2_eval.yaml
+CKPT=YOUR_CKPT_PATH
+NAME=EXP_NAME
+IMG_PATH=YOUR_IMG_PATH
+EVAL_FILE_PATH=YOUR_EVAL_FILE_PATH
+
+# torchrun --nproc_per_node 1 eval_scripts/eval_ref.py --name ${NAME} \
+#  --cfg-path ${cfg_path} \
+#  --ckpt ${CKPT} --dataset refcoco,refcoco+,refcocog --lora_r 64 --lora_alpha 16 \
+#  --batch_size 64 --max_new_tokens 20 --resample --img_path ${IMG_PATH} --eval_file_path ${EVAL_FILE_PATH}
+
+# torchrun --nproc_per_node 1 eval_scripts/eval_vqa.py --name ${NAME} \
+#  --cfg-path ${cfg_path} \
+#  --ckpt ${CKPT} --split val,test --dataset okvqa,vizwiz,iconqa,gqa,vsr,hm --lora_r 64 --lora_alpha 16 \
+#  --batch_size 32 --max_new_tokens 20 --resample
+
+
+torchrun --nproc_per_node 1 eval_scripts/eval_vqa.py --name ${NAME} \
+ --cfg-path ${cfg_path} \
+ --ckpt ${CKPT} --split val,test --dataset okvqa --lora_r 64 --lora_alpha 16 \
+ --batch_size 32 --max_new_tokens 20
diff --git a/eval_scripts/eval_ref.py b/eval_scripts/eval_ref.py
@@ -0,0 +1,128 @@
+import os
+import re
+import json
+import argparse
+from collections import defaultdict
+import random
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+import torch
+from torch.utils.data import DataLoader
+from minigpt4.common.config import Config
+from minigpt4.common.eval_utils import prepare_texts, init_model, eval_parser, computeIoU
+from minigpt4.conversation.conversation import CONV_VISION_minigptv2
+
+from minigpt4.datasets.datasets.coco_caption import RefCOCOEvalData
+
+def list_of_str(arg):
+    return list(map(str, arg.split(',')))
+
+parser = eval_parser()
+parser.add_argument("--dataset", type=list_of_str, default='refcoco', help="dataset to evaluate")
+parser.add_argument("--res", type=float, default=100.0, help="resolution used in refcoco")
+parser.add_argument("--resample", action='store_true', help="resolution used in refcoco")
+args = parser.parse_args()
+
+cfg = Config(args)
+
+eval_dict = {'refcoco': ['val','testA','testB'], 
+            'refcoco+': ['val','testA','testB'],
+            'refcocog': ['val','test']}
+
+
+model, vis_processor = init_model(args)
+model.eval()
+CONV_VISION = CONV_VISION_minigptv2
+conv_temp = CONV_VISION.copy()
+conv_temp.system = ""
+
+# 
+model.eval()
+save_path = cfg.run_cfg.save_path
+
+
+
+for dataset in args.dataset:
+    for split in eval_dict[dataset]:
+
+        eval_file_path = cfg.evaluation_datasets_cfg[dataset]["eval_file_path"]
+        img_path = cfg.evaluation_datasets_cfg[dataset]["img_path"]
+        batch_size = cfg.evaluation_datasets_cfg[dataset]["batch_size"]
+        max_new_tokens = cfg.evaluation_datasets_cfg[dataset]["max_new_tokens"]
+
+        with open(os.path.join(eval_file_path,f"{dataset}/{dataset}_{split}.json"), 'r') as f:
+            refcoco = json.load(f)
+
+        data = RefCOCOEvalData(refcoco, vis_processor, img_path)
+        eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
+        minigpt4_predict = defaultdict(list)
+        resamples = []
+
+        for images, questions, img_ids in tqdm(eval_dataloader):
+            texts = prepare_texts(questions, conv_temp)  # warp the texts with conversation template
+            answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False)
+            for answer, img_id, question in zip(answers, img_ids, questions):
+                answer = answer.replace("<unk>","").replace(" ","").strip()
+                pattern = r'\{<\d{1,3}><\d{1,3}><\d{1,3}><\d{1,3}>\}'
+                if re.match(pattern, answer):
+                    minigpt4_predict[img_id].append(answer)
+                else:
+                    resamples.append({'img_id': img_id, 'sents': [question.replace('[refer] give me the location of','').strip()]})
+        if args.resample:
+            for i in range(20):
+                data = RefCOCOEvalData(resamples, vis_processor, img_path)
+                resamples = []
+                eval_dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
+                for images, questions, img_ids in tqdm(eval_dataloader):
+                    texts = prepare_texts(questions, conv_temp)  # warp the texts with conversation template
+                    answers = model.generate(images, texts, max_new_tokens=max_new_tokens, do_sample=False)
+                    for answer, img_id, question in zip(answers, img_ids, questions):
+                        answer = answer.replace("<unk>","").replace(" ","").strip()
+                        pattern = r'\{<\d{1,3}><\d{1,3}><\d{1,3}><\d{1,3}>\}'
+                        if re.match(pattern, answer) or i == 4:
+                            minigpt4_predict[img_id].append(answer)
+                        else:
+                            resamples.append({'img_id': img_id, 'sents': [question.replace('[refer] give me the location of','').strip()]})
+
+                if len(resamples) == 0:
+                    break
+
+        file_save_path = os.path.join(save_path,f"{args.dataset}_{split}.json")
+        with open(file_save_path,'w') as f:
+            json.dump(minigpt4_predict, f)
+
+        count=0
+        total=len(refcoco)
+        res=args.res
+        refcoco_dict = defaultdict()
+        for item in refcoco:
+            refcoco_dict[item['img_id']] = item
+        for img_id in refcoco_dict:
+            item = refcoco_dict[img_id]
+            bbox = item['bbox']
+            outputs = minigpt4_predict[img_id]
+            for output in outputs:
+                try:
+                    integers = re.findall(r'\d+', output)
+                    pred_bbox = [int(num) for num in integers]
+                    height = item['height']
+                    width = item['width']
+                    pred_bbox[0] = pred_bbox[0] / res * width
+                    pred_bbox[1] = pred_bbox[1] / res * height
+                    pred_bbox[2] = pred_bbox[2] / res * width
+                    pred_bbox[3] = pred_bbox[3] / res * height
+
+                    gt_bbox = [0,0,0,0]
+                    gt_bbox[0] = bbox[0]
+                    gt_bbox[1] = bbox[1]
+                    gt_bbox[2] = bbox[0] + bbox[2]
+                    gt_bbox[3] = bbox[1] + bbox[3]
+
+                    iou_score = computeIoU(pred_bbox, gt_bbox)
+                    if iou_score > 0.5:
+                        count+=1
+                except:
+                    continue
+
+        print(f'{dataset} {split}:', count / total * 100, flush=True)