add vcr

PhoenixZ810 · Jul 9, 2024 · de800cf · de800cf
1 parent 71abe68
commit de800cf
Show file tree

Hide file tree

Showing 8 changed files with 335 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -60,6 +60,7 @@ English | [<a href="/docs/zh-CN/README_zh-CN.md">简体中文</a>] | [<a href="/
 | [**RealWorldQA**](https://x.ai/blog/grok-1.5v)            | RealWorldQA | MCQ                                          | [**POPE**](https://github.com/AoiDragon/POPE) | POPE                                           | Y/N                                            |
 | [**Core-MM**](https://github.com/core-mm/core-mm)-          | CORE_MM | VQA                                               | [**MMT-Bench**](https://mmt-bench.github.io)                 | MMT-Bench_[VAL/VAL_MI/ALL/ALL_MI]                | MCQ  |
 | [**MLLMGuard**](https://github.com/Carol-gutianle/MLLMGuard) - | MLLMGuard_DS | VQA | [**AesBench**](https://github.com/yipoh/AesBench) | AesBench_[VAL/TEST] | MCQ |
+| [**VCR-wiki**](https://huggingface.co/datasets/vcr-org/) | VCR_[EN/ZH]_[EASY/HARD][_ALL/_500/_100] | VCR | |  | |
 
 **\*** We only provide a subset of the evaluation results, since some VLMs do not yield reasonable results under the zero-shot setting
 

diff --git a/docs/en/Development.md b/docs/en/Development.md
@@ -19,6 +19,7 @@ Currently, we organize a benchmark as one single TSV file. During inference, the
 | COCO_VAL               | ✅     | ✅     |            |          |      |                         | ✅      |          |             |       |
 | OCRVQA_[TEST/TESTCORE] | ✅     | ✅     |            | ✅        |      |                         | ✅      |          |             |       |
 | TextVQA_VAL            | ✅     | ✅     |            | ✅        |      |                         | ✅      |          |             |       |
+| VCR_[EN/ZH]_[EASY/HARD][_ALL/_500/_100]            | ✅     | ✅     |            | ✅        |      |                         | ✅      |          |             |       |
 
 <div align="center"><b>Table 1. TSV fields of supported datasets.</b></div>
 

diff --git a/docs/ja/README_ja.md b/docs/ja/README_ja.md
@@ -48,6 +48,7 @@ PS: 日本語の README には最新のアップデートがすべて含まれ
 | [**RealWorldQA**](https://x.ai/blog/grok-1.5v)            | RealWorldQA | MCQ                                          | [**POPE**](https://github.com/AoiDragon/POPE) | POPE                                           | Y/N                                            |
 | [**Core-MM**](https://github.com/core-mm/core-mm)-          | CORE_MM | VQA                                               | [**MMT-Bench**](https://mmt-bench.github.io)                 | MMT-Bench_[VAL/VAL_MI/ALL/ALL_MI]                | MCQ  |
 | [**MLLMGuard**](https://github.com/Carol-gutianle/MLLMGuard) - | MLLMGuard_DS | VQA | [**AesBench**](https://github.com/yipoh/AesBench) | AesBench_[VAL/TEST] | MCQ |
+| VCR_[EN/ZH]_[EASY/HARD][_ALL/_500/_100]            | ✅     | ✅     |            | ✅        |      |                         | ✅      |          |             |       |
 
 **\*** ゼロショット設定で合理的な結果を出せないVLMの一部の評価結果のみを提供しています
 

diff --git a/docs/zh-CN/Development_zh-CN.md b/docs/zh-CN/Development_zh-CN.md
@@ -19,6 +19,7 @@
 | COCO_VAL               | ✅     | ✅     |            |          |      |                         | ✅      |          |             |       |
 | OCRVQA_[TEST/TESTCORE] | ✅     | ✅     |            | ✅        |      |                         | ✅      |          |             |       |
 | TextVQA_VAL            | ✅     | ✅     |            | ✅        |      |                         | ✅      |          |             |       |
+| VCR_[EN/ZH]_[EASY/HARD][_ALL/_500/_100]            | ✅     | ✅     |            | ✅        |      |                         | ✅      |          |             |       |
 
 <div align="center"><b>表 1. 支持的数据集的 TSV 字段。</b></div>
 

diff --git a/docs/zh-CN/README_zh-CN.md b/docs/zh-CN/README_zh-CN.md
@@ -57,6 +57,7 @@
 | [**RealWorldQA**](https://x.ai/blog/grok-1.5v)            | RealWorldQA | MCQ                                          | [**POPE**](https://github.com/AoiDragon/POPE) | POPE                                           | Y/N                                            |
 | [**Core-MM**](https://github.com/core-mm/core-mm)-          | CORE_MM | VQA                                               | [**MMT-Bench**](https://mmt-bench.github.io)                 | MMT-Bench_[VAL/VAL_MI/ALL/ALL_MI]                | MCQ      |
 | [**MLLMGuard**](https://github.com/Carol-gutianle/MLLMGuard) - | MLLMGuard_DS | VQA | [**AesBench**](https://github.com/yipoh/AesBench) | AesBench_[VAL/TEST] | MCQ |
+| VCR_[EN/ZH]_[EASY/HARD][_ALL/_500/_100]            | ✅     | ✅     |            | ✅        |      |                         | ✅      |          |             |       |
 
 **\*** 我们只提供了部分模型上的测试结果，剩余模型无法在 zero-shot 设定下测试出合理的精度
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,13 +1,15 @@
 einops
-gradio==4.15.0
-huggingface_hub
+evaluate
+gradio
+huggingface-hub
 matplotlib
-numpy>=1.23.4
+nltk
+numpy
 omegaconf
 openai==1.3.5
 opencv-python>=4.4.0.46
 openpyxl
-pandas>=1.5.3
+pandas
 pillow
 portalocker
 protobuf
@@ -17,11 +19,12 @@ requests
 rich
 seaborn
 sentencepiece
+setuptools
+spacy
 sty
 tabulate
 tiktoken
 timeout-decorator
-torch>=2.0.1
 tqdm
 transformers
 typing_extensions==4.7.1

diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
@@ -4,14 +4,14 @@
 from .image_caption import ImageCaptionDataset
 from .image_yorn import ImageYORNDataset
 from .image_mcq import ImageMCQDataset, MMMUDataset, CustomMCQDataset
-from .image_vqa import ImageVQADataset, OCRBench, MathVista, LLaVABench, MMVet, CustomVQADataset
+from .image_vqa import ImageVQADataset, OCRBench, MathVista, LLaVABench, MMVet, VCRDataset, CustomVQADataset
 from .mmbench_video import MMBenchVideo
 from .utils import build_judge, extract_answer_from_item, prefetch_answer, DEBUG_MESSAGE
 from ..smp import *
 
 DATASET_CLASSES = [
     ImageCaptionDataset, ImageYORNDataset, ImageMCQDataset, MMMUDataset,
-    CustomMCQDataset, ImageVQADataset, OCRBench, MathVista, LLaVABench, MMVet,
+    CustomMCQDataset, ImageVQADataset, OCRBench, MathVista, LLaVABench, MMVet, VCRDataset,
     CustomVQADataset, MMBenchVideo
 ]
 
@@ -27,7 +27,7 @@ def build_dataset(dataset_name, **kwargs):
         return MMBenchVideo(dataset_name, **kwargs)
     datasets = [
         ImageCaptionDataset, ImageYORNDataset, ImageMCQDataset, ImageVQADataset,
-        MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet
+        MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet, VCRDataset,
     ]
     for cls in datasets:
         if dataset_name in cls.supported_datasets():
@@ -55,7 +55,7 @@ def build_dataset(dataset_name, **kwargs):
 
 __all__ = [
     'MMBenchVideo', 'ImageYORNDataset', 'ImageMCQDataset', 'MMMUDataset',
-    'ImageCaptionDataset', 'ImageVQADataset', 'OCRBench', 'MathVista', 'LLaVABench', 'MMVet',
+    'ImageCaptionDataset', 'ImageVQADataset', 'OCRBench', 'MathVista', 'LLaVABench', 'MMVet', 'VCRDataset',
     'CustomMCQDataset', 'CustomVQADataset', 'build_dataset', 'img_root_map',
     'build_judge', 'extract_answer_from_item', 'prefetch_answer', 'DEBUG_MESSAGE'
 ]