type errors in clip model builders, switch dataclass to namedtuple

facebookresearch · sophiazhi · Jun 10, 2022 · Jun 13, 2022 · Jun 13, 2022 · Jun 14, 2022
commit f2580294b3421d1aeced475b552261d2a8f40e65
diff --git a/torchmultimodal/architectures/clip.py b/torchmultimodal/architectures/clip.py
@@ -5,8 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import warnings
-from collections import namedtuple
-from typing import Dict
+from typing import Dict, NamedTuple
 
 import torch
 import torch.nn.functional as F
@@ -37,12 +36,11 @@ def __init__(
     ):
         super().__init__()
         self.encoders = encoders
-        self.clip_output = namedtuple("CLIPOutput", encoders.keys())
 
     def forward(
         self,
         modalities: Dict[str, torch.Tensor],
-    ) -> Dict[str, torch.Tensor]:
+    ) -> NamedTuple:
         embeddings = {}
         for key, encoder in self.encoders.items():
             if key not in modalities:
@@ -52,4 +50,5 @@ def forward(
             if key not in self.encoders:
                 warnings.warn(f"Missing encoder for extra input {key}")
 
-        return self.clip_output(**embeddings)
+        clip_output = NamedTuple("CLIPOutput", **{k: torch.Tensor for k in self.encoders.keys()})  # type: ignore
+        return clip_output(**embeddings)
diff --git a/torchmultimodal/models/clip.py b/torchmultimodal/models/clip.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import torch
 from torchmultimodal.architectures.clip import CLIPArchitecture
 from torchmultimodal.modules.encoders.clip_resnet_encoder import ResNetForCLIP
 from torchmultimodal.modules.encoders.clip_text_encoder import CLIPTextEncoder
@@ -22,7 +23,9 @@ def clip_vit_b16():
         num_classes=512,
     )
     text_encoder = CLIPTextEncoder(embedding_dim=512)
-    return CLIPArchitecture(vision_encoder, text_encoder)
+    return CLIPArchitecture(
+        encoders=torch.nn.ModuleDict({"vision": vision_encoder, "text": text_encoder})
+    )
 
 
 def clip_vit_b32():
@@ -36,7 +39,9 @@ def clip_vit_b32():
         num_classes=512,
     )
     text_encoder = CLIPTextEncoder(embedding_dim=512)
-    return CLIPArchitecture(vision_encoder, text_encoder)
+    return CLIPArchitecture(
+        encoders=torch.nn.ModuleDict({"vision": vision_encoder, "text": text_encoder})
+    )
 
 
 def clip_vit_l14():
@@ -50,7 +55,9 @@ def clip_vit_l14():
         num_classes=768,
     )
     text_encoder = CLIPTextEncoder(embedding_dim=768, width=768, heads=12)
-    return CLIPArchitecture(vision_encoder, text_encoder)
+    return CLIPArchitecture(
+        encoders=torch.nn.ModuleDict({"vision": vision_encoder, "text": text_encoder})
+    )
 
 
 def clip_rn50():
@@ -61,7 +68,9 @@ def clip_rn50():
         width=2048,
     )
     text_encoder = CLIPTextEncoder(embedding_dim=1024)
-    return CLIPArchitecture(vision_encoder, text_encoder)
+    return CLIPArchitecture(
+        encoders=torch.nn.ModuleDict({"vision": vision_encoder, "text": text_encoder})
+    )
 
 
 def clip_rn101():
@@ -72,7 +81,9 @@ def clip_rn101():
         width=2048,
     )
     text_encoder = CLIPTextEncoder(embedding_dim=1024)
-    return CLIPArchitecture(vision_encoder, text_encoder)
+    return CLIPArchitecture(
+        encoders=torch.nn.ModuleDict({"vision": vision_encoder, "text": text_encoder})
+    )
 
 
 # Note: these models require larger image sizes
@@ -85,7 +96,9 @@ def clip_rn50x4():
         width=2560,
     )
     text_encoder = CLIPTextEncoder(embedding_dim=1024, width=640, heads=12)
-    return CLIPArchitecture(vision_encoder, text_encoder)
+    return CLIPArchitecture(
+        encoders=torch.nn.ModuleDict({"vision": vision_encoder, "text": text_encoder})
+    )
 
 
 def clip_rn50x16():
@@ -97,7 +110,9 @@ def clip_rn50x16():
         width=3072,
     )
     text_encoder = CLIPTextEncoder(embedding_dim=768, width=768, heads=12)
-    return CLIPArchitecture(vision_encoder, text_encoder)
+    return CLIPArchitecture(
+        encoders=torch.nn.ModuleDict({"vision": vision_encoder, "text": text_encoder})
+    )
 
 
 def clip_rn50x64():
@@ -109,7 +124,9 @@ def clip_rn50x64():
         width=4096,
     )
     text_encoder = CLIPTextEncoder(embedding_dim=1024, width=1024, heads=16)
-    return CLIPArchitecture(vision_encoder, text_encoder)
+    return CLIPArchitecture(
+        encoders=torch.nn.ModuleDict({"vision": vision_encoder, "text": text_encoder})
+    )
 
 
 # Note: these models use torchvision's ResNet
@@ -120,7 +137,9 @@ def clip_rn50_tv():
         num_classes=1024,
     )
     text_encoder = CLIPTextEncoder()
-    return CLIPArchitecture(vision_encoder, text_encoder)
+    return CLIPArchitecture(
+        encoders=torch.nn.ModuleDict({"vision": vision_encoder, "text": text_encoder})
+    )
 
 
 def clip_rn101_tv():
@@ -130,4 +149,6 @@ def clip_rn101_tv():
         num_classes=512,
     )
     text_encoder = CLIPTextEncoder()
-    return CLIPArchitecture(vision_encoder, text_encoder)
+    return CLIPArchitecture(
+        encoders=torch.nn.ModuleDict({"vision": vision_encoder, "text": text_encoder})
+    )