-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathbase_model.py
executable file
·232 lines (219 loc) · 9.58 KB
/
base_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
"""
This code is developed based on Jin-Hwa Kim's repository (Bilinear Attention Networks - https://github.com/jnhwkim/ban-vqa) by Xuan B. Nguyen
"""
import torch
import torch.nn as nn
from attention import BiAttention, StackedAttention
from language_model import WordEmbedding, QuestionEmbedding
from classifier import SimpleClassifier
from fc import FCNet
from bc import BCNet
from counting import Counter
from utils import tfidf_loading
from simple_cnn import SimpleCNN
from auto_encoder import Auto_Encoder_Model
import clip
# Create BAN model
class BAN_Model(nn.Module):
def __init__(self, dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, maml_v_emb, ae_v_emb, clip_v_emb=None):
super(BAN_Model, self).__init__()
self.args = args
self.dataset = dataset
self.op = args.op
self.glimpse = args.gamma
self.w_emb = w_emb
self.q_emb = q_emb
self.v_att = v_att
self.b_net = nn.ModuleList(b_net)
self.q_prj = nn.ModuleList(q_prj)
if counter is not None: # if do not use counter
self.c_prj = nn.ModuleList(c_prj)
self.classifier = classifier
self.counter = counter
self.drop = nn.Dropout(.5)
self.tanh = nn.Tanh()
if args.maml:
self.maml_v_emb = maml_v_emb
if args.autoencoder:
self.ae_v_emb = ae_v_emb
self.convert = nn.Linear(16384, 64)
if args.clip:
self.clip_v_emb = clip_v_emb
def forward(self, v, q):
"""Forward
v: [batch, num_objs, obj_dim]
b: [batch, num_objs, b_dim]
q: [batch_size, seq_length]
return: logits, not probs
"""
# get visual feature
if self.args.maml:
maml_v_emb = self.maml_v_emb(v[0]).unsqueeze(1)
v_emb = maml_v_emb
if self.args.autoencoder:
encoder = self.ae_v_emb.forward_pass(v[1])
decoder = self.ae_v_emb.reconstruct_pass(encoder)
ae_v_emb = encoder.view(encoder.shape[0], -1)
ae_v_emb = self.convert(ae_v_emb).unsqueeze(1)
v_emb = ae_v_emb
if self.args.maml and self.args.autoencoder:
v_emb = torch.cat((maml_v_emb, ae_v_emb), 2)
if self.args.clip:
clip_v_emb = self.clip_v_emb.encode_image(v[2]).unsqueeze(1)
v_emb = clip_v_emb
if self.args.clip and self.args.autoencoder:
v_emb = torch.cat((clip_v_emb, ae_v_emb), 2)
# get lextual feature
w_emb = self.w_emb(q)
q_emb = self.q_emb.forward_all(w_emb) # [batch, q_len, q_dim]
# Attention
b_emb = [0] * self.glimpse
att, logits = self.v_att.forward_all(v_emb, q_emb) # b x g x v x q
for g in range(self.glimpse):
b_emb[g] = self.b_net[g].forward_with_weights(v_emb, q_emb, att[:,g,:,:]) # b x l x h
atten, _ = logits[:,g,:,:].max(2)
q_emb = self.q_prj[g](b_emb[g].unsqueeze(1)) + q_emb
if self.args.autoencoder:
return q_emb.sum(1), decoder
return q_emb.sum(1)
def classify(self, input_feats):
return self.classifier(input_feats)
# Create SAN model
class SAN_Model(nn.Module):
def __init__(self, w_emb, q_emb, v_att, classifier, args, maml_v_emb, ae_v_emb):
super(SAN_Model, self).__init__()
self.args = args
self.w_emb = w_emb
self.q_emb = q_emb
self.v_att = v_att
self.classifier = classifier
if args.maml:
self.maml_v_emb = maml_v_emb
if args.autoencoder:
self.ae_v_emb = ae_v_emb
self.convert = nn.Linear(16384, 64)
def forward(self, v, q):
"""Forward
v: [batch, num_objs, obj_dim]
b: [batch, num_objs, b_dim]
q: [batch_size, seq_length]
return: logits, not probs
"""
# get visual feature
if self.args.maml:
maml_v_emb = self.maml_v_emb(v[0]).unsqueeze(1)
v_emb = maml_v_emb
if self.args.autoencoder:
encoder = self.ae_v_emb.forward_pass(v[1])
decoder = self.ae_v_emb.reconstruct_pass(encoder)
ae_v_emb = encoder.view(encoder.shape[0], -1)
ae_v_emb = self.convert(ae_v_emb).unsqueeze(1)
v_emb = ae_v_emb
if self.args.maml and self.args.autoencoder:
v_emb = torch.cat((maml_v_emb, ae_v_emb), 2)
# get textual feature
w_emb = self.w_emb(q)
q_emb = self.q_emb(w_emb) # [batch, q_dim], return final hidden state
# Attention
att = self.v_att(v_emb, q_emb)
if self.args.autoencoder:
return att, decoder
return att
def classify(self, input_feats):
return self.classifier(input_feats)
# Build BAN model
def build_BAN(dataset, args, priotize_using_counter=False):
# init word embedding module, question embedding module, and Attention network
w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.op)
q_emb = QuestionEmbedding(300 if 'c' not in args.op else 600, args.num_hid, 1, False, .0, args.rnn)
v_att = BiAttention(dataset.v_dim, args.num_hid, args.num_hid, args.gamma)
# build and load pre-trained MAML model
if args.RAD_dir:
path_dir = args.RAD_dir
elif args.SLAKE_dir:
path_dir = args.SLAKE_dir
if args.maml:
weight_path = path_dir + '/' + args.maml_model_path
print('load initial weights MAML from: %s' % (weight_path))
maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn)
# build and load pre-trained Auto-encoder model
if args.autoencoder:
ae_v_emb = Auto_Encoder_Model()
weight_path = path_dir + '/' + args.ae_model_path
print('load initial weights DAE from: %s'%(weight_path))
ae_v_emb.load_state_dict(torch.load(weight_path))
if args.clip:
pubmedclip, _ = clip.load(args.clip_vision_encoder, jit=False)
if not args.clip_org:
checkpoint = torch.load(args.clip_path)
pubmedclip.load_state_dict(checkpoint['state_dict'])
clip_v_emb = pubmedclip.float()
# Loading tfidf weighted embedding
# Loading tfidf weighted embedding
if hasattr(args, 'tfidf'):
w_emb = tfidf_loading(args.tfidf, w_emb, args)
# Optional module: counter for BAN
use_counter = args.use_counter if priotize_using_counter is None else priotize_using_counter
if use_counter or priotize_using_counter:
objects = 10 # minimum number of boxes
if use_counter or priotize_using_counter:
counter = Counter(objects)
else:
counter = None
# init BAN residual network
b_net = []
q_prj = []
c_prj = []
for i in range(args.gamma):
b_net.append(BCNet(dataset.v_dim, args.num_hid, args.num_hid, None, k=1))
q_prj.append(FCNet([args.num_hid, args.num_hid], '', .2))
if use_counter or priotize_using_counter:
c_prj.append(FCNet([objects + 1, args.num_hid], 'ReLU', .0))
# init classifier
classifier = SimpleClassifier(
args.num_hid, args.num_hid * 2, dataset.num_ans_candidates, args)
# contruct VQA model and return
if args.maml and args.autoencoder:
return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, maml_v_emb,
ae_v_emb)
elif args.clip and args.autoencoder:
return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, None, ae_v_emb, clip_v_emb)
elif args.maml:
return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, maml_v_emb,
None)
elif args.autoencoder:
return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, None,
ae_v_emb)
return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, None, None)
# Build SAN model
def build_SAN(dataset, args):
# init word embedding module, question embedding module, and Attention network
w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0, args.op)
q_emb = QuestionEmbedding(300 if 'c' not in args.op else 600, args.num_hid, 1, False, 0.0, args.rnn)
v_att = StackedAttention(args.num_stacks, dataset.v_dim, args.num_hid, args.num_hid, dataset.num_ans_candidates,
args.dropout)
# build and load pre-trained MAML model
if args.maml:
weight_path = args.RAD_dir + '/' + args.maml_model_path
print('load initial weights MAML from: %s' % (weight_path))
maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn)
# build and load pre-trained Auto-encoder model
if args.autoencoder:
ae_v_emb = Auto_Encoder_Model()
weight_path = args.RAD_dir + '/' + args.ae_model_path
print('load initial weights DAE from: %s'%(weight_path))
ae_v_emb.load_state_dict(torch.load(weight_path))
# Loading tfidf weighted embedding
if hasattr(args, 'tfidf'):
w_emb = tfidf_loading(args.tfidf, w_emb, args)
# init classifier
classifier = SimpleClassifier(
args.num_hid, 2 * args.num_hid, dataset.num_ans_candidates, args)
# contruct VQA model and return
if args.maml and args.autoencoder:
return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb, ae_v_emb)
elif args.maml:
return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb, None)
elif args.autoencoder:
return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, ae_v_emb)
return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, None)