Skip to content

Commit

Permalink
add CaiT, new vision transformer out of facebook AI, complete with la…
Browse files Browse the repository at this point in the history
…yerscale, talking heads, and cls -> patch cross attention
  • Loading branch information
lucidrains committed Apr 1, 2021
1 parent 506fcf8 commit 9ef8da4
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 8 deletions.
45 changes: 38 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,36 @@ img = torch.randn(1, 3, 256, 256)
preds = v(img) # (1, 1000)
```

## CaiT

<a href="https://arxiv.org/abs/2103.17239">This paper</a> also notes difficulty in training vision transformers at greater depths and proposes two solutions. First it proposes to do per-channel multiplication of the output of the residual block. Second, it proposes to have the patches attend to one another, and only allow the CLS token to attend to the patches in the last few layers.

They also add <a href="https://github.com/lucidrains/x-transformers#talking-heads-attention">Talking Heads</a>, noting improvements

You can use this scheme as follows

```python
import torch
from vit_pytorch.cait import CaiT

v = CaiT(
image_size = 256,
patch_size = 32,
num_classes = 1000,
dim = 1024,
depth = 12, # depth of transformer for patch to patch attention only
cls_depth = 2, # depth of cross attention of CLS tokens to patch
heads = 16,
mlp_dim = 2048,
dropout = 0.1,
emb_dropout = 0.1
)

img = torch.randn(1, 3, 256, 256)

preds = v(img) # (1, 1000)
```

## Token-to-Token ViT

<img src="./images/t2t.png" width="400px"></img>
Expand All @@ -164,7 +194,8 @@ v = T2TViT(
)

img = torch.randn(1, 3, 224, 224)
v(img) # (1, 1000)

preds = v(img) # (1, 1000)
```

## Cross ViT
Expand All @@ -177,7 +208,7 @@ v(img) # (1, 1000)
import torch
from vit_pytorch.cross_vit import CrossViT

model = CrossViT(
v = CrossViT(
image_size = 256,
num_classes = 1000,
depth = 4, # number of multi-scale encoding blocks
Expand All @@ -199,7 +230,7 @@ model = CrossViT(

img = torch.randn(1, 3, 256, 256)

pred = model(img) # (1, 1000)
pred = v(img) # (1, 1000)
```

## PiT
Expand All @@ -212,7 +243,7 @@ pred = model(img) # (1, 1000)
import torch
from vit_pytorch.pit import PiT

p = PiT(
v = PiT(
image_size = 224,
patch_size = 14,
dim = 256,
Expand All @@ -228,7 +259,7 @@ p = PiT(

img = torch.randn(1, 3, 224, 224)

preds = p(img) # (1, 1000)
preds = v(img) # (1, 1000)
```

## CvT
Expand All @@ -241,7 +272,7 @@ preds = p(img) # (1, 1000)
import torch
from vit_pytorch.cvt import CvT

model = CvT(
v = CvT(
num_classes = 1000,
s1_emb_dim = 64, # stage 1 - dimension
s1_emb_kernel = 7, # stage 1 - conv kernel
Expand Down Expand Up @@ -272,7 +303,7 @@ model = CvT(

img = torch.randn(1, 3, 224, 224)

pred = model(img) # (1, 1000)
pred = v(img) # (1, 1000)
```

## Masked Patch Prediction
Expand Down
Binary file added images/cait.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
setup(
name = 'vit-pytorch',
packages = find_packages(exclude=['examples']),
version = '0.12.0',
version = '0.14.1',
license='MIT',
description = 'Vision Transformer (ViT) - Pytorch',
author = 'Phil Wang',
Expand Down
148 changes: 148 additions & 0 deletions vit_pytorch/cait.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import torch
from torch import nn, einsum
import torch.nn.functional as F

from einops import rearrange, repeat
from einops.layers.torch import Rearrange

# helpers

def exists(val):
return val is not None

# classes

class LayerScale(nn.Module):
def __init__(self, dim, fn, init_eps = 0.1):
super().__init__()
scale = torch.zeros(1, 1, dim).fill_(init_eps)
self.scale = nn.Parameter(scale)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(x, **kwargs) * self.scale

class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.norm = nn.LayerNorm(dim)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(self.norm(x), **kwargs)

class FeedForward(nn.Module):
def __init__(self, dim, hidden_dim, dropout = 0.):
super().__init__()
self.net = nn.Sequential(
nn.Linear(dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, dim),
nn.Dropout(dropout)
)
def forward(self, x):
return self.net(x)

class Attention(nn.Module):
def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
super().__init__()
inner_dim = dim_head * heads
self.heads = heads
self.scale = dim_head ** -0.5

self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)

self.attend = nn.Softmax(dim = -1)

self.mix_heads_pre_attn = nn.Parameter(torch.randn(heads, heads))
self.mix_heads_post_attn = nn.Parameter(torch.randn(heads, heads))

self.to_out = nn.Sequential(
nn.Linear(inner_dim, dim),
nn.Dropout(dropout)
)

def forward(self, x, context = None):
b, n, _, h = *x.shape, self.heads

context = x if not exists(context) else torch.cat((x, context), dim = 1)

qkv = (self.to_q(x), *self.to_kv(context).chunk(2, dim = -1))
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)

dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale

dots = einsum('b h i j, h g -> b g i j', dots, self.mix_heads_pre_attn) # talking heads, pre-softmax
attn = self.attend(dots)
attn = einsum('b h i j, h g -> b g i j', attn, self.mix_heads_post_attn) # talking heads, post-softmax

out = einsum('b h i j, b h j d -> b h i d', attn, v)
out = rearrange(out, 'b h n d -> b n (h d)')
return self.to_out(out)

class Transformer(nn.Module):
def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
super().__init__()
self.layers = nn.ModuleList([])
for _ in range(depth):
self.layers.append(nn.ModuleList([
LayerScale(dim, PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout))),
LayerScale(dim, PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout)))
]))
def forward(self, x, context = None):
for attn, ff in self.layers:
x = attn(x, context = context) + x
x = ff(x) + x
return x

class CaiT(nn.Module):
def __init__(
self,
*,
image_size,
patch_size,
num_classes,
dim,
depth,
cls_depth,
heads,
mlp_dim,
dim_head = 64,
dropout = 0.,
emb_dropout = 0.
):
super().__init__()
assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
num_patches = (image_size // patch_size) ** 2
patch_dim = 3 * patch_size ** 2

self.to_patch_embedding = nn.Sequential(
Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
nn.Linear(patch_dim, dim),
)

self.pos_embedding = nn.Parameter(torch.randn(1, num_patches, dim))
self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
self.dropout = nn.Dropout(emb_dropout)

self.patch_transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)
self.cls_transformer = Transformer(dim, cls_depth, heads, dim_head, mlp_dim, dropout)

self.mlp_head = nn.Sequential(
nn.LayerNorm(dim),
nn.Linear(dim, num_classes)
)

def forward(self, img):
x = self.to_patch_embedding(img)
b, n, _ = x.shape

x += self.pos_embedding[:, :n]
x = self.dropout(x)

x = self.patch_transformer(x)

cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
x = self.cls_transformer(cls_tokens, context = x)

return self.mlp_head(x[:, 0])

0 comments on commit 9ef8da4

Please sign in to comment.