-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunet.py
118 lines (96 loc) · 3.85 KB
/
unet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
""" Parts of the U-Net model """
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import Wav2Vec2Model
class UNet(nn.Module):
def __init__(self,
input_dim, # ssl_dim + stft_dim = 768 + 201 = 969
n_classes, # stft_dim = 201
input_channels,
bilinear=True,
):
super(UNet, self).__init__()
self.input_dim = input_dim
self.n_classes = n_classes
self.input_channels = input_channels
self.bilinear = bilinear
self.type = type
factor = 2 if bilinear else 1
self.inc = (DoubleConv(self.input_channels, 64))
self.down1 = (DownSamplingLayer(64, 128))
self.down2 = (DownSamplingLayer(128, 256))
self.down3 = (DownSamplingLayer(256, 512))
self.down4 = (DownSamplingLayer(512, 1024 // factor))
self.up1 = (UpSamplingLayer(1024, 512 // factor, bilinear))
self.up2 = (UpSamplingLayer(512, 256 // factor, bilinear))
self.up3 = (UpSamplingLayer(256, 128 // factor, bilinear))
self.up4 = (UpSamplingLayer(128, 64, bilinear))
self.outc = (OutConv(64, self.n_classes))
self.dim_stft = 201 # TODO: change without hardcoding
self.linear_projection = torch.nn.Linear(self.input_dim, self.dim_stft)
def forward(self, x):
x1 = self.inc(x)
x2 = self.down1(x1)
x3 = self.down2(x2)
x4 = self.down3(x3)
x5 = self.down4(x4)
x = self.up1(x5, x4)
x = self.up2(x, x3)
x = self.up3(x, x2)
x = self.up4(x, x1)
logits = self.outc(x)
logits = self.linear_projection(logits)
return logits
class DoubleConv(nn.Module):
"""(convolution => [BN] => ReLU) * 2"""
def __init__(self, in_channels, out_channels, mid_channels=None):
super().__init__()
if not mid_channels:
mid_channels = out_channels
self.double_conv = nn.Sequential(
nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)
)
def forward(self, x):
return self.double_conv(x)
class DownSamplingLayer(nn.Module):
"""Downscaling with maxpool then double conv"""
def __init__(self, in_channels, out_channels):
super().__init__()
self.maxpool_conv = nn.Sequential(
nn.MaxPool2d(2),
DoubleConv(in_channels, out_channels)
)
def forward(self, x):
return self.maxpool_conv(x)
class UpSamplingLayer(nn.Module):
"""Upscaling then double conv"""
def __init__(self, in_channels, out_channels, bilinear=True):
super().__init__()
# if bilinear, use the normal convolutions to reduce the number of channels
if bilinear:
self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
self.conv = DoubleConv(in_channels, out_channels, in_channels // 2)
else:
self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)
self.conv = DoubleConv(in_channels, out_channels)
def forward(self, x1, x2):
x1 = self.up(x1)
# input is CHW
diffY = x2.size()[2] - x1.size()[2]
diffX = x2.size()[3] - x1.size()[3]
x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
diffY // 2, diffY - diffY // 2])
x = torch.cat([x2, x1], dim=1)
return self.conv(x)
class OutConv(nn.Module):
def __init__(self, in_channels, out_channels):
super(OutConv, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)
def forward(self, x):
return self.conv(x)