forked from Anjok07/ultimatevocalremovergui
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
218 lines (188 loc) · 7.25 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import math
import torch as th
from torch import nn
from .utils import capture_init, center_trim
class BLSTM(nn.Module):
def __init__(self, dim, layers=1):
super().__init__()
self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
self.linear = nn.Linear(2 * dim, dim)
def forward(self, x):
x = x.permute(2, 0, 1)
x = self.lstm(x)[0]
x = self.linear(x)
x = x.permute(1, 2, 0)
return x
def rescale_conv(conv, reference):
std = conv.weight.std().detach()
scale = (std / reference)**0.5
conv.weight.data /= scale
if conv.bias is not None:
conv.bias.data /= scale
def rescale_module(module, reference):
for sub in module.modules():
if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)):
rescale_conv(sub, reference)
def upsample(x, stride):
"""
Linear upsampling, the output will be `stride` times longer.
"""
batch, channels, time = x.size()
weight = th.arange(stride, device=x.device, dtype=th.float) / stride
x = x.view(batch, channels, time, 1)
out = x[..., :-1, :] * (1 - weight) + x[..., 1:, :] * weight
return out.reshape(batch, channels, -1)
def downsample(x, stride):
"""
Downsample x by decimation.
"""
return x[:, :, ::stride]
class Demucs(nn.Module):
@capture_init
def __init__(self,
sources=4,
audio_channels=2,
channels=64,
depth=6,
rewrite=True,
glu=True,
upsample=False,
rescale=0.1,
kernel_size=8,
stride=4,
growth=2.,
lstm_layers=2,
context=3,
samplerate=44100):
"""
Args:
sources (int): number of sources to separate
audio_channels (int): stereo or mono
channels (int): first convolution channels
depth (int): number of encoder/decoder layers
rewrite (bool): add 1x1 convolution to each encoder layer
and a convolution to each decoder layer.
For the decoder layer, `context` gives the kernel size.
glu (bool): use glu instead of ReLU
upsample (bool): use linear upsampling with convolutions
Wave-U-Net style, instead of transposed convolutions
rescale (int): rescale initial weights of convolutions
to get their standard deviation closer to `rescale`
kernel_size (int): kernel size for convolutions
stride (int): stride for convolutions
growth (float): multiply (resp divide) number of channels by that
for each layer of the encoder (resp decoder)
lstm_layers (int): number of lstm layers, 0 = no lstm
context (int): kernel size of the convolution in the
decoder before the transposed convolution. If > 1,
will provide some context from neighboring time
steps.
"""
super().__init__()
self.audio_channels = audio_channels
self.sources = sources
self.kernel_size = kernel_size
self.context = context
self.stride = stride
self.depth = depth
self.upsample = upsample
self.channels = channels
self.samplerate = samplerate
self.encoder = nn.ModuleList()
self.decoder = nn.ModuleList()
self.final = None
if upsample:
self.final = nn.Conv1d(channels + audio_channels, sources * audio_channels, 1)
stride = 1
if glu:
activation = nn.GLU(dim=1)
ch_scale = 2
else:
activation = nn.ReLU()
ch_scale = 1
in_channels = audio_channels
for index in range(depth):
encode = []
encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), nn.ReLU()]
if rewrite:
encode += [nn.Conv1d(channels, ch_scale * channels, 1), activation]
self.encoder.append(nn.Sequential(*encode))
decode = []
if index > 0:
out_channels = in_channels
else:
if upsample:
out_channels = channels
else:
out_channels = sources * audio_channels
if rewrite:
decode += [nn.Conv1d(channels, ch_scale * channels, context), activation]
if upsample:
decode += [
nn.Conv1d(channels, out_channels, kernel_size, stride=1),
]
else:
decode += [nn.ConvTranspose1d(channels, out_channels, kernel_size, stride)]
if index > 0:
decode.append(nn.ReLU())
self.decoder.insert(0, nn.Sequential(*decode))
in_channels = channels
channels = int(growth * channels)
channels = in_channels
if lstm_layers:
self.lstm = BLSTM(channels, lstm_layers)
else:
self.lstm = None
if rescale:
rescale_module(self, reference=rescale)
def valid_length(self, length):
"""
Return the nearest valid length to use with the model so that
there is no time steps left over in a convolutions, e.g. for all
layers, size of the input - kernel_size % stride = 0.
If the mixture has a valid length, the estimated sources
will have exactly the same length when context = 1. If context > 1,
the two signals can be center trimmed to match.
For training, extracts should have a valid length.For evaluation
on full tracks we recommend passing `pad = True` to :method:`forward`.
"""
for _ in range(self.depth):
if self.upsample:
length = math.ceil(length / self.stride) + self.kernel_size - 1
else:
length = math.ceil((length - self.kernel_size) / self.stride) + 1
length = max(1, length)
length += self.context - 1
for _ in range(self.depth):
if self.upsample:
length = length * self.stride + self.kernel_size - 1
else:
length = (length - 1) * self.stride + self.kernel_size
return int(length)
def forward(self, mix):
x = mix
saved = [x]
for encode in self.encoder:
x = encode(x)
saved.append(x)
if self.upsample:
x = downsample(x, self.stride)
if self.lstm:
x = self.lstm(x)
for decode in self.decoder:
if self.upsample:
x = upsample(x, stride=self.stride)
skip = center_trim(saved.pop(-1), x)
x = x + skip
x = decode(x)
if self.final:
skip = center_trim(saved.pop(-1), x)
x = th.cat([x, skip], dim=1)
x = self.final(x)
x = x.view(x.size(0), self.sources, self.audio_channels, x.size(-1))
return x