Skip to content

Commit

Permalink
long sentence generation
Browse files Browse the repository at this point in the history
  • Loading branch information
ming024 committed Jul 25, 2020
1 parent 538a26d commit 512293b
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 3 deletions.
2 changes: 1 addition & 1 deletion hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@


# Vocoder
vocoder = 'waveglow' # 'waveglow' or 'melgan'
vocoder = 'melgan' # 'waveglow' or 'melgan'


# Log-scaled duration
Expand Down
Binary file modified synth/LJSpeech/step_300000.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 3 additions & 0 deletions synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ def get_FastSpeech2(num):
return model

def synthesize(model, waveglow, melgan, text, sentence, prefix=''):
sentence = sentence[:200] # long filename will result in OS Error

src_len = torch.from_numpy(np.array([text.shape[1]])).to(device)

mel, mel_postnet, log_duration_output, f0_output, energy_output, _, _, mel_len = model(text, src_len)
Expand Down Expand Up @@ -70,6 +72,7 @@ def synthesize(model, waveglow, melgan, text, sentence, prefix=''):
args = parser.parse_args()

sentences = [
"Advanced text to speech models such as Fast Speech can synthesize speech significantly faster than previous auto regressive models with comparable quality. The training of Fast Speech model relies on an auto regressive teacher model for duration prediction and knowledge distillation, which can ease the one to many mapping problem in T T S. However, Fast Speech has several disadvantages, 1, the teacher student distillation pipeline is complicated, 2, the duration extracted from the teacher model is not accurate enough, and the target mel spectrograms distilled from teacher model suffer from information loss due to data simplification, both of which limit the voice quality.",
"Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition",
"in being comparatively modern.",
"For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process",
Expand Down
10 changes: 8 additions & 2 deletions transformer/Models.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,10 @@ def forward(self, src_seq, mask, return_attns=False):
slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)

# -- Forward
enc_output = self.src_word_emb(src_seq) + self.position_enc[:, :max_len, :].expand(batch_size, -1, -1)
if not self.training and src_seq.shape[1] > hp.max_seq_len:
enc_output = self.src_word_emb(src_seq) + get_sinusoid_encoding_table(src_seq.shape[1], hp.encoder_hidden)[:src_seq.shape[1], :].unsqueeze(0).expand(batch_size, -1, -1).to(src_seq.device)
else:
enc_output = self.src_word_emb(src_seq) + self.position_enc[:, :max_len, :].expand(batch_size, -1, -1)

for enc_layer in self.layer_stack:
enc_output, enc_slf_attn = enc_layer(
Expand Down Expand Up @@ -110,7 +113,10 @@ def forward(self, enc_seq, mask, return_attns=False):
slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)

# -- Forward
dec_output = enc_seq + self.position_enc[:, :max_len, :].expand(batch_size, -1, -1)
if not self.training and enc_seq.shape[1] > hp.max_seq_len:
dec_output = enc_seq + get_sinusoid_encoding_table(enc_seq.shape[1], hp.decoder_hidden)[:enc_seq.shape[1], :].unsqueeze(0).expand(batch_size, -1, -1).to(enc_seq.device)
else:
dec_output = enc_seq + self.position_enc[:, :max_len, :].expand(batch_size, -1, -1)

for dec_layer in self.layer_stack:
dec_output, dec_slf_attn = dec_layer(
Expand Down

0 comments on commit 512293b

Please sign in to comment.