long sentence generation

xuhu357 · Jul 25, 2020 · 512293b · 512293b
1 parent 538a26d
commit 512293b
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 3 deletions.
diff --git a/hparams.py b/hparams.py
@@ -85,7 +85,7 @@
 
 
 # Vocoder
-vocoder = 'waveglow' # 'waveglow' or 'melgan'
+vocoder = 'melgan' # 'waveglow' or 'melgan'
 
 
 # Log-scaled duration

diff --git a/synth/LJSpeech/step_300000.png b/synth/LJSpeech/step_300000.png
diff --git a/synthesize.py b/synthesize.py
@@ -40,6 +40,8 @@ def get_FastSpeech2(num):
     return model
 
 def synthesize(model, waveglow, melgan, text, sentence, prefix=''):
+    sentence = sentence[:200] # long filename will result in OS Error
+
     src_len = torch.from_numpy(np.array([text.shape[1]])).to(device)
 
     mel, mel_postnet, log_duration_output, f0_output, energy_output, _, _, mel_len = model(text, src_len)
@@ -70,6 +72,7 @@ def synthesize(model, waveglow, melgan, text, sentence, prefix=''):
     args = parser.parse_args()
 
     sentences = [
+        "Advanced text to speech models such as Fast Speech can synthesize speech significantly faster than previous auto regressive models with comparable quality. The training of Fast Speech model relies on an auto regressive teacher model for duration prediction and knowledge distillation, which can ease the one to many mapping problem in T T S. However, Fast Speech has several disadvantages, 1, the teacher student distillation pipeline is complicated, 2, the duration extracted from the teacher model is not accurate enough, and the target mel spectrograms distilled from teacher model suffer from information loss due to data simplification, both of which limit the voice quality.",
         "Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition",
         "in being comparatively modern.",
         "For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process",

diff --git a/transformer/Models.py b/transformer/Models.py
@@ -64,7 +64,10 @@ def forward(self, src_seq, mask, return_attns=False):
         slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
 
         # -- Forward
-        enc_output = self.src_word_emb(src_seq) + self.position_enc[:, :max_len, :].expand(batch_size, -1, -1)
+        if not self.training and src_seq.shape[1] > hp.max_seq_len:
+            enc_output = self.src_word_emb(src_seq) + get_sinusoid_encoding_table(src_seq.shape[1], hp.encoder_hidden)[:src_seq.shape[1], :].unsqueeze(0).expand(batch_size, -1, -1).to(src_seq.device)
+        else:
+            enc_output = self.src_word_emb(src_seq) + self.position_enc[:, :max_len, :].expand(batch_size, -1, -1)
 
         for enc_layer in self.layer_stack:
             enc_output, enc_slf_attn = enc_layer(
@@ -110,7 +113,10 @@ def forward(self, enc_seq, mask, return_attns=False):
         slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
 
         # -- Forward
-        dec_output = enc_seq + self.position_enc[:, :max_len, :].expand(batch_size, -1, -1)
+        if not self.training and enc_seq.shape[1] > hp.max_seq_len:
+            dec_output = enc_seq + get_sinusoid_encoding_table(enc_seq.shape[1], hp.decoder_hidden)[:enc_seq.shape[1], :].unsqueeze(0).expand(batch_size, -1, -1).to(enc_seq.device)
+        else:
+            dec_output = enc_seq + self.position_enc[:, :max_len, :].expand(batch_size, -1, -1)
 
         for dec_layer in self.layer_stack:
             dec_output, dec_slf_attn = dec_layer(