1. update waveglow

2. add gl option and hyperparams to TTSModel
2026-03-08 01:32:35 +00:00 · 2019-10-04 15:24:42 +05:30
parent d0d273a698
commit 36c731cad0
4 changed files with 492 additions and 178 deletions
--- a/taco2/denoiser.py
+++ b/taco2/denoiser.py
@@ -7,19 +7,19 @@ class Denoiser(torch.nn.Module):
    """ Removes model bias from audio produced with waveglow """

    def __init__(self, waveglow, filter_length=1024, n_overlap=4,
-                 win_length=1024, mode='zeros'):
+                 win_length=1024, mode='zeros', n_mel_channels=80,):
        super(Denoiser, self).__init__()
        self.stft = STFT(filter_length=filter_length,
                         hop_length=int(filter_length/n_overlap),
                         win_length=win_length).cpu()
        if mode == 'zeros':
            mel_input = torch.zeros(
-                (1, 80, 88),
+                (1, n_mel_channels, 88),
                dtype=waveglow.upsample.weight.dtype,
                device=waveglow.upsample.weight.device)
        elif mode == 'normal':
            mel_input = torch.randn(
-                (1, 80, 88),
+                (1, n_mel_channels, 88),
                dtype=waveglow.upsample.weight.dtype,
                device=waveglow.upsample.weight.device)
        else:
--- a/taco2/hparams.py
+++ b/taco2/hparams.py
@@ -2,76 +2,79 @@
 # import tensorflow as tf
 from dataclasses import dataclass
 from .text import symbols
+
 # from .text_codec import symbols

+
@dataclass
 class HParams(object):
    """docstring for HParams."""
+
    ################################
    # Experiment Parameters        #
    ################################
-    epochs=500
-    iters_per_checkpoint=1000
-    seed=1234
-    dynamic_loss_scaling=True
-    fp16_run=False
-    distributed_run=False
-    dist_backend="nccl"
-    dist_url="tcp://localhost:54321"
-    cudnn_enabled=True
-    cudnn_benchmark=False
-    ignore_layers=["embedding.weight"]
+    epochs = 500
+    iters_per_checkpoint = 1000
+    seed = 1234
+    dynamic_loss_scaling = True
+    fp16_run = False
+    distributed_run = False
+    dist_backend = "nccl"
+    dist_url = "tcp://localhost:54321"
+    cudnn_enabled = True
+    cudnn_benchmark = False
+    ignore_layers = ["embedding.weight"]
    ################################
    # Data Parameters             #
    ################################
-    load_mel_from_disk=False
-    training_files="lists/tts_data_train_processed.txt"
-    validation_files="filelists/tts_data_val_processed.txt"
-    text_cleaners=["english_cleaners"]
+    load_mel_from_disk = False
+    training_files = "lists/tts_data_train_processed.txt"
+    validation_files = "filelists/tts_data_val_processed.txt"
+    text_cleaners = ["english_cleaners"]
    ################################
    # Audio Parameters             #
    ################################
-    max_wav_value=32768.0
-    sampling_rate=16000
-    filter_length=1024
-    hop_length=256
-    win_length=1024
-    n_mel_channels=80
-    mel_fmin=0.0
-    mel_fmax=8000.0
+    max_wav_value = 32768.0
+    sampling_rate = 16000
+    filter_length = 1024
+    hop_length = 256
+    win_length = 1024
+    n_mel_channels: int = 40
+    mel_fmin: float = 0.0
+    mel_fmax: float = 4000.0
    ################################
    # Model Parameters             #
    ################################
-    n_symbols=len(symbols)
-    symbols_embedding_dim=512
+    n_symbols = len(symbols)
+    symbols_embedding_dim = 512
    # Encoder parameters
-    encoder_kernel_size=5
-    encoder_n_convolutions=3
-    encoder_embedding_dim=512
+    encoder_kernel_size = 5
+    encoder_n_convolutions = 3
+    encoder_embedding_dim = 512
    # Decoder parameters
-    n_frames_per_step=1  # currently only 1 is supported
-    decoder_rnn_dim=1024
-    prenet_dim=256
-    max_decoder_steps=1000
-    gate_threshold=0.5
-    p_attention_dropout=0.1
-    p_decoder_dropout=0.1
+    n_frames_per_step = 1  # currently only 1 is supported
+    decoder_rnn_dim = 1024
+    prenet_dim = 256
+    max_decoder_steps = 1000
+    gate_threshold = 0.5
+    p_attention_dropout = 0.1
+    p_decoder_dropout = 0.1
    # Attention parameters
-    attention_rnn_dim=1024
-    attention_dim=128
+    attention_rnn_dim = 1024
+    attention_dim = 128
    # Location Layer parameters
-    attention_location_n_filters=32
-    attention_location_kernel_size=31
+    attention_location_n_filters = 32
+    attention_location_kernel_size = 31
    # Mel-post processing network parameters
-    postnet_embedding_dim=512
-    postnet_kernel_size=5
-    postnet_n_convolutions=5
+    postnet_embedding_dim = 512
+    postnet_kernel_size = 5
+    postnet_n_convolutions = 5
    ################################
    # Optimization Hyperparameters #
    ################################
-    use_saved_learning_rate=False
-    learning_rate=1e-3
-    weight_decay=1e-6
-    grad_clip_thresh=1.0
-    batch_size=4
-    mask_padding=True  # set model's padded outputs to padded values
+    use_saved_learning_rate = False
+    learning_rate = 1e-3
+    weight_decay = 1e-6
+    grad_clip_thresh = 1.0
+    batch_size = 4
+    mask_padding = True  # set model's padded outputs to padded values