mirror of
https://github.com/malarinv/tacotron2
synced 2026-03-08 01:32:35 +00:00
1. update waveglow
2. add gl option and hyperparams to TTSModel
This commit is contained in:
@@ -7,19 +7,19 @@ class Denoiser(torch.nn.Module):
|
||||
""" Removes model bias from audio produced with waveglow """
|
||||
|
||||
def __init__(self, waveglow, filter_length=1024, n_overlap=4,
|
||||
win_length=1024, mode='zeros'):
|
||||
win_length=1024, mode='zeros', n_mel_channels=80,):
|
||||
super(Denoiser, self).__init__()
|
||||
self.stft = STFT(filter_length=filter_length,
|
||||
hop_length=int(filter_length/n_overlap),
|
||||
win_length=win_length).cpu()
|
||||
if mode == 'zeros':
|
||||
mel_input = torch.zeros(
|
||||
(1, 80, 88),
|
||||
(1, n_mel_channels, 88),
|
||||
dtype=waveglow.upsample.weight.dtype,
|
||||
device=waveglow.upsample.weight.device)
|
||||
elif mode == 'normal':
|
||||
mel_input = torch.randn(
|
||||
(1, 80, 88),
|
||||
(1, n_mel_channels, 88),
|
||||
dtype=waveglow.upsample.weight.dtype,
|
||||
device=waveglow.upsample.weight.device)
|
||||
else:
|
||||
|
||||
@@ -2,76 +2,79 @@
|
||||
# import tensorflow as tf
|
||||
from dataclasses import dataclass
|
||||
from .text import symbols
|
||||
|
||||
# from .text_codec import symbols
|
||||
|
||||
|
||||
@dataclass
|
||||
class HParams(object):
|
||||
"""docstring for HParams."""
|
||||
|
||||
################################
|
||||
# Experiment Parameters #
|
||||
################################
|
||||
epochs=500
|
||||
iters_per_checkpoint=1000
|
||||
seed=1234
|
||||
dynamic_loss_scaling=True
|
||||
fp16_run=False
|
||||
distributed_run=False
|
||||
dist_backend="nccl"
|
||||
dist_url="tcp://localhost:54321"
|
||||
cudnn_enabled=True
|
||||
cudnn_benchmark=False
|
||||
ignore_layers=["embedding.weight"]
|
||||
epochs = 500
|
||||
iters_per_checkpoint = 1000
|
||||
seed = 1234
|
||||
dynamic_loss_scaling = True
|
||||
fp16_run = False
|
||||
distributed_run = False
|
||||
dist_backend = "nccl"
|
||||
dist_url = "tcp://localhost:54321"
|
||||
cudnn_enabled = True
|
||||
cudnn_benchmark = False
|
||||
ignore_layers = ["embedding.weight"]
|
||||
################################
|
||||
# Data Parameters #
|
||||
################################
|
||||
load_mel_from_disk=False
|
||||
training_files="lists/tts_data_train_processed.txt"
|
||||
validation_files="filelists/tts_data_val_processed.txt"
|
||||
text_cleaners=["english_cleaners"]
|
||||
load_mel_from_disk = False
|
||||
training_files = "lists/tts_data_train_processed.txt"
|
||||
validation_files = "filelists/tts_data_val_processed.txt"
|
||||
text_cleaners = ["english_cleaners"]
|
||||
################################
|
||||
# Audio Parameters #
|
||||
################################
|
||||
max_wav_value=32768.0
|
||||
sampling_rate=16000
|
||||
filter_length=1024
|
||||
hop_length=256
|
||||
win_length=1024
|
||||
n_mel_channels=80
|
||||
mel_fmin=0.0
|
||||
mel_fmax=8000.0
|
||||
max_wav_value = 32768.0
|
||||
sampling_rate = 16000
|
||||
filter_length = 1024
|
||||
hop_length = 256
|
||||
win_length = 1024
|
||||
n_mel_channels: int = 40
|
||||
mel_fmin: float = 0.0
|
||||
mel_fmax: float = 4000.0
|
||||
################################
|
||||
# Model Parameters #
|
||||
################################
|
||||
n_symbols=len(symbols)
|
||||
symbols_embedding_dim=512
|
||||
n_symbols = len(symbols)
|
||||
symbols_embedding_dim = 512
|
||||
# Encoder parameters
|
||||
encoder_kernel_size=5
|
||||
encoder_n_convolutions=3
|
||||
encoder_embedding_dim=512
|
||||
encoder_kernel_size = 5
|
||||
encoder_n_convolutions = 3
|
||||
encoder_embedding_dim = 512
|
||||
# Decoder parameters
|
||||
n_frames_per_step=1 # currently only 1 is supported
|
||||
decoder_rnn_dim=1024
|
||||
prenet_dim=256
|
||||
max_decoder_steps=1000
|
||||
gate_threshold=0.5
|
||||
p_attention_dropout=0.1
|
||||
p_decoder_dropout=0.1
|
||||
n_frames_per_step = 1 # currently only 1 is supported
|
||||
decoder_rnn_dim = 1024
|
||||
prenet_dim = 256
|
||||
max_decoder_steps = 1000
|
||||
gate_threshold = 0.5
|
||||
p_attention_dropout = 0.1
|
||||
p_decoder_dropout = 0.1
|
||||
# Attention parameters
|
||||
attention_rnn_dim=1024
|
||||
attention_dim=128
|
||||
attention_rnn_dim = 1024
|
||||
attention_dim = 128
|
||||
# Location Layer parameters
|
||||
attention_location_n_filters=32
|
||||
attention_location_kernel_size=31
|
||||
attention_location_n_filters = 32
|
||||
attention_location_kernel_size = 31
|
||||
# Mel-post processing network parameters
|
||||
postnet_embedding_dim=512
|
||||
postnet_kernel_size=5
|
||||
postnet_n_convolutions=5
|
||||
postnet_embedding_dim = 512
|
||||
postnet_kernel_size = 5
|
||||
postnet_n_convolutions = 5
|
||||
################################
|
||||
# Optimization Hyperparameters #
|
||||
################################
|
||||
use_saved_learning_rate=False
|
||||
learning_rate=1e-3
|
||||
weight_decay=1e-6
|
||||
grad_clip_thresh=1.0
|
||||
batch_size=4
|
||||
mask_padding=True # set model's padded outputs to padded values
|
||||
use_saved_learning_rate = False
|
||||
learning_rate = 1e-3
|
||||
weight_decay = 1e-6
|
||||
grad_clip_thresh = 1.0
|
||||
batch_size = 4
|
||||
mask_padding = True # set model's padded outputs to padded values
|
||||
|
||||
Reference in New Issue
Block a user