diff --git a/README.rst b/README.rst index 46ccbc2..dc9b7da 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ PAULE .. image:: https://zenodo.org/badge/355606517.svg :target: https://zenodo.org/badge/latestdoi/355606517 -Predictive Articulatory speech synthesis Utilizing Lexical Embeddings (PAULE) a +Predictive Articulatory speech synthesis Utilizing Lexical Embeddings (PAULE) is a python frame work to plan control parameter trajectories for the VocalTractLab simulator for a target acoustics or semantic embedding. @@ -54,6 +54,9 @@ To cite the PAULE source code use the DOI 10.5281/zenodo.7252431 (https://zenodo.org/doi/10.5281/zenodo.7252431), if you want to cite the software in general or the specific DOI on Zenodo. +Linked Projects +--------------- +To generate suitable corpus data for the PAULE model, we have also implemented `create_vtl_corpus `_ Acknowledgements ---------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 9186f23..a278186 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,6 +3,10 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. + +PAULE +====== +Predictive Articulatory speech synthesis Utilizing Lexical Embeddings (PAULE) is a python frame work to plan control parameter trajectories for the VocalTractLab simulator for a target acoustics or semantic embedding. .. toctree:: :maxdepth: 2 :caption: Contents: diff --git a/docs/source/training.rst b/docs/source/training.rst new file mode 100644 index 0000000..dbb7f35 --- /dev/null +++ b/docs/source/training.rst @@ -0,0 +1,30 @@ +Training +========== + + + +Data +===== + +Paule requires data to be in the following format: + + + + +Training +======== +For effective training you probably want to use a GPU. + + +Paule consist of a number of models that are trained seperately. The models are: + +- `Embedder` Input: Log mel spectrogram, Output: Semantic embedding, is added to the target embedding + +- `ForwardModel` Input: Normalized control parameters, Output: Log mel spectrogram + +- `InverseModel` Input: Log mel spectrogram, Output: Normalized control parameters + +-`MelGANs` + + + diff --git a/paule/paule.py b/paule/paule.py index f6bcbc4..63c27d8 100644 --- a/paule/paule.py +++ b/paule/paule.py @@ -162,7 +162,7 @@ def __init__(self, *, pred_model=None, pred_optimizer=None, inv_model=None, inv_ if embedder: self.embedder = embedder else: - self.embedder = EmbeddingModel(num_lstm_layers=2, hidden_size=720).double() + self.embedder = EmbeddingModel(num_lstm_layers=2, hidden_size=720, dropout=0.7).double() self.embedder.load_state_dict(torch.load( os.path.join(DIR, "pretrained_models/embedder/embed_model_common_voice_syn_rec_2_720_0_dropout_07_noise_6e05_rmse_lr_00001_200.pt"), map_location=self.device))