Analysing Deep Learning-Spectral Envelope Prediction Methods for Singing Synthesis

Analysing Deep Learning-Spectral Envelope Prediction Methods for Singing Synthesis. Bous, F. & Roebel, A. In 2019 27th European Signal Processing Conference (EUSIPCO), pages 1-5, Sep., 2019.

Paper doi abstract bibtex

We conduct an investigation on various hyperparameters regarding neural networks used to generate spectral envelopes for singing synthesis. Two perceptive tests, where the first compares two models directly and the other ranks models with a mean opinion score, are performed. With these tests we show that when learning to predict spectral envelopes, 2d-convolutions are superior over previously proposed 1d-convolutions and that predicting multiple frames in an iterated fashion during training is superior over injecting noise to the input data. An experimental investigation whether learning to predict a probability distribution vs. single samples was performed but turned out to be inconclusive. A network architecture is proposed that incorporates the improvements which we found to be useful and we show in our experiments that this network produces better results than other stat-of-the-art methods.

@InProceedings{8903122,
  author = {F. Bous and A. Roebel},
  booktitle = {2019 27th European Signal Processing Conference (EUSIPCO)},
  title = {Analysing Deep Learning-Spectral Envelope Prediction Methods for Singing Synthesis},
  year = {2019},
  pages = {1-5},
  abstract = {We conduct an investigation on various hyperparameters regarding neural networks used to generate spectral envelopes for singing synthesis. Two perceptive tests, where the first compares two models directly and the other ranks models with a mean opinion score, are performed. With these tests we show that when learning to predict spectral envelopes, 2d-convolutions are superior over previously proposed 1d-convolutions and that predicting multiple frames in an iterated fashion during training is superior over injecting noise to the input data. An experimental investigation whether learning to predict a probability distribution vs. single samples was performed but turned out to be inconclusive. A network architecture is proposed that incorporates the improvements which we found to be useful and we show in our experiments that this network produces better results than other stat-of-the-art methods.},
  keywords = {convolutional neural nets;learning (artificial intelligence);prediction theory;spectral analysis;speech coding;speech processing;statistical distributions;deep learning-spectral envelope prediction methods;singing synthesis;neural networks;spectral envelopes;perceptive tests;mean opinion score;network architecture;2d-convolution;1d-convolutions;probability distribution;Convolution;Two dimensional displays;Time-frequency analysis;Neural networks;Vocoders;Probability distribution;Europe;Singing synthesis;spectral envelopes;deep learning},
  doi = {10.23919/EUSIPCO.2019.8903122},
  issn = {2076-1465},
  month = {Sep.},
  url = {https://www.eurasip.org/proceedings/eusipco/eusipco2019/proceedings/papers/1570534081.pdf},
}

Downloads: 0

{"_id":"f9upCvtxFg65aqJTD","bibbaseid":"bous-roebel-analysingdeeplearningspectralenvelopepredictionmethodsforsingingsynthesis-2019","authorIDs":[],"author_short":["Bous, F.","Roebel, A."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","author":[{"firstnames":["F."],"propositions":[],"lastnames":["Bous"],"suffixes":[]},{"firstnames":["A."],"propositions":[],"lastnames":["Roebel"],"suffixes":[]}],"booktitle":"2019 27th European Signal Processing Conference (EUSIPCO)","title":"Analysing Deep Learning-Spectral Envelope Prediction Methods for Singing Synthesis","year":"2019","pages":"1-5","abstract":"We conduct an investigation on various hyperparameters regarding neural networks used to generate spectral envelopes for singing synthesis. Two perceptive tests, where the first compares two models directly and the other ranks models with a mean opinion score, are performed. With these tests we show that when learning to predict spectral envelopes, 2d-convolutions are superior over previously proposed 1d-convolutions and that predicting multiple frames in an iterated fashion during training is superior over injecting noise to the input data. An experimental investigation whether learning to predict a probability distribution vs. single samples was performed but turned out to be inconclusive. A network architecture is proposed that incorporates the improvements which we found to be useful and we show in our experiments that this network produces better results than other stat-of-the-art methods.","keywords":"convolutional neural nets;learning (artificial intelligence);prediction theory;spectral analysis;speech coding;speech processing;statistical distributions;deep learning-spectral envelope prediction methods;singing synthesis;neural networks;spectral envelopes;perceptive tests;mean opinion score;network architecture;2d-convolution;1d-convolutions;probability distribution;Convolution;Two dimensional displays;Time-frequency analysis;Neural networks;Vocoders;Probability distribution;Europe;Singing synthesis;spectral envelopes;deep learning","doi":"10.23919/EUSIPCO.2019.8903122","issn":"2076-1465","month":"Sep.","url":"https://www.eurasip.org/proceedings/eusipco/eusipco2019/proceedings/papers/1570534081.pdf","bibtex":"@InProceedings{8903122,\n author = {F. Bous and A. Roebel},\n booktitle = {2019 27th European Signal Processing Conference (EUSIPCO)},\n title = {Analysing Deep Learning-Spectral Envelope Prediction Methods for Singing Synthesis},\n year = {2019},\n pages = {1-5},\n abstract = {We conduct an investigation on various hyperparameters regarding neural networks used to generate spectral envelopes for singing synthesis. Two perceptive tests, where the first compares two models directly and the other ranks models with a mean opinion score, are performed. With these tests we show that when learning to predict spectral envelopes, 2d-convolutions are superior over previously proposed 1d-convolutions and that predicting multiple frames in an iterated fashion during training is superior over injecting noise to the input data. An experimental investigation whether learning to predict a probability distribution vs. single samples was performed but turned out to be inconclusive. A network architecture is proposed that incorporates the improvements which we found to be useful and we show in our experiments that this network produces better results than other stat-of-the-art methods.},\n keywords = {convolutional neural nets;learning (artificial intelligence);prediction theory;spectral analysis;speech coding;speech processing;statistical distributions;deep learning-spectral envelope prediction methods;singing synthesis;neural networks;spectral envelopes;perceptive tests;mean opinion score;network architecture;2d-convolution;1d-convolutions;probability distribution;Convolution;Two dimensional displays;Time-frequency analysis;Neural networks;Vocoders;Probability distribution;Europe;Singing synthesis;spectral envelopes;deep learning},\n doi = {10.23919/EUSIPCO.2019.8903122},\n issn = {2076-1465},\n month = {Sep.},\n url = {https://www.eurasip.org/proceedings/eusipco/eusipco2019/proceedings/papers/1570534081.pdf},\n}\n\n","author_short":["Bous, F.","Roebel, A."],"key":"8903122","id":"8903122","bibbaseid":"bous-roebel-analysingdeeplearningspectralenvelopepredictionmethodsforsingingsynthesis-2019","role":"author","urls":{"Paper":"https://www.eurasip.org/proceedings/eusipco/eusipco2019/proceedings/papers/1570534081.pdf"},"keyword":["convolutional neural nets;learning (artificial intelligence);prediction theory;spectral analysis;speech coding;speech processing;statistical distributions;deep learning-spectral envelope prediction methods;singing synthesis;neural networks;spectral envelopes;perceptive tests;mean opinion score;network architecture;2d-convolution;1d-convolutions;probability distribution;Convolution;Two dimensional displays;Time-frequency analysis;Neural networks;Vocoders;Probability distribution;Europe;Singing synthesis;spectral envelopes;deep learning"],"metadata":{"authorlinks":{}},"downloads":0},"bibtype":"inproceedings","biburl":"https://raw.githubusercontent.com/Roznn/EUSIPCO/main/eusipco2019url.bib","creationDate":"2021-02-11T19:15:22.155Z","downloads":0,"keywords":["convolutional neural nets;learning (artificial intelligence);prediction theory;spectral analysis;speech coding;speech processing;statistical distributions;deep learning-spectral envelope prediction methods;singing synthesis;neural networks;spectral envelopes;perceptive tests;mean opinion score;network architecture;2d-convolution;1d-convolutions;probability distribution;convolution;two dimensional displays;time-frequency analysis;neural networks;vocoders;probability distribution;europe;singing synthesis;spectral envelopes;deep learning"],"search_terms":["analysing","deep","learning","spectral","envelope","prediction","methods","singing","synthesis","bous","roebel"],"title":"Analysing Deep Learning-Spectral Envelope Prediction Methods for Singing Synthesis","year":2019,"dataSources":["NqWTiMfRR56v86wRs","r6oz3cMyC99QfiuHW"]}