The Centre for Speech Technology Research, The university of Edinburgh

Publications by Zhizheng Wu

zwu2.bib

@book{nicktomijunichi2014antispoofing,
  author = {{E}vans, {N}icholas {W} {D} and {K}innunen, {T}omi and {Y}amagishi, {J}unichi and {W}u, {Z}hizheng and {A}legre, {F}ederico and {D}e {L}eon, {P}hillip},
  publisher = {{B}ook {C}hapter in "{H}andbook of {B}iometric {A}nti-spoofing", {S}pringer, {S}. {M}arcel, {S}. {L}i and {M}. {N}ixon, {E}ds., 2014},
  doi = {http://dx.doi.org/10.1007/978-1-4471-6524-8_7},
  title = {{S}peaker recognition anti-spoofing},
  abstract = {Progress in the development of spoofing countermeasures for automatic speaker recognition is less advanced than equivalent work related to other biometric modalities. This chapter outlines the potential for even state-of-the-art automatic speaker recognition systems to be spoofed. While the use of a multitude of different datasets, protocols and metrics complicates the meaningful comparison of different vulnerabilities, we review previous work related to impersonation, replay, speech synthesis and voice conversion spoofing attacks. The article also presents an analysis of the early work to develop spoofing countermeasures. The literature shows that there is significant potential for automatic speaker verification systems to be spoofed, that significant further work is required to develop generalised countermeasures, that there is a need for standard datasets, evaluation protocols and metrics and that greater emphasis should be placed on text-dependent scenarios.},
  month = {June},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/chapter7_anti-spoofing.pdf},
  categories = {Speaker recogntion, spoofing attack, anti-spoofing, countermeasure}
}
@inproceedings{dnnbmtl_ICASSP15,
  author = {Wu, Z. and Valentini-Botinhao, C. and Watts, O. and King, S.},
  title = {{Deep neural networks employing multi-task learning and stacked bottleneck features for speech synthesis.}},
  booktitle = {Proc. ICASSP},
  year = {2015},
  abstract = {Deep neural networks (DNNs) use a cascade of hidden representations to enable the learning of complex mappings from input to output features. They are able to learn the complex mapping from textbased linguistic features to speech acoustic features, and so perform text-to-speech synthesis. Recent results suggest that DNNs can produce more natural synthetic speech than conventional HMM-based statistical parametric systems. In this paper, we show that the hidden representation used within a DNN can be improved through the use of Multi-Task Learning, and that stacking multiple frames of hidden layer activations (stacked bottleneck features) also leads to improvements. Experimental results confirmed the effectiveness of the proposed methods, and in listening tests we find that stacked bottleneck features in particular offer a significant improvement over both a baseline DNN and a benchmark HMM system.},
  month = {April},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/dnnbmtl_ICASSP15.pdf},
  pages = {4460-4464}
}
@inproceedings{dnncost_IS15,
  author = {Valentini-Botinhao, C. and Wu, Z. and King, S.},
  title = {{Towards minimum perceptual error training for {DNN}-based speech synthesis}},
  booktitle = {Proc. Interspeech},
  year = {2015},
  month = {September},
  address = {Dresden, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/dnncost_IS15.pdf},
  abstract = {We propose to use a perceptually-oriented domain to improve the quality of text-to-speech generated by deep neural networks (DNNs). We train a DNN that predicts the parameters required for speech reconstruction but whose cost function is calculated in another domain. In this paper, to represent this perceptual domain we extract an approximated version of the Spectro-Temporal Excitation Pattern that was originally proposed as part of a model of hearing speech in noise. We train DNNs that predict band aperiodicity, fundamental frequency and Mel cepstral coefficients and compare generated speech when the spectral cost function is defined in the Mel cepstral, warped log spectrum or perceptual domains. Objective results indicate that the perceptual domain system achieves the highest quality.}
}
@inproceedings{Merritt2015RichContext,
  author = {Merritt, Thomas and Yamagishi, Junichi and Wu, Zhizheng and Watts, Oliver and King, Simon},
  title = {{Deep neural network context embeddings for model selection in rich-context HMM synthesis}},
  booktitle = {{Proc. Interspeech}},
  address = {Dresden},
  month = {September},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/merritt2015RichContext.pdf},
  abstract = {{This paper introduces a novel form of parametric synthesis that uses context embeddings produced by the bottleneck layer of a deep neural network to guide the selection of models in a rich-context HMM-based synthesiser. Rich-context synthesis – in which Gaussian distributions estimated from single linguistic contexts seen in the training data are used for synthesis, rather than more conventional decision tree-tied models – was originally proposed to address over-smoothing due to averaging across contexts. Our previous investigations have confirmed experimentally that averaging across different contexts is indeed one of the largest factors contributing to the limited quality of statistical parametric speech synthesis. However, a possible weakness of the rich context approach as previously formulated is that a conventional tied model is still used to guide selection of Gaussians at synthesis time. Our proposed approach replaces this with context embeddings derived from a neural network.}},
  categories = {{speech synthesis, hidden Markov model, deep neural networks, rich context, embedding}}
}
@inproceedings{wester:human:IS2015,
  author = {Wester, Mirjam and Wu, Zhizheng and Yamagishi, Junichi},
  title = {Human vs Machine Spoofing Detection on Wideband and Narrowband Data},
  booktitle = {Proc. Interspeech},
  year = {2015},
  month = {September},
  address = {Dresden},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/wester:human:IS2015.pdf},
  abstract = {How well do humans detect spoofing attacks directed at automatic speaker verification systems? This paper investigates the performance of humans at detecting spoofing attacks from speech synthesis and voice conversion systems. Two speaker verification tasks, in which the speakers were either humans or machines, were also conducted. The three tasks were carried out with two types of data: wideband (16kHz) and narrowband (8kHz) telephone line simulated data. Spoofing detection by humans was compared to automatic spoofing detection (ASD) algorithms. Listening tests were carefully constructed to en- sure the human and automatic tasks were as similar as possible taking into consideration listener’s constraints (e.g., fatigue and memory limitations). Results for human trials show the error rates on narrowband data double compared to on wide- band data. The second verification task, which included only artificial speech, showed equal overall acceptance rates for both 8kHz and 16kHz. In the spoofing detection task, there was a drop in performance on most of the artificial trials as well as on human trials. At 8kHz, 20% of human trials were incorrectly classified as artificial, compared to 12% at 16kHz. The ASD algorithms also showed a drop in performance on 8kHz data, but outperformed human listeners across the board.},
  categories = {spoofing, human performance, automatic spoofing detection}
}
@article{sizov2015joint,
  author = {Sizov, Aleksandr and Khoury, Elie and Kinnunen, Tomi and Wu, Zhizheng and Marcel, Sebastien},
  publisher = {IEEE},
  title = {Joint Speaker Verification and Antispoofing in the-Vector Space},
  journal = {IEEE Transactions on Information Forensics and Security},
  number = {4},
  volume = {10},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/TIFS2015_joint.pdf},
  pages = {821--832}
}
@inproceedings{wu2015minimum,
  author = {Wu, Zhizheng and King, Simon},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/is2015_trajectory_dnn.pdf},
  booktitle = {Interspeech},
  year = {2015},
  title = {Minimum trajectory error training for deep neural networks, combined with stacked bottleneck features}
}
@inproceedings{wu2015adaptation,
  author = {Wu, Zhizheng and Swietojanski, Pawel and Veaux, Christophe and Renals, Steve and King, Simon},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/is2015_dnn_adaptation.pdf},
  booktitle = {Interspeech},
  year = {2015},
  title = {A study of speaker adaptation for {DNN}-based speech synthesis}
}
@inproceedings{wu2015asvspoof,
  author = {Wu, Zhizheng and Kinnunen, Tomi and Evans, Nicholas and Yamagishi, Junichi and Hanilci, Cemal and Sahidullah, Md and Sizov, Aleksandr},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/is2015_asvspoof.pdf},
  booktitle = {Interspeech},
  year = {2015},
  title = {{ASVspoof} 2015: the First Automatic Speaker Verification Spoofing and Countermeasures Challenge}
}
@inproceedings{tian2015fusion,
  author = {Tian, Xiaohai and Wu, Zhizheng and Lee, Siu-Wa and Nguyen, Quy Hy and Dong, Minghui and Chng, Eng Siong},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/is2015_xiaohai_fusion.pdf},
  booktitle = {Interspeech},
  year = {2015},
  title = {System Fusion for High-Performance Voice Conversion}
}
@inproceedings{wu2015mtl,
  author = {Wu, Zhizheng and Valentini-Botinhao, Cassia and Watts, Oliver and King, Simon},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/icassp2015_dnn_tts.pdf},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  year = {2015},
  title = {Deep neural network employing multi-task learning and stacked bottleneck features for speech synthesis}
}
@inproceedings{wu2015sas,
  author = {Wu, Zhizheng and Khodabakhsh, Ali and Demiroglu, Cenk and Yamagishi, Junichi and Saito, Daisuke and Toda, Tomoki and King, Simon},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/icassp2015_sas.pdf},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  year = {2015},
  title = {{SAS}: A speaker verification spoofing database containing diverse attacks}
}
@inproceedings{tian2015sparse,
  author = {Tian, Xiaohai and Wu, Zhizheng and Lee, Siu-Wa and Nguyen, Quy Hy and Chng, Eng Siong and Dong, Minghui},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/icassp2015_sparse_warping.pdf},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  year = {2015},
  title = {Sparse representation for frequency warping based voice conversion}
}
@inproceedings{Hu_Interspeech15,
  author = {Hu, Qiong and Wu, Zhizheng and Richmond, Korin and Yamagishi, Junichi and Stylianou, Yannis and Maia, Ranniery},
  title = {Fusion of multiple parameterisations for {DNN}-based sinusoidal speech synthesis with multi-task learning},
  booktitle = {Proc. Interspeech},
  year = {2015},
  month = {September},
  address = {Dresden, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/Q_Interspeech15.pdf},
  abstract = {It has recently been shown that deep neural networks (DNN) can improve the quality of statistical parametric speech synthesis (SPSS) when using a source-filter vocoder. Our own previous work has furthermore shown that a dynamic sinusoidal model (DSM) is also highly suited to DNN-based SPSS, whereby sinusoids may either be used themselves as a “direct parameterisation” (DIR), or they may be encoded using an “intermediate spectral parameterisation” (INT). The approach in that work was effectively to replace a decision tree with a neural network. However, waveform parameterisation and synthesis steps that have been developed to suit HMMs may not fully exploit DNN capabilities. Here, in contrast, we investigate ways to combine INT and DIR at the levels of both DNN modelling and waveform generation. For DNN training, we propose to use multi-task learning to model cepstra (from INT) and log amplitudes (from DIR) as primary and secondary tasks. Our results show combining these improves modelling accuracy for both tasks. Next, during synthesis, instead of discarding parameters from the second task, a fusion method using harmonic amplitudes derived from both tasks is applied. Preference tests show the proposed method gives improved performance, and that this applies to synthesising both with and without global variance parameters.}
}
@inproceedings{henter2016robust,
  author = {Henter, Gustav Eje and Ronanki, Srikanth and Watts, Oliver and Wester, Mirjam and Wu, Zhizheng and King, Simon},
  title = {Robust {TTS} duration modelling using {DNN}s},
  url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7472655},
  abstract = {Accurate modelling and prediction of speech-sound durations is an important component in generating more natural synthetic speech. Deep neural networks (DNNs) offer a powerful modelling paradigm, and large, found corpora of natural and expressive speech are easy to acquire for training them. Unfortunately, found datasets are seldom subject to the quality-control that traditional synthesis methods expect. Common issues likely to affect duration modelling include transcription errors, reductions, filled pauses, and forced-alignment inaccuracies. To combat this, we propose to improve modelling and prediction of speech durations using methods from robust statistics, which are able to disregard ill-fitting points in the training material. We describe a robust fitting criterion based on the density power divergence (the beta-divergence) and a robust generation heuristic using mixture density networks (MDNs). Perceptual tests indicate that subjects prefer synthetic speech generated using robust models of duration over the baselines.},
  address = {Shanghai, China},
  month = {March},
  volume = {41},
  year = {2016},
  keywords = {Speech synthesis, duration modelling, robust statistics},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/henter2016robust.pdf},
  booktitle = {Proc. ICASSP},
  pages = {5130--5134},
  categories = {Speech synthesis, duration modelling, robust statistics}
}
@inproceedings{watts2016hmms,
  author = {Watts, Oliver and Henter, Gustav Eje and Merritt, Thomas and Wu, Zhizheng and King, Simon},
  title = {From {HMM}s to {DNN}s: where do the improvements come from?},
  url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7472730},
  abstract = {Deep neural networks (DNNs) have recently been the focus of much text-to-speech research as a replacement for decision trees and hidden Markov models (HMMs) in statistical parametric synthesis systems. Performance improvements have been reported; however, the configuration of systems evaluated makes it impossible to judge how much of the improvement is due to the new machine learning methods, and how much is due to other novel aspects of the systems. Specifically, whereas the decision trees in HMM-based systems typically operate at the state-level, and separate trees are used to handle separate acoustic streams, most DNN-based systems are trained to make predictions simultaneously for all streams at the level of the acoustic frame. This paper isolates the influence of three factors (machine learning method; state vs. frame predictions; separate vs. combined stream predictions) by building a continuum of systems along which only a single factor is varied at a time. We find that replacing decision trees with DNNs and moving from state-level to frame-level predictions both significantly improve listeners' naturalness ratings of synthetic speech produced by the systems. No improvement is found to result from switching from separate-stream to combined-stream predictions.},
  address = {Shanghai, China},
  month = {March},
  volume = {41},
  year = {2016},
  keywords = {speech synthesis, hidden Markov model, decision tree, deep neural network},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/watts2016hmms.pdf},
  booktitle = {Proc. ICASSP},
  pages = {5505--5509},
  categories = {speech synthesis, hidden Markov model, decision tree, deep neural network}
}
@inproceedings{toda2016voice,
  author = {Toda, Tomoki and Chen, Ling-Hui and Saito, Daisuke and Villavicencio, Fernando and Wester, Mirjam and Wu, Zhizheng and Yamagishi, Junichi},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/toda2016voice.pdf},
  booktitle = {Proc. Interspeech},
  title = {The Voice Conversion Challenge 2016},
  abstract = {This paper describes the Voice Conversion Challenge 2016 devised by the authors to better understand different voice conversion (VC) techniques by comparing their performance on a common dataset. The task of the challenge was speaker conversion, i.e., to transform the voice identity of a source speaker into that of a target speaker while preserving the linguistic content. Using a common dataset consisting of 162 utterances for training and 54 utterances for evaluation from each of 5 source and 5 target speakers, 17 groups working in VC around the world developed their own VC systems for every combination of the source and target speakers, i.e., 25 systems in total, and generated voice samples converted by the developed systems. These samples were evaluated in terms of target speaker similarity and naturalness by 200 listeners in a controlled environment. This paper summarizes the design of the challenge, its result, and a future plan to share views about unsolved problems and challenges faced by the current VC techniques.},
  year = {2016}
}
@inproceedings{wester2016analysis,
  author = {Wester, Mirjam and Wu, Zhizheng and Yamagishi, Junichi},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/wester2016analysis.pdf},
  booktitle = {Proc. Interspeech},
  title = {Analysis of the Voice Conversion Challenge 2016 Evaluation Results},
  abstract = {The Voice Conversion Challenge 2016 is the first Voice Conversion Challenge in which different voice conversion systems and approaches using the same voice data were compared. This paper describes the design of the evaluation, it presents the results and statistical analyses of the results.},
  year = {2016}
}
@inproceedings{wester2016multidimensional,
  author = {Wester, Mirjam and Wu, Zhizheng and Yamagishi, Junichi},
  title = {Multidimensional scaling of systems in the Voice Conversion Challenge 2016},
  booktitle = {Proc. Speech Synthesis Workshop 9},
  year = {2016},
  address = {Sunnyvale, CA.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/wester2016multidimensional.pdf},
  abstract = {This study investigates how listeners judge the similarity of voice converted voices using a talker discrimination task. The data used is from the Voice Conversion Challenge 2016. 17 participants from around the world took part in building voice converted voices from a shared data set of source and target speakers. This paper describes the evaluation of similarity for four of the source-target pairs (two intra-gender and two cross-gender) in more detail. Multidimensional scaling was performed to illustrate where each system was perceived to be in an acoustic space compared to the source and target speakers and to each other.}
}
@inproceedings{ronanki2016template,
  author = {Ronanki, Srikanth and Henter, Gustav Eje and Wu, Zhizheng and King, Simon},
  title = {A template-based approach for speech synthesis intonation generation using {LSTM}s},
  booktitle = {Proc. Interspeech},
  year = {2016},
  month = {September},
  address = {San Francisco, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/ronanki2016template.pdf},
  abstract = {The absence of convincing intonation makes current parametric speech synthesis systems sound dull and lifeless, even when trained on expressive speech data. Typically, these systems use regression techniques to predict the fundamental frequency (F0) frame-by-frame. This approach leads to overly-smooth pitch contours and fails to construct an appropriate prosodic structure across the full utterance. In order to capture and reproduce larger-scale pitch patterns, this paper proposes a template-based approach for automatic F0 generation, where per-syllable pitch-contour templates (from a small, automatically learned set) are predicted by a recurrent neural network (RNN). The use of syllable templates mitigates the over-smoothing problem and is able to reproduce pitch patterns observed in the data. The use of an RNN, paired with connectionist temporal classification (CTC), enables the prediction of structure in the pitch contour spanning the entire utterance. This novel F0 prediction system is used alongside separate LSTMs for predicting phone durations and the other acoustic features, to construct a complete text-to-speech system. We report the results of objective and subjective tests on an expressive speech corpus of children's audiobooks, and include comparisons to a conventional baseline that predicts F0 directly at the frame level.},
  categories = {speech synthesis, intonation modelling, F0 templates, LSTM, CTC}
}
@inproceedings{merritt2016hybrid,
  author = {Merritt, Thomas and Clark, Robert A J and Wu, Zhizheng and Yamagishi, Junichi and King, Simon},
  title = {Deep neural network-guided unit selection synthesis},
  booktitle = {Proc. ICASSP},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Merritt_ICASSP2016.pdf},
  abstract = {Vocoding of speech is a standard part of statistical parametric speech synthesis systems. It imposes an upper bound of the naturalness that can possibly be achieved. Hybrid systems using parametric models to guide the selection of natural speech units can combine the benefits of robust statistical models with the high level of naturalness of waveform concatenation. Existing hybrid systems use Hidden Markov Models (HMMs) as the statistical model. This paper demonstrates that the superiority of Deep Neural Network (DNN) acoustic models over HMMs in conventional statistical parametric speech synthesis also carries over to hybrid synthesis. We compare various DNN and HMM hybrid configurations, guiding the selection of waveform units in either the vocoder parameter domain, or in the domain of embeddings (bottleneck features).},
  categories = {speech synthesis, hybrid synthesis, deep neural networks, embedding, unit selection}
}
@inproceedings{cstr2016blizzard,
  author = {Merritt, Thomas and Ronanki, Srikanth and Wu, Zhizheng and Watts, Oliver},
  title = {The {CSTR} entry to the {Blizzard Challenge} 2016},
  booktitle = {Proc. Blizzard Challenge},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Cstr2016BlizzardEntry.pdf},
  abstract = {This paper describes the text-to-speech system entered by The Centre for Speech Technology Research into the 2016 Blizzard Challenge. This system is a hybrid synthesis system which uses output from a recurrent neural network to drive a unit selection synthesiser. The annual Blizzard Challenge conducts side-by-side testing of a number of speech synthesis systems trained on a common set of speech data. The task of the 2016 Blizzard Challenge is to train on expressively-read children’s storybooks, and to synthesise speech in the same domain. The Challenge therefore presents an opportunity to test the effectiveness of several techniques we have developed when applied to expressive speech data.},
  categories = {hybrid synthesis, statistical parametric speech synthesis, deep neural network, recurrent neural network, unit selection}
}
@inproceedings{ronanki_demo_ssw2016,
  author = {Ronanki, Srikanth and Wu, Zhizheng and Watts, Oliver and King, Simon},
  title = {{A Demonstration of the Merlin Open Source Neural Network Speech Synthesis System}},
  booktitle = {Proc. Speech Synthesis Workshop (SSW9)},
  month = {September},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Merlin_demo_paper.pdf},
  abstract = {This demonstration showcases our new Open Source toolkit for neural network-based speech synthesis, Merlin. We wrote Merlin because we wanted free, simple, maintainable code that we understood. No existing toolkits met all of those requirements. Merlin is designed for speech synthesis, but can be put to other uses. It has already also been used for voice conversion, classification tasks, and for predicting head motion from speech.},
  categories = {Merlin, speech synthesis, deep learning}
}
@inproceedings{watts_blizzard2015,
  author = {Watts, Oliver and Ronanki, Srikanth and Wu, Zhizheng and Raitio, Tuomo and Suni, Antti},
  title = {{The NST--GlottHMM entry to the Blizzard Challenge 2015}},
  booktitle = {Proc. Blizzard Challenge Workshop (Interspeech Satellite)},
  year = {2015},
  month = {September},
  key = {watts_blizzard2015},
  address = {Berlin, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/bc2015_nst.pdf},
  abstract = {We describe the synthetic voices forming the joint entry into the 2015 Blizzard Challenge of the Natural Speech Technology consortium, Helsinki University, and Aalto University. The 2015 Blizzard Challenge presents an opportunity to test and benchmark some of the tools we have developed to address the problem of how to produce systems in arbitrary new languages with minimal annotated data and language-specific expertise on the part of the system builders. We here explain how our tools were used to address these problems on the different tasks of the challenge, and provide some discussion of the evaluation results. Some additions to the system used to build voices for the previous Challenge are described: acoustic modelling using deep neural networks with jointly-trained duration model, and an unsupervised approach for handling the phenomenon of inherent vowel deletion which occurs in 3 of the 6 target languages.},
  categories = {statistical parametric speech synthesis, unsupervised learning, vector space model, glottal inverse filtering, deep neural network, glottal flow pulse library, schwa-deletion}
}
@inproceedings{Espic2016,
  author = {Espic, Felipe and Valentini-Botinhao, Cassia and Wu, Zhizheng and King, Simon},
  title = {Waveform generation based on signal reshaping for statistical parametric speech synthesis},
  booktitle = {Proc. Interspeech},
  address = {San Francisco, CA, USA},
  abstract = {We propose a new paradigm of waveform generation for Statistical Parametric Speech Synthesis that is based on neither source-filter separation nor sinusoidal modelling. We suggest that one of the main problems of current vocoding techniques is that they perform an extreme decomposition of the speech signal into source and filter, which is an underlying cause of “buzziness”, “musical artifacts”, or “muffled sound” in the synthetic speech. The proposed method avoids making unnecessary assumptions and decompositions as far as possible, and uses only the spectral envelope and F0 as parameters. Prerecorded speech is used as a base signal, which is “reshaped” to match the acoustic specification predicted by the statistical model, without any source-filter decomposition. A detailed description of the method is presented, including implementation details and adjustments. Subjective listening test evaluations of complete DNN-based text-to-speech systems were conducted for two voices: one female and one male. The results show that the proposed method tends to outperform the state-of-theart standard vocoder STRAIGHT, whilst using fewer acoustic parameters.},
  month = {September},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/0487.PDF},
  pages = {2263-2267},
  categories = {speech synthesis, waveform generation, vocoding, statistical parametric speech synthesis}
}
@inproceedings{wu2016merlin,
  author = {Wu, Zhizheng and Watts, Oliver and King, Simon},
  date-modified = {2018-01-19 16:44:16 +0000},
  title = {Merlin: An Open Source Neural Network Speech Synthesis System},
  abstract = {We introduce the Merlin speech synthesis toolkit for neural network-based speech synthesis. The system takes linguistic features as input, and employs neural networks to predict acoustic features, which are then passed to a vocoder to produce the speech waveform. Various neural netw are implemented, including a standard feedforward neural network, mixture density neural network, recurrent neural network (RNN), long short-term memory (LSTM) recurrent neural network, amongst others. The toolkit is Open Source, written in Python, and is extensible. This paper briefly describes the system, and provides some benchmarking results on a freely available corpus.},
  month = sep,
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/master_2.pdf},
  booktitle = {9th ISCA Speech Synthesis Workshop (2016)},
  pages = {218--223}
}
@inproceedings{watts2015nst,
  author = {Watts, Oliver and Ronanki, Srikanth and Wu, Zhizheng and Raitio, Tuomo and Suni, A.},
  date-modified = {2018-01-19 16:44:49 +0000},
  title = {The NST--GlottHMM entry to the Blizzard Challenge 2015},
  abstract = {We describe the synthetic voices forming the joint entry into the 2015 Blizzard Challenge of the Natural Speech Technology consortium, Helsinki University, and Aal to University. The 2015 Blizzard Challenge presents an opportunity to test and benchmark some of the tools we have developed to address the problem of how to produce systems in arbitrary new languages with minimal annotated data and language-specific expertise on the part of the system builders. We here explain how our tools were used to address these problems on the different tasks of the challenge, and provide some discussion of the evaluation results. Some additions to the system used to build voices for the previous Challenge are described: acoustic modelling using deep neural networks with jointly-trained duration model,and an unsupervised approach for handling the phenomenon of inherent vowel deletion which occurs in 3 of the 6 target languages.},
  month = sep,
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/watts2015nst-glotthmm.pdf},
  booktitle = {Proceedings of Blizzard Challenge 2015}
}
@inproceedings{watts2015sentence-level,
  author = {Watts, Oliver and Wu, Zhizheng and King, Simon},
  publisher = {International Speech Communication Association},
  date-modified = {2018-01-19 16:44:26 +0000},
  title = {Sentence-level control vectors for deep neural network speech synthesis},
  abstract = {This paper describes the use of a low-dimensional vector representation of sentence acoustics to control the output of a feed-forward deep neural network text-to-speech system on a sentence-by-sentence basis. Vector representations for sentences in the training corpus are learned during network training along with other parameters of the model. Although the network is trained on a frame-by-frame basis, the standard frame-level inputs representing linguistic features are supplemented by features from a projection layer which outputs a learned representation of sentence-level acoustic characteristics. The projection layer contains dedicated parameters for each sentence in the training data which are optimised jointly with the standard network weights. Sentence-specific parameters are optimised on all frames of the relevant sentence -- these parameters therefore allow the network to account for sentence-level variation in the data which is not predictable from the standard linguistic inputs. Results show that the global prosodic characteristics of synthetic speech can be controlled simply and robustly at run time by supplementing basic linguistic features with sentence-level control vectors which are novel but designed to be consistent with those observed in the training corpus.},
  month = sep,
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/watts2015sentence-level.pdf},
  booktitle = {INTERSPEECH 2015 16th Annual Conference of the International Speech Communication Association},
  pages = {2217--2221}
}