The Centre for Speech Technology Research, The university of Edinburgh

Publications by Oliver Watts

owatts.bib

@inproceedings{LorenzoAlbayzinProposal2012,
  author = {Lorenzo-Trueba, Jaime and Watts, Oliver and Barra-Chicote, Roberto and Yamagishi, Junichi and King, Simon and Montero, Juan M},
  title = {Simple4All proposals for the Albayzin Evaluations in Speech Synthesis},
  abstract = {Simple4All is a European funded project that aims to streamline the production of multilanguage expressive synthetic voices by means of unsupervised data extraction techniques, allowing the automatic processing of freely available data into flexible task-specific voices. In this paper we describe three different approaches for this task, the first two covering enhancements in expressivity and flexibility with the final one focusing on the development of unsupervised voices. The first technique introduces the principle of speaker adaptation from average models consisting of multiple voices, with the second being an extension of this adaptation concept into allowing the control of the expressive strength of the synthetic voice. Finally, an unsupervised approach to synthesis capable of learning from unlabelled text data is introduced in detail},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/simple4all-proposal.pdf},
  booktitle = {Proc. Iberspeech 2012},
  categories = {Albayzin challenge, expressive speech synthesis}
}
@article{child_speech_journal_2010,
  author = {Watts, O. and Yamagishi, J. and King, S. and Berkling, K.},
  doi = {10.1109/TASL.2009.2035029},
  title = {Synthesis of Child Speech with {HMM} Adaptation and Voice Conversion},
  journal = {Audio, Speech, and Language Processing, IEEE Transactions on},
  issn = {1558-7916},
  number = {5},
  month = {July},
  volume = {18},
  pages = {1005--1016},
  year = {2010},
  keywords = {HMM adaptation techniques;child speech synthesis;hidden Markov model;speaker adaptive modeling technique;speaker dependent technique;speaker-adaptive voice;statistical parametric synthesizer;target speaker corpus;voice conversion;hidden Markov models;speech synthesis;},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/YamagishiJ_SynthesisofChildSpeech.pdf},
  abstract = {The synthesis of child speech presents challenges both in the collection of data and in the building of a synthesizer from that data. We chose to build a statistical parametric synthesizer using the hidden Markov model (HMM)-based system HTS, as this technique has previously been shown to perform well for limited amounts of data, and for data collected under imperfect conditions. Six different configurations of the synthesizer were compared, using both speaker-dependent and speaker-adaptive modeling techniques, and using varying amounts of data. For comparison with HMM adaptation, techniques from voice conversion were used to transform existing synthesizers to the characteristics of the target speaker. Speaker-adaptive voices generally outperformed child speaker-dependent voices in the evaluation. HMM adaptation outperformed voice conversion style techniques when using the full target speaker corpus; with fewer adaptation data, however, no significant listener preference for either HMM adaptation or voice conversion methods was found.}
}
@article{Ekpenyong2013,
  author = {Ekpenyong, Moses and Urua, Eno-Abasi and Watts, Oliver and King, Simon and Yamagishi, Junichi},
  numpages = {9},
  issue_date = {January, 2014},
  doi = {10.1016/j.specom.2013.02.003},
  title = {Statistical Parametric Speech Synthesis for {I}bibio},
  url = {http://dx.doi.org/10.1016/j.specom.2013.02.003},
  journal = {Speech Communication},
  issn = {0167-6393},
  abstract = {Ibibio is a Nigerian tone language, spoken in the south-east coastal region of Nigeria. Like most African languages, it is resource-limited. This presents a major challenge to conventional approaches to speech synthesis, which typically require the training of numerous predictive models of linguistic features such as the phoneme sequence (i.e., a pronunciation dictionary plus a letter-to-sound model) and prosodic structure (e.g., a phrase break predictor). This training is invariably supervised, requiring a corpus of training data labelled with the linguistic feature to be predicted. In this paper, we investigate what can be achieved in the absence of many of these expensive resources, and also with a limited amount of speech recordings. We employ a statistical parametric method, because this has been found to offer good performance even on small corpora, and because it is able to directly learn the relationship between acoustics and whatever linguistic features are available, potentially mitigating the absence of explicit representations of intermediate linguistic layers such as prosody. We present an evaluation that compares systems that have access to varying degrees of linguistic structure. The simplest system only uses phonetic context (quinphones), and this is compared to systems with access to a richer set of context features, with or without tone marking. It is found that the use of tone marking contributes significantly to the quality of synthetic speech. Future work should therefore address the problem of tone assignment using a dictionary and the building of a prediction module for out-of-vocabulary words.},
  month = {January},
  volume = {56},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Moses_Ibibio.pdf},
  pages = {243--251},
  categories = {HTS, Ibibio, Low-resource languages, Speech synthesis}
}
@phdthesis{watts-2012,
  author = {Watts, Oliver},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/oliver_watts_thesis.pdf},
  school = {University of Edinburgh},
  year = {2012},
  abstract = {This thesis introduces a general method for incorporating the distributional analysis of textual and linguistic objects into text-to-speech (TTS) conversion systems. Conventional TTS conversion uses intermediate layers of representation to bridge the gap between text and speech. Collecting the annotated data needed to produce these intermediate layers is a far from trivial task, possibly prohibitively so for languages in which no such resources are in existence. Distributional analysis, in contrast, proceeds in an unsupervised manner, and so enables the creation of systems using textual data that are not annotated. The method therefore aids the building of systems for languages in which conventional linguistic resources are scarce, but is not restricted to these languages. The distributional analysis proposed here places the textual objects analysed in a continuous-valued space, rather than specifying a hard categorisation of those objects. This space is then partitioned during the training of acoustic models for synthesis, so that the models generalise over objects' surface forms in a way that is acoustically relevant. The method is applied to three levels of textual analysis: to the characterisation of sub-syllabic units, word units and utterances. Entire systems for three languages (English, Finnish and Romanian) are built with no reliance on manually labelled data or language-specific expertise. Results of a subjective evaluation are presented.},
  title = {Unsupervised Learning for Text-to-Speech Synthesis}
}
@inproceedings{watts_yamagishi_king_2011,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon},
  title = {Unsupervised continuous-valued word features for phrase-break prediction without a part-of-speech tagger},
  booktitle = {Proc. Interspeech},
  year = {2011},
  abstract = {Part of speech (POS) tags are foremost among the features conventionally used to predict intonational phrase-breaks for text to speech (TTS) conversion. The construction of such systems therefore presupposes the availability of a POS tagger for the relevant language, or of a corpus manually tagged with POS. However, such tools and resources are not available in the majority of the world’s languages, and manually labelling text with POS tags is an expensive and time-consuming process. We therefore propose the use of continuous-valued features that summarise the distributional characteristics of word types as surrogates for POS features. Importantly, such features are obtained in an unsupervised manner from an untagged text corpus. We present results on the phrase-break prediction task, where use of the features closes the gap in performance between a baseline system (using only basic punctuation-related features) and a topline system (incorporating a state-of-the-art POS tagger).},
  month = {August},
  address = {Florence, Italy},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_yamagishi_king_2011.pdf},
  pages = {2157--2160}
}
@inproceedings{child_synthesis_2009,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon and Berkling, Kay},
  title = {{HMM} Adaptation and Voice Conversion for the Synthesis of Child Speech: A Comparison},
  booktitle = {Proc. Interspeech 2009},
  year = {2009},
  abstract = {This study compares two different methodologies for producing data-driven synthesis of child speech from existing systems that have been trained on the speech of adults. On one hand, an existing statistical parametric synthesiser is transformed using model adaptation techniques, informed by linguistic and prosodic knowledge, to the speaker characteristics of a child speaker. This is compared with the application of voice conversion techniques to convert the output of an existing waveform concatenation synthesiser with no explicit linguistic or prosodic knowledge. In a subjective evaluation of the similarity of synthetic speech to natural speech from the target speaker, the HMM-based systems evaluated are generally preferred, although this is at least in part due to the higher dimensional acoustic features supported by these techniques.},
  month = {September},
  address = {Brighton, U.K.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/child_synthesis_2009.pdf},
  pages = {2627--2630}
}
@inproceedings{leo_08-3,
  author = {Andersson, J. Sebastian and Badino, Leonardo and Watts, Oliver S. and P.Aylett, Matthew},
  title = {The {CSTR/Cereproc B}lizzard Entry 2008: The Inconvenient Data},
  booktitle = {Proc. Blizzard Challenge Workshop (in Proc. Interspeech 2008)},
  year = {2008},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/cstr-cereproc_Blizzard2008.pdf},
  abstract = {In a commercial system data used for unit selection systems is collected with a heavy emphasis on homogeneous neutral data that has sufficient coverage for the units that will be used in the system. In this years Blizzard entry CSTR and CereProc present a joint entry where the emphasis has been to explore techniques to deal with data which is not homogeneous (the English entry) and did not have appropriate coverage for a diphone based system (the Mandarin entry where tone/phone combinations were treated as distinct phone categories). In addition, two further problems were addressed, 1) Making use of non-homogeneous data for creating a voice that can realise both expressive and neutral speaking styles (the English entry) 2) Building a unit selection system with no native understanding of the language but depending instead on external native evaluation (the Mandarin Entry).}
}
@inproceedings{hts-child-oliver,
  author = {Watts, Oliver and Yamagishi, Junichi and Berkling, Kay and King, Simon},
  title = {{HMM}-based synthesis of child speech},
  booktitle = {Proc. 1st Workshop on Child, Computer and Interaction (ICMI'08 post-conference workshop)},
  year = {2008},
  month = {October},
  key = {hts-child-oliver},
  address = {Crete, Greece},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/child-hts-oliver.pdf},
  abstract = {The synthesis of child speech presents challenges both in the collection of data and in the building of a synthesiser from that data. Because only limited data can be collected, and the domain of that data is constrained, it is difficult to obtain the type of phonetically-balanced corpus usually used in speech synthesis. As a consequence, building a synthesiser from this data is difficult. Concatenative synthesisers are not robust to corpora with many missing units (as is likely when the corpus content is not carefully designed), so we chose to build a statistical parametric synthesiser using the HMM-based system HTS. This technique has previously been shown to perform well for limited amounts of data, and for data collected under imperfect conditions. We compared 6 different configurations of the synthesiser, using both speaker-dependent and speaker-adaptive modelling techniques, and using varying amounts of data. The output from these systems was evaluated alongside natural and vocoded speech, in a Blizzard-style listening test.},
  categories = {speech synthesis, HMM-based speech synthesis, HTS, speaker adaptation, voice conversion, average voice, child speech}
}
@inproceedings{higher_level,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon},
  title = {The role of higher-level linguistic features in {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  year = {2010},
  abstract = {We analyse the contribution of higher-level elements of the linguistic specification of a data-driven speech synthesiser to the naturalness of the synthetic speech which it generates. The system is trained using various subsets of the full feature-set, in which features relating to syntactic category, intonational phrase boundary, pitch accent and boundary tones are selectively removed. Utterances synthesised by the different configurations of the system are then compared in a subjective evaluation of their naturalness. The work presented forms background analysis for an ongoing set of experiments in performing text-to-speech (TTS) conversion based on shallow features: features that can be trivially extracted from text. By building a range of systems, each assuming the availability of a different level of linguistic annotation, we obtain benchmarks for our on-going work.},
  month = {September},
  address = {Makuhari, Japan},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100565.pdf},
  pages = {841-844}
}
@inproceedings{watts_zhou_2011,
  author = {Watts, Oliver and Zhou, Bowen},
  title = {Unsupervised features from text for speech synthesis in a speech-to-speech translation system},
  booktitle = {Proc. Interspeech},
  year = {2011},
  abstract = {We explore the use of linguistic features for text to speech (TTS) conversion in the context of a speech-to-speech translation system that can be extracted from unannotated text in an unsupervised, language-independent fashion. The features are intended to act as surrogates for conventional part of speech (POS) features. Unlike POS features, the experimental features assume only the availability of tools and data that must already be in place for the construction of other components of the translation system, and can therefore be used for the TTS module without incurring additional TTS-specific costs. We here describe the use of the experimental features in a speech synthesiser, using six different configurations of the system to allow the comparison of the proposed features with conventional, knowledge-based POS features. We present results of objective and subjective evaluations of the usefulness of the new features.},
  month = {August},
  address = {Florence, Italy},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/watts_zhou_2011.pdf},
  pages = {2153--2156}
}
@inproceedings{junichi:interspeech2010,
  author = {Yamagishi, Junichi and Watts, Oliver and King, Simon and Usabaev, Bela},
  title = {Roles of the Average Voice in Speaker-adaptive {HMM}-based Speech Synthesis},
  booktitle = {{Proc. Interspeech}},
  year = {2010},
  abstract = {In speaker-adaptive HMM-based speech synthesis, there are typically a few speakers for which the output synthetic speech sounds worse than that of other speakers, despite having the same amount of adaptation data from within the same corpus. This paper investigates these fluctuations in quality and concludes that as mel-cepstral distance from the average voice becomes larger, the MOS naturalness scores generally become worse. Although this negative correlation is not that strong, it suggests a way to improve the training and adaptation strategies. We also draw comparisons between our findings and the work of other researchers regarding ``vocal attractiveness.''},
  month = {September},
  address = {Makuhari, Japan},
  keywords = {speech synthesis, HMM, average voice, speaker adaptation},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/IS100361.pdf},
  pages = {418--421}
}
@article{junichi:ieee2010,
  author = {Yamagishi, J. and Usabaev, B. and King, S. and Watts, O. and Dines, J. and Tian, J. and Hu, R. and Guan, Y. and Oura, K. and Tokuda, K. and Karhila, R. and Kurimo, M.},
  doi = {10.1109/TASL.2010.2045237},
  title = {Thousands of Voices for {HMM}-based Speech Synthesis -- Analysis and Application of {TTS} Systems Built on Various {ASR} Corpora},
  journal = {IEEE Transactions on Audio, Speech and Language Processing},
  number = {5},
  abstract = {In conventional speech synthesis, large amounts of phonetically balanced speech data recorded in highly controlled recording studio environments are typically required to build a voice. Although using such data is a straightforward solution for high quality synthesis, the number of voices available will always be limited, because recording costs are high. On the other hand, our recent experiments with HMM-based speech synthesis systems have demonstrated that speaker-adaptive HMM-based speech synthesis (which uses an ``average voice model'' plus model adaptation) is robust to non-ideal speech data that are recorded under various conditions and with varying microphones, that are not perfectly clean, and/or that lack phonetic balance. This enables us to consider building high-quality voices on ``non-TTS'' corpora such as ASR corpora. Since ASR corpora generally include a large number of speakers, this leads to the possibility of producing an enormous number of voices automatically. In this paper, we demonstrate the thousands of voices for HMM-based speech synthesis that we have made from several popular ASR corpora such as the Wall Street Journal (WSJ0, WSJ1, and WSJCAM0), Resource Management, Globalphone, and SPEECON databases. We also present the results of associated analysis based on perceptual evaluation, and discuss remaining issues.},
  month = {July},
  volume = {18},
  year = {2010},
  keywords = {Automatic speech recognition (ASR), H Triple S (HTS), SPEECON database, WSJ database, average voice, hidden Markov model (HMM)-based speech synthesis, speaker adaptation, speech synthesis, voice conversion},
  pages = {984--1004}
}
@inproceedings{jyamagis:1000sHTS,
  author = {Yamagishi, J. and Usabaev, Bela and King, Simon and Watts, Oliver and Dines, John and Tian, Jilei and Hu, Rile and Guan, Yong and Oura, Keiichiro and Tokuda, Keiichi and Karhila, Reima and Kurimo, Mikko},
  title = {Thousands of voices for {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  year = {2009},
  abstract = {Our recent experiments with HMM-based speech synthesis systems have demonstrated that speaker-adaptive HMM-based speech synthesis (which uses an ‘average voice model’ plus model adaptation) is robust to non-ideal speech data that are recorded under various conditions and with varying microphones, that are not perfectly clean, and/or that lack of phonetic balance. This enables us consider building high-quality voices on ’non-TTS’ corpora such as ASR corpora. Since ASR corpora generally include a large number of speakers, this leads to the possibility of producing an enormous number of voices automatically. In this paper we show thousands of voices for HMM-based speech synthesis that we have made from several popular ASR corpora such as the Wall Street Journal databases (WSJ0/WSJ1/WSJCAM0), Resource Management, Globalphone and Speecon. We report some perceptual evaluation results and outline the outstanding issues.},
  month = {September},
  address = {Brighton, U.K.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2009/index.php},
  pages = {420--423}
}
@inproceedings{letter_based_TTS,
  author = {Watts, Oliver and Yamagishi, Junichi and King, Simon},
  title = {Letter-based speech synthesis},
  booktitle = {Proc. Speech Synthesis Workshop 2010},
  year = {2010},
  abstract = {Initial attempts at performing text-to-speech conversion based on standard orthographic units are presented, forming part of a larger scheme of training TTS systems on features that can be trivially extracted from text. We evaluate the possibility of using the technique of decision-tree-based context clustering conventionally used in HMM-based systems for parametertying to handle letter-to-sound conversion. We present the application of a method of compound-feature discovery to corpusbased speech synthesis. Finally, an evaluation of intelligibility of letter-based systems and more conventional phoneme-based systems is presented.},
  month = {September},
  address = {Nara, Japan},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2010/ssw7.pdf},
  pages = {317-322}
}
@inproceedings{Lu_SSW8,
  author = {Lu, Heng and King, Simon and Watts, Oliver},
  title = {Combining a Vector Space Representation of Linguistic Context with a Deep Neural Network for Text-To-Speech Synthesis},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  year = {2013},
  abstract = {Conventional statistical parametric speech synthesis relies on decision trees to cluster together similar contexts, result- ing in tied-parameter context-dependent hidden Markov models (HMMs). However, decision tree clustering has a major weak- ness: it use hard division and subdivides the model space based on one feature at a time, fragmenting the data and failing to exploit interactions between linguistic context features. These linguistic features themselves are also problematic, being noisy and of varied relevance to the acoustics. We propose to combine our previous work on vector-space representations of linguistic context, which have the added ad- vantage of working directly from textual input, and Deep Neural Networks (DNNs), which can directly accept such continuous representations as input. The outputs of the network are probability distributions over speech features. Maximum Likelihood Parameter Generation is then used to create parameter trajectories, which in turn drive a vocoder to generate the waveform. Various configurations of the system are compared, using both conventional and vector space context representations and with the DNN making speech parameter predictions at two dif- ferent temporal resolutions: frames, or states. Both objective and subjective results are presented.},
  month = {August},
  address = {Barcelona, Spain},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_PS3-3_Lu.pdf},
  pages = {281--285}
}
@inproceedings{Mamiya_SSW8,
  author = {Mamiya, Yoshitaka and Stan, Adriana and Yamagishi, Junichi and Bell, Peter and Watts, Oliver and Clark, Robert and King, Simon},
  title = {Using Adaptation to Improve Speech Transcription Alignment in Noisy and Reverberant Environments},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  year = {2013},
  abstract = {When using data retrieved from the internet to create new speech databases, the recording conditions can often be highly variable within and between sessions. This variance influences the overall performance of any automatic speech and text alignment techniques used to process this data. In this paper we discuss the use of speaker adaptation methods to address this issue. Starting from a baseline system for automatic sentence-level segmentation and speech and text alignment based on GMMs and grapheme HMMs, respectively, we employ Maximum A Posteriori (MAP) and Constrained Maximum Likelihood Linear Regression (CMLLR) techniques to model the variation in the data in order to increase the amount of confidently aligned speech. We tested 29 different scenarios, which include reverberation, 8 talker babble noise and white noise, each in various combinations and SNRs. Results show that the MAP-based segmentation's performance is very much influenced by the noise type, as well as the presence or absence of reverberation. On the other hand, the CMLLR adaptation of the acoustic models gives an average 20\% increase in the aligned data percentage for the majority of the studied scenarios.},
  month = {August},
  address = {Barcelona, Spain},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_PS1-4_Mamiya.pdf},
  pages = {61--66}
}
@inproceedings{Watts_SSW8,
  author = {Watts, Oliver and Stan, Adriana and Clark, Rob and Mamiya, Yoshitaka and Giurgiu, Mircea and Yamagishi, Junichi and King, Simon},
  title = {Unsupervised and lightly-supervised learning for rapid construction of {TTS} systems in multiple languages from 'found' data: evaluation and analysis},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  year = {2013},
  abstract = {This paper presents techniques for building text-to-speech front-ends in a way that avoids the need for language-specific expert knowledge, but instead relies on universal resources (such as the Unicode character database) and unsupervised learning from unannotated data to ease system development. The acquisition of expert language-specific knowledge and expert annotated data is a major bottleneck in the development of corpus-based TTS systems in new languages. The methods presented here side-step the need for such resources as pronunciation lexicons, phonetic feature sets, part of speech tagged data, etc. The paper explains how the techniques introduced are applied to the 14 languages of a corpus of `found' audiobook data. Results of an evaluation of the intelligibility of the systems resulting from applying these novel techniques to this data are presented.},
  month = {August},
  address = {Barcelona, Spain},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_OS2-3_Watts.pdf},
  pages = {121--126}
}
@inproceedings{Lorenzo-Trueba_SSW8,
  author = {Lorenzo-Trueba, Jaime and Barra-Chicote, Roberto and Yamagishi, Junichi and Watts, Oliver and Montero, Juan M.},
  title = {Towards Speaking Style Transplantation in Speech Synthesis},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  year = {2013},
  abstract = {One of the biggest challenges in speech synthesis is the production of naturally sounding synthetic voices. This means that the resulting voice must be not only of high enough quality but also that it must be able to capture the natural expressiveness imbued in human speech. This paper focus on solving the expressiveness problem by proposing a set of different techniques that could be used for extrapolating the expressiveness of proven high quality speaking style models into neutral speakers in HMM-based synthesis. As an additional advantage, the proposed techniques are based on adaptation approaches, which means that they can be used with little training data (around 15 minutes of training data are used in each style for this pa- per). For the final implementation, a set of 4 speaking styles were considered: news broadcasts, live sports commentary, interviews and parliamentary speech. Finally, the implementation of the 5 techniques were tested through a perceptual evaluation that proves that the deviations between neutral and speaking style average models can be learned and used to imbue expressiveness into target neutral speakers as intended.},
  month = {August},
  address = {Barcelona, Spain},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_PS2-3_Lorenzo-Trueba.pdf},
  pages = {179--183}
}
@inproceedings{Stan_IS13,
  author = {Stan, Adriana and Watts, Oliver and Mamiya, Yoshitaka and Giurgiu, Mircea and Clark, Rob and Yamagishi, Junichi and King, Simon},
  title = {{TUNDRA: A Multilingual Corpus of Found Data for TTS Research Created with Light Supervision}},
  booktitle = {Proc. Interspeech},
  year = {2013},
  month = {August},
  address = {Lyon, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/IS131055.pdf},
  abstract = {Simple4All Tundra (version 1.0) is the first release of a standardised multilingual corpus designed for text-to-speech research with imperfect or found data. The corpus consists of approximately 60 hours of speech data from audiobooks in 14 languages, as well as utterance-level alignments obtained with a lightly-supervised process. Future versions of the corpus will include finer-grained alignment and prosodic annotation, all of which will be made freely available. This paper gives a general outline of the data collected so far, as well as a detailed description of how this has been done, emphasizing the minimal language-specific knowledge and manual intervention used to compile the corpus. To demonstrate its potential use, text-to-speech systems have been built for all languages using unsupervised or lightly supervised methods, also briefly presented in the paper.}
}
@inproceedings{blizzard_13,
  author = {Watts, Oliver and Stan, Adriana and Mamiya, Yoshitaka and Suni, Antti and Burgos, José Martín and Montero, Juan Manuel},
  title = {{The {Simple4All} entry to the Blizzard Challenge 2013}},
  booktitle = {Proc. Blizzard Challenge 2013},
  month = {August},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/S4A_blizzard_2013.pdf},
  abstract = {We describe the synthetic voices entered into the 2013 Blizzard Challenge by the SIMPLE4ALL consortium. The 2013 Blizzard Challenge presents an opportunity to test and benchmark some of the tools we have been developing to address two problems of interest: 1) how best to learn from plentiful 'found' data, and 2) how to produce systems in arbitrary new languages with minimal annotated data and language-specific expertise on the part of the system builders. We here explain how our tools were used to address these problems on the different tasks of the challenge, and provide some discussion of the evaluation results.}
}
@inproceedings{Mamiya_13a,
  author = {Mamiya, Yoshitaka and Yamagishi, Junichi and Watts, Oliver and Clark, Robert A.J. and King, Simon and Stan, Adriana},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/0007987.pdf},
  booktitle = {Proc. ICASSP},
  year = {2013},
  abstract = {Audiobooks have been focused on as promising data for training Text-to-Speech (TTS) systems. However, they usually do not have a correspondence between audio and text data. Moreover, they are usually divided only into chapter units. In practice, we have to make a correspondence of audio and text data before we use them for building TTS synthesisers. However aligning audio and text data is time-consuming and involves manual labor. It also requires persons skilled in speech processing. Previously, we have proposed to use graphemes for automatically aligning speech and text data. This paper further integrates a lightly supervised voice activity detection (VAD) technique to detect sentence boundaries as a pre-processing step before the grapheme approach. This lightly supervised technique requires time stamps of speech and silence only for the first fifty sentences. Combining those, we can semi-automatically build TTS systems from audiobooks with minimum manual intervention. From subjective evaluations we analyse how the grapheme-based aligner and/or the proposed VAD technique impact the quality of HMM-based speech synthesisers trained on audiobooks.},
  title = {LIGHTLY SUPERVISED GMM VAD TO USE AUDIOBOOK FOR SPEECH SYNTHESISER}
}
@inproceedings{boros-2014,
  author = {Boroș, Tiberiu and Stan, Adriana and Watts, Oliver and Dumitrescu, Stefan Daniel},
  title = {{RSS-TOBI} - a Prosodically Enhanced {R}omanian Speech Corpus},
  booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14)},
  year = {2014},
  month = {May},
  address = {Reykjavik, Iceland},
  date = {26-31},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/boros-2014.pdf},
  abstract = {This paper introduces a recent development of a Romanian Speech corpus to include prosodic annotations of the speech data in the form of ToBI labels. We describe the methodology of determining the required pitch patterns that are common for the Romanian language, annotate the speech resource, and then provide a comparison of two text-to-speech synthesis systems to establish the benefits of using this type of information to our speech resource. The result is a publicly available speech dataset which can be used to further develop speech synthesis systems or to automatically learn the prediction of ToBI labels from text in Romanian language.},
  categories = {text-to-speech synthesis, Romanian language, ToBI}
}
@inproceedings{watts-2014,
  author = {Watts, Oliver and Gangireddy, Siva and Yamagishi, Junichi and King, Simon and Renals, Steve and Stan, Adriana and Giurgiu, Mircea},
  title = {NEURAL NET WORD REPRESENTATIONS FOR PHRASE-BREAK PREDICTION WITHOUT A PART OF SPEECH TAGGER},
  booktitle = {Proc. ICASSP},
  year = {2014},
  month = {May},
  pages = {2618--2622},
  address = {Florence, Italy},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/watts-2014.pdf},
  abstract = {The use of shared projection neural nets of the sort used in language modelling is proposed as a way of sharing parameters between multiple text-to-speech system components. We experiment with pretraining the weights of such a shared projection on an auxiliary language modelling task and then apply the resulting word representations to the task of phrase-break prediction. Doing so allows us to build phrase-break predictors that rival conventional systems without any reliance on conventional knowledge-based resources such as part of speech taggers.},
  categories = {Speech synthesis, TTS, unsupervised learning, neural net language modelling, multitask learning}
}
@inproceedings{blizzard_14,
  author = {Suni, Antti and Raitio, Tuomo and Gowda, Dhananjaya and Karhila, Reima and Gibson, Matt and Watts, Oliver},
  title = {{The {Simple4All} entry to the Blizzard Challenge 2014}},
  booktitle = {Proc. Blizzard Challenge 2014},
  month = {September},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/blizzard_14.pdf},
  abstract = {We describe the synthetic voices entered into the 2014 Blizzard Challenge by the SIMPLE4ALL consortium. The 2014 Blizzard Challenge presents an opportunity to test and benchmark some of the tools we have been developing to address the problem of how to produce systems in arbitrary new languages with minimal annotated data and language-specific expertise on the part of the system builders. We here explain how our tools were used to address these problems on the different tasks of the challenge, and provide some discussion of the evaluation results. Several additions to the system used to build voices for the previous Challenge are described: naive alphabetisation, unsupervised syllabification, and glottal flow pulse prediction using deep neural networks.},
  categories = {statistical parametric speech synthesis, unsupervised learning, vector space model, glottal inverse filtering, deep neural network, glottal flow pulse library}
}
@inproceedings{dnnbmtl_ICASSP15,
  author = {Wu, Z. and Valentini-Botinhao, C. and Watts, O. and King, S.},
  title = {{Deep neural networks employing multi-task learning and stacked bottleneck features for speech synthesis.}},
  booktitle = {Proc. ICASSP},
  year = {2015},
  abstract = {Deep neural networks (DNNs) use a cascade of hidden representations to enable the learning of complex mappings from input to output features. They are able to learn the complex mapping from textbased linguistic features to speech acoustic features, and so perform text-to-speech synthesis. Recent results suggest that DNNs can produce more natural synthetic speech than conventional HMM-based statistical parametric systems. In this paper, we show that the hidden representation used within a DNN can be improved through the use of Multi-Task Learning, and that stacking multiple frames of hidden layer activations (stacked bottleneck features) also leads to improvements. Experimental results confirmed the effectiveness of the proposed methods, and in listening tests we find that stacked bottleneck features in particular offer a significant improvement over both a baseline DNN and a benchmark HMM system.},
  month = {April},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/dnnbmtl_ICASSP15.pdf},
  pages = {4460-4464}
}
@inproceedings{Merritt2015RichContext,
  author = {Merritt, Thomas and Yamagishi, Junichi and Wu, Zhizheng and Watts, Oliver and King, Simon},
  title = {{Deep neural network context embeddings for model selection in rich-context HMM synthesis}},
  booktitle = {{Proc. Interspeech}},
  address = {Dresden},
  month = {September},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/merritt2015RichContext.pdf},
  abstract = {{This paper introduces a novel form of parametric synthesis that uses context embeddings produced by the bottleneck layer of a deep neural network to guide the selection of models in a rich-context HMM-based synthesiser. Rich-context synthesis – in which Gaussian distributions estimated from single linguistic contexts seen in the training data are used for synthesis, rather than more conventional decision tree-tied models – was originally proposed to address over-smoothing due to averaging across contexts. Our previous investigations have confirmed experimentally that averaging across different contexts is indeed one of the largest factors contributing to the limited quality of statistical parametric speech synthesis. However, a possible weakness of the rich context approach as previously formulated is that a conventional tied model is still used to guide selection of Gaussians at synthesis time. Our proposed approach replaces this with context embeddings derived from a neural network.}},
  categories = {{speech synthesis, hidden Markov model, deep neural networks, rich context, embedding}}
}
@inproceedings{wu2015mtl,
  author = {Wu, Zhizheng and Valentini-Botinhao, Cassia and Watts, Oliver and King, Simon},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/icassp2015_dnn_tts.pdf},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  year = {2015},
  title = {Deep neural network employing multi-task learning and stacked bottleneck features for speech synthesis}
}
@article{stan-2016,
  author = {Stan, Adriana and Mamiya, Yoshitaka and Yamagishi, Junichi and Bell, Peter and Watts, Oliver and Clark, Rob and King, Simon},
  doi = {http://dx.doi.org/10.1016/j.csl.2015.06.006},
  title = {{ALISA}: An automatic lightly supervised speech segmentation and alignment tool},
  url = {http://www.sciencedirect.com/science/article/pii/S0885230815000650},
  journal = {Computer Speech and Language},
  issn = {0885-2308},
  abstract = {This paper describes the ALISA tool, which implements a lightly supervised method for sentence-level alignment of speech with imperfect transcripts. Its intended use is to enable the creation of new speech corpora from a multitude of resources in a language-independent fashion, thus avoiding the need to record or transcribe speech data. The method is designed so that it requires minimum user intervention and expert knowledge, and it is able to align data in languages which employ alphabetic scripts. It comprises a GMM-based voice activity detector and a highly constrained grapheme-based speech aligner. The method is evaluated objectively against a gold standard segmentation and transcription, as well as subjectively through building and testing speech synthesis systems from the retrieved data. Results show that on average, 70% of the original data is correctly aligned, with a word error rate of less than 0.5%. In one case, subjective listening tests show a statistically significant preference for voices built on the gold transcript, but this is small and in other tests, no statistically significant differences between the systems built from the fully supervised training data and the one which uses the proposed method are found.},
  volume = {35},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/stan-2016.pdf},
  pages = {116--133},
  categories = {Speech segmentation, speech and text alignment, grapheme acoustic models, lightly supervised system, imperfect transcripts}
}
@inproceedings{wester2016evaluating,
  author = {Wester, Mirjam and Watts, Oliver and Henter, Gustav Eje},
  title = {Evaluating comprehension of natural and synthetic conversational speech},
  url = {http://www.isca-speech.org/archive/sp2016/pdfs_stamped/41.pdf},
  abstract = {Current speech synthesis methods typically operate on isolated sentences and lack convincing prosody when generating longer segments of speech. Similarly, prevailing TTS evaluation paradigms, such as intelligibility (transcription word error rate) or MOS, only score sentences in isolation, even though overall comprehension is arguably more important for speech-based communication. In an effort to develop more ecologically-relevant evaluation techniques that go beyond isolated sentences, we investigated comprehension of natural and synthetic speech dialogues. Specifically, we tested listener comprehension on long segments of spontaneous and engaging conversational speech (three 10-minute radio interviews of comedians). Interviews were reproduced either as natural speech, synthesised from carefully prepared transcripts, or synthesised using durations from forced-alignment against the natural speech, all in a balanced design. Comprehension was measured using multiple choice questions. A significant difference was measured between the comprehension/retention of natural speech (74\% correct responses) and synthetic speech with forced-aligned durations (61\% correct responses). However, no significant difference was observed between natural and regular synthetic speech (70\% correct responses). Effective evaluation of comprehension remains elusive.},
  address = {Boston, MA},
  month = {June},
  volume = {8},
  year = {2016},
  keywords = {evaluation, comprehension, conversational speech, statistical parametric speech synthesis},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/41.pdf},
  booktitle = {Speech Prosody},
  pages = {736--740},
  categories = {evaluation, comprehension, conversational speech, statistical parametric speech synthesis}
}
@inproceedings{henter2016robust,
  author = {Henter, Gustav Eje and Ronanki, Srikanth and Watts, Oliver and Wester, Mirjam and Wu, Zhizheng and King, Simon},
  title = {Robust {TTS} duration modelling using {DNN}s},
  url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7472655},
  abstract = {Accurate modelling and prediction of speech-sound durations is an important component in generating more natural synthetic speech. Deep neural networks (DNNs) offer a powerful modelling paradigm, and large, found corpora of natural and expressive speech are easy to acquire for training them. Unfortunately, found datasets are seldom subject to the quality-control that traditional synthesis methods expect. Common issues likely to affect duration modelling include transcription errors, reductions, filled pauses, and forced-alignment inaccuracies. To combat this, we propose to improve modelling and prediction of speech durations using methods from robust statistics, which are able to disregard ill-fitting points in the training material. We describe a robust fitting criterion based on the density power divergence (the beta-divergence) and a robust generation heuristic using mixture density networks (MDNs). Perceptual tests indicate that subjects prefer synthetic speech generated using robust models of duration over the baselines.},
  address = {Shanghai, China},
  month = {March},
  volume = {41},
  year = {2016},
  keywords = {Speech synthesis, duration modelling, robust statistics},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/henter2016robust.pdf},
  booktitle = {Proc. ICASSP},
  pages = {5130--5134},
  categories = {Speech synthesis, duration modelling, robust statistics}
}
@inproceedings{watts2016hmms,
  author = {Watts, Oliver and Henter, Gustav Eje and Merritt, Thomas and Wu, Zhizheng and King, Simon},
  title = {From {HMM}s to {DNN}s: where do the improvements come from?},
  url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7472730},
  abstract = {Deep neural networks (DNNs) have recently been the focus of much text-to-speech research as a replacement for decision trees and hidden Markov models (HMMs) in statistical parametric synthesis systems. Performance improvements have been reported; however, the configuration of systems evaluated makes it impossible to judge how much of the improvement is due to the new machine learning methods, and how much is due to other novel aspects of the systems. Specifically, whereas the decision trees in HMM-based systems typically operate at the state-level, and separate trees are used to handle separate acoustic streams, most DNN-based systems are trained to make predictions simultaneously for all streams at the level of the acoustic frame. This paper isolates the influence of three factors (machine learning method; state vs. frame predictions; separate vs. combined stream predictions) by building a continuum of systems along which only a single factor is varied at a time. We find that replacing decision trees with DNNs and moving from state-level to frame-level predictions both significantly improve listeners' naturalness ratings of synthetic speech produced by the systems. No improvement is found to result from switching from separate-stream to combined-stream predictions.},
  address = {Shanghai, China},
  month = {March},
  volume = {41},
  year = {2016},
  keywords = {speech synthesis, hidden Markov model, decision tree, deep neural network},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/watts2016hmms.pdf},
  booktitle = {Proc. ICASSP},
  pages = {5505--5509},
  categories = {speech synthesis, hidden Markov model, decision tree, deep neural network}
}
@inproceedings{cstr2016blizzard,
  author = {Merritt, Thomas and Ronanki, Srikanth and Wu, Zhizheng and Watts, Oliver},
  title = {The {CSTR} entry to the {Blizzard Challenge} 2016},
  booktitle = {Proc. Blizzard Challenge},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Cstr2016BlizzardEntry.pdf},
  abstract = {This paper describes the text-to-speech system entered by The Centre for Speech Technology Research into the 2016 Blizzard Challenge. This system is a hybrid synthesis system which uses output from a recurrent neural network to drive a unit selection synthesiser. The annual Blizzard Challenge conducts side-by-side testing of a number of speech synthesis systems trained on a common set of speech data. The task of the 2016 Blizzard Challenge is to train on expressively-read children’s storybooks, and to synthesise speech in the same domain. The Challenge therefore presents an opportunity to test the effectiveness of several techniques we have developed when applied to expressive speech data.},
  categories = {hybrid synthesis, statistical parametric speech synthesis, deep neural network, recurrent neural network, unit selection}
}
@inproceedings{ribeiro2016wavelet,
  author = {Ribeiro, Manuel Sam and Watts, Oliver and Yamagishi, Junichi and Clark, Robert A. J.},
  title = {Wavelet-based decomposition of f0 as a secondary task for {DNN-based} speech synthesis with multi-task learning},
  booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  year = {2016},
  month = {March},
  address = {Shanghai, China},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/ribeiro-et-al-icassp16.pdf},
  abstract = {We investigate two wavelet-based decomposition strategies of the f0 signal and their usefulness as a secondary task for speech synthesis using multi-task deep neural networks (MTL-DNN). The first decomposition strategy uses a static set of scales for all utterances in the training data. We propose a second strategy, where the scale of the mother wavelet is dynamically adjusted to the rate of each utterance. This approach is able to capture f0 variations related to the syllable, word, clitic-group, and phrase units. This method also constrains the wavelet components to be within the frequency range that previous experiments have shown to be more natural. These two strategies are evaluated as a secondary task in multi-task deep neural networks (MTL-DNNs). Results indicate that on an expressive dataset there is a strong preference for the systems using multi-task learning when compared to the baseline system.},
  categories = {speech synthesis, f0 modelling, deep neural network, multi-task learning, continuous wavelet transform}
}
@inproceedings{ribeiro2016syllable,
  author = {Ribeiro, Manuel Sam and Watts, Oliver and Yamagishi, Junichi},
  title = {Syllable-level representations of suprasegmental features for {DNN-based} text-to-speech synthesis},
  booktitle = {Proceedings of Interspeech},
  year = {2016},
  month = {September},
  address = {San Francisco, United States},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/1034.PDF},
  abstract = {A top-down hierarchical system based on deep neural networks is investigated for the modeling of prosody in speech synthesis. Suprasegmental features are processed separately from segmental features and a compact distributed representation of highlevel units is learned at syllable-level. The suprasegmental representation is then integrated into a frame-level network. Objective measures show that balancing segmental and suprasegmental features can be useful for the frame-level network. Additional features incorporated into the hierarchical system are then tested. At the syllable-level, a bag-of-phones representation is proposed and, at the word-level, embeddings learned from text sources are used. It is shown that the hierarchical system is able to leverage new features at higher-levels more efficiently than a system which exploits them directly at the frame-level. A perceptual evaluation of the proposed systems is conducted and followed by a discussion of the results.},
  categories = {speech synthesis, prosody, deep neural networks, suprasegmental representations}
}
@inproceedings{ribeiro2016parallel,
  author = {Ribeiro, Manuel Sam and Watts, Oliver and Yamagishi, Junichi},
  title = {Parallel and cascaded deep neural networks for text-to-speech synthesis},
  booktitle = {9th ISCA Workshop on Speech Synthesis (SSW9)},
  year = {2016},
  month = {September},
  address = {Sunnyvale, United States},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/ribeiro-et-al-ssw9.pdf},
  abstract = {An investigation of cascaded and parallel deep neural networks for speech synthesis is conducted. In these systems, suprasegmental linguistic features (syllable-level and above) are processed separately from segmental features (phone-level and below). The suprasegmental component of the networks learns compact distributed representations of high-level linguistic units without any segmental influence. These representations are then integrated into a frame-level system using a cascaded or a parallel approach. In the cascaded network, suprasegmental representations are used as input to the frame-level network. In the parallel network, segmental and suprasegmental features are processed separately and concatenated at a later stage. These experiments are conducted with a standard set of high-dimensional linguistic features as well as a hand-pruned one. It is observed that hierarchical systems are consistently preferred over the baseline feedforward systems. Similarly, parallel networks are preferred over cascaded networks.},
  categories = {speech synthesis, prosody, deep neural networks, embeddings, suprasegmental representations}
}
@inproceedings{ribeiro2017learning,
  author = {Ribeiro, Manuel Sam and Watts, Oliver and Yamagishi, Junichi},
  title = {Learning word vector representations based on acoustic counts},
  booktitle = {Proceedings of Interspeech},
  year = {2017},
  month = {August},
  address = {Stockholm, Sweden},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/1340.PDF},
  abstract = {This paper presents a simple count-based approach to learning word vector representations by leveraging statistics of cooccurrences between text and speech. This type of representation requires two discrete sequences of units defined across modalities. Two possible methods for the discretization of an acoustic signal are presented, which are then applied to fundamental frequency and energy contours of a transcribed corpus of speech, yielding a sequence of textual objects (e.g. words, syllables) aligned with a sequence of discrete acoustic events. Constructing a matrix recording the co-occurrence of textual objects with acoustic events and reducing its dimensionality with matrix decomposition results in a set of context-independent representations of word types. These are applied to the task of acoustic modelling for speech synthesis; objective and subjective results indicate that these representations are useful for the generation of acoustic parameters in a text-to-speech (TTS) system. In general, we observe that the more discretization approaches, acoustic signals, and levels of linguistic analysis are incorporated into a TTS system via these count-based representations, the better that TTS system performs.},
  categories = {speech synthesis, text-to-speech, vector representations, word embeddings, deep neural networks}
}
@inproceedings{ronanki_interspeech17,
  author = {Ronanki, Srikanth and Watts, Oliver and King, Simon},
  title = {{A Hierarchical Encoder-Decoder Model for Statistical Parametric Speech Synthesis}},
  booktitle = {Proc. Interspeech 2017},
  month = {August},
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/interspeech_2017_paper.pdf},
  abstract = {Current approaches to statistical parametric speech synthesis using Neural Networks generally require input at the same temporal resolution as the output, typically a frame every 5ms, or in some cases at waveform sampling rate. It is therefore necessary to fabricate highly-redundant frame-level (or sample-level) linguistic features at the input. This paper proposes the use of a hierarchical encoder-decoder model to perform the sequence-to-sequence regression in a way that takes the input linguistic features at their original timescales, and preserves the relationships between words, syllables and phones. The proposed model is designed to make more effective use of supra-segmental features than conventional architectures, as well as being computationally efficient. Experiments were conducted on prosodically-varied audiobook material because the use of supra-segmental features is thought to be particularly important in this case. Both objective measures and results from subjective listening tests, which asked listeners to focus on prosody, show that the proposed method performs significantly better than a conventional architecture that requires the linguistic input to be at the acoustic frame rate. We provide code and a recipe to enable our system to be reproduced using the Merlin toolkit.},
  categories = {Speech synthesis, prosody modeling, hierarchical models and sequence modeling}
}
@inproceedings{ronanki_blizzard2017,
  author = {Ronanki, Srikanth and Ribeiro, Sam and Espic, Felipe and Watts, Oliver},
  title = {{The CSTR entry to the Blizzard Challenge 2017}},
  booktitle = {Proc. Blizzard Challenge Workshop (Interspeech Satellite)},
  year = {2017},
  month = {August},
  key = {ronanki_blizzard2017},
  address = {Stockholm, Sweden},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/CSTR_Blizzard2017.pdf},
  abstract = {The annual Blizzard Challenge conducts side-by-side testing of a number of speech synthesis systems trained on a common set of speech data. Similar to 2016 Blizzard challenge, the task for this year is to train on expressively-read children's story-books, and to synthesise speech in the same domain. The Challenge therefore presents an opportunity to investigate the effectiveness of several techniques we have developed when applied to expressive and prosodically-varied audiobook data. This paper describes the text-to-speech system entered by The Centre for Speech Technology Research into the 2017 Blizzard Challenge. The current system is a hybrid synthesis system which drives a unit selection synthesiser using the output from a neural network based acoustic and duration model. We assess the performance of our system by reporting the results from formal listening tests provided by the challenge.},
  categories = {Merlin, hybrid speech synthesis, unit selection, deep neural networks}
}
@inproceedings{ronanki_slt2016,
  author = {Ronanki, Srikanth and Watts, Oliver and King, Simon and Henter, Gustav Eje},
  title = {{Median-Based Generation of Synthetic Speech Durations using a Non-Parametric Approach}},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology (SLT)},
  month = {December},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/ronanki2016mediandur.pdf},
  abstract = {This paper proposes a new approach to duration modelling for statistical parametric speech synthesis in which a recurrent statistical model is trained to output a phone transition probability at each timestep (acoustic frame). Unlike conventional approaches to duration modelling – which assume that duration distributions have a particular form (e.g., a Gaussian) and use the mean of that distribution for synthesis – our approach can in principle model any distribution supported on the non-negative integers. Generation from this model can be performed in many ways; here we consider output generation based on the median predicted duration. The median is more typical (more probable) than the conventional mean duration, is robust to training-data irregularities, and enables incremental generation. Furthermore, a frame-level approach to duration prediction is consistent with a longer-term goal of modelling durations and acoustic features together. Results indicate that the proposed method is competitive with baseline approaches in approximating the median duration of held-out natural speech.},
  categories = {text-to-speech, speech synthesis, duration modelling, non-parametric models, LSTMs}
}
@inproceedings{ronanki_demo_ssw2016,
  author = {Ronanki, Srikanth and Wu, Zhizheng and Watts, Oliver and King, Simon},
  title = {{A Demonstration of the Merlin Open Source Neural Network Speech Synthesis System}},
  booktitle = {Proc. Speech Synthesis Workshop (SSW9)},
  month = {September},
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Merlin_demo_paper.pdf},
  abstract = {This demonstration showcases our new Open Source toolkit for neural network-based speech synthesis, Merlin. We wrote Merlin because we wanted free, simple, maintainable code that we understood. No existing toolkits met all of those requirements. Merlin is designed for speech synthesis, but can be put to other uses. It has already also been used for voice conversion, classification tasks, and for predicting head motion from speech.},
  categories = {Merlin, speech synthesis, deep learning}
}
@inproceedings{watts_blizzard2015,
  author = {Watts, Oliver and Ronanki, Srikanth and Wu, Zhizheng and Raitio, Tuomo and Suni, Antti},
  title = {{The NST--GlottHMM entry to the Blizzard Challenge 2015}},
  booktitle = {Proc. Blizzard Challenge Workshop (Interspeech Satellite)},
  year = {2015},
  month = {September},
  key = {watts_blizzard2015},
  address = {Berlin, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/bc2015_nst.pdf},
  abstract = {We describe the synthetic voices forming the joint entry into the 2015 Blizzard Challenge of the Natural Speech Technology consortium, Helsinki University, and Aalto University. The 2015 Blizzard Challenge presents an opportunity to test and benchmark some of the tools we have developed to address the problem of how to produce systems in arbitrary new languages with minimal annotated data and language-specific expertise on the part of the system builders. We here explain how our tools were used to address these problems on the different tasks of the challenge, and provide some discussion of the evaluation results. Some additions to the system used to build voices for the previous Challenge are described: acoustic modelling using deep neural networks with jointly-trained duration model, and an unsupervised approach for handling the phenomenon of inherent vowel deletion which occurs in 3 of the 6 target languages.},
  categories = {statistical parametric speech synthesis, unsupervised learning, vector space model, glottal inverse filtering, deep neural network, glottal flow pulse library, schwa-deletion}
}
@inproceedings{cstr2017blizzard,
  author = {Ronanki, Srikanth and Ribeiro, {Manuel Sam} and Espic, Felipe and Watts, Oliver},
  title = {The {CSTR} entry to the {Blizzard Challenge} 2017},
  booktitle = {Proc. Blizzard Challenge},
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/CSTR_Blizzard2017.pdf},
  abstract = {The annual Blizzard Challenge conducts side-by-side testing of a number of speech synthesis systems trained on a common set of speech data. Similar to 2016 Blizzard challenge, the task for this year is to train on expressively-read children’s story-books, and to synthesise speech in the same domain. The Challenge therefore presents an opportunity to investigate the effectiveness of several techniques we have developed when applied to expressive and prosodically-varied audiobook data. This paper describes the text-to-speech system entered by The Centre for Speech Technology Research into the 2017 Blizzard Challenge. The current system is a hybrid synthesis system which drives a unit selection synthesiser using the output from a neural network based acoustic and duration model. We assess the performance of our system by reporting the results from formal listening tests provided by the challenge.},
  categories = {Merlin, hybrid speech synthesis, unit selection, deep neural networks}
}
@inproceedings{mendelson2017nativization,
  author = {Mendelson, Joseph and Oplustil, Pilar and Watts, Oliver and King, Simon},
  date-modified = {2018-01-19 16:44:20 +0000},
  title = {Nativization of foreign names in TTS for automatic reading of world news in Swahili},
  abstract = {When a text-to-speech (TTS) system is required to speak world news, a large fraction of the words to be spoken will be proper names originating in a wide variety of languages. Phonetization of these names based on target language letter-to-sound rules will typically be inadequate. This is detrimental not only during synthesis, when inappropriate phone sequences are produced, but also during training, if the system is trained on data from the same domain. This is because poor phonetization during forced alignment based on hidden Markov models can pollute the whole model set, resulting in degraded alignment even of normal target-language words. This paper presents four techniques designed to address this issue in the context of a Swahili TTS system: automatic transcription of proper names based on a lexicon from a better-resourced language; the addition of a parallel phone set and special part-of-speech tag exclusively dedicated to proper names; a manually-crafted phone mapping which allows substitutions for potentially more accurate phones in proper names during forced alignment; the addition in proper names of a grapheme-derived frame-level feature, supplementing the standard phonetic inputs to the acoustic model. We present results from objective and subjective evaluations of systems built using these four techniques.},
  month = may,
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/mendelson2017nativization_1.pdf},
  booktitle = {Interspeech 2017}
}
@inproceedings{yoshimura2016hierarchical,
  author = {Yoshimura, Takenori and Henter, {Gustav Eje} and Watts, Oliver and Wester, Mirjam and Yamagishi, Junichi and Tokuda, Keiichi},
  bdsk-url-1 = {http://dx.doi.org/10.21437/Interspeech.2016-847},
  publisher = {International Speech Communication Association},
  doi = {10.21437/Interspeech.2016-847},
  date-modified = {2018-01-19 16:43:35 +0000},
  title = {A Hierarchical Predictor of Synthetic Speech Naturalness Using Neural Networks},
  abstract = {A problem when developing and tuning speech synthesis systems is that there is no well-established method of automatically rating the quality of the synthetic speech. This research attempts to obtain a new automated measure which is trained on the result of large-scale subjective evaluations employing many human listeners, i.e., the Blizzard Challenge. To exploit the data, we experiment with linear regression, feed-forward and convolutional neural network models, and combinations of them to regress from synthetic speech to the perceptual scores obtained from listeners. The biggest improvements were seen when combining stimulus- and system-level predictions.},
  month = sep,
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/0847.PDF},
  booktitle = {Interspeech 2016},
  pages = {342--346}
}
@inproceedings{wu2016merlin,
  author = {Wu, Zhizheng and Watts, Oliver and King, Simon},
  date-modified = {2018-01-19 16:44:16 +0000},
  title = {Merlin: An Open Source Neural Network Speech Synthesis System},
  abstract = {We introduce the Merlin speech synthesis toolkit for neural network-based speech synthesis. The system takes linguistic features as input, and employs neural networks to predict acoustic features, which are then passed to a vocoder to produce the speech waveform. Various neural netw are implemented, including a standard feedforward neural network, mixture density neural network, recurrent neural network (RNN), long short-term memory (LSTM) recurrent neural network, amongst others. The toolkit is Open Source, written in Python, and is extensible. This paper briefly describes the system, and provides some benchmarking results on a freely available corpus.},
  month = sep,
  year = {2016},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/master_2.pdf},
  booktitle = {9th ISCA Speech Synthesis Workshop (2016)},
  pages = {218--223}
}
@article{chee-yong2015combining,
  author = {{Chee Yong}, Lau and Watts, Oliver and King, Simon},
  date-modified = {2018-01-19 16:43:57 +0000},
  title = {Combining Lightly-supervised Learning and User Feedback to Construct and Improve a Statistical Parametric Speech Synthesizer for Malay},
  journal = {Research Journal of Applied Sciences, Engineering and Technology},
  issn = {2040-7459},
  number = {11},
  month = dec,
  volume = {11},
  pages = {1227--1232},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/lau2015combining.pdf},
  abstract = {In this study, we aim to reduce the human effort in preparing training data for synthesizing human speech and improve the quality of synthetic speech. In spite of the learning-from-data used to train the statistical models, the construction of a statistical parametric speech synthesizer involves substantial human effort, especially when using imperfect data or working on a new language. Here, we use lightly-supervised methods for preparing the data and constructing the text-processing front end. This initial system is then iteratively improved using active learning in which feedback from users is used to disambiguate the pronunciation system in our chosen language, Malay. The data are prepared using speaker diarisation and lightly-supervised text-speech alignment. In the front end, graphemebased units are used. The active learning used small amounts of feedback from a listener to train a classifier. We report evaluations of two systems built from high-quality studio data and lower-quality `found' data respectively and show that the intelligibility of each can be improved using active learning.}
}
@inproceedings{watts2015nst,
  author = {Watts, Oliver and Ronanki, Srikanth and Wu, Zhizheng and Raitio, Tuomo and Suni, A.},
  date-modified = {2018-01-19 16:44:49 +0000},
  title = {The NST--GlottHMM entry to the Blizzard Challenge 2015},
  abstract = {We describe the synthetic voices forming the joint entry into the 2015 Blizzard Challenge of the Natural Speech Technology consortium, Helsinki University, and Aal to University. The 2015 Blizzard Challenge presents an opportunity to test and benchmark some of the tools we have developed to address the problem of how to produce systems in arbitrary new languages with minimal annotated data and language-specific expertise on the part of the system builders. We here explain how our tools were used to address these problems on the different tasks of the challenge, and provide some discussion of the evaluation results. Some additions to the system used to build voices for the previous Challenge are described: acoustic modelling using deep neural networks with jointly-trained duration model,and an unsupervised approach for handling the phenomenon of inherent vowel deletion which occurs in 3 of the 6 target languages.},
  month = sep,
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/watts2015nst-glotthmm.pdf},
  booktitle = {Proceedings of Blizzard Challenge 2015}
}
@inproceedings{watts2015sentence-level,
  author = {Watts, Oliver and Wu, Zhizheng and King, Simon},
  publisher = {International Speech Communication Association},
  date-modified = {2018-01-19 16:44:26 +0000},
  title = {Sentence-level control vectors for deep neural network speech synthesis},
  abstract = {This paper describes the use of a low-dimensional vector representation of sentence acoustics to control the output of a feed-forward deep neural network text-to-speech system on a sentence-by-sentence basis. Vector representations for sentences in the training corpus are learned during network training along with other parameters of the model. Although the network is trained on a frame-by-frame basis, the standard frame-level inputs representing linguistic features are supplemented by features from a projection layer which outputs a learned representation of sentence-level acoustic characteristics. The projection layer contains dedicated parameters for each sentence in the training data which are optimised jointly with the standard network weights. Sentence-specific parameters are optimised on all frames of the relevant sentence -- these parameters therefore allow the network to account for sentence-level variation in the data which is not predictable from the standard linguistic inputs. Results show that the global prosodic characteristics of synthetic speech can be controlled simply and robustly at run time by supplementing basic linguistic features with sentence-level control vectors which are novel but designed to be consistent with those observed in the training corpus.},
  month = sep,
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/watts2015sentence-level.pdf},
  booktitle = {INTERSPEECH 2015 16th Annual Conference of the International Speech Communication Association},
  pages = {2217--2221}
}
@inproceedings{kay2015knowledge,
  author = {Kay, Rosie and Watts, Oliver and Barra-Chicote, Roberto and Mayo, Cassie},
  date-modified = {2018-01-19 16:44:10 +0000},
  title = {Knowledge versus data in TTS: evaluation of a continuum of synthesis systems},
  abstract = {Grapheme-based models have been proposed for both ASR and TTS as a way of circumventing the lack of expert-compiled pronunciation lexicons in under-resourced languages. It is a common observation that this should work well in languages employing orthographies with a transparent letter-to-phoneme relationship,such as Spanish. Our experience has shown, however,that there is still a significant difference in intelligibility between grapheme-based systems and conventional ones for this language. This paper explores the contribution of different levels of linguistic annotation to system intelligibility, and the trade-off between those levels and the quantity of data used for training. Ten systems spaced across these two continua of knowledge and data were subjectively evaluated for intelligibility.},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/kay2015knowledge.pdf},
  booktitle = {INTERSPEECH 2015, 16th Annual Conference of the International Speech Communication Association, Dresden, Germany, September 6-10, 2015},
  pages = {3335--3339}
}