The Centre for Speech Technology Research, The university of Edinburgh

Publications by Cassia Valentini-Botinhao

s0968719.bib

@phdthesis{Cassia_PhD13,
  author = {Valentini-Botinhao, Cassia},
  school = {University of Edinburgh},
  title = {Intelligibility enhancement of synthetic speech in noise},
  abstract = {Speech technology can facilitate human-machine interaction and create new communication interfaces. Text-To-Speech (TTS) systems provide speech output for dialogue, notification and reading applications as well as personalized voices for people that have lost the use of their own. TTS systems are built to produce synthetic voices that should sound as natural, expressive and intelligible as possible and if necessary be similar to a particular speaker. Although naturalness is an important requirement, providing the correct information in adverse conditions can be crucial to certain applications. Speech that adapts or reacts to different listening conditions can in turn be more expressive and natural. In this work we focus on enhancing the intelligibility of TTS voices in additive noise. For that we adopt the statistical parametric paradigm for TTS in the shape of a hidden Markov model (HMM-) based speech synthesis system that allows for flexible enhancement strategies. Little is known about which human speech production mechanisms actually increase intelligibility in noise and how the choice of mechanism relates to noise type, so we approached the problem from another perspective: using mathematical models for hearing speech in noise. To find which models are better at predicting intelligibility of TTS in noise we performed listening evaluations to collect subjective intelligibility scores which we then compared to the models’ predictions. In these evaluations we observed that modifications performed on the spectral envelope of speech can increase intelligibility significantly, particularly if the strength of the modification depends on the noise and its level. We used these findings to inform the decision of which of the models to use when automatically modifying the spectral envelope of the speech according to the noise. We devised two methods, both involving cepstral coefficient modifications. The first was applied during extraction while training the acoustic models and the other when generating a voice using pre-trained TTS models. The latter has the advantage of being able to address fluctuating noise. To increase intelligibility of synthetic speech at generation time we proposed a method for Mel cepstral coefficient modification based on the glimpse proportion measure, the most promising of the models of speech intelligibility that we evaluated. An extensive series of listening experiments demonstrated that this method brings significant intelligibility gains to TTS voices while not requiring additional recordings of clear or Lombard speech. To further improve intelligibility we combined our method with noise-independent enhancement approaches based on the acoustics of highly intelligible speech. This combined solution was as effective for stationary noise as for the challenging competing speaker scenario, obtaining up to 4dB of equivalent intensity gain. Finally, we proposed an extension to the speech enhancement paradigm to account for not only energetic masking of signals but also for linguistic confusability of words in sentences. We found that word level confusability, a challenging value to predict, can be used as an additional prior to increase intelligibility even for simple enhancement methods like energy reallocation between words. These findings motivate further research into solutions that can tackle the effect of energetic masking on the auditory system as well as on higher levels of processing.},
  year = {2013},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2013/Cassia_PhD13.pdf},
  categories = {speech synthesis, speech intelligibility in noise}
}
@article{Cassia_CSL13,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S. and Maia, R.},
  doi = {10.1016/j.csl.2013.06.001},
  title = {Intelligibility enhancement of {HMM}-generated speech in additive noise by modifying Mel cepstral coefficients to increase the Glimpse Proportion},
  journal = {Computer Speech and Language},
  number = {2},
  pages = {665--686},
  volume = {28},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Cassia_CSL14.pdf},
  abstract = {This paper describes speech intelligibility enhancement for hidden Markov model (HMM) generated synthetic speech in noise. We present a method for modifying the Mel cepstral coefficients generated by statistical parametric models that have been trained on plain speech. We update these coefficients such that the Glimpse Proportion – an objective measure of the intelligibility of speech in noise – increases, while keeping the speech energy fixed. An acoustic analysis reveals that the modified speech is boosted in the region 1-4kHz, particularly for vowels, nasals and approximants. Results from listening tests employing speech-shaped noise show that the modified speech is as intelligible as a synthetic voice trained on plain speech whose duration, Mel cepstral coefficients and excitation signal parameters have been adapted to Lombard speech from the same speaker. Our proposed method does not require these additional recordings of Lombard speech. In the presence of a competing talker, both modification and adaptation of spectral coefficients give more modest gains.}
}
@inproceedings{Cassia_IS13,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S. and Stylianou, Y.},
  title = {{Combining perceptually-motivated spectral shaping with loudness and duration modification for intelligibility enhancement of HMM-based synthetic speech in noise}},
  booktitle = {Proc. Interspeech},
  address = {Lyon, France},
  month = {August},
  year = {2013},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2013/Cassia_IS13.pdf},
  abstact = {This paper presents our entry to a speech-in-noise intelligibility enhancement evaluation: the Hurricane Challenge. The system consists of a Text-To-Speech voice manipulated through a combination of enhancement strategies, each of which is known to be individually successful: a perceptually-motivated spectral shaper based on the Glimpse Proportion measure, dynamic range compression, and adaptation to Lombard excitation and duration patterns. We achieved substantial intelligibility improvements relative to unmodified synthetic speech: 4.9 dB in competing speaker and 4.1 dB in speech-shaped noise. An analysis conducted across this and other two similar evaluations shows that the spectral shaper and the compressor (both of which are loudness boosters) contribute most under higher SNR conditions, particularly for speech-shaped noise. Duration and excitation Lombard-adapted changes are more beneficial in lower SNR conditions, and for competing speaker noise.}
}
@inproceedings{Cooke_IS13,
  author = {Cooke, M. and Mayo, C. and Valentini-Botinhao, C.},
  title = {{Intelligibility-enhancing speech modifications: the Hurricane Challenge}},
  booktitle = {Proc. Interspeech},
  address = {Lyon, France},
  month = {August},
  year = {2013},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2013/Cooke_IS13.pdf},
  abstact = {Speech output is used extensively, including in situations where correct message reception is threatened by adverse listening conditions. Recently, there has been a growing interest in algorithmic modifications that aim to increase the intelligibility of both natural and synthetic speech when presented in noise. The Hurricane Challenge is the first large-scale open evaluation of algorithms designed to enhance speech intelligibility. Eighteen systems operating on a common data set were subjected to extensive listening tests and compared to unmodified natural and text-to-speech (TTS) baselines. The best-performing systems achieved gains over unmodified natural speech of 4.4 and 5.1 dB in competing speaker and stationary noise respectively, while TTS systems made gains of 5.6 and 5.1 dB over their baseline. Surprisingly, for most conditions the largest gains were observed for noise-independent algorithms, suggesting that performance in this task can be further improved by exploiting information in the masking signal.}
}
@inproceedings{Cassia_ICASSP13,
  author = {Valentini-Botinhao, C. and Godoy, E. and Stylianou, Y. and Sauert, B. and King, S. and Yamagishi, J.},
  title = {{Improving intelligibility in noise of HMM-generated speech via noise-dependent and -independent methods.}},
  booktitle = {Proc. ICASSP},
  address = {Vancouver, Canada},
  month = {May},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Cassia_ICASSP13.pdf},
  abstact = {In order to improve the intelligibility of HMM-generated Text-to- Speech (TTS) in noise, this work evaluates several speech enhancement methods, exploring combinations of noise-independent and -dependent approaches as well as algorithms previously developed for natural speech. We evaluate one noise-dependent method proposed for TTS, based on the glimpse proportion measure, and three approaches originally proposed for natural speech - one that estimates the noise and is based on the speech intelligibility index, and two noise-independent methods based on different spectral shaping techniques followed by dynamic range compression. We demonstrate how these methods influence the average spectra for different phone classes. We then present results of a listening experiment with speech-shaped noise and a competing speaker. A few methods made the TTS voice even more intelligible than the natural one. Although noise-dependent methods did not improve gains, the intelligibility differences found in distinct noises motivates such dependency.}
}
@inproceedings{Tang_SPIN13,
  author = {Tang, Y. and Cooke, M. and Valentini-Botinhao, C.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Tang_SPIN13.pdf},
  booktitle = {Proc. SPIN},
  title = {A distortion-weighted glimpse-based intelligibility metric for modified and synthetic speech},
  year = {2013}
}
@article{Cooke_SPCOM13,
  author = {Cooke, M. and Mayo, C. and Valentini-Botinhao, C. and Stylianou, Y. and Sauert, B. and Tang, Y.},
  title = {Evaluating the intelligibility benefit of speech modifications in known noise conditions},
  journal = {Speech Communication},
  abstract = {The use of live and recorded speech is widespread in applications where correct message reception is important. Furthermore, the deployment of synthetic speech in such applications is growing. Modifications to natural and synthetic speech have therefore been proposed which aim at improving intelligibility in noise. The current study compares the benefits of speech modification algorithms in a large-scale speech intelligibility evaluation and quantifies the equivalent intensity change, defined as the amount in decibels that unmodified speech would need to be adjusted by in order to achieve the same intelligibility as modified speech. Listeners identified keywords in phonetically-balanced sentences representing ten different types of speech: plain and Lombard speech, five types of modified speech, and three forms of synthetic speech. Sentences were masked by either a stationary or a competing speech masker. Modification methods varied in the manner and degree to which they exploited estimates of the masking noise. The best-performing modifications led to equivalent intensity changes of around 5 dB in moderate and high noise levels for the stationary masker, and 3--4 dB in the presence of competing speech. These gains exceed those produced by Lombard speech. Synthetic speech in noise was always less intelligible than plain natural speech, but modified synthetic speech reduced this deficit by a significant amount.},
  volume = {55},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cooke_SPCOM13.pdf},
  issue = {4},
  pages = {572-585}
}
@inproceedings{CassiaICASSP12,
  author = {Valentini-Botinhao, C. and Maia, R. and Yamagishi, J. and King, S. and Zen, H.},
  doi = {10.1109/ICASSP.2012.6288794},
  title = {{Cepstral analysis based on the Glimpse proportion measure for improving the intelligibility of {HMM}-based synthetic speech in noise}},
  booktitle = {Proc. ICASSP},
  address = {Kyoto, Japan},
  month = {March},
  pages = {3997--4000},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_ICASSP12.pdf},
  abstract = {In this paper we introduce a new cepstral coefficient extraction method based on an intelligibility measure for speech in noise, the Glimpse Proportion measure. This new method aims to increase the intelligibility of speech in noise by modifying the clean speech, and has applications in scenarios such as public announcement and car navigation systems. We first explain how the Glimpse Proportion measure operates and further show how we approximated it to integrate it into an existing spectral envelope parameter extraction method commonly used in the HMM-based speech synthesis framework. We then demonstrate how this new method changes the modelled spectrum according to the characteristics of the noise and show results for a listening test with vocoded and HMM-based synthetic speech. The test indicates that the proposed method can significantly improve intelligibility of synthetic speech in speech shaped noise.},
  categories = {HMM-based speech synthesis, intelligibility enhancement, speech analysis}
}
@inproceedings{Cassia_IS11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and King, Simon},
  title = {Can Objective Measures Predict the Intelligibility of Modified {HMM}-based Synthetic Speech in Noise?},
  booktitle = {Proc. Interspeech},
  month = {August},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_IS11.pdf},
  abstract = {{Synthetic speech can be modified to improve intelligibility in noise. In order to perform modifications automatically, it would be useful to have an objective measure that could predict the intelligibility of modified synthetic speech for human listeners. We analysed the impact on intelligibility – and on how well objective measures predict it – when we separately modify speaking rate, fundamental frequency, line spectral pairs and spectral peaks. Shifting LSPs can increase intelligibility for human listeners; other modifications had weaker effects. Among the objective measures we evaluated, the Dau model and the Glimpse proportion were the best predictors of human performance.}},
  categories = {HMM-based speech synthesis, objective measures of intelligibility}
}
@inproceedings{Cassia_ICASSP11,
  author = {Valentini-Botinhao, Cassia and Yamagishi, Junichi and King, Simon},
  doi = {10.1109/ICASSP.2011.5947507},
  title = {Evaluation of objective measures for intelligibility prediction of {HMM}-based synthetic speech in noise},
  booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2011 IEEE International Conference on},
  issn = {1520-6149},
  month = {May},
  pages = {5112--5115},
  year = {2011},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/Cassia_ICASSP11.pdf},
  abstract = {{In this paper we evaluate four objective measures of speech with regards to intelligibility prediction of synthesized speech in diverse noisy situations. We evaluated three intelligibility measures, the Dau measure, the glimpse proportion and the Speech Intelligibility Index (SII) and a quality measure, the Perceptual Evaluation of Speech Quality (PESQ). For the generation of synthesized speech we used a state of the art HMM-based speech synthesis system. The noisy conditions comprised four additive noises. The measures were compared with subjective intelligibility scores obtained in listening tests. The results show the Dau and the glimpse measures to be the best predictors of intelligibility, with correlations of around 0.83 to subjective scores. All measures gave less accurate predictions of intelligibility for synthetic speech than have previously been found for natural speech; in particular the SII measure. In additional experiments, we processed the synthesized speech by an ideal binary mask before adding noise. The Glimpse measure gave the most accurate intelligibility predictions in this situation.}},
  categories = {HMM-based speech synthesis, objective measures of intelligibility}
}
@inproceedings{CassiaSAPA12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Evaluating speech intelligibility enhancement for {HMM}-based synthetic speech in noise}},
  booktitle = {Proc. Sapa Workshop},
  year = {2012},
  month = {September},
  address = {Portland, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Sapa12.pdf},
  abstract = {It is possible to increase the intelligibility of speech in noise by enhancing the clean speech signal. In this paper we demonstrate the effects of modifying the spectral envelope of synthetic speech according to the environmental noise. To achieve this, we modify Mel cepstral coefficients according to an intelligibility measure that accounts for glimpses of speech in noise: the Glimpse Proportion measure. We evaluate this method against a baseline synthetic voice trained only with normal speech and a topline voice trained with Lombard speech, as well as natural speech. The intelligibility of these voices was measured when mixed with speech-shaped noise and with a competing speaker at three different levels. The Lombard voices, both natural and synthetic, were more intelligible than the normal voices in all conditions. For speech-shaped noise, the proposed modified voice was as intelligible as the Lombard synthetic voice without requiring any recordings of Lombard speech, which are hard to obtain. However, in the case of competing talker noise, the Lombard synthetic voice was more intelligible than the proposed modified voice.},
  categories = {HMM-based speech synthesis, intelligibility enhancement}
}
@inproceedings{CassiaLista12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Using an intelligibility measure to create noise robust cepstral coefficients for {HMM}-based speech synthesis}},
  booktitle = {Proc. LISTA Workshop},
  address = {Edinburgh, UK},
  month = {May},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_Lista12.pdf},
  categories = {HMM-based speech synthesis, intelligibility enhancement}
}
@inproceedings{CassiaWocci12,
  author = {Valentini-Botinhao, C. and Degenkolb-Weyers, S. and Maier, A. and Noeth, E. and Eysholdt, U. and Bocklet, T.},
  title = {{Automatic detection of sigmatism in children}},
  booktitle = {Proc. WOCCI},
  address = {Portland, USA},
  month = {September},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cassia_WOCCI12.pdf},
  abstract = {We propose in this paper an automatic system to detect sigmatism from the speech signal. Sigmatism occurs when the tongue is positioned incorrectly during articulation of sibilant phones like /s/ and /z/. For our task we extracted various sets of features from speech: Mel frequency cepstral coefficients, energies in specific bandwidths of the spectral envelope, and the so-called supervectors, which are the parameters of an adapted speaker model. We then trained several classifiers on a speech database of German adults simulating three different types of sigmatism. Recognition results were calculated at a phone, word and speaker level for both the simulated database and for a database of pathological speakers. For the simulated database, we achieved recognition rates of up to 86%, 87% and 94% at a phone, word and speaker level. The best classifier was then integrated as part of a Java applet that allows patients to record their own speech, either by pronouncing isolated phones, a specific word or a list of words, and provides them with a feedback whether the sibilant phones are being correctly pronounced.}
}
@inproceedings{CassiaIS12,
  author = {Valentini-Botinhao, C. and Yamagishi, J. and King, S.},
  title = {{Mel cepstral coefficient modification based on the Glimpse Proportion measure for improving the intelligibility of {HMM}-generated synthetic speech in noise}},
  booktitle = {Proc. Interspeech},
  address = {Portland, USA},
  month = {September},
  year = {2012},
  abstract = {We propose a method that modifies the Mel cepstral coefficients of HMM-generated synthetic speech in order to increase the intelligibility of the generated speech when heard by a listener in the presence of a known noise. This method is based on an approximation we previously proposed for the Glimpse Proportion measure. Here we show how to update the Mel cepstral coefficients using this measure as an optimization criterion and how to control the amount of distortion by limiting the frequency resolution of the modifications. To evaluate the method we built eight different voices from normal read-text speech data from a male speaker. Some voices were also built from Lombard speech data produced by the same speaker. Listening experiments with speech-shaped noise and with a single competing talker indicate that our method significantly improves intelligibility when compared to unmodified synthetic speech. The voices built from Lombard speech outperformed the proposed method particularly for the competing talker case. However, compared to a voice using only the spectral parameters from Lombard speech, the proposed method obtains similar or higher performance.},
  categories = {HMM-based speech synthesis, intelligibility enhancement, Mel cepstral coefficients}
}
@inproceedings{Valentini-Botinhao_SSW8,
  author = {Valentini-Botinhao, Cassia and Wester, Mirjam and Yamagishi, Junichi and King, Simon},
  title = {Using neighbourhood density and selective {SNR} boosting to increase the intelligibility of synthetic speech in noise},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  address = {Barcelona, Spain},
  month = {August},
  pages = {133--138},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Cassia_SSW13.pdf},
  abstract = {Motivated by the fact that words are not equally confusable, we explore the idea of using word-level intelligibility predictions to selectively boost the harder-to-understand words in a sentence, aiming to improve overall intelligibility in the presence of noise. First, the intelligibility of a set of words from dense and sparse phonetic neighbourhoods was evaluated in isolation. The resulting intelligibility scores were used to inform two sentencelevel experiments. In the first experiment the signal-to-noise ratio of one word was boosted to the detriment of another word. Sentence intelligibility did not generally improve. The intelligibility of words in isolation and in a sentence were found to be significantly different, both in clean and in noisy conditions. For the second experiment, one word was selectively boosted while slightly attenuating all other words in the sentence. This strategy was successful for words that were poorly recognised in that particular context. However, a reliable predictor of word-in-context intelligibility remains elusive, since this involves – as our results indicate – semantic, syntactic and acoustic information about the word and the sentence.}
}
@inproceedings{Valentini_IS14,
  author = {Valentini-Botinhao, C. and Wester, M.},
  title = {Using linguistic predictability and the {Lombard} effect to increase the intelligibility of synthetic speech in noise},
  abstract = {In order to predict which words in a sentence are harder to understand in noise it is necessary to consider not only audibility but also semantic or linguistic information. This paper focuses on using linguistic predictability to inform an intelligibility enhancement method that uses Lombard-adapted synthetic speech to modify low predictable words in Speech Perception in Noise (SPIN) test sentences. Word intelligibility in the presence of speech-shaped noise was measured using plain, Lombard and a combination of the two synthetic voices. The findings show that the Lombard voice increases intelligibility in noise but the intelligibility gap between words in a high and low predictable context still remains. Using a Lombard voice when a word is unpredictable is a good strategy, but if a word is predictable from its context the Lombard benefit only occurs when other words in the sentence are also modified.},
  address = {Singapore},
  month = {September},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Valentini_Wester_IS14.pdf},
  booktitle = {Proc. Interspeech},
  pages = {2063--2067},
  categories = {intelligibility enhancement, speech in noise, HMM-based speech synthesis, SPIN test}
}
@inproceedings{postfilter_IS14,
  author = {Chen, L.-H. and Raitio, T. and Valentini-Botinhao, C. and Yamagishi, J. and Ling, Z.-H.},
  title = {{DNN-Based Stochastic Postfilter for HMM-Based Speech Synthesis}},
  booktitle = {Proc. Interspeech},
  address = {Singapore},
  month = {September},
  pages = {1954--1958},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/postfilter_IS14.pdf},
  abstract = {In this paper we propose a deep neural network to model the conditional probability of the spectral differences between natural and synthetic speech. This allows us to reconstruct the spectral fine structures in speech generated by HMMs. We compared the new stochastic data-driven postfilter with global variance based parameter generation and modulation spectrum enhancement. Our results confirm that the proposed method significantly improves the segmental quality of synthetic speech compared to the conventional methods.}
}
@inproceedings{salb_IS14,
  author = {Valentini-Botinhao, C. and Toman, M. and Pucher, M. and Schabus, D. and Yamagishi, J.},
  title = {{Intelligibility Analysis of Fast Synthesized Speech}},
  booktitle = {Proc. Interspeech},
  address = {Singapore},
  month = {September},
  pages = {2922--2926},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/salb_IS14.pdf},
  abstract = {In this paper we analyse the effect of speech corpus and compression method on the intelligibility of synthesized speech at fast rates. We recorded English and German language voice talents at a normal and a fast speaking rate and trained an HSMM-based synthesis system based on the normal and the fast data of each speaker. We compared three compression methods: scaling the variance of the state duration model, interpolating the duration models of the fast and the normal voices, and applying a linear compression method to generated speech. Word recognition results for the English voices show that generating speech at normal speaking rate and then applying linear compression resulted in the most intelligible speech at all tested rates. A similar result was found when evaluating the intelligibility of the natural speech corpus. For the German voices, interpolation was found to be better at moderate speaking rates but the linear method was again more successful at very high rates, for both blind and sighted participants. These results indicate that using fast speech data does not necessarily create more intelligible voices and that linear compression can more reliably provide higher intelligibility, particularly at higher rates.}
}