The Centre for Speech Technology Research, The university of Edinburgh

Publications by Cassie Mayo

catherin.bib

@inproceedings{Cooke_IS13,
  author = {Cooke, M. and Mayo, C. and Valentini-Botinhao, C.},
  title = {{Intelligibility-enhancing speech modifications: the Hurricane Challenge}},
  booktitle = {Proc. Interspeech},
  year = {2013},
  month = {August},
  address = {Lyon, France},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2013/Cooke_IS13.pdf},
  abstact = {Speech output is used extensively, including in situations where correct message reception is threatened by adverse listening conditions. Recently, there has been a growing interest in algorithmic modifications that aim to increase the intelligibility of both natural and synthetic speech when presented in noise. The Hurricane Challenge is the first large-scale open evaluation of algorithms designed to enhance speech intelligibility. Eighteen systems operating on a common data set were subjected to extensive listening tests and compared to unmodified natural and text-to-speech (TTS) baselines. The best-performing systems achieved gains over unmodified natural speech of 4.4 and 5.1 dB in competing speaker and stationary noise respectively, while TTS systems made gains of 5.6 and 5.1 dB over their baseline. Surprisingly, for most conditions the largest gains were observed for noise-independent algorithms, suggesting that performance in this task can be further improved by exploiting information in the masking signal.}
}
@article{Cooke_SPCOM13,
  author = {Cooke, M. and Mayo, C. and Valentini-Botinhao, C. and Stylianou, Y. and Sauert, B. and Tang, Y.},
  title = {Evaluating the intelligibility benefit of speech modifications in known noise conditions},
  journal = {Speech Communication},
  pages = {572-585},
  volume = {55},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Cooke_SPCOM13.pdf},
  issue = {4},
  abstract = {The use of live and recorded speech is widespread in applications where correct message reception is important. Furthermore, the deployment of synthetic speech in such applications is growing. Modifications to natural and synthetic speech have therefore been proposed which aim at improving intelligibility in noise. The current study compares the benefits of speech modification algorithms in a large-scale speech intelligibility evaluation and quantifies the equivalent intensity change, defined as the amount in decibels that unmodified speech would need to be adjusted by in order to achieve the same intelligibility as modified speech. Listeners identified keywords in phonetically-balanced sentences representing ten different types of speech: plain and Lombard speech, five types of modified speech, and three forms of synthetic speech. Sentences were masked by either a stationary or a competing speech masker. Modification methods varied in the manner and degree to which they exploited estimates of the masking noise. The best-performing modifications led to equivalent intensity changes of around 5 dB in moderate and high noise levels for the stationary masker, and 3--4 dB in the presence of competing speech. These gains exceed those produced by Lombard speech. Synthetic speech in noise was always less intelligible than plain natural speech, but modified synthetic speech reduced this deficit by a significant amount.}
}
@inproceedings{mayo:99,
  author = {Mayo, C.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1999/0479.pdf},
  booktitle = {XIVth International Congress of Phonetic Sciences, San Francisco},
  year = {1999},
  categories = {speech perception, development, cue weighting, phonemic awareness, literacy},
  title = {Perceptual weighting and phonemic awareness in pre--reading and early--reading children}
}
@inproceedings{mayo:12,
  author = {Mayo, C. and Aubanel, V. and Cooke, M.},
  booktitle = {Proc. Interspeech},
  year = {2012},
  address = {Portland, OR, USA},
  title = {Effect of prosodic changes on speech intelligibility}
}
@inproceedings{mayoturk:98,
  author = {Mayo, C.},
  booktitle = {LabPhon 6, University of York, UK},
  year = {1998},
  categories = {speech perception, development, cue weighting, phonemic awareness, literacy},
  title = {The developmental relationship between perceptual weighting and phonemic awareness}
}
@inproceedings{mayoturk:03,
  author = {Mayo, C. and Turk, A.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/icphs-0677.pdf},
  booktitle = {XVth International Congress of Phonetic Sciences, Barcelona},
  year = {2003},
  categories = {speech perception, development, cue weighting},
  title = {Is the development of cue weighting strategies in children's speech perception context-dependent?}
}
@inproceedings{clark:podsiadlo:mayo:king:blizzard2007,
  author = {Clark, Robert A. J. and Podsiadlo, Monika and Fraser, Mark and Mayo, Catherine and King, Simon},
  title = {Statistical Analysis of the {B}lizzard {C}hallenge 2007 Listening Test Results},
  booktitle = {Proc. Blizzard 2007 (in Proc. Sixth {ISCA} Workshop on Speech Synthesis)},
  year = {2007},
  month = {August},
  address = {Bonn, Germany},
  keywords = {Blizzard},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2007/blz3_003.pdf},
  abstract = {Blizzard 2007 is the third Blizzard Challenge, in which participants build voices from a common dataset. A large listening test is conducted which allows comparison of systems in terms of naturalness and intelligibility. New sections were added to the listening test for 2007 to test the perceived similarity of the speaker's identity between natural and synthetic speech. In this paper, we present the results of the listening test and the subsequent statistical analysis.},
  categories = {blizzard,listening test}
}
@inproceedings{gibbonmayo:08,
  author = {Gibbon, F. and Mayo, C.},
  booktitle = {4th International EPG Symposium, Edinburgh, UK.},
  year = {2008},
  categories = {speech perception, cue weighting, undifferentiated gestures, electropalatography},
  title = {Adults' perception of conflicting acoustic cues associated with EPG-defined undifferentiated gestures}
}
@article{mayosturkwatson:01,
  author = {Mayo, C. and Turk, A. and Watson, J.},
  title = {Flexibility of acoustic cue weighting in children's speech perception},
  journal = {Journal of the Acoustical Society of America},
  volume = {109},
  year = {2001},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2001/JASA-Mayo-Turk-Watson.pdf},
  pages = {2313},
  categories = {speech perception, development, cue weighting}
}
@inproceedings{mayoturk:04b,
  author = {Mayo, C. and Turk, A.},
  booktitle = {LabPhon 9, University of Illinois at Urbana-Champaign},
  year = {2004},
  categories = {speech perception, development, cue weighting},
  title = {The Development of Perceptual Cue Weighting Within and Across Monosyllabic Words}
}
@article{mayoturk-jasa05,
  author = {Mayo, C. and Turk, A.},
  title = {The influence of spectral distinctiveness on acoustic cue weighting in children's and adults' speech perception},
  journal = {Journal of the Acoustical Society of America},
  volume = {118},
  year = {2005},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/mayo-turk-2005-7.pdf},
  pages = {1730--1741}
}
@inproceedings{mayoaylettladd:97,
  author = {Mayo, C. and Aylett, M. and Ladd, D. R.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/1997/esca2.pdf},
  booktitle = {Intonation: Theory, Models and Applications},
  year = {1997},
  categories = {intonation, perceptual evaluation, Glasgow English, transcription, ToBI},
  title = {Prosodic transcription of Glasgow English: an evaluation study of {GlaToBI}}
}
@article{mayoscobbiehewlettwaters:03,
  author = {Mayo, C. and Scobbie, J. and Hewlett, N. and Waters, D.},
  title = {The influence of phonemic awareness development on acoustic cue weighting in children's speech perception},
  journal = {Journal of Speech, Language and Hearing Research},
  volume = {46},
  year = {2003},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2003/JSLHR1184-Mayo.pdf},
  pages = {1184-1196},
  categories = {speech perception, development, cue weighting, phonemic awareness, literacy}
}
@inproceedings{mayoturk-psp05,
  author = {Mayo, C. and Turk, A.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/mayoday2.pdf},
  booktitle = {Proc. ISCA Workshop on Plasticity in Speech Perception},
  year = {2005},
  title = {No Available Theories Currently Explain All Adult-Child Cue Weighting Differences},
  address = {London, UK}
}
@inproceedings{koutsogiannaki:12,
  author = {Koutsogiannaki, M. and Pettinato, M. and Mayo, C. and Kandia, V. and Stylianou, Y.},
  booktitle = {Proc. Interspeech},
  year = {2012},
  address = {Portland, OR, USA},
  title = {Can modified casual speech reach the intelligibility of clear speech?}
}
@inproceedings{mayoturk:99,
  author = {Mayo, C.},
  booktitle = {20th Annual Child Phonology Conference, Bangor, Wales},
  year = {1999},
  categories = {speech perception, development, cue weighting, phonemic awareness, literacy},
  title = {The development of phonemic awareness and perceptual weighting in relation to early and later literacy acquisition}
}
@inproceedings{aubanel:12,
  author = {Aubanel, V. and Cooke, M. and Foster, E. and Garcia-Lecumberri, M. L. and Mayo, C.},
  booktitle = {Proc. Interspeech},
  year = {2012},
  address = {Portland, OR, USA},
  title = {Effects of the availability of visual information and presence of competing conversations on speech production}
}
@phdthesis{mayo:00,
  author = {Mayo, C.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2000/thesis.pdf},
  school = {Queen Margaret University College},
  year = {2000},
  categories = {speech perception, development, cue weighting, phonemic awareness, literacy},
  title = {The relationship between phonemic awareness and cue weighting in speech perception: longitudinal and cross-sectional child studies}
}
@inproceedings{mayoturk:98b,
  author = {Mayo, C.},
  booktitle = {Chicago Linguistics Society 34},
  year = {1998},
  categories = {speech perception, development, cue weighting, phonemic awareness, literacy},
  title = {A longitudinal study of perceptual weighting and phonemic awarenes}
}
@inproceedings{mayoclarkking-isp05,
  author = {Mayo, C. and Clark, R. A. J. and King, S.},
  title = {Multidimensional Scaling of Listener Responses to Synthetic Speech},
  booktitle = {Proc. Interspeech 2005},
  year = {2005},
  month = {September},
  address = {Lisbon, Portugal},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2005/ie-speech-2005.pdf}
}
@inproceedings{mayoturkwatson:02,
  author = {Mayo, C. and Turk, A. and Watson, J.},
  booktitle = {Proceedings of TIPS: Temporal Integration in the Perception of Speech, Aix-en-Provence},
  year = {2002},
  categories = {speech perception, development, cue weighting},
  title = {Development of cue weighting strategies in children's speech perception}
}
@inproceedings{karaiskos:king:clark:mayo:blizzard2008,
  author = {Karaiskos, Vasilis and King, Simon and Clark, Robert A. J. and Mayo, Catherine},
  title = {The Blizzard Challenge 2008},
  booktitle = {Proc. Blizzard Challenge Workshop},
  year = {2008},
  month = {September},
  address = {Brisbane, Australia},
  keywords = {Blizzard},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2008/summary_Blizzard2008.pdf},
  abstract = {The Blizzard Challenge 2008 was the fourth annual Blizzard Challenge. This year, participants were asked to build two voices from a UK English corpus and one voice from a Man- darin Chinese corpus. This is the first time that a language other than English has been included and also the first time that a large UK English corpus has been available. In addi- tion, the English corpus contained somewhat more expressive speech than that found in corpora used in previous Blizzard Challenges. To assist participants with limited resources or limited ex- perience in UK-accented English or Mandarin, unaligned la- bels were provided for both corpora and for the test sentences. Participants could use the provided labels or create their own. An accent-specific pronunciation dictionary was also available for the English speaker. A set of test sentences was released to participants, who were given a limited time in which to synthesise them and submit the synthetic speech. An online listening test was con- ducted, to evaluate naturalness, intelligibility and degree of similarity to the original speaker.}
}
@article{mayo:clark:king:10,
  author = {Mayo, C. and Clark, R. A. J. and King, S.},
  doi = {10.1016/j.specom.2010.10.003},
  title = {Listeners' Weighting of Acoustic Cues to Synthetic Speech Naturalness: A Multidimensional Scaling Analysis},
  journal = {Speech Communication},
  number = {3},
  abstract = {The quality of current commercial speech synthesis systems is now so high that system improvements are being made at subtle sub- and supra-segmental levels. Human perceptual evaluation of such subtle improvements requires a highly sophisticated level of perceptual attention to specific acoustic characteristics or cues. However, it is not well understood what acoustic cues listeners attend to by default when asked to evaluate synthetic speech. It may, therefore, be potentially quite difficult to design an evaluation method that allows listeners to concentrate on only one dimension of the signal, while ignoring others that are perceptually more important to them. The aim of the current study was to determine which acoustic characteristics of unit-selection synthetic speech are most salient to listeners when evaluating the naturalness of such speech. This study made use of multidimensional scaling techniques to analyse listeners' pairwise comparisons of synthetic speech sentences. Results indicate that listeners place a great deal of perceptual importance on the presence of artifacts and discontinuities in the speech, somewhat less importance on aspects of segmental quality, and very little importance on stress/intonation appropriateness. These relative differences in importance will impact on listeners' ability to attend to these different acoustic characteristics of synthetic speech, and should therefore be taken into account when designing appropriate methods of synthetic speech evaluation.},
  volume = {53},
  year = {2011},
  keywords = {Speech synthesis; Evaluation; Speech perception; Acoustic cue weighting; Multidimensional scaling},
  pages = {311--326}
}
@article{mayoturk:04,
  author = {Mayo, C. and Turk, T.},
  title = {Adult-child differences in acoustic cue weighting are influenced by segmental context: Children are not always perceptually biased towards transitions},
  journal = {Journal of the Acoustical Society of America},
  volume = {115},
  year = {2004},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2004/mayo-turk-2004a.pdf},
  pages = {3184-3194},
  categories = {speech perception, development, cue weighting}
}
@inproceedings{godoy_mayo_stylianou_interspeech13,
  author = {Godoy, Elizabeth and Mayo, Catherine and Stylianou, Yannis},
  title = {Linking Loudness Increases in Normal and {Lombard} Speech to Decreasing Vowel Formant Separation},
  booktitle = {Proc. Interspeech},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/IS130186.PDF},
  abstract = {The increased vocal effort associated with the Lombard reflex produces speech that is perceived as louder and judged to be more intelligible in noise than normal speech. Previous work illustrates that, on average, Lombard increases in loudness result from boosting spectral energy in a frequency band spanning the range of formants F1-F3, particularly for voiced speech. Observing additionally that increases in loudness across spoken sentences are spectro-temporally localized, the goal of this work is to further isolate these regions of maximal loudness by linking them to specific formant trends, explicitly considering here the vowel formant separation. For both normal and Lombard speech, this work illustrates that, as loudness increases in frequency bands containing formants (e.g. F1-F2 or F2-F3), the observed separation between formant frequencies decreases. From a production standpoint, these results seem to highlight a physiological trait associated with how humans increase the loudness of their speech, namely moving vocal tract resonances closer together. Particularly, for Lombard speech, this phenomena is exaggerated: that is, the Lombard speech is louder and formants in corresponding spectro-temporal regions are even closer together},
  categories = {Lombard effect, loudness, vowel formant separation}
}
@article{mayo_gibbon_clark_jslhr13,
  author = {Mayo, Catherine and Gibbon, Fiona and Clark, Robert A. J.},
  doi = {doi:10.1044/1092-4388(2012/10-0280)},
  title = {Phonetically Trained and Untrained Adults' Transcription of Place of Articulation for Intervocalic Lingual Stops With Intermediate Acoustic Cues},
  journal = {Journal of Speech, Language and Hearing Research},
  abstract = {Purpose: In this study, the authors aimed to investigate how listener training and the presence of intermediate acoustic cues influence transcription variability for conflicting cue speech stimuli. Method: Twenty listeners with training in transcribing disordered speech, and 26 untrained listeners, were asked to make forced-choice labeling decisions for synthetic vowel–consonant–vowel (VCV) sequences "a doe" and "a go". Both the VC and CV transitions in these stimuli ranged through intermediate positions, from appropriate for /d/ to appropriate for /g/. Results: Both trained and untrained listeners gave more weight to the CV transitions than to the VC transitions. However, listener behavior was not uniform: The results showed a high level of inter- and intratranscriber inconsistency, with untrained listeners showing a nonsignificant tendency to be more influenced than trained listeners by CV transitions. Conclusions: Listeners do not assign consistent categorical labels to the type of intermediate, conflicting transitional cues that were present in the stimuli used in the current study and that are also present in disordered articulations. Although listener inconsistency in assigning labels to intermediate productions is not increased as a result of phonetic training, neither is it reduced by such training.},
  volume = {56},
  year = {2013},
  keywords = {speech perception, intermediate acoustic cues, phonetic transcription, multilevel logistic regression},
  pages = {779-791}
}
@inproceedings{wester:icassp:14,
  author = {Wester, Mirjam and Mayo, Cassie},
  title = {Accent rating by native and non-native listeners},
  booktitle = {Proceedings of ICASSP},
  year = {2014},
  month = {May},
  pages = {7749-7753},
  address = {Florence, Italy},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/wester:icassp:14.pdf},
  abstract = {This study investigates the influence of listener native language with respect to talker native language on perception of degree of foreign accent in English. Listeners from native English, Finnish, German and Mandarin backgrounds rated the accentedness of native English, Finnish, German and Mandarin talkers producing a controlled set of English sentences. Results indicate that non-native listeners, like native listeners, are able to classify non-native talkers as foreign-accented, and native talkers as unaccented. However, while non-native talkers received higher accentedness ratings than native talkers from all listener groups, non-native listeners judged talkers with non-native accents less harshly than did native English listeners. Similarly, non-native listeners assigned higher degrees of foreign accent to native English talkers than did native English listeners. It seems that non-native listeners give accentedness ratings that are less extreme, or closer to the centre of the rating scale in both directions, than those used by native listeners.},
  categories = {accent rating}
}
@inproceedings{henter2014measuring,
  author = {Henter, Gustav Eje and Merritt, Thomas and Shannon, Matt and Mayo, Catherine and King, Simon},
  title = {Measuring the perceptual effects of modelling assumptions in speech synthesis using stimuli constructed from repeated natural speech},
  abstract = {Acoustic models used for statistical parametric speech synthesis typically incorporate many modelling assumptions. It is an open question to what extent these assumptions limit the naturalness of synthesised speech. To investigate this question, we recorded a speech corpus where each prompt was read aloud multiple times. By combining speech parameter trajectories extracted from different repetitions, we were able to quantify the perceptual effects of certain commonly used modelling assumptions. Subjective listening tests show that taking the source and filter parameters to be conditionally independent, or using diagonal covariance matrices, significantly limits the naturalness that can be achieved. Our experimental results also demonstrate the shortcomings of mean-based parameter generation.},
  month = {September},
  volume = {15},
  pages = {1504--1508},
  year = {2014},
  keywords = {speech synthesis, acoustic modelling, stream independence, diagonal covariance matrices, repeated speech},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/henter2014measuring.pdf},
  booktitle = {Proc. Interspeech}
}
@inproceedings{kay2015knowledge,
  author = {Kay, Rosie and Watts, Oliver and Barra-Chicote, Roberto and Mayo, Cassie},
  date-modified = {2018-01-19 16:44:10 +0000},
  title = {Knowledge versus data in TTS: evaluation of a continuum of synthesis systems},
  abstract = {Grapheme-based models have been proposed for both ASR and TTS as a way of circumventing the lack of expert-compiled pronunciation lexicons in under-resourced languages. It is a common observation that this should work well in languages employing orthographies with a transparent letter-to-phoneme relationship,such as Spanish. Our experience has shown, however,that there is still a significant difference in intelligibility between grapheme-based systems and conventional ones for this language. This paper explores the contribution of different levels of linguistic annotation to system intelligibility, and the trade-off between those levels and the quantity of data used for training. Ten systems spaced across these two continua of knowledge and data were subjectively evaluated for intelligibility.},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/kay2015knowledge.pdf},
  booktitle = {INTERSPEECH 2015, 16th Annual Conference of the International Speech Communication Association, Dresden, Germany, September 6-10, 2015},
  pages = {3335--3339}
}