The Centre for Speech Technology Research, The university of Edinburgh

Publications by Rasmus Dall

s0836504.bib

@inproceedings{Dall_Veaux_Yamagishi_King_Interspeech2012,
  author = {Dall, Rasmus and Veaux, Christophe and Yamagishi, Junichi and King, Simon},
  title = {Analysis of speaker clustering techniques for {HMM}-based speech synthesis},
  booktitle = {Proc. Interspeech},
  month = {September},
  year = {2012},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/Dall_Veaux_Yamagishi_King_Interspeech2012.pdf},
  abstract = {This paper describes a method for speaker clustering, with the application of building average voice models for speaker-adaptive HMM-based speech synthesis that are a good basis for adapting to specific target speakers. Our main hypothesis is that using perceptually similar speakers to build the average voice model will be better than use unselected speakers, even if the amount of data available from perceptually similar speakers is smaller. We measure the perceived similarities among a group of 30 female speakers in a listening test and then apply multiple linear regression to automatically predict these listener judgements of speaker similarity and thus to identify similar speakers automatically. We then compare a variety of average voice models trained on either speakers who were perceptually judged to be similar to the target speaker, or speakers selected by the multiple linear regression, or a large global set of unselected speakers. We find that the average voice model trained on perceptually similar speakers provides better performance than the global model, even though the latter is trained on more data, confirming our main hypothesis. However, the average voice model using speakers selected automatically by the multiple linear regression does not reach the same level of performance.},
  categories = {Statistical parametric speech synthesis, hidden Markov models, speaker adaptation}
}
@inproceedings{Dall_Yamagishi_King_SpeechProsody2014,
  author = {Dall, Rasmus and Yamagishi, Junichi and King, Simon},
  title = {Rating Naturalness in Speech Synthesis: The Effect of Style and Expectation},
  booktitle = {Proc. Speech Prosody},
  month = {May},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Dall_Yamagishi_King_SpeechProsody2014.pdf},
  abstract = {In this paper we present evidence that speech produced spontaneously in a conversation is considered more natural than read prompts. We also explore the relationship between participants' expectations of the speech style under evaluation and their actual ratings. In successive listening tests subjects rated the naturalness of either spontaneously produced, read aloud or written sentences, with instructions toward either conversational, reading or general naturalness. It was found that, when presented with spontaneous or read aloud speech, participants consistently rated spontaneous speech more natural - even when asked to rate naturalness in the reading case. Presented with only text, participants generally preferred transcriptions of spontaneous utterances, except when asked to evaluate naturalness in terms of reading aloud. This has implications for the application of MOS-scale naturalness ratings in Speech Synthesis, and potentially on the type of data suitable for use both in general TTS, dialogue systems and specifically in Conversational TTS, in which the goal is to reproduce speech as it is produced in a spontaneous conversational setting.},
  categories = {speech synthesis, evaluation, naturalness, MOS, spontaneous speech, read speech, TTS}
}
@inproceedings{dall_IS14,
  author = {Dall, Rasmus and Wester, Mirjam and Corley, Martin},
  title = {The Effect of Filled Pauses and Speaking Rate on Speech Comprehension in Natural, Vocoded and Synthetic Speech},
  abstract = {It has been shown that in natural speech filled pauses can be beneficial to a listener. In this paper, we attempt to discover whether listeners react in a similar way to filled pauses in synthetic and vocoded speech compared to natural speech. We present two experiments focusing on reaction time to a target word. In the first, we replicate earlier work in natural speech, namely that listeners respond faster to a target word following a filled pause than following a silent pause. This is replicated in vocoded but not in synthetic speech. Our second experiment investigates the effect of speaking rate on reaction times as this was potentially a confounding factor in the first experiment. Evidence suggests that slower speech rates lead to slower reaction times in synthetic \emph{and} in natural speech. Moreover, in synthetic speech the response to a target word after a filled pause is slower than after a silent pause. This finding, combined with an overall slower reaction time, demonstrates a shortfall in current synthesis techniques. Remedying this could help make synthesis less demanding and more pleasant for the listener, and reaction time experiments could thus provide a measure of improvement in synthesis techniques.},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Dall_Wester_Corley_IS14.pdf},
  booktitle = {Proc. Interspeech},
  categories = {HMM-synthesis, speech synthesis, reaction time, filled pause, disfluency, speaking rate, speech perception}
}
@inproceedings{Dall_Tomalin_IS14,
  author = {Dall, Rasmus and Tomalin, Marcus and Wester, Mirjam and Byrne, William and King, Simon},
  title = {Investigating Automatic & Human Filled Pause Insertion for Speech Synthesis},
  booktitle = {Proc. Interspeech},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Dall_Tomalin_Wester.pdf},
  abstract = {Filled pauses are pervasive in conversational speech and have been shown to serve several psychological and structural purposes. Despite this, they are seldom modelled overtly by state-of-the-art speech synthesis systems. This paper seeks to motivate the incorporation of filled pauses into speech synthesis systems by exploring their use in conversational speech, and by comparing the performance of several automatic systems inserting filled pauses into fluent text. Two initial experiments are described which seek to determine whether people's predicted insertion points are consistent with actual practice and/or with each other. The experiments also investigate whether there are `right' and `wrong' places to insert filled pauses. The results show good consistency between people's predictions of usage and their actual practice, as well as a perceptual preference for the `right' placement. The third experiment contrasts the performance of several automatic systems that insert filled pauses into fluent sentences. The best performance (determined by F-score) was achieved through the by-word interpolation of probabilities predicted by Recurrent Neural Network and 4gram Language Models. The results offer insights into the use and perception of filled pauses by humans, and how automatic systems can be used to predict their locations.},
  categories = {filled pause, HMM TTS, SVM, RNN}
}
@inproceedings{Aylett_Dall_Ghoshal_Henter_Merritt_Interspeech2014,
  author = {Aylett, Matthew and Dall, Rasmus and Ghoshal, Arnab and Henter, Gustav Eje and Merritt, Thomas},
  title = {A Flexible Front-End for {HTS}},
  booktitle = {Proc. Interspeech},
  month = {September},
  pages = {1283--1287},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Aylett_Dall_Ghoshal_Henter_Merritt_Interspeech2014.pdf},
  abstract = {Parametric speech synthesis techniques depend on full context acoustic models generated by language front-ends, which analyse linguistic and phonetic structure. HTS, the leading parametric synthesis system, can use a number of different front-ends to generate full context models for synthesis and training. In this paper we explore the use of a new text processing front-end that has been added to the speech recognition toolkit Kaldi as part of an ongoing project to produce a new parametric speech synthesis system, Idlak. The use of XML specification files, a modular design, and modern coding and testing approaches, make the Idlak front-end ideal for adding, altering and experimenting with the contexts used in full context acoustic models. The Idlak front-end was evaluated against the standard Festival front-end in the HTS system. Results from the Idlak front-end compare well with the more mature Festival front-end (Idlak - 2.83 MOS vs Festival - 2.85 MOS), although a slight reduction in naturalness perceived by non-native English speakers can be attributed to Festival’s insertion of non-punctuated pauses.},
  categories = {speech synthesis, text processing, parametric synthesis, Kaldi, Idlak}
}
@incollection{dalessandro_et_al_2014_reactive,
  editor = {Rybarczyk, Yves and Cardoso, Tiago and Rosas, João and Camarinha-Matos, Luis M.},
  author = {d’Alessandro, Nicolas and Tilmanne, Joëlle and Astrinaki, Maria and Hueber, Thomas and Dall, Rasmus and Ravet, Thierry and Moinet, Alexis and Cakmak, Huseyin and Babacan, Onur and Barbulescu, Adela and Parfait, Valentin and Huguenin, Victor and Kalaycı, EmineSümeyye and Hu, Qiong},
  publisher = {Springer Berlin Heidelberg},
  title = {Reactive Statistical Mapping: Towards the Sketching of Performative Control with Data},
  series = {IFIP Advances in Information and Communication Technology},
  booktitle = {Innovative and Creative Developments in Multimodal Interaction Systems},
  abstract = {This paper presents the results of our participation to the ninth eNTERFACE workshop on multimodal user interfaces. Our target for this workshop was to bring some technologies currently used in speech recognition and synthesis to a new level, i.e. being the core of a new HMM-based mapping system. The idea of statistical mapping has been investigated, more precisely how to use Gaussian Mixture Models and Hidden Markov Models for realtime and reactive generation of new trajectories from inputted labels and for realtime regression in a continuous-to-continuous use case. As a result, we have developed several proofs of concept, including an incremental speech synthesiser, a software for exploring stylistic spaces for gait and facial motion in real-time, a reactive audiovisual laughter and a prototype demonstrating the realtime reconstruction of lower body gait motion strictly from upper body motion, with conservation of the stylistic properties. This project has been the opportunity to formalise HMM-based mapping, integrate various of these innovations into the Mage library and explore the development of a realtime gesture recognition tool.},
  volume = {425},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/dAllessandro_Tilmanne_Astrinaki_Hueber_Dall_Ravet_Moinet_Cakmak_Babacan_Barbulescu_Parfait_Huguenin_Kalayci_Hu_enterface2013.pdf},
  pages = {20-49},
  categories = {Statistical Modelling, Hidden Markov Models, Motion Capture, Speech, Singing, Laughter, Realtime Systems, Mapping}
}
@inproceedings{wester:artificial:IS2015,
  author = {Wester, Mirjam and Aylett, Matthew and Tomalin, Marcus and Dall, Rasmus},
  title = {Artificial Personality and Disfluency},
  booktitle = {Proc. Interspeech},
  year = {2015},
  month = {September},
  address = {Dresden},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/wester:artificial:IS2015.pdf},
  abstract = {The focus of this paper is artificial voices with different personalities. Previous studies have shown links between an individual's use of disfluencies in their speech and their perceived personality. Here, filled pauses (uh and um) and discourse markers (like, you know, I mean) have been included in synthetic speech as a way of creating an artificial voice with different personalities. We discuss the automatic insertion of filled pauses and discourse markers (i.e., fillers) into otherwise fluent texts. The automatic system is compared to a ground truth of human "acted" filler insertion. Perceived personality (as defined by the big five personality dimensions) of the synthetic speech is assessed by means of a standardised questionnaire. Synthesis without fillers is compared to synthesis with either spontaneous or synthetic fillers. Our findings explore how the inclusion of disfluencies influences the way in which subjects rate the perceived personality of an artificial voice.},
  categories = {artificial personality, TTS, disfluency}
}
@inproceedings{tomalin:diss:2015,
  author = {Tomalin, Marcus and Wester, Mirjam and Dall, Rasmus and Byrne, Bill and King, Simon},
  title = {A Lattice-based Approach to Automatic Filled Pause Insertion},
  booktitle = {Proc. DiSS 2015},
  year = {2015},
  month = {August},
  address = {Edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/tomalin:diss:2015.pdf},
  abstract = {This paper describes a novel method for automatically inserting filled pauses (e.g., UM) into fluent texts. Although filled pauses are known to serve a wide range of psychological and structural functions in conversational speech, they have not traditionally been modelled overtly by state-of-the-art speech synthesis systems. However, several recent systems have started to model disfluencies specifically, and so there is an increasing need to create disfluent speech synthesis input by automatically inserting filled pauses into otherwise fluent text. The approach presented here interpolates Ngrams and Full-Output Recurrent Neural Network Language Models (f-RNNLMs) in a lattice-rescoring framework. It is shown that the interpolated system outperforms separate Ngram and f-RNNLM systems, where performance is analysed using the Precision, Recall, and F-score metrics.},
  categories = {Disfluency, Filled Pauses, f-RNNLMs, Ngrams, Lattices}
}
@inproceedings{Wester:diss:2015,
  author = {Wester, Mirjam and Corley, Martin and Dall, Rasmus},
  title = {The Temporal Delay Hypothesis: Natural, Vocoded and Synthetic Speech},
  booktitle = {Proc. DiSS 2015},
  year = {2015},
  month = {August},
  address = {Edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/wester:diss:2015.pdf},
  abstract = {Including disfluencies in synthetic speech is being explored as a way of making synthetic speech sound more natural and conversational. How to measure whether the resulting speech is actually more natural, however, is not straightforward. Conventional approaches to synthetic speech evaluation fall short as a listener is either primed to prefer stimuli with filled pauses or, when they aren't primed they prefer more fluent speech. Psycholinguistic reaction time experiments may circumvent this issue. In this paper, we revisit one such reaction time experiment. For natural speech, delays in word onset were found to facilitate word recognition regardless of the type of delay; be they a filled pause (um), silence or a tone. We expand these experiments by examining the effect of using vocoded and synthetic speech. Our results partially replicate previous findings. For natural and vocoded speech, if the delay is a silent pause, significant increases in the speed of word recognition are found. If the delay comprises a filled pause there is a significant increase in reaction time for vocoded speech but not for natural speech. For synthetic speech, no clear effects of delay on word recognition are found. We hypothesise this is because it takes longer (requires more cognitive resources) to process synthetic speech than natural or vocoded speech.},
  categories = {delay hypothesis, disfluency}
}
@inproceedings{dall:diss2015,
  author = {Dall, Rasmus and Wester, Mirjam and Corley, Martin},
  title = {Disfluencies in change detection in natural, vocoded and synthetic speech},
  booktitle = {Proc. DiSS 2015},
  year = {2015},
  month = {August},
  address = {Edinburgh},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/dall:diss:2015.pdf},
  abstract = {In this paper, we investigate the effect of filled pauses, a discourse marker and silent pauses in a change detection experiment in natural, vocoded and synthetic speech. In natural speech change detection has been found to increase in the presence of filled pauses, we extend this work by replicating earlier findings and explore the effect of a discourse marker, like, and silent pauses. Furthermore we report how the use of "unnatural" speech, namely synthetic and vocoded, affects change detection rates. It was found that the filled pauses, the discourse marker and silent pauses all increase change detection rates in natural speech, however in neither synthetic nor vocoded speech did this effect appear. Rather, change detection rates decreased in both types of "unnatural" speech compared to natural speech. The natural results suggests that while each type of pause increase detection rates, the type of pause may have a further effect. The "unnatural" results suggest that it is not the full pipeline of synthetic speech that causes the degradation, but rather that something in the pre-processing, i.e. vocoding, of the speech database limits the resulting synthesis.},
  categories = {change detection, filled pauses, speech synthesis}
}
@inproceedings{dall2016testing,
  author = {Dall, Rasmus and Brognaux, Sandrine and Richmond, Korin and Valentini-Botinhao, Cassia and Henter, Gustav Eje and Hirschberg, Julia and Yamagishi, Junichi},
  title = {Testing the consistency assumption: pronunciation variant forced alignment in read and spontaneous speech synthesis},
  abstract = {Forced alignment for speech synthesis traditionally aligns a phoneme sequence predetermined by the front-end text processing system. This sequence is not altered during alignment, i.e., it is forced, despite possibly being faulty. The consistency assumption is the assumption that these mistakes do not degrade models, as long as the mistakes are consistent across training and synthesis. We present evidence that in the alignment of both standard read prompts and spontaneous speech this phoneme sequence is often wrong, and that this is likely to have a negative impact on acoustic models. A lattice-based forced alignment system allowing for pronunciation variation is implemented, resulting in improved phoneme identity accuracy for both types of speech. A perceptual evaluation of HMM-based voices showed that spontaneous models trained on this improved alignment also improved standard synthesis, despite breaking the consistency assumption.},
  month = {March},
  pages = {5155-5159},
  year = {2016},
  keywords = {speech synthesis, TTS, forced alignment, HMM},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2016/dall2016testing.pdf},
  booktitle = {Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}
}
@inproceedings{Dall2016f,
  author = {Dall, Rasmus and Hashimoto, Kei and Oura, Keiichiro and Nankaku, Yoshihiko and Tokuda, Keiichi},
  title = {{Redefining the Linguistic Context Feature Set for HMM and DNN TTS Through Position and Parsing}},
  booktitle = {Proc. Interspeech},
  year = {2016},
  address = {San Francisco, CA, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Dall_Hashimoto_Oura_Nankaku_Tokuda_Interspeech2016.pdf},
  abstract = {In this paper we present an investigation of a number of alter- native linguistic feature context sets for HMM and DNN text- to-speech synthesis. The representation of positional values is explored through two alternatives to the standard set of absolute values, namely relational and categorical values. In a preference test the categorical representation was found to be preferred for both HMM and DNN synthesis. Subsequently, features based on probabilistic context free grammar and dependency parsing are presented. These features represent the phrase level rela- tions between words in the sentences, and in a preference eval- uation it was found that these features all improved upon the base set, with a combination of both parsing methods best over- all. As the features primarily affected the F0 prediction, this il- lustrates the potential of syntactic structure to improve prosody in TTS.},
  categories = {Speech Synthesis, TTS, PCFG, dependency parse, parsing, HMM, DNN, linguistic features}
}
@inproceedings{Dall2016e,
  author = {Dall, Rasmus and Tomalin, Marcus and Wester, Mirjam},
  title = {{Synthesising Filled Pauses: Representation and Datamixing}},
  booktitle = {Proc. SSW9},
  year = {2016},
  address = {Cupertino, CA, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Dall_Tomalin_Wester_SSW2016.pdf},
  abstract = {Filled pauses occur frequently in spontaneous human speech, yet modern text-to-speech synthesis systems rarely model these disfluencies overtly, and consequently they do not output convincing synthetic filled pauses. This paper presents a text-to-speech system that is specifically designed to model these particular disfluencies more efffectively. A preparatory investigation shows that a synthetic voice trained exclusively on spontaneous speech is perceived to be inferior in quality to a voice trained entirely on read speech, even though the latter does not handle filled pauses well. This motivates an investigation into the phonetic representation of filled pauses which show that, in a preference test, the use of a distinct phone for filled pauses is preferred over the standard /V/ phone and the alternative /@/ phone. In addition, we present a variety of data-mixing techniques to combine the strengths of standard synthesis systems trained on read speech corpora with the supplementary advantages offered by systems trained on spontaneous speech. In a MUSHRA-style test, it is found that the best overall quality is obtained by combining the two types of corpora using a source mark- ing technique. Specifically, general speech is synthesised with a standard mark, while filled pauses are synthesised with a spontaneous mark, which has the added benefit of also producing filled pauses that are comparatively well synthesised.},
  categories = {TTS, Filled Pauses, HMM, Phonetic Represen- tation, Speech Synthesis}
}
@inproceedings{Dall2016d,
  author = {Dall, Rasmus and Gonzalvo, Xavi},
  title = {{JNDSLAM: A SLAM extension for Speech Synthesis}},
  booktitle = {Proc. Speech Prosody},
  year = {2016},
  address = {Boston, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Dall_Gonzalvo_SpeechProsody2016.pdf},
  abstract = {Pitch movement is a large component of speech prosody, and despite being directly modelled in statistical parametric speech synthesis systems very flat intonation contours are still pro- duced. We present an open-source fully data-driven approach to pitch contour stylisation suitable for speech synthesis based on the SLAM approach. Modifications are proposed based on the Just Noticeable Difference in pitch and tailored to the need of speech synthesis for describing the movement of the pitch. In an anchored Mean Opinion Score (MOS) test using oracle labels the proposed method shows an improvement over stan- dard synthesis. Long Short-Term Memory Neural Networks were then used to predict the contour labels, but initial exper- iments achieved low prediction rates. We conclude that using current linguistic features for pitch stylisation label mapping is not feasible unless additional features are added. Furthermore an open-source implementation is released.},
  categories = {HMM, TTS, LSTM, prosody, pitch contour, speech synthesis}
}