The Centre for Speech Technology Research, The university of Edinburgh

Publications by M. Sam Ribeiro

mribeir2.bib

@inproceedings{garner2014translation,
  author = {Garner, Philip N and Clark, Rob and Goldman, Jean-Philippe and Honnet, Pierre-Edouard and Ivanova, Maria and Lazaridis, Alexandros and Liang, Hui and Pfister, Beat and Ribeiro, Manuel Sam and Wehrli, Eric and others},
  title = {Translation and Prosody in Swiss Languages},
  booktitle = {Nouveaux cahiers de linguistique francaise, 31. 3rd Swiss Workshop on Prosody},
  year = {2014},
  month = {September},
  address = {Geneva, Switzerland},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Garner:14.pdf},
  abstract = {The SIWIS project aims to investigate spoken language translation, where both the speaker characteristics and prosody are translated. This means the translation carries not only spoken content, but also speaker identification, emotion and intent. We describe the background of the project, and present some initial approaches and results. These include the design and collection of a Swiss bilingual database that both enables research in Swiss accented speech processing, and facilitates reliable evaluation.},
  categories = {automatic speech recognition, text-to-speech synthesis, speech-to-speech translation, prosody}
}
@inproceedings{ribeiro2015multilevel,
  author = {Ribeiro, Manuel Sam and Clark, Robert A. J.},
  title = {A Multi-Level Representation of f0 using the Continuous Wavelet Transform and the Discrete Cosine Transform},
  booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP},
  year = {2015},
  month = {April},
  address = {Brisbane, Australia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/ribeiro_and_clark_icassp15.pdf},
  abstract = {We propose a representation of f0 using the Continuous Wavelet Transform (CWT) and the Discrete Cosine Transform (DCT). The CWT decomposes the signal into various scales of selected frequencies, while the DCT compactly represents complex contours as a weighted sum of cosine functions. The proposed approach has the advantage of combining signal decomposition and higher-level representations, thus modeling low-frequencies at higher levels and high-frequencies at lower-levels. Objective results indicate that this representation improves f0 prediction over traditional short-term approaches. Subjective results show that improvements are seen over the typical MSD-HMM and are comparable to the recently proposed CWT-HMM, while using less parameters. These results are discussed and future lines of research are proposed.},
  categories = {prosody, HMM-based synthesis, f0 modeling, continuous wavelet transform, discrete cosine transform}
}
@inproceedings{ribeiro2015perceptual,
  author = {Ribeiro, Manuel Sam and Yamagishi, Junichi and Clark, Robert A. J.},
  title = {A Perceptual Investigation of Wavelet-based Decomposition of f0 for Text-to-Speech Synthesis},
  booktitle = {Proc. Interspeech},
  year = {2015},
  month = {September},
  address = {Dresden, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/ribeiro_et_al_IS15.pdf},
  abstract = {The Continuous Wavelet Transform (CWT) has been recently proposed to model f0 in the context of speech synthesis. It was shown that systems using signal decomposition with the CWT tend to outperform systems that model the signal directly. The f0 signal is typically decomposed into various scales of differing frequency. In these experiments, we reconstruct f0 with selected frequencies and ask native listeners to judge the naturalness of synthesized utterances with respect to natural speech. Results indicate that HMM-generated f0 is comparable to the CWT low frequencies, suggesting it mostly generates utterances with neutral intonation. Middle frequencies achieve very high levels of naturalness, while very high frequencies are mostly noise.},
  categories = {speech synthesis, prosody, f0 modeling, continuous wavelet transform, perceptual experiments}
}
@inproceedings{ribeiro2016wavelet,
  author = {Ribeiro, Manuel Sam and Watts, Oliver and Yamagishi, Junichi and Clark, Robert A. J.},
  title = {Wavelet-based decomposition of f0 as a secondary task for {DNN-based} speech synthesis with multi-task learning},
  booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  year = {2016},
  month = {March},
  address = {Shanghai, China},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/ribeiro-et-al-icassp16.pdf},
  abstract = {We investigate two wavelet-based decomposition strategies of the f0 signal and their usefulness as a secondary task for speech synthesis using multi-task deep neural networks (MTL-DNN). The first decomposition strategy uses a static set of scales for all utterances in the training data. We propose a second strategy, where the scale of the mother wavelet is dynamically adjusted to the rate of each utterance. This approach is able to capture f0 variations related to the syllable, word, clitic-group, and phrase units. This method also constrains the wavelet components to be within the frequency range that previous experiments have shown to be more natural. These two strategies are evaluated as a secondary task in multi-task deep neural networks (MTL-DNNs). Results indicate that on an expressive dataset there is a strong preference for the systems using multi-task learning when compared to the baseline system.},
  categories = {speech synthesis, f0 modelling, deep neural network, multi-task learning, continuous wavelet transform}
}
@inproceedings{goldman2016siwis,
  author = {Goldman, Jean-Philippe and Honnet, Pierre-Edouard and Clark, Rob and Garner, Philip N and Ivanova, Maria and Lazaridis, Alexandros and Liang, Hui and Macedo, Tiago and Pfister, Beat and Ribeiro, Manuel Sam and others},
  title = {{The SIWIS database: a multilingual speech database with acted emphasis}},
  booktitle = {Proceedings of Interspeech},
  year = {2016},
  month = {September},
  address = {San Francisco, United States},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/1003.PDF},
  abstract = {We describe here a collection of speech data of bilingual and trilingual speakers of English, French, German and Italian. In the context of speech to speech translation (S2ST), this database is designed for several purposes and studies: training CLSA systems (cross-language speaker adaptation), conveying emphasis through S2ST systems, and evaluating TTS systems. More precisely, 36 speakers judged as accentless (22 bilingual and 14 trilingual speakers) were recorded for a set of 171 prompts in two or three languages, amounting to a total of 24 hours of speech. These sets of prompts include 100 sentences from news, 25 sentences from Europarl, the same 25 sentences with one acted emphasised word, 20 semantically unpredictable sentences, and finally a 240-word long text. All in all, it yielded 64 bilingual session pairs of the six possible combinations of the four languages. The database is freely available for non-commercial use and scientific research purposes},
  categories = {speech-to-speech translation, speech corpus, bilingual speakers, emphasis}
}
@inproceedings{ribeiro2016syllable,
  author = {Ribeiro, Manuel Sam and Watts, Oliver and Yamagishi, Junichi},
  title = {Syllable-level representations of suprasegmental features for {DNN-based} text-to-speech synthesis},
  booktitle = {Proceedings of Interspeech},
  year = {2016},
  month = {September},
  address = {San Francisco, United States},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/1034.PDF},
  abstract = {A top-down hierarchical system based on deep neural networks is investigated for the modeling of prosody in speech synthesis. Suprasegmental features are processed separately from segmental features and a compact distributed representation of highlevel units is learned at syllable-level. The suprasegmental representation is then integrated into a frame-level network. Objective measures show that balancing segmental and suprasegmental features can be useful for the frame-level network. Additional features incorporated into the hierarchical system are then tested. At the syllable-level, a bag-of-phones representation is proposed and, at the word-level, embeddings learned from text sources are used. It is shown that the hierarchical system is able to leverage new features at higher-levels more efficiently than a system which exploits them directly at the frame-level. A perceptual evaluation of the proposed systems is conducted and followed by a discussion of the results.},
  categories = {speech synthesis, prosody, deep neural networks, suprasegmental representations}
}
@inproceedings{ribeiro2016parallel,
  author = {Ribeiro, Manuel Sam and Watts, Oliver and Yamagishi, Junichi},
  title = {Parallel and cascaded deep neural networks for text-to-speech synthesis},
  booktitle = {9th ISCA Workshop on Speech Synthesis (SSW9)},
  year = {2016},
  month = {September},
  address = {Sunnyvale, United States},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/ribeiro-et-al-ssw9.pdf},
  abstract = {An investigation of cascaded and parallel deep neural networks for speech synthesis is conducted. In these systems, suprasegmental linguistic features (syllable-level and above) are processed separately from segmental features (phone-level and below). The suprasegmental component of the networks learns compact distributed representations of high-level linguistic units without any segmental influence. These representations are then integrated into a frame-level system using a cascaded or a parallel approach. In the cascaded network, suprasegmental representations are used as input to the frame-level network. In the parallel network, segmental and suprasegmental features are processed separately and concatenated at a later stage. These experiments are conducted with a standard set of high-dimensional linguistic features as well as a hand-pruned one. It is observed that hierarchical systems are consistently preferred over the baseline feedforward systems. Similarly, parallel networks are preferred over cascaded networks.},
  categories = {speech synthesis, prosody, deep neural networks, embeddings, suprasegmental representations}
}
@inproceedings{ribeiro2017learning,
  author = {Ribeiro, Manuel Sam and Watts, Oliver and Yamagishi, Junichi},
  title = {Learning word vector representations based on acoustic counts},
  booktitle = {Proceedings of Interspeech},
  year = {2017},
  month = {August},
  address = {Stockholm, Sweden},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/1340.PDF},
  abstract = {This paper presents a simple count-based approach to learning word vector representations by leveraging statistics of cooccurrences between text and speech. This type of representation requires two discrete sequences of units defined across modalities. Two possible methods for the discretization of an acoustic signal are presented, which are then applied to fundamental frequency and energy contours of a transcribed corpus of speech, yielding a sequence of textual objects (e.g. words, syllables) aligned with a sequence of discrete acoustic events. Constructing a matrix recording the co-occurrence of textual objects with acoustic events and reducing its dimensionality with matrix decomposition results in a set of context-independent representations of word types. These are applied to the task of acoustic modelling for speech synthesis; objective and subjective results indicate that these representations are useful for the generation of acoustic parameters in a text-to-speech (TTS) system. In general, we observe that the more discretization approaches, acoustic signals, and levels of linguistic analysis are incorporated into a TTS system via these count-based representations, the better that TTS system performs.},
  categories = {speech synthesis, text-to-speech, vector representations, word embeddings, deep neural networks}
}
@inproceedings{cstr2017blizzard,
  author = {Ronanki, Srikanth and Ribeiro, {Manuel Sam} and Espic, Felipe and Watts, Oliver},
  title = {The {CSTR} entry to the {Blizzard Challenge} 2017},
  booktitle = {Proc. Blizzard Challenge},
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/CSTR_Blizzard2017.pdf},
  abstract = {The annual Blizzard Challenge conducts side-by-side testing of a number of speech synthesis systems trained on a common set of speech data. Similar to 2016 Blizzard challenge, the task for this year is to train on expressively-read children’s story-books, and to synthesise speech in the same domain. The Challenge therefore presents an opportunity to investigate the effectiveness of several techniques we have developed when applied to expressive and prosodically-varied audiobook data. This paper describes the text-to-speech system entered by The Centre for Speech Technology Research into the 2017 Blizzard Challenge. The current system is a hybrid synthesis system which drives a unit selection synthesiser using the output from a neural network based acoustic and duration model. We assess the performance of our system by reporting the results from formal listening tests provided by the challenge.},
  categories = {Merlin, hybrid speech synthesis, unit selection, deep neural networks}
}