The Centre for Speech Technology Research, The university of Edinburgh

Publications by M. Sam Ribeiro

s1250520.bib

@inproceedings{garner2014translation,
  author = {Garner, Philip N and Clark, Rob and Goldman, Jean-Philippe and Honnet, Pierre-Edouard and Ivanova, Maria and Lazaridis, Alexandros and Liang, Hui and Pfister, Beat and Ribeiro, Manuel Sam and Wehrli, Eric and others},
  title = {Translation and Prosody in Swiss Languages},
  booktitle = {Nouveaux cahiers de linguistique francaise, 31. 3rd Swiss Workshop on Prosody},
  address = {Geneva, Switzerland},
  month = {September},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Garner:14.pdf},
  abstract = {The SIWIS project aims to investigate spoken language translation, where both the speaker characteristics and prosody are translated. This means the translation carries not only spoken content, but also speaker identification, emotion and intent. We describe the background of the project, and present some initial approaches and results. These include the design and collection of a Swiss bilingual database that both enables research in Swiss accented speech processing, and facilitates reliable evaluation.},
  categories = {automatic speech recognition, text-to-speech synthesis, speech-to-speech translation, prosody}
}
@inproceedings{ribeiro2015multilevel,
  author = {Ribeiro, Manuel Sam and Clark, Robert A. J.},
  title = {A Multi-Level Representation of f0 using the Continuous Wavelet Transform and the Discrete Cosine Transform},
  booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP},
  address = {Brisbane, Australia},
  month = {April},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/ribeiro_and_clark_icassp15.pdf},
  abstract = {We propose a representation of f0 using the Continuous Wavelet Transform (CWT) and the Discrete Cosine Transform (DCT). The CWT decomposes the signal into various scales of selected frequencies, while the DCT compactly represents complex contours as a weighted sum of cosine functions. The proposed approach has the advantage of combining signal decomposition and higher-level representations, thus modeling low-frequencies at higher levels and high-frequencies at lower-levels. Objective results indicate that this representation improves f0 prediction over traditional short-term approaches. Subjective results show that improvements are seen over the typical MSD-HMM and are comparable to the recently proposed CWT-HMM, while using less parameters. These results are discussed and future lines of research are proposed.},
  categories = {prosody, HMM-based synthesis, f0 modeling, continuous wavelet transform, discrete cosine transform}
}
@inproceedings{ribeiro2015perceptual,
  author = {Ribeiro, Manuel Sam and Yamagishi, Junichi and Clark, Robert A. J.},
  title = {A Perceptual Investigation of Wavelet-based Decomposition of f0 for Text-to-Speech Synthesis},
  booktitle = {Proc. Interspeech},
  address = {Dresden, Germany},
  month = {September},
  year = {2015},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/ribeiro_et_al_IS15.pdf},
  abstract = {The Continuous Wavelet Transform (CWT) has been recently proposed to model f0 in the context of speech synthesis. It was shown that systems using signal decomposition with the CWT tend to outperform systems that model the signal directly. The f0 signal is typically decomposed into various scales of differing frequency. In these experiments, we reconstruct f0 with selected frequencies and ask native listeners to judge the naturalness of synthesized utterances with respect to natural speech. Results indicate that HMM-generated f0 is comparable to the CWT low frequencies, suggesting it mostly generates utterances with neutral intonation. Middle frequencies achieve very high levels of naturalness, while very high frequencies are mostly noise.},
  categories = {speech synthesis, prosody, f0 modeling, continuous wavelet transform, perceptual experiments}
}