The Centre for Speech Technology Research, The university of Edinburgh

Publications by David Braude

s1044849.bib

@inproceedings{Braude2013a,
  author = {Braude, David Adam and Shimodaira, Hiroshi and Ben Youssef, Atef},
  title = {Template-Warping Based Speech Driven Head Motion Synthesis},
  booktitle = {Interspeech},
  abstract = {We propose a method for synthesising head motion from speech using a combination of an Input-Output Markov model (IOMM) and Gaussian mixture models trained in a supervised manner. A key difference of this approach compared to others is to model the head motion in each angle as a series of templates of motion rather than trying to recover a frame-wise function. The templates were chosen to reflect natural patterns in the head motion, and states for the IOMM were chosen based on statistics of the templates. This reduces the search space for the trajectories and stops impossible motions such as discontinuities from being possible. For synthesis our system warps the templates to account for the acoustic features and the other angles' warping parameters. We show our system is capable of recovering the statistics of the motion that were chosen for the states. Our system was then compared to a baseline that used a frame-wise mapping that is based on previously published work. A subjective preference test that includes multiple speakers showed participants have a preference for the segment based approach. Both of these systems were trained on storytelling free speech.},
  year = {2013},
  keywords = {Head motion synthesis, GMMs, IOMM},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Braude-etal_IS13.pdf},
  pages = {2763 -- 2767}
}
@inproceedings{benyoussef:IS2013,
  author = {Ben Youssef, Atef and Shimodaira, Hiroshi and Braude, David A.},
  title = {Articulatory features for speech-driven head motion synthesis},
  booktitle = {Proc. Interspeech},
  year = {2013},
  abstract = {This study investigates the use of articulatory features for speech-driven head motion synthesis as opposed to prosody features such as F0 and energy which have been mainly used in the literature. In the proposed approach, multi-stream HMMs are trained jointly on the synchronous streams of speech and head motion data. Articulatory features can be regarded as an intermediate parametrisation of speech that are expected to have a close link with head movement. Measured head and articulatory movements acquired by EMA were synchronously recorded with speech. Measured articulatory data was compared to those predicted from speech using an HMM-based inversion mapping system trained in a semi-supervised fashion. Canonical correlation analysis (CCA) on a data set of free speech of 12 people shows that the articulatory features are more correlated with head rotation than prosodic and/or cepstral speech features. It is also shown that the synthesised head motion using articulatory features give higher correlations with the original head motion than when only prosodic features are used.},
  month = {August},
  address = {Lyon, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/BenYoussef-et-al_IS13.pdf},
  pages = {2758-2762}
}
@inproceedings{braude2013template,
  author = {Braude, David A. and Shimodaira, Hiroshi and Ben Youssef, Atef},
  title = {Template-Warping Based Speech Driven Head Motion Synthesis},
  booktitle = {Proc. Interspeech},
  year = {2013},
  abstract = {We propose a method for synthesising head motion from speech using a combination of an Input-Output Markov model (IOMM) and Gaussian mixture models trained in a supervised manner. A key difference of this approach compared to others is to model the head motion in each angle as a series of templates of motion rather than trying to recover a frame-wise function. The templates were chosen to reflect natural patterns in the head motion, and states for the IOMM were chosen based on statistics of the templates. This reduces the search space for the trajectories and stops impossible motions such as discontinuities from being possible. For synthesis our system warps the templates to account for the acoustic features and the other angles’ warping parameters. We show our system is capable of recovering the statistics of the motion that were chosen for the states. Our system was then compared to a baseline that used a frame-wise mapping that is based on previously published work. A subjective preference test that includes multiple speakers showed participants have a preference for the segment based approach. Both of these systems were trained on storytelling free speech.},
  month = {August},
  address = {Lyon, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Braude-etal_IS13.pdf},
  pages = {2763-2767}
}
@inproceedings{benyoussef:iva2013,
  author = {Ben Youssef, Atef and Shimodaira, Hiroshi and Braude, David A.},
  title = {Head Motion Analysis and Synthesis over Different Tasks},
  booktitle = {Proc. Intelligent Virtual Agents},
  abstract = {It is known that subjects vary in their head movements. This paper presents an analysis of this variety over different tasks and speakers and their impact on head motion synthesis. Measured head and articulatory movements acquired by an ElectroMagnetic Articulograph (EMA) synchronously recorded with audio was used. Data set of speech of 12 people recorded on different tasks confirms that the head motion variate over tasks and speakers. Experimental results confirmed that the proposed models were capable of learning and synthesising task-dependent head motions from speech. Subjective evaluation of synthesised head motion using task models shows that trained models on the matched task is better than mismatched one and free speech data provide models that predict preferred motion by the participants compared to read speech data.},
  month = {September},
  year = {2013},
  organization = {Springer},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/BenYoussef-et-al_IVA13.pdf},
  pages = {285-294}
}
@inproceedings{braude:iva2013,
  author = {Braude, David A. and Shimodaira, Hiroshi and Ben Youssef, Atef},
  title = {The {University of Edinburgh} Head-Motion and Audio Storytelling ({U}o{E}-{H}A{S}) Dataset},
  booktitle = {Proc. Intelligent Virtual Agents},
  year = {2013},
  abstract = {In this paper we announce the release of a large dataset of storytelling monologue with motion capture for the head and body. Initial tests on the dataset indicate that head motion is more dependant on the speaker than the style of speech.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/Braude-etal_IVA2013.pdf},
  organization = {Springer},
  pages = {466-467}
}
@inproceedings{benyoussef_shimodaira_icassp2014,
  author = {Ben Youssef, Atef and Shimodaira, Hiroshi and Braude, David},
  title = {Speech driven Talking Head from Estimated Articulatory Features},
  booktitle = {Proc. ICASSP},
  address = {Florence, Italy},
  month = {May},
  pages = {4606--4610},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/benyoussef_etal_icassp2014.pdf},
  abstract = {In this paper, we present a talking head in which the lips and head motion are controlled using articulatory movements estimated from speech. A phonesize HMM-based inversion mapping is employed and trained in a semi-supervised fashion. The advantage of the use of articulatory features is that they can drive the lips motions and they have a close link with head movements. Speech inversion normally requires the training data recorded with electromagnetic articulograph (EMA), which restricts the naturalness of head movements. The present study considers a more realistic recording condition where the training data for the target speaker are recorded with a usual motion capture system rather than EMA. Different temporal clustering techniques are investigated for HMM-based mapping as well as a GMM-based frame-wise mapping as a baseline system. Objective and subjective experiments show that the synthesised motions are more natural using an HMM system than a GMM one, and estimated EMA features outperform prosodic features.},
  categories = {acoustic-articulatory, inversion mapping, MLPG, talking heads}
}