The Centre for Speech Technology Research, The university of Edinburgh

Publications by Qiong Hu

s1164800.bib

@inproceedings{Hu_SSW8,
  author = {Hu, Qiong and Richmond, Korin and Yamagishi, Junichi and Latorre, Javier},
  title = {An experimental comparison of multiple vocoder types},
  booktitle = {8th ISCA Workshop on Speech Synthesis},
  year = {2013},
  abstract = {This paper presents an experimental comparison of a broad range of the leading vocoder types which have been previously described. We use a reference implementation of each of these to create stimuli for a listening test using copy synthesis. The listening test is performed using both Lombard and normal read speech stimuli, and with two types of question for comparison. Multi-dimensional Scaling (MDS) is conducted on the listener responses to analyse similarities in terms of quality between the vocoders. Our MDS and clustering results show that the vocoders which use a sinusoidal synthesis approach are perceptually distinguishable from the source-filter vocoders. To help further interpret the axes of the resulting MDS space, we test for correlations with standard acoustic quality metrics and find one axis is strongly correlated with PESQ scores. We also find both speech style and the format of the listening test question may influence test results. Finally, we also present preference test results which compare each vocoder with the natural speech.},
  month = {August},
  address = {Barcelona, Spain},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/ssw8_OS4-3_Hu.pdf},
  pages = {155--160}
}
@inproceedings{Hu_IC14,
  author = {Hu, Qiong and Stylianou, Yannis and Richmond, Korin and Maia, Ranniery and Yamagishi, Junichi and Latorre, Javier},
  title = {A Fixed Dimension and Perceptually Based Dynamic Sinusoidal Model of Speech},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  address = {Florence, Italy},
  month = {May},
  pages = {6311--6315},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Qiong_IC14.pdf},
  abstract = {This paper presents a fixed- and low-dimensional, perceptually based dynamic sinusoidal model of speech referred to as PDM (Perceptual Dynamic Model). To decrease and fix the number of sinusoidal components typically used in the standard sinusoidal model, we propose to use only one dynamic sinusoidal component per critical band. For each band, the sinusoid with the maximum spectral amplitude is selected and associated with the centre frequency of that critical band. The model is expanded at low frequencies by incorporating sinusoids at the boundaries of the corresponding bands while at the higher frequencies a modulated noise component is used. A listening test is conducted to compare speech reconstructed with PDM and state-of-the-art models of speech, where all models are constrained to use an equal number of parameters. The results show that PDM is clearly preferred in terms of quality over the other systems.}
}
@inproceedings{Hu_Interspeech14,
  author = {Hu, Qiong and Stylianou, Yannis and Maia, Ranniery and Richmond, Korin and Yamagishi, Junichi and Latorre, Javier},
  title = {An investigation of the application of dynamic sinusoidal models to statistical parametric speech synthesis},
  booktitle = {Proc. Interspeech},
  year = {2014},
  abstract = {This paper applies a dynamic sinusoidal synthesis model to statistical parametric speech synthesis (HTS). For this, we utilise regularised cepstral coefficients to represent both the static amplitude and dynamic slope of selected sinusoids for statistical modelling. During synthesis, a dynamic sinusoidal model is used to reconstruct speech. A preference test is conducted to compare the selection of different sinusoids for cepstral representation. Our results show that when integrated with HTS, a relatively small number of sinusoids selected according to a perceptual criterion can produce quality comparable to using all harmonics. A Mean Opinion Score (MOS) test shows that our proposed statistical system is preferred to one using mel-cepstra from pitch synchronous spectral analysis.},
  month = {September},
  address = {Singapore},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/Q_Interspeech14.pdf},
  pages = {780--784}
}
@incollection{dalessandro_et_al_2014_reactive,
  editor = {Rybarczyk, Yves and Cardoso, Tiago and Rosas, João and Camarinha-Matos, Luis M.},
  author = {d’Alessandro, Nicolas and Tilmanne, Joëlle and Astrinaki, Maria and Hueber, Thomas and Dall, Rasmus and Ravet, Thierry and Moinet, Alexis and Cakmak, Huseyin and Babacan, Onur and Barbulescu, Adela and Parfait, Valentin and Huguenin, Victor and Kalaycı, EmineSümeyye and Hu, Qiong},
  publisher = {Springer Berlin Heidelberg},
  title = {Reactive Statistical Mapping: Towards the Sketching of Performative Control with Data},
  series = {IFIP Advances in Information and Communication Technology},
  booktitle = {Innovative and Creative Developments in Multimodal Interaction Systems},
  abstract = {This paper presents the results of our participation to the ninth eNTERFACE workshop on multimodal user interfaces. Our target for this workshop was to bring some technologies currently used in speech recognition and synthesis to a new level, i.e. being the core of a new HMM-based mapping system. The idea of statistical mapping has been investigated, more precisely how to use Gaussian Mixture Models and Hidden Markov Models for realtime and reactive generation of new trajectories from inputted labels and for realtime regression in a continuous-to-continuous use case. As a result, we have developed several proofs of concept, including an incremental speech synthesiser, a software for exploring stylistic spaces for gait and facial motion in real-time, a reactive audiovisual laughter and a prototype demonstrating the realtime reconstruction of lower body gait motion strictly from upper body motion, with conservation of the stylistic properties. This project has been the opportunity to formalise HMM-based mapping, integrate various of these innovations into the Mage library and explore the development of a realtime gesture recognition tool.},
  volume = {425},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/dAllessandro_Tilmanne_Astrinaki_Hueber_Dall_Ravet_Moinet_Cakmak_Babacan_Barbulescu_Parfait_Huguenin_Kalayci_Hu_enterface2013.pdf},
  pages = {20-49},
  categories = {Statistical Modelling, Hidden Markov Models, Motion Capture, Speech, Singing, Laughter, Realtime Systems, Mapping}
}
@inproceedings{Hu_ICASSP15,
  author = {Hu, Qiong and Stylianou, Yannis and Maia, Ranniery and Richmond, Korin and Yamagishi, Junichi},
  title = {METHODS FOR APPLYING DYNAMIC SINUSOIDAL MODELS TO STATISTICAL PARAMETRIC SPEECH SYNTHESIS},
  booktitle = {Proc. ICASSP},
  year = {2015},
  month = {April},
  address = {Brisbane, Austrilia},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/IC15_Qiong.pdf},
  abstract = {Sinusoidal vocoders can generate high quality speech, but they have not been extensively applied to statistical parametric speech synthesis. This paper presents two ways for using dynamic sinusoidal models for statistical speech synthesis, enabling the sinusoid parameters to be modelled in HMMbased synthesis. In the first method, features extracted from a fixed- and low-dimensional, perception-based dynamic sinusoidal model (PDM) are statistically modelled directly. In the second method, we convert both static amplitude and dynamic slope from all the harmonics of a signal, which we term the Harmonic Dynamic Model (HDM), to intermediate parameters (regularised cepstral coefficients) for modelling. During synthesis, HDM is then used to reconstruct speech. We have compared the voice quality of these two methods to the STRAIGHT cepstrum-based vocoder with mixed excitation in formal listening tests. Our results show that HDM with intermediate parameters can generate comparable quality as STRAIGHT, while PDM direct modelling seems promising in terms of producing good speech quality without resorting to intermediate parameters such as cepstra.}
}
@inproceedings{Hu_Interspeech15,
  author = {Hu, Qiong and Wu, Zhizheng and Richmond, Korin and Yamagishi, Junichi and Stylianou, Yannis and Maia, Ranniery},
  title = {Fusion of multiple parameterisations for {DNN}-based sinusoidal speech synthesis with multi-task learning},
  booktitle = {Proc. Interspeech},
  year = {2015},
  month = {September},
  address = {Dresden, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2015/Q_Interspeech15.pdf},
  abstract = {It has recently been shown that deep neural networks (DNN) can improve the quality of statistical parametric speech synthesis (SPSS) when using a source-filter vocoder. Our own previous work has furthermore shown that a dynamic sinusoidal model (DSM) is also highly suited to DNN-based SPSS, whereby sinusoids may either be used themselves as a “direct parameterisation” (DIR), or they may be encoded using an “intermediate spectral parameterisation” (INT). The approach in that work was effectively to replace a decision tree with a neural network. However, waveform parameterisation and synthesis steps that have been developed to suit HMMs may not fully exploit DNN capabilities. Here, in contrast, we investigate ways to combine INT and DIR at the levels of both DNN modelling and waveform generation. For DNN training, we propose to use multi-task learning to model cepstra (from INT) and log amplitudes (from DIR) as primary and secondary tasks. Our results show combining these improves modelling accuracy for both tasks. Next, during synthesis, instead of discarding parameters from the second task, a fusion method using harmonic amplitudes derived from both tasks is applied. Preference tests show the proposed method gives improved performance, and that this applies to synthesising both with and without global variance parameters.}
}
@inproceedings{hu2016initial,
  author = {Hu, Qiong and Yamagishi, Junichi and Richmond, Korin and Subramanian, Kartick and Stylianou, Yannis},
  title = {Initial investigation of speech synthesis based on complex-valued neural networks},
  abstract = {Although frequency analysis often leads us to a speech signal in the complex domain, the acoustic models we frequently use are designed for real-valued data. Phase is usually ignored or modelled separately from spectral amplitude. Here, we propose a complex-valued neural network (CVNN) for directly modelling the results of the frequency analysis in the complex domain (such as the complex amplitude). We also introduce a phase encoding technique to map real-valued data (e.g. cepstra or log amplitudes) into the complex domain so we can use the same CVNN processing seamlessly. In this paper, a fully complex-valued neural network, namely a neural network where all of the weight matrices, activation functions and learning algorithms are in the complex domain, is applied for speech synthesis. Results show its ability to model both complex-valued and real-valued data.},
  month = {March},
  pages = {5630-5634},
  year = {2016},
  keywords = {complex-valued neural network, speech synthesis, complex amplitude, phase modelling},
  pdf = {http://www.cstr.ed.ac.uk/downloads/publications/2016/hu2016initial.pdf},
  booktitle = {Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}
}
@phdthesis{qiong-2016,
  author = {Hu, Qiong},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/Thesis_Qiong_Hu.pdf},
  school = {University of Edinburgh},
  year = {2016},
  abstract = {This study focuses on improving the voice quality of statistical speech synthesis based on sinusoidal models. Although our study shows that for copy synthesis, sinusoidal model with the complex amplitude can generate high quality of speech compared with source-filter one, component sinusoids are correlated with each other, and its number of parameters is also high and varied in each frame. Therefore, a perceptually based dynamic sinusoidal model (PDM) is proposed for its application for statistical speech synthesis. then we extensively discuss the methods for using dynamic sinusoidal models for HMM-based statistical speech synthesis. Two parametrisation approaches are presented: INT and DIR. To further improve voice quality of SPSS, we further apply a deep neural networks (DNN) model for proposed vocoder and investigate ways to combine INT and DIR at the level of both DNN modelling and waveform generation. Finally an alternative statistical model referred as complex-valued n! eural network (CVNN), which treats complex coefficients as a whole, is also proposed to model the complex amplitude explicitly.},
  title = {Statistical parametric speech synthesis based on sinusoidal models}
}