The Centre for Speech Technology Research, The university of Edinburgh

Publications by Joris Driesen

jdriesen.bib

@inproceedings{jdriesen_asru13,
  author = {Driesen, Joris and Renals, Steve},
  doi = {10.1109/ASRU.2013.6707772},
  title = {Lightly Supervised Automatic Subtitling of Weather Forecasts},
  booktitle = {Proc. Automatic Speech Recognition and Understanding Workshop},
  address = {Olomouc, Czech Republic},
  month = {December},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/asru13.pdf},
  abstract = {Since subtitling television content is a costly process, there are large potential advantages to automating it, using automatic speech recognition (ASR). However, training the necessary acoustic models can be a challenge, since the available training data usually lacks verbatim orthographic transcriptions. If there are approximate transcriptions, this problem can be overcome using light supervision methods. In this paper, we perform speech recognition on broadcasts of Weatherview, BBC's daily weather report, as a first step towards automatic subtitling. For training, we use a large set of past broadcasts, using their manually created subtitles as approximate transcriptions. We discuss and and compare two different light supervision methods, applying them to this data. The best training set finally obtained with these methods is used to create a hybrid deep neural networkbased recognition system, which yields high recognition accuracies on three separate Weatherview evaluation sets.}
}
@inproceedings{jdriesen:iwslt_german,
  author = {Driesen, Joris and Bell, Peter and Sinclair, Mark and Renals, Steve},
  title = {Description of the {UEDIN} system for {German ASR}},
  booktitle = {Proc IWSLT},
  year = {2013},
  month = {December},
  address = {Heidelberg, Germany},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/german_iwslt.pdf},
  abstract = {In this paper we describe the ASR system for German built at the University of Edinburgh (UEDIN) for the 2013 IWSLT evaluation campaign. For ASR, the major challenge to overcome, was to find suitable acoustic training data. Due to the lack of expertly transcribed acoustic speech data for German, acoustic model training had to be performed on publicly available data crawled from the internet. For evaluation, lack of a manual segmentation into utterances was handled in two different ways: by generating an automatic segmentation, and by treating entire input files as a single segment. Demonstrating the latter method is superior in the current task, we obtained a WER of 28.16% on the dev set and 36.21% on the test set.}
}
@inproceedings{bell14_xling_mlan,
  author = {Bell, P. and Driesen, J. and Renals, S.},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/bell_xling_mlan_is2014.pdf},
  booktitle = {Proc. Interspeech},
  year = {2014},
  abstract = {Posterior-based or bottleneck features derived from neural networks trained on out-of-domain data may be successfully applied to improve speech recognition performance when data is scarce for the target domain or language. In this paper we combine this approach with the use of a hierarchical deep neural network (DNN) network structure -- which we term a multi-level adaptive network (MLAN) -- and the use of multitask learning. We have applied the technique to cross-lingual speech recognition experiments on recordings of TED talks and European Parliament sessions in English (source language) and German (target language). We demonstrate that the proposed method can lead to improvements over standard methods, even when the quantity of training data for the target language is relatively high. When the complete method is applied, we achieve relative WER reductions of around 13\% compared to a monolingual hybrid DNN baseline.},
  title = {Cross-lingual adaptation with multi-task adaptive networks}
}
@inproceedings{bell14_iwslt,
  author = {Bell, Peter and Swietojanski, Pawel and Driesen, Joris and Sinclair, Mark and McInnes, Fergus and Renals, Steve},
  title = {The {UEDIN} {ASR} Systems for the {IWSLT} 2014 Evaluation},
  booktitle = {Proc. IWSLT},
  address = {South Lake Tahoe, USA},
  month = {December},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/bell14_iwslt.pdf},
  abstract = {This paper describes the University of Edinburgh (UEDIN) ASR systems for the 2014 IWSLT Evaluation. Notable features of the English system include deep neural network acoustic models in both tandem and hybrid configuration with the use of multi-level adaptive networks, LHUC adaptation and Maxout units. The German system includes lightly supervised training and a new method for dictionary generation. Our voice activity detection system now uses a semi-Markov model to incorporate a prior on utterance lengths. There are improvements of up to 30\% relative WER on the tst2013 English test set.}
}