The Centre for Speech Technology Research, The university of Edinburgh

Publications by Fergus McInnes

fmi.bib

@inproceedings{hasler2012,
  author = {Hasler, Eva and Bell, Peter and Ghoshal, Arnab and Haddow, Barry and Koehn, Philipp and McInnes, Fergus and Renals, Steve and Swietojanski, Pawel},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2012/paper_50.pdf},
  abstract = {This paper describes the University of Edinburgh (UEDIN) systems for the IWSLT 2012 Evaluation. We participated in the ASR (English), MT (English-French, German-English) and SLT (English-French) tracks.},
  year = {2012},
  booktitle = {Proc. International Workshop on Spoken Language Translation},
  title = {The {UEDIN} system for the {IWSLT} 2012 evaluation}
}
@inproceedings{mcinnes_cogsci2011,
  author = {McInnes, Fergus R. and Goldwater, Sharon J.},
  title = {Unsupervised Extraction of Recurring Words from Infant-Directed Speech},
  booktitle = {Proceedings of CogSci 2011},
  year = {2011},
  month = {July},
  address = {Boston, Massachusetts},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2011/mcinnes_cogsci2011.pdf},
  abstract = {To date, most computational models of infant word segmentation have worked from phonemic or phonetic input, or have used toy datasets. In this paper, we present an algorithm for word extraction that works directly from naturalistic acoustic input: infant-directed speech from the CHILDES corpus. The algorithm identifies recurring acoustic patterns that are candidates for identification as words or phrases, and then clusters together the most similar patterns. The recurring patterns are found in a single pass through the corpus using an incremental method, where only a small number of utterances are considered at once. Despite this limitation, we show that the algorithm is able to extract a number of recurring words, including some that infants learn earliest, such as "Mommy" and the child’s name. We also introduce a novel information-theoretic evaluation measure.},
  categories = {language acquisition, word segmentation, speech recognition, computational modelling}
}
@inproceedings{renals-icassp91,
  author = {Renals, S. and McKelvie, D. and McInnes, F.},
  title = {A comparative study of continuous speech recognition using neural networks and hidden {Markov} models},
  booktitle = {Proc IEEE ICASSP},
  year = {1991},
  address = {Toronto},
  pages = {369--372},
  categories = {}
}
@inproceedings{Williams_1989_d,
  author = {Williams, Briony J. and Hiller, S. M and McInnes, F. and Dalby, J.},
  address = {Paris, France},
  booktitle = {Proceedings of the European Conference on Speech Communication and Technology},
  year = {1989},
  categories = {prosody, recognition},
  title = {A Knowledge-Based Nasal Classifier for Use in Continuous Speech Recognition}
}
@inproceedings{bell13_lecture_transcription,
  author = {Bell, Peter and Yamamoto, Hitoshi and Swietojanski, Pawel and Wu, Youzheng and McInnes, Fergus and Hori, Chiori and Renals, Steve},
  title = {A lecture transcription system combining neural network acoustic and language models},
  booktitle = {Proc. Interspeech},
  address = {Lyon, France},
  month = {August},
  year = {2013},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/lecture_transcription_is2013.pdf},
  abstract = {This paper presents a new system for automatic transcription of lectures. The system combines a number of novel features, including deep neural network acoustic models using multi-level adaptive networks to incorporate out-of-domain information, and factored recurrent neural network language models. We demonstrate that the system achieves large improvements on the TED lecture transcription task from the 2012 IWSLT evaluation -- our results are currently the best reported on this task, showing an relative WER reduction of more than 16\% compared to the closest competing system from the evaluation.}
}
@inproceedings{bourlard_slam2013,
  author = {Bourlard, H. and Ferras, M. and Pappas, N. and Popescu-Belis, A. and Renals, S. and McInnes, F. and Bell, P. and Ingram, S. and Guillemot, M.},
  title = {Processing and Linking Audio Events in Large Multimedia Archives: The {EU} {inEvent} Project},
  booktitle = {Proceedings of SLAM 2013 (First Workshop on Speech, Language and Audio in Multimedia)},
  year = {2013},
  month = {August},
  address = {Marseille, France},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/bourlard_slam2013.pdf},
  abstract = {In the inEvent EU project, we aim at structuring, retrieving, and sharing large archives of networked, and dynamically changing, multimedia recordings, mainly consisting of meetings, videoconferences, and lectures. More specifically, we are developing an integrated system that performs audiovisual processing of multimedia recordings, and labels them in terms of interconnected "hyper-events" (a notion inspired from hyper-texts). Each hyper-event is composed of simpler facets, including audio-video recordings and metadata, which are then easier to search, retrieve and share. In the present paper, we mainly cover the audio processing aspects of the system, including speech recognition, speaker diarization and linking (across recordings), the use of these features for hyper-event indexing and recommendation, and the search portal. We present initial results for feature extraction from lecture recordings using the TED talks.},
  categories = {networked multimedia events, audio processing: speech recognition, speaker diarization and linking, multimedia indexing and searching, hyper-events}
}
@inproceedings{bhatt_acmmm2013,
  author = {Bhatt, C. and Popescu-Belis, A. and Habibi, M. and Ingram, S. and Masneri, S. and McInnes, F. and Pappas, N. and Schreer, O.},
  title = {Multi-factor Segmentation for Topic Visualization and Recommendation: the {MUST-VIS} System},
  booktitle = {Proceedings of ACM Multimedia 2013},
  year = {2013},
  month = {October},
  address = {Barcelona, Spain},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/bhatt_acmmm2013.pdf},
  abstract = {This paper presents the MUST-VIS system for the MediaMixer/VideoLectures.NET Temporal Segmentation and Annotation Grand Challenge. The system allows users to visualize a lecture as a series of segments represented by keyword clouds, with relations to other similar lectures and segments. Segmentation is performed using a multi-factor algorithm which takes advantage of the audio (through automatic speech recognition and word-based segmentation) and video (through the detection of actions such as writing on the blackboard). The similarity across segments and lectures is computed using a content-based recommendation algorithm. Overall, the graph-based representation of segment similarity appears to be a promising and cost-effective approach to navigating lecture databases.},
  categories = {content analysis and retrieval, multimedia information systems, lecture segmentation, lecture recommendations}
}
@inproceedings{bell13_iwslt,
  author = {Bell, Peter and McInnes, Fergus and Gangireddy, Siva Reddy and Sinclair, Mark and Birch, Alexandra and Renals, Steve},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2013/bell13_iwslt_system.pdf},
  booktitle = {Proc. International Workshop on Spoken Language Translation},
  title = {The {UEDIN} English {ASR} System for the {IWSLT} 2013 Evaluation},
  abstract = {This paper describes the University of Edinburgh (UEDIN) English ASR system for the IWSLT 2013 Evaluation. \mbox{Notable} features of the system include deep neural network acoustic models in both tandem and hybrid configuration, cross-domain adaptation with multi-level adaptive networks, and the use of a recurrent neural network language model. Improvements to our system since the 2012 evaluation -- which include the use of a significantly improved n-gram language model -- result in a 19\% relative WER reduction on the \tstD set.},
  year = {2013}
}
@inproceedings{sgangireddy_interspeech14,
  author = {Gangireddy, Siva Reddy and McInnes, Fergus and Renals, Steve},
  title = {Feed Forward Pre-Training for Recurrent Neural Network Language Models},
  booktitle = {Proc. Interspeech},
  abstract = {The recurrent neural network language model (RNNLM) has been demonstrated to consistently reduce perplexities and automatic speech recognition (ASR) word error rates across a variety of domains. In this paper we propose a pre-training method for the RNNLM, by sharing the output weights of the feed forward neural network language model (NNLM) with the RNNLM. This is accomplished by first fine-tuning the weights of the NNLM, which are then used to initialise the output weights of an RNNLM with the same number of hidden units. We have carried out text-based experiments on the Penn Treebank Wall Street Journal data, and ASR experiments on the TED talks data used in the International Workshop on Spoken Language Translation (IWSLT) evaluation campaigns. Across the experiments, we observe small improvements in perplexity and ASR word error rate.},
  month = {September},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/srg-interspeech14.pdf},
  pages = {2620-2624},
  categories = {Language Modelling, Recurrent Neural Network, Pre-training, Automatic Speech Recognition, TED talks}
}
@inproceedings{sinclairbell_interspeech14,
  author = {Sinclair, Mark and Bell, Peter and Birch, Alexandra and McInnes, Fergus},
  title = {A semi-Markov model for speech segmentation with an utterance-break prior},
  booktitle = {Proc. Interspeech},
  month = {September},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/interspeech2014.pdf},
  abstract = {Speech segmentation is the problem of finding the end points of a speech utterance for passing to an automatic speech recognition (ASR) system. The quality of this segmentation can have a large impact on the accuracy of the ASR system; in this paper we demonstrate that it can have an even larger impact on downstream natural language processing tasks – in this case, machine translation. We develop a novel semi-Markov model which allows the segmentation of audio streams into speech utterances which are optimised for the desired distribution of sentence lengths for the target domain. We compare this with existing state-of-the-art methods and show that it is able to achieve not only improved ASR performance, but also to yield significant benefits to a speech translation task.},
  categories = {speech activity detection, speech segmentation, machine translation, speech recognition}
}
@inproceedings{bell14_iwslt,
  author = {Bell, Peter and Swietojanski, Pawel and Driesen, Joris and Sinclair, Mark and McInnes, Fergus and Renals, Steve},
  title = {The {UEDIN} {ASR} Systems for the {IWSLT} 2014 Evaluation},
  booktitle = {Proc. IWSLT},
  address = {South Lake Tahoe, USA},
  month = {December},
  year = {2014},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2014/bell14_iwslt.pdf},
  abstract = {This paper describes the University of Edinburgh (UEDIN) ASR systems for the 2014 IWSLT Evaluation. Notable features of the English system include deep neural network acoustic models in both tandem and hybrid configuration with the use of multi-level adaptive networks, LHUC adaptation and Maxout units. The German system includes lightly supervised training and a new method for dictionary generation. Our voice activity detection system now uses a semi-Markov model to incorporate a prior on utterance lengths. There are improvements of up to 30\% relative WER on the tst2013 English test set.}
}