The Centre for Speech Technology Research, The university of Edinburgh

Publications by Ondrej Klejch

s1569734.bib

@inproceedings{tsunoo17_rnn_story_segmentation_fusion,
  author = {Tsunoo, Emiru and Klejch, Ondrej and Bell, Peter and Renals, Steve},
  publisher = {IEEE},
  title = {Hierarchical recurrent neural network for story segmentation using fusion of lexical and acoustic features},
  booktitle = {Proc. ASRU},
  month = aug,
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/ASRU2017Tsunoo_v7.pdf},
  abstract = {A broadcast news stream consists of a number of stories and it is an important task to find the boundaries of stories automatically in news analysis. We capture the topic structure using a hierarchical model based on a Recurrent Neural Network (RNN) sentence modeling layer and a bidirectional Long Short-Term Memory (LSTM) topic modeling layer, with a fusion of acoustic and lexical features. Both features are accumulated with RNNs and trained jointly within the model to be fused at the sentence level. We conduct experiments on the topic detection and tracking (TDT4) task comparing combinations of two modalities trained with limited amount of parallel data. Further we utilize additional sufficient text data for training to polish our model. Experimental results indicate that the hierarchical RNN topic modeling takes advantage of the fusion scheme, especially with additional text training data, with a higher F1-measure compared to conventional state-of-the-art methods.}
}
@inproceedings{liepins17_summa_platform,
  author = {Liepins, Renars and Germann, Ulrich and Barzdins, Guntis and Birch, Alexandra and Renals, Steve and Weber, Susanne and Kreeft, {Peggy van der} and Bourlard, Hervé and Prieto, João and Klejch, Ondřej and Bell, Peter and Lazaridis, Alexandros and Mendes, Alfonso and Riedel, Sebastian and Almeida, {Mariana S. C.} and Balage, Pedro and Cohen, Shay and Dwojak, Tomasz and Garner, Phil and Giefer, Andreas and Junczys-Dowmunt, Marcin and Imrani, Hina and Nogueira, David and Ali, Ahmed and Miranda, Sebastião and Popescu-Belis, Andrei and Werlen, {Lesly Miculicich} and Papasarantopoulos, Nikos and Obamuyide, Abiola and Jones, Clive and Dalvi, Fahim and Vlachos, Andreas and Wang, Yang and Tong, Sibo and Sennrich, Rico and Pappas, Nikolaos and Narayan, Shashi and Damonte, Marco and Durrani, Nadir and Khurana, Sameer and Abdelali, Ahmed and Sajjad, Hassan and Vogel, Stephan and Sheppey, David and Hernon, Chris},
  publisher = {Association for Computational Linguistics (ACL)},
  isbn = {978-1-945626-34-0},
  title = {The SUMMA Platform Prototype},
  booktitle = {Proceedings of the EACL 2017 Software Demonstrations},
  abstract = {We present the first prototype of the SUMMA Platform: an integrated platform for multilingual media monitoring. The platform contains a rich suite of low-level and high-level natural language processing technologies: automatic speech recognition of broadcast media, machine translation, automated tagging and classification of named entities, semantic parsing to detect relationships between entities, and automatic construction / augmentation of factual knowledge bases. Implemented on the Docker platform, it can easily be deployed, customised, and scaled to large volumes of incoming media streams.},
  month = apr,
  year = {2017},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/E17_3029.pdf},
  pages = {116–119}
}
@inproceedings{klejch2016slt,
  author = {Klejch, Ondrej and Bell, Peter and Renals, Steve},
  title = {Punctuated transcription of multi-genre broadcasts using acoustic and lexical approaches},
  abstract = {In this paper we investigate the punctuated transcription of multi-genre broadcast media. We examine four systems, three of which are based on lexical features, the fourth of which uses acoustic features by integrating punctuation into the speech recognition acoustic models. We also explore the combination of these component systems using voting and log-linear interpolation. We performed experiments on the English language MGB Challenge data, which comprises about 1,600h of BBC television recordings. Our results indicate that a lexical system, based on a neural machine translation approach is significantly better than other systems achieving an F-Measure of 62.6% on reference text, with a relative degradation of 19% on ASR output. Our analysis of the results in terms of specific punctuation indicated that using longer context improves the prediction of question marks and acoustic information improves prediction of exclamation marks. Finally, we show that even though the systems are complementary, their straightforward combination does not yield better F-measures than a single system using neural machine translation.},
  year = {2016},
  month = {December},
  address = {San Diego, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2016/slt-2016.pdf},
  booktitle = {Proc. IEEE Workshop on Spoken Language Technology},
  categories = {punctuation, speech recognition, neural machine translation, rich transcription}
}
@inproceedings{klejch2017icassp,
  author = {Klejch, Ondrej and Bell, Peter and Renals, Steve},
  title = {Sequence-to-sequence models for punctuated transcription combining lexical and acoustic features},
  abstract = {In this paper we present an extension of our previously described neural machine translation based system for punctuated transcription. This extension allows the system to map from per frame acoustic features to word level representations by replacing the traditional encoder in the encoder-decoder architecture with a hierarchical encoder. Furthermore, we show that a system combining lexical and acoustic features significantly outperforms systems using only a single source of features on all measured punctuation marks. The combination of lexical and acoustic features achieves a significant improvement in F-Measure of 1.5 absolute over the purely lexical neural machine translation based system.},
  year = {2017},
  month = {March},
  address = {New Orleans, USA},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/icassp-2017.pdf},
  booktitle = {Proc. IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
  categories = {punctuation, speech recognition, neural machine translation, rich transcription}
}
@inproceedings{tsunoo2017asru,
  author = {Tsunoo, Emiru and Klejch, Ondrej and Bell, Peter and Renals, Steve},
  title = {Hierarchical recurrent neural network for story segmentation using fusion of lexical and acoustic features},
  abstract = {A broadcast news stream consists of a number of stories and it is an important task to find the boundaries of stories automatically in news analysis. We capture the topic structure using a hierarchical model based on a Recurrent Neural Network (RNN) sentence modeling layer and a bidirectional Long Short-Term Memory (LSTM) topic modeling layer, with a fusion of acoustic and lexical features. Both features are accumulated with RNNs and trained jointly within the model to be fused at the sentence level. We conduct experiments on the topic detection and tracking (TDT4) task comparing combinations of two modalities trained with limited amount of parallel data. Further we utilize additional sufficient text data for training to polish our model. Experimental results indicate that the hierarchical RNN topic modeling takes advantage of the fusion scheme, especially with additional text training data, with a higher F1-measure compared to conventional state-of-the-art methods.},
  year = {2017},
  month = {December},
  address = {Okinawa, Japan},
  pdf = {http://www.cstr.inf.ed.ac.uk/downloads/publications/2017/ASRU2017Tsunoo_v7.pdf},
  booktitle = {Proc. IEEE Automatic Speech Recognition and Understanding Workshop},
  categories = {spoken document processing, recurrent neural network, topic modeling, story segmentation, multimodal features}
}