
E-Mail: florian.mock@uni-jena.de
Room: 08N09
Phone: +49-3641-9-46485
Publications
2022
Mock, Florian; Kretschmer, Fleming; Kriese, Anton; Böcker, Sebastian; Marz, Manja
Taxonomic classification of DNA sequences beyond sequence similarity using deep neural networks Journal Article
In: Proc Natl Acad Sci, vol. 119, iss. 35, pp. e2122636119, 2022.
@article{Mock2022,
title = {Taxonomic classification of DNA sequences beyond sequence similarity using deep neural networks},
author = {Florian Mock and Fleming Kretschmer and Anton Kriese and Sebastian Böcker and Manja Marz
},
doi = {10.1073/pnas.2122636119},
year = {2022},
date = {2022-08-30},
journal = {Proc Natl Acad Sci},
volume = {119},
issue = {35},
pages = {e2122636119},
abstract = {Taxonomic classification, that is, the assignment to biological clades with shared ancestry, is a common task in genetics, mainly based on a genome similarity search of large genome databases. The classification quality depends heavily on the database, since representative relatives must be present. Many genomic sequences cannot be classified at all or only with a high misclassification rate. Here we present BERTax, a deep neural network program based on natural language processing to precisely classify the superkingdom and phylum of DNA sequences taxonomically without the need for a known representative relative from a database. We show BERTax to be at least on par with the state-of-the-art approaches when taxonomically similar species are part of the training data. For novel organisms, however, BERTax clearly outperforms any existing approach. Finally, we show that BERTax can also be combined with database approaches to further increase the prediction quality in almost all cases. Since BERTax is not based on similar entries in databases, it allows precise taxonomic classification of a broader range of genomic sequences, thus increasing the overall information gain.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2021
Mock, Florian; Kretschmer, Fleming; Kriese, Anton; Böcker, Sebastian; Marz, Manja
BERTax: taxonomic classification of DNA sequences with Deep Neural Networks Journal Article
In: bioRxiv, 2021.
@article{Mock:21a,
title = {BERTax: taxonomic classification of DNA sequences with Deep Neural Networks},
author = {Florian Mock and Fleming Kretschmer and Anton Kriese and Sebastian Böcker and Manja Marz},
url = {https://github.com/f-kretschmer/bertax},
doi = {10.1101/2021.07.09.451778},
year = {2021},
date = {2021-07-10},
urldate = {2021-07-10},
journal = {bioRxiv},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Taxonomic classification, i.e., the identification and assignment to groups of biological organisms with the same origin and characteristics, is a common task in genetics. Nowadays, taxonomic classification is mainly based on genome similarity search to large genome databases. In this process, the classification quality depends heavily on the database since representative relatives have to be known already. Many genomic sequences cannot be classified at all or only with a high misclassification rate.
Here we present BERTax, a program that uses a deep neural network to pre-cisely classify the superkingdom, phylum, and genus of DNA sequences taxonomically without the need for a known representative relative from a database. For this, BERTax uses the natural language processing model BERT trained to represent DNA. We show BERTax to be at least on par with the state-of-the-art approaches when taxonomically similar species are part of the training data. In case of an entirely novel organism, however, BERTax clearly outperforms any existing approach. Finally, we show that BERTax can also be combined with database approaches to further increase the prediction quality.
Since BERTax is not based on homologous entries in databases, it allows precise taxonomic classification of a broader range of genomic sequences. This leads to a higher number of correctly classified sequences and thus increases the overall information gain.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Here we present BERTax, a program that uses a deep neural network to pre-cisely classify the superkingdom, phylum, and genus of DNA sequences taxonomically without the need for a known representative relative from a database. For this, BERTax uses the natural language processing model BERT trained to represent DNA. We show BERTax to be at least on par with the state-of-the-art approaches when taxonomically similar species are part of the training data. In case of an entirely novel organism, however, BERTax clearly outperforms any existing approach. Finally, we show that BERTax can also be combined with database approaches to further increase the prediction quality.
Since BERTax is not based on homologous entries in databases, it allows precise taxonomic classification of a broader range of genomic sequences. This leads to a higher number of correctly classified sequences and thus increases the overall information gain.
Mock, Florian; Marz, Manja
Sequence Classification with Machine Learning at the Example of Viral Host Prediction Incollection
In: Frishman, Dmitrij; Marz, Manja (Ed.): Virus Bioinformatics, CRC Press, 2021.
@incollection{Mock:21,
title = {Sequence Classification with Machine Learning at the Example of Viral Host Prediction},
author = {Florian Mock and Manja Marz},
editor = {Dmitrij Frishman and Manja Marz},
doi = {10.1201/9781003097679-10},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {Virus Bioinformatics},
publisher = {CRC Press},
abstract = {Sequence classification is a common task in modern virus bioinformatics research. DNA, RNA, or protein sequences are either filtered for certain properties or the properties of a sequence are to be determined. This task is a very diverse problem. The previous knowledge about the data and also the amount of usable data differ for each project. Also the classification task itself is highly diverse. An additional difficulty is that even today for most biological questions, especially in virology, we lack some set of measurable properties (features) that always explain our observations. Here, we introduce machine learning for viral sequence classification. Together with the reader, we build a deep neural network (DNN) pipeline to classify the host of an influenza A virus from its genome sequence with great accuracy. This result may be somewhat surprising since, despite years of research, we lack a set of properties that lead to highly accurate predictions, and currently, more exceptions are often found than new features. Deep learning can automatically identify a trainable set of features and their dependencies with higher predictive power than previous approaches. This work may serve as a starting point to encourage researchers in virology to use machine learning. Using viral host prediction as an example, we will be discussing classical pitfalls such as data quantity and quality.},
keywords = {},
pubstate = {published},
tppubtype = {incollection}
}
Pappas, Nikolaos; Roux, Simon; Hölzer, Martin; Lamkiewicz, Kevin; Mock, Florian; Marz, Manja; Dutilh, Bas E.
Virus Bioinformatics Incollection
In: Reference Module in Life Sciences, vol. 1, pp. 124-132, Elsevier, 2021, ISBN: 978-0-12-809633-8.
@incollection{Pappas:20,
title = {Virus Bioinformatics},
author = {Nikolaos Pappas and Simon Roux and Martin Hölzer and Kevin Lamkiewicz and Florian Mock and Manja Marz and Bas E. Dutilh},
doi = {10.1016/B978-0-12-814515-9.00034-5},
isbn = {978-0-12-809633-8},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {Reference Module in Life Sciences},
volume = {1},
pages = {124-132},
publisher = {Elsevier},
abstract = {Since the discovery of computers, bioinformatics and computational biology have been instrumental in a wide range of discoveries in virology. These include early mathematical models of virus-host interaction, and more recently the analysis of viral nucleotide and protein sequences to track their function, epidemiology, and evolution. The genomics revolution has provided an unprecedented amount of sequence information from both viruses and their hosts. In this article, we discuss how bioinformatics allows viral sequence data to be analyzed and interpreted, including an overview of commonly used tools and examples of applications.
},
keywords = {},
pubstate = {published},
tppubtype = {incollection}
}
2020
Collatz, Maximilian; Mock, Florian; Barth, Emanuel; Hölzer, Martin; Sachse, Konrad; Marz, Manja
EpiDope: A Deep Neural Network for linear B-cell epitope prediction Journal Article
In: Bioinformatics, vol. 37, no. 4, pp. 448–455, 2020.
@article{Collatz:20,
title = {EpiDope: A Deep Neural Network for linear B-cell epitope prediction},
author = {Maximilian Collatz and Florian Mock and Emanuel Barth and Martin Hölzer and Konrad Sachse and Manja Marz},
editor = {Lenore Cowen},
url = {http://github.com/mcollatz/EpiDope},
doi = {10.1093/bioinformatics/btaa773},
year = {2020},
date = {2020-09-11},
urldate = {2020-09-11},
journal = {Bioinformatics},
volume = {37},
number = {4},
pages = {448–455},
publisher = {Oxford University Press (OUP)},
abstract = {By binding to specific structures on antigenic proteins, the so-called epitopes, B-cell antibodies can neutralize pathogens. The identification of B-cell epitopes is of great value for the development of specific serodiagnostic assays and the optimization of medical therapy. However, identifying diagnostically or therapeutically relevant epitopes is a challenging task that usually involves extensive laboratory work. In this study, we show that the time, cost and labor-intensive process of epitope detection in the lab can be significantly reduced using in silico prediction.
Here, we present EpiDope, a python tool which uses a deep neural network to detect linear B-cell epitope regions on individual protein sequences. With an area under the curve between 0.67 ± 0.07 in the receiver operating characteristic curve, EpiDope exceeds all other currently used linear B-cell epitope prediction tools. Our software is shown to reliably predict linear B-cell epitopes of a given protein sequence, thus contributing to a significant reduction of laboratory experiments and costs required for the conventional approach.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Here, we present EpiDope, a python tool which uses a deep neural network to detect linear B-cell epitope regions on individual protein sequences. With an area under the curve between 0.67 ± 0.07 in the receiver operating characteristic curve, EpiDope exceeds all other currently used linear B-cell epitope prediction tools. Our software is shown to reliably predict linear B-cell epitopes of a given protein sequence, thus contributing to a significant reduction of laboratory experiments and costs required for the conventional approach.
Mock, Florian; Viehweger, Adrian; Barth, Emanuel; Marz, Manja
VIDHOP, viral host prediction with Deep Learning Journal Article
In: Bioinformatics, vol. 37, no. 3, pp. 318–325, 2020.
@article{Mock:20,
title = {VIDHOP, viral host prediction with Deep Learning},
author = {Florian Mock and Adrian Viehweger and Emanuel Barth and Manja Marz},
editor = {Jinbo Xu},
url = {https://github.com/flomock/vidhop},
doi = {10.1093/bioinformatics/btaa705},
year = {2020},
date = {2020-08-10},
urldate = {2020-08-10},
journal = {Bioinformatics},
volume = {37},
number = {3},
pages = {318–325},
publisher = {Oxford University Press (OUP)},
abstract = {Zoonosis, the natural transmission of infections from animals to humans, is a far-reaching global problem. The recent outbreaks of Zikavirus, Ebolavirus and Coronavirus are examples of viral zoonosis, which occur more frequently due to globalization. In case of a virus outbreak, it is helpful to know which host organism was the original carrier of the virus to prevent further spreading of viral infection. Recent approaches aim to predict a viral host based on the viral genome, often in combination with the potential host genome and arbitrarily selected features. These methods are limited in the number of different hosts they can predict or the accuracy of the prediction.
Here, we present a fast and accurate deep learning approach for viral host prediction, which is based on the viral genome sequence only. We tested our deep neural network (DNN) on three different virus species (influenza A virus, rabies lyssavirus and rotavirus A). We achieved for each virus species an AUC between 0.93 and 0.98, allowing highly accurate predictions while using only fractions (100–400 bp) of the viral genome sequences. We show that deep neural networks are suitable to predict the host of a virus, even with a limited amount of sequences and highly unbalanced available data. The trained DNNs are the core of our virus–host prediction tool VIrus Deep learning HOst Prediction (VIDHOP). VIDHOP also allows the user to train and use models for other viruses.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Here, we present a fast and accurate deep learning approach for viral host prediction, which is based on the viral genome sequence only. We tested our deep neural network (DNN) on three different virus species (influenza A virus, rabies lyssavirus and rotavirus A). We achieved for each virus species an AUC between 0.93 and 0.98, allowing highly accurate predictions while using only fractions (100–400 bp) of the viral genome sequences. We show that deep neural networks are suitable to predict the host of a virus, even with a limited amount of sequences and highly unbalanced available data. The trained DNNs are the core of our virus–host prediction tool VIrus Deep learning HOst Prediction (VIDHOP). VIDHOP also allows the user to train and use models for other viruses.
2019
Mostajo, Nelly F.; Lataretu, Marie; Krautwurst, Sebastian; Mock, Florian; Desirò, Daniel; Lamkiewicz, Kevin; Collatz, Maximilian; Schoen, Andreas; Weber, Friedemann; Marz, Manja; Hölzer, Martin
A comprehensive annotation and differential expression analysis of short and long non-coding RNAs in 16 bat genomes Journal Article
In: NAR Genomics Bioinf, vol. 2, no. 1, pp. lqz006, 2019.
@article{Mostajo:20,
title = {A comprehensive annotation and differential expression analysis of short and long non-coding RNAs in 16 bat genomes},
author = {Nelly F. Mostajo and Marie Lataretu and Sebastian Krautwurst and Florian Mock and Daniel Desirò and Kevin Lamkiewicz and Maximilian Collatz and Andreas Schoen and Friedemann Weber and Manja Marz and Martin Hölzer},
url = {https://www.rna.uni-jena.de/supplements/bats/index.html},
doi = {10.1093/nargab/lqz006},
year = {2019},
date = {2019-09-30},
urldate = {2019-09-30},
journal = {NAR Genomics Bioinf},
volume = {2},
number = {1},
pages = {lqz006},
abstract = {Although bats are increasingly becoming the focus of scientific studies due to their unique properties, these exceptional animals are still among the least studied mammals. Assembly quality and completeness of bat genomes vary a lot and especially non-coding RNA (ncRNA) annotations are incomplete or simply missing. Accordingly, standard bioinformatics pipelines for gene expression analysis often ignore ncRNAs such as microRNAs or long antisense RNAs. The main cause of this problem is the use of incomplete genome annotations. We present a complete screening for ncRNAs within 16 bat genomes. NcRNAs affect a remarkable variety of vital biological functions, including gene expression regulation, RNA processing, RNA interference and, as recently described, regulatory processes in viral infections. Within all investigated bat assemblies, we annotated 667 ncRNA families including 162 snoRNAs and 193 miRNAs as well as rRNAs, tRNAs, several snRNAs and lncRNAs, and other structural ncRNA elements. We validated our ncRNA candidates by six RNA-Seq data sets and show significant expression patterns that have never been described before in a bat species on such a large scale. Our annotations will be usable as a resource (rna.uni-jena.de/supplements/bats) for deeper studying of bat evolution, ncRNAs repertoire, gene expression and regulation, ecology and important host–virus interactions.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}