2024
Spangenberg, Jannes; Mündnich, Stefan; Busch, Anne; Pastore, Stefan; Wierczeiko, Anna; Goettsch, Winfried; Dietrich, Vincent; Pryszcz, Leszek P.; Cruciani, Sonia; Novoa, Eva Maria; Joshi, Kandarp; Perera, Ranjan; Giorgio, Salvatore Di; Arrubarrena, Paola; Tellioglu, Irem; Poon, Chi-Lam; Wan, Yuk Kei; Göke, Jonathan; Hildebrandt, Andreas; Dieterich, Christoph; Helm, Mark; Marz, Manja; Gerber, Susanne; Alagna, Nicolo
The RMaP challenge of predicting RNA modifications by nanopore sequencing Journal Article
In: Communications Chemistry, vol. 8, iss. 1, 2024.
Abstract | Links | BibTeX | Tags: machine learning, nanopore, nucleic acid modifications, RNA / transcriptomics
@article{nokey_79,
title = {The RMaP challenge of predicting RNA modifications by nanopore sequencing},
author = {Jannes Spangenberg and Stefan Mündnich and Anne Busch and Stefan Pastore and Anna Wierczeiko and Winfried Goettsch and Vincent Dietrich and Leszek P. Pryszcz and Sonia Cruciani and Eva Maria Novoa and Kandarp Joshi and Ranjan Perera and Salvatore Di Giorgio and Paola Arrubarrena and Irem Tellioglu and Chi-Lam Poon and Yuk Kei Wan and Jonathan Göke and Andreas Hildebrandt and Christoph Dieterich and Mark Helm and Manja Marz and Susanne Gerber and Nicolo Alagna},
doi = {10.1038/s42004-025-01507-0},
year = {2024},
date = {2024-12-04},
urldate = {2024-12-04},
journal = {Communications Chemistry},
volume = {8},
issue = {1},
abstract = {The field of epitranscriptomics is undergoing a technology-driven revolution. During past decades, RNA modifications like N6-methyladenosine (m6A), pseudouridine (ψ), and 5-methylcytosine (m5C) became acknowledged for playing critical roles in cellular processes. Direct RNA sequencing by Oxford Nanopore Technologies (ONT) enabled the detection of modifications in native RNA, by detecting noncanonical RNA nucleosides properties in raw data. Consequently, the field’s cutting edge has a heavy component in computer science, opening new avenues of cooperation across the community, as exchanging data is as impactful as exchanging samples. Therefore, we seize the occasion to bring scientists together within the RNA Modification and Processing (RMaP) challenge to advance solutions for RNA modification detection and discuss ideas, problems and approaches. We show several computational methods to detect the most researched mRNA modifications (m6A, ψ, and m5C). Results demonstrate that a low prediction error and a high prediction accuracy can be achieved on these modifications across different approaches and algorithms. The RMaP challenge marks a substantial step towards improving algorithms’ comparability, reliability, and consistency in RNA modification prediction. It points out the deficits in this young field that need to be addressed in further challenges.},
keywords = {machine learning, nanopore, nucleic acid modifications, RNA / transcriptomics},
pubstate = {published},
tppubtype = {article}
}
zu Siederdissen, Christian Höner; Spangenberg, Jannes; Bisdorf, Kevin; Krautwurst, Sebastian; Srivastava, Akash; Marz, Manja; Taubert, Martin
Nanopore sequencing enables novel detection of deuterium incorporation in DNA Journal Article
In: Computational and Structural Biotechnology Journal, vol. 23, 2024.
Abstract | Links | BibTeX | Tags: bacteria, DNA / genomics, machine learning, metagenomics, nanopore, nucleic acid modifications
@article{nokey_74,
title = {Nanopore sequencing enables novel detection of deuterium incorporation in DNA},
author = {Christian {Höner zu Siederdissen} and Jannes Spangenberg and Kevin Bisdorf and Sebastian Krautwurst and Akash Srivastava and Manja Marz and Martin Taubert},
doi = {10.1016/j.csbj.2024.09.027},
year = {2024},
date = {2024-10-03},
urldate = {2024-10-03},
journal = {Computational and Structural Biotechnology Journal},
volume = {23},
abstract = {Identifying active microbes is crucial to understand their role in ecosystem functions. Metabolic labeling with heavy, non-radioactive isotopes, i.e., stable isotope probing (SIP), can track active microbes by detecting heavy isotope incorporation in biomolecules such as DNA. However, the detection of heavy isotope-labeled nucleotides directly during sequencing has, to date, not been achieved. In this study, Oxford nanopore sequencing was utilized to detect heavy isotopes incorporation in DNA molecules. Two isotopes widely used in SIP experiments were employed to label a bacterial isolate: deuterium (D, as D2O) and carbon-13 (13C, as glucose). We hypothesize that labeled DNA is distinguishable from unlabeled DNA by changes in the nanopore signal. To verify this distinction, we employed a Bayesian classifier trained on signal distributions of short oligonucleotides (k-mers) from labeled and unlabeled sequencing reads. Our results show a clear distinction between D-labeled and unlabeled reads, based on changes in median and median absolute deviation (MAD) of the nanopore signals for different k-mers. In contrast, 13C-labeled DNA cannot be distinguished from unlabeled DNA. For D, the model employed correctly predicted more than 85% of the reads. Even when metabolic labeling was conducted with only 30% D2O, 80% of the obtained reads were correctly classified with a 5% false discovery rate. Our work demonstrates the feasibility of direct detection of deuterium incorporation in DNA molecules during Oxford nanopore sequencing. This finding represents a first step in establishing the combined use of nanopore sequencing and SIP for tracking active organisms in microbial ecology.},
keywords = {bacteria, DNA / genomics, machine learning, metagenomics, nanopore, nucleic acid modifications},
pubstate = {published},
tppubtype = {article}
}
2022
Mock, Florian
Context sensitive neural networks for the classification of DNA, RNA and protein sequences PhD Thesis
2022.
Links | BibTeX | Tags: classification, machine learning
@phdthesis{nokey_37,
title = {Context sensitive neural networks for the classification of DNA, RNA and protein sequences},
author = {Florian Mock},
url = {https://suche.thulb.uni-jena.de/Record/1820176673},
year = {2022},
date = {2022-09-05},
howpublished = {Friedrich-Schiller-Universität Jena},
keywords = {classification, machine learning},
pubstate = {published},
tppubtype = {phdthesis}
}
Mock, Florian; Kretschmer, Fleming; Kriese, Anton; Böcker, Sebastian; Marz, Manja
Taxonomic classification of DNA sequences beyond sequence similarity using deep neural networks Journal Article
In: Proc Natl Acad Sci, vol. 119, iss. 35, pp. e2122636119, 2022.
Abstract | Links | BibTeX | Tags: classification, DNA / genomics, machine learning
@article{Mock2022,
title = {Taxonomic classification of DNA sequences beyond sequence similarity using deep neural networks},
author = {Florian Mock and Fleming Kretschmer and Anton Kriese and Sebastian Böcker and Manja Marz
},
doi = {10.1073/pnas.2122636119},
year = {2022},
date = {2022-08-30},
journal = {Proc Natl Acad Sci},
volume = {119},
issue = {35},
pages = {e2122636119},
abstract = {Taxonomic classification, that is, the assignment to biological clades with shared ancestry, is a common task in genetics, mainly based on a genome similarity search of large genome databases. The classification quality depends heavily on the database, since representative relatives must be present. Many genomic sequences cannot be classified at all or only with a high misclassification rate. Here we present BERTax, a deep neural network program based on natural language processing to precisely classify the superkingdom and phylum of DNA sequences taxonomically without the need for a known representative relative from a database. We show BERTax to be at least on par with the state-of-the-art approaches when taxonomically similar species are part of the training data. For novel organisms, however, BERTax clearly outperforms any existing approach. Finally, we show that BERTax can also be combined with database approaches to further increase the prediction quality in almost all cases. Since BERTax is not based on similar entries in databases, it allows precise taxonomic classification of a broader range of genomic sequences, thus increasing the overall information gain.},
keywords = {classification, DNA / genomics, machine learning},
pubstate = {published},
tppubtype = {article}
}
2021
Collatz, Maximilian
2021.
Links | BibTeX | Tags: differential expression analysis, machine learning, RNA / transcriptomics
@phdthesis{nokey,
title = {Two Stories about Trying to Trace the Untraceable: B-Cell Epitope Prediction and Deciphering Circadian Clocks},
author = {Maximilian Collatz},
url = {https://suche.thulb.uni-jena.de/Record/1767090838},
year = {2021},
date = {2021-07-30},
urldate = {2021-01-01},
howpublished = {Friedrich-Schiller-Universität Jena},
keywords = {differential expression analysis, machine learning, RNA / transcriptomics},
pubstate = {published},
tppubtype = {phdthesis}
}
Warnat-Herresthal, Stefanie; Schultze, Hartmut; Shastry, Krishnaprasad Lingadahalli; Manamohan, Sathyanarayanan; Mukherjee, Saikat; Garg, Vishesh; Sarveswara, Ravi; Händler, Kristian; Pickkers, Peter; Aziz, N. Ahmad; Ktena, Sofia; Tran, Florian; Bitzer, Michael; Ossowski, Stephan; Casadei, Nicolas; Herr, Christian; Petersheim, Daniel; Behrends, Uta; Kern, Fabian; Fehlmann, Tobias; Schommers, Philipp; Lehmann, Clara; Augustin, Max; Rybniker, Jan; Altmüller, Janine; Mishra, Neha; Bernardes, Joana P.; Krämer, Benjamin; Bonaguro, Lorenzo; Schulte-Schrepping, Jonas; Domenico, Elena De; Siever, Christian; Kraut, Michael; Desai, Milind; Monnet, Bruno; Saridaki, Maria; Siegel, Charles Martin; Drews, Anna; Nuesch-Germano, Melanie; Theis, Heidi; Heyckendorf, Jan; Schreiber, Stefan; Kim-Hellmuth, Sarah; (COVAS), COVID- Aachen Study; Nattermann, Jacob; Skowasch, Dirk; Kurth, Ingo; Keller, Andreas; Bals, Robert; Nürnberg, Peter; Rieß, Olaf; Rosenstiel, Philip; Netea, Mihai G.; Theis, Fabian; Mukherjee, Sach; Backes, Michael; Aschenbrenner, Anna C.; Ulas, Thomas; (DeCOI), Deutsche COVID-19 Omics Initiative; Breteler, Monique M. B.; Giamarellos-Bourboulis, Evangelos J.; Kox, Matthijs; Becker, Matthias; Cheran, Sorin; Woodacre, Michael S.; Goh, Eng Lim; Schultze, Joachim L.
Swarm Learning for decentralized and confidential clinical machine learning Journal Article
In: Nature, vol. 594, no. 7862, pp. 265-270, 2021.
Abstract | Links | BibTeX | Tags: coronavirus, machine learning, viruses
@article{nokey,
title = {Swarm Learning for decentralized and confidential clinical machine learning},
author = {Stefanie Warnat-Herresthal and Hartmut Schultze and Krishnaprasad Lingadahalli Shastry and Sathyanarayanan Manamohan and Saikat Mukherjee and Vishesh Garg and Ravi Sarveswara and Kristian Händler and Peter Pickkers and N. Ahmad Aziz and Sofia Ktena and Florian Tran and Michael Bitzer and Stephan Ossowski and Nicolas Casadei and Christian Herr and Daniel Petersheim and Uta Behrends and Fabian Kern and Tobias Fehlmann and Philipp Schommers and Clara Lehmann and Max Augustin and Jan Rybniker and Janine Altmüller and Neha Mishra and Joana P. Bernardes and Benjamin Krämer and Lorenzo Bonaguro and Jonas Schulte-Schrepping and Elena De Domenico and Christian Siever and Michael Kraut and Milind Desai and Bruno Monnet and Maria Saridaki and Charles Martin Siegel and Anna Drews and Melanie Nuesch-Germano and Heidi Theis and Jan Heyckendorf and Stefan Schreiber and Sarah Kim-Hellmuth and COVID- Aachen Study (COVAS) and Jacob Nattermann and Dirk Skowasch and Ingo Kurth and Andreas Keller and Robert Bals and Peter Nürnberg and Olaf Rieß and Philip Rosenstiel and Mihai G. Netea and Fabian Theis and Sach Mukherjee and Michael Backes and Anna C. Aschenbrenner and Thomas Ulas and Deutsche COVID-19 Omics Initiative (DeCOI) and Monique M. B. Breteler and Evangelos J. Giamarellos-Bourboulis and Matthijs Kox and Matthias Becker and Sorin Cheran and Michael S. Woodacre and Eng Lim Goh and Joachim L. Schultze },
doi = {10.1038/s41586-021-03583-3},
year = {2021},
date = {2021-05-26},
urldate = {2021-05-26},
journal = {Nature},
volume = {594},
number = {7862},
pages = {265-270},
abstract = {Fast and reliable detection of patients with severe and heterogeneous illnesses is a major goal of precision medicine1,2. Patients with leukaemia can be identified using machine learning on the basis of their blood transcriptomes3. However, there is an increasing divide between what is technically possible and what is allowed, because of privacy legislation4,5. Here, to facilitate the integration of any medical data from any data owner worldwide without violating privacy laws, we introduce Swarm Learning-a decentralized machine-learning approach that unites edge computing, blockchain-based peer-to-peer networking and coordination while maintaining confidentiality without the need for a central coordinator, thereby going beyond federated learning. To illustrate the feasibility of using Swarm Learning to develop disease classifiers using distributed data, we chose four use cases of heterogeneous diseases (COVID-19, tuberculosis, leukaemia and lung pathologies). With more than 16,400 blood transcriptomes derived from 127 clinical studies with non-uniform distributions of cases and controls and substantial study biases, as well as more than 95,000 chest X-ray images, we show that Swarm Learning classifiers outperform those developed at individual sites. In addition, Swarm Learning completely fulfils local confidentiality regulations by design. We believe that this approach will notably accelerate the introduction of precision medicine.
},
keywords = {coronavirus, machine learning, viruses},
pubstate = {published},
tppubtype = {article}
}
Mock, Florian; Marz, Manja
Sequence Classification with Machine Learning at the Example of Viral Host Prediction Book Section
In: Frishman, Dmitrij; Marz, Manja (Ed.): Virus Bioinformatics, CRC Press, 2021.
Abstract | Links | BibTeX | Tags: classification, machine learning, virus host interaction, viruses
@incollection{Mock:21,
title = {Sequence Classification with Machine Learning at the Example of Viral Host Prediction},
author = {Florian Mock and Manja Marz},
editor = {Dmitrij Frishman and Manja Marz},
doi = {10.1201/9781003097679-10},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {Virus Bioinformatics},
publisher = {CRC Press},
abstract = {Sequence classification is a common task in modern virus bioinformatics research. DNA, RNA, or protein sequences are either filtered for certain properties or the properties of a sequence are to be determined. This task is a very diverse problem. The previous knowledge about the data and also the amount of usable data differ for each project. Also the classification task itself is highly diverse. An additional difficulty is that even today for most biological questions, especially in virology, we lack some set of measurable properties (features) that always explain our observations. Here, we introduce machine learning for viral sequence classification. Together with the reader, we build a deep neural network (DNN) pipeline to classify the host of an influenza A virus from its genome sequence with great accuracy. This result may be somewhat surprising since, despite years of research, we lack a set of properties that lead to highly accurate predictions, and currently, more exceptions are often found than new features. Deep learning can automatically identify a trainable set of features and their dependencies with higher predictive power than previous approaches. This work may serve as a starting point to encourage researchers in virology to use machine learning. Using viral host prediction as an example, we will be discussing classical pitfalls such as data quantity and quality.},
keywords = {classification, machine learning, virus host interaction, viruses},
pubstate = {published},
tppubtype = {incollection}
}
2020
Collatz, Maximilian; Mock, Florian; Barth, Emanuel; Hölzer, Martin; Sachse, Konrad; Marz, Manja
EpiDope: A Deep Neural Network for linear B-cell epitope prediction Journal Article
In: Bioinformatics, vol. 37, no. 4, pp. 448–455, 2020.
Abstract | Links | BibTeX | Tags: machine learning, software, virus host interaction, viruses
@article{Collatz:20,
title = {EpiDope: A Deep Neural Network for linear B-cell epitope prediction},
author = {Maximilian Collatz and Florian Mock and Emanuel Barth and Martin Hölzer and Konrad Sachse and Manja Marz},
editor = {Lenore Cowen},
url = {https://github.com/rnajena/EpiDope},
doi = {10.1093/bioinformatics/btaa773},
year = {2020},
date = {2020-09-11},
urldate = {2020-09-11},
journal = {Bioinformatics},
volume = {37},
number = {4},
pages = {448–455},
publisher = {Oxford University Press (OUP)},
abstract = {By binding to specific structures on antigenic proteins, the so-called epitopes, B-cell antibodies can neutralize pathogens. The identification of B-cell epitopes is of great value for the development of specific serodiagnostic assays and the optimization of medical therapy. However, identifying diagnostically or therapeutically relevant epitopes is a challenging task that usually involves extensive laboratory work. In this study, we show that the time, cost and labor-intensive process of epitope detection in the lab can be significantly reduced using in silico prediction.
Here, we present EpiDope, a python tool which uses a deep neural network to detect linear B-cell epitope regions on individual protein sequences. With an area under the curve between 0.67 ± 0.07 in the receiver operating characteristic curve, EpiDope exceeds all other currently used linear B-cell epitope prediction tools. Our software is shown to reliably predict linear B-cell epitopes of a given protein sequence, thus contributing to a significant reduction of laboratory experiments and costs required for the conventional approach.},
keywords = {machine learning, software, virus host interaction, viruses},
pubstate = {published},
tppubtype = {article}
}
Here, we present EpiDope, a python tool which uses a deep neural network to detect linear B-cell epitope regions on individual protein sequences. With an area under the curve between 0.67 ± 0.07 in the receiver operating characteristic curve, EpiDope exceeds all other currently used linear B-cell epitope prediction tools. Our software is shown to reliably predict linear B-cell epitopes of a given protein sequence, thus contributing to a significant reduction of laboratory experiments and costs required for the conventional approach.
Mock, Florian; Viehweger, Adrian; Barth, Emanuel; Marz, Manja
VIDHOP, viral host prediction with Deep Learning Journal Article
In: Bioinformatics, vol. 37, no. 3, pp. 318–325, 2020.
Abstract | Links | BibTeX | Tags: machine learning, software, virus host interaction, viruses
@article{Mock:20,
title = {VIDHOP, viral host prediction with Deep Learning},
author = {Florian Mock and Adrian Viehweger and Emanuel Barth and Manja Marz},
editor = {Jinbo Xu},
url = {https://github.com/rnajena/vidhop},
doi = {10.1093/bioinformatics/btaa705},
year = {2020},
date = {2020-08-10},
urldate = {2020-08-10},
journal = {Bioinformatics},
volume = {37},
number = {3},
pages = {318–325},
publisher = {Oxford University Press (OUP)},
abstract = {Zoonosis, the natural transmission of infections from animals to humans, is a far-reaching global problem. The recent outbreaks of Zikavirus, Ebolavirus and Coronavirus are examples of viral zoonosis, which occur more frequently due to globalization. In case of a virus outbreak, it is helpful to know which host organism was the original carrier of the virus to prevent further spreading of viral infection. Recent approaches aim to predict a viral host based on the viral genome, often in combination with the potential host genome and arbitrarily selected features. These methods are limited in the number of different hosts they can predict or the accuracy of the prediction.
Here, we present a fast and accurate deep learning approach for viral host prediction, which is based on the viral genome sequence only. We tested our deep neural network (DNN) on three different virus species (influenza A virus, rabies lyssavirus and rotavirus A). We achieved for each virus species an AUC between 0.93 and 0.98, allowing highly accurate predictions while using only fractions (100–400 bp) of the viral genome sequences. We show that deep neural networks are suitable to predict the host of a virus, even with a limited amount of sequences and highly unbalanced available data. The trained DNNs are the core of our virus–host prediction tool VIrus Deep learning HOst Prediction (VIDHOP). VIDHOP also allows the user to train and use models for other viruses.},
keywords = {machine learning, software, virus host interaction, viruses},
pubstate = {published},
tppubtype = {article}
}
Here, we present a fast and accurate deep learning approach for viral host prediction, which is based on the viral genome sequence only. We tested our deep neural network (DNN) on three different virus species (influenza A virus, rabies lyssavirus and rotavirus A). We achieved for each virus species an AUC between 0.93 and 0.98, allowing highly accurate predictions while using only fractions (100–400 bp) of the viral genome sequences. We show that deep neural networks are suitable to predict the host of a virus, even with a limited amount of sequences and highly unbalanced available data. The trained DNNs are the core of our virus–host prediction tool VIrus Deep learning HOst Prediction (VIDHOP). VIDHOP also allows the user to train and use models for other viruses.
Samuel, Sheeba; Shadaydeh, Maha; Böcker, Sebastian; Brügmann, Bernd; Bucher, Solveig Franziska; Deckert, Volker; Denzler, Joachim; Dittrich, Peter; Eggeling, Ferdinand; Güllmar, Daniel; Guntinas-Lichius, Orlando; König-Ries, Birgitta; Löffler, Frank; Maicher, Lutz; Marz, Manja; Migliavacca, Mirco; Reichenbach, Jürgen R.; Reichstein, Markus; Römermann, Christine; Wittig, Andrea
A virtual "Werkstatt" for digitization in the sciences Journal Article
In: Res Ideas Outcomes, vol. 6, pp. e54106, 2020.
Abstract | Links | BibTeX | Tags: machine learning
@article{Samuel:20,
title = {A virtual "Werkstatt" for digitization in the sciences},
author = {Sheeba Samuel and Maha Shadaydeh and Sebastian Böcker and Bernd Brügmann and Solveig Franziska Bucher and Volker Deckert and Joachim Denzler and Peter Dittrich and Ferdinand Eggeling and Daniel Güllmar and Orlando Guntinas-Lichius and Birgitta König-Ries and Frank Löffler and Lutz Maicher and Manja Marz and Mirco Migliavacca and Jürgen R. Reichenbach and Markus Reichstein and Christine Römermann and Andrea Wittig},
doi = {10.3897/rio.6.e54106},
year = {2020},
date = {2020-05-11},
urldate = {2020-01-01},
journal = {Res Ideas Outcomes},
volume = {6},
pages = {e54106},
publisher = {Pensoft Publishers},
abstract = {Data is central in almost all scientific disciplines nowadays. Furthermore, intelligent systems have developed rapidly in recent years, so that in many disciplines the expectation is emerging that with the help of intelligent systems, significant challenges can be overcome and science can be done in completely new ways. In order for this to succeed, however, first, fundamental research in computer science is still required, and, second, generic tools must be developed on which specialized solutions can be built. In this paper, we introduce a recently started collaborative project funded by the Carl Zeiss Foundation, a virtual manufactory for digitization in the sciences, the “Werkstatt”, which is being established at the Michael Stifel Center Jena (MSCJ) for data-driven and simulation science to address fundamental questions in computer science and applications. The Werkstatt focuses on three key areas, which include generic tools for machine learning, knowledge generation using machine learning processes, and semantic methods for the data life cycle, as well as the application of these topics in different disciplines. Core and pilot projects address the key aspects of the topics and form the basis for sustainable work in the Werkstatt.},
keywords = {machine learning},
pubstate = {published},
tppubtype = {article}
}
2019
Viehweger, Adrian; Krautwurst, Sebastian; Koenig, Brigitte; Marz, Manja
An encoding of genome content for machine learning Journal Article
In: bioRxiv, pp. 524280, 2019.
Abstract | Links | BibTeX | Tags: assembly, machine learning, metagenomics
@article{Viehweger:19,
title = {An encoding of genome content for machine learning},
author = {Adrian Viehweger and Sebastian Krautwurst and Brigitte Koenig and Manja Marz},
url = {https://github.com/phiweger/nanotext},
doi = {10.1101/524280},
year = {2019},
date = {2019-01-18},
urldate = {2019-01-18},
journal = {bioRxiv},
pages = {524280},
publisher = {Cold Spring Harbor Laboratory},
abstract = {An ever-growing number of metagenomes can be used for biomining and the study of microbial functions. The use of learning algorithms in this context has been hindered, because they often need input in the form of low-dimensional, dense vectors of numbers. We propose such a representation for genomes called nanotext that scales to very large data sets.
The underlying model is learned from a corpus of nearly 150 thousand genomes spanning 750 million protein domains. We treat the protein domains in a genome like words in a document, assuming that protein domains in a similar context have similar “meaning”. This meaning can be distributed by a neural net over a vector of numbers.
The resulting vectors efficiently encode function, preserve known phylogeny, capture subtle functional relationships and are robust against genome incompleteness. The “functional” distance between two vectors complements nucleotide-based distance, so that genomes can be identified as similar even though their nucleotide identity is low. nanotext can thus encode (meta)genomes for direct use in downstream machine learning tasks. We show this by predicting plausible culture media for metagenome assembled genomes (MAGs) from the Tara Oceans Expedition using their genome content only. nanotext is freely released under a BSD licence (https://github.com/phiweger/nanotext).},
keywords = {assembly, machine learning, metagenomics},
pubstate = {published},
tppubtype = {article}
}
The underlying model is learned from a corpus of nearly 150 thousand genomes spanning 750 million protein domains. We treat the protein domains in a genome like words in a document, assuming that protein domains in a similar context have similar “meaning”. This meaning can be distributed by a neural net over a vector of numbers.
The resulting vectors efficiently encode function, preserve known phylogeny, capture subtle functional relationships and are robust against genome incompleteness. The “functional” distance between two vectors complements nucleotide-based distance, so that genomes can be identified as similar even though their nucleotide identity is low. nanotext can thus encode (meta)genomes for direct use in downstream machine learning tasks. We show this by predicting plausible culture media for metagenome assembled genomes (MAGs) from the Tara Oceans Expedition using their genome content only. nanotext is freely released under a BSD licence (https://github.com/phiweger/nanotext).
2018
Lamkiewicz, Kevin; Barth, Emanuel; Marz, Manja; Ibrahim, Bashar
Identification of potential microRNAs associated with Herpesvirus family based on bioinformatic analysis Journal Article
In: bioRxiv, pp. 417782, 2018.
Abstract | Links | BibTeX | Tags: machine learning, ncRNAs, RNA / transcriptomics, viruses
@article{Lamkiewicz:18,
title = {Identification of potential microRNAs associated with Herpesvirus family based on bioinformatic analysis},
author = {Kevin Lamkiewicz and Emanuel Barth and Manja Marz and Bashar Ibrahim},
doi = {10.1101/417782},
year = {2018},
date = {2018-11-09},
urldate = {2018-11-09},
journal = {bioRxiv},
pages = {417782},
publisher = {Cold Spring Harbor Laboratory},
abstract = {MicroRNAs (miRNAs) are known key regulators of gene expression on posttranscriptional level in many organisms encoded in mammals, plants and also several viral families. To date, no homologous gene of a virus-originated miRNA is known in other organisms. To date, only a few homologous miRNA between two different viruses are known, however, no gene of a virus-originated miRNA is known in any organism of other kingdoms. This can be attributed to the fact that classical miRNA detection approaches such as homology-based predictions fail at viruses due to their highly diverse genomes and their high mutation rate.
Here, we applied the virus-derived precursor miRNA (pre-miRNA) prediction pipeline ViMiFi, which combines information about sequence conservation and machine learning-based approaches, on Human Herpesvirus 7 (HHV7) and Epstein-Barr virus (EBV). ViMiFi was able to predict 61 candidates in EBV, which has 25 known pre-miRNAs. From these 25, ViMiFi identified 20. It was further able to predict 18 candidates in the HHV7 genome, in which no miRNA had been described yet. We also studied the undescribed candidates of both viruses for potential functions and found similarities with human snRNAs and miRNAs from mammals and plants.},
keywords = {machine learning, ncRNAs, RNA / transcriptomics, viruses},
pubstate = {published},
tppubtype = {article}
}
Here, we applied the virus-derived precursor miRNA (pre-miRNA) prediction pipeline ViMiFi, which combines information about sequence conservation and machine learning-based approaches, on Human Herpesvirus 7 (HHV7) and Epstein-Barr virus (EBV). ViMiFi was able to predict 61 candidates in EBV, which has 25 known pre-miRNAs. From these 25, ViMiFi identified 20. It was further able to predict 18 candidates in the HHV7 genome, in which no miRNA had been described yet. We also studied the undescribed candidates of both viruses for potential functions and found similarities with human snRNAs and miRNAs from mammals and plants.
