
E-Mail: sebastian.krautwurst@uni-jena.de
Room: 08S01
Phone: +49-3641-9-46483
Publications
2022
Fuesslin, Valeria; Krautwurst, Sebastian; Srivastava, Akash; Winter, Doris; Liedigk, Britta; Thye, Thorsten; Herrera-León, Silvia; Wohl, Shirlee; May, Jürgen; Fobil, Julius N.; Eibach, Daniel; Marz, Manja; Schuldt, Kathrin
In: Front Microbiol, vol. 13, pp. 909692, 2022.
@article{Fuesslin2022,
title = {Prediction of Antibiotic Susceptibility Profiles of \textit{Vibrio cholerae} Isolates From Whole Genome Illumina and Nanopore Sequencing Data: CholerAegon},
author = {Valeria Fuesslin and Sebastian Krautwurst and Akash Srivastava and Doris Winter and Britta Liedigk and Thorsten Thye and Silvia Herrera-León and Shirlee Wohl and Jürgen May and Julius N. Fobil and Daniel Eibach and Manja Marz and Kathrin Schuldt},
url = {https://github.com/RaverJay/CholerAegon },
doi = {10.3389/fmicb.2022.909692},
year = {2022},
date = {2022-06-22},
journal = {Front Microbiol},
volume = {13},
pages = {909692},
abstract = {During the last decades, antimicrobial resistance (AMR) has become a global public health concern. Nowadays multi-drug resistance is commonly observed in strains of Vibrio cholerae, the etiological agent of cholera. In order to limit the spread of pathogenic drug-resistant bacteria and to maintain treatment options the analysis of clinical samples and their AMR profiles are essential. Particularly, in low-resource settings a timely analysis of AMR profiles is often impaired due to lengthy culturing procedures for antibiotic susceptibility testing or lack of laboratory capacity. In this study, we explore the applicability of whole genome sequencing for the prediction of AMR profiles of V. cholerae. We developed the pipeline CholerAegon for the in silico prediction of AMR profiles of 82 V. cholerae genomes assembled from long and short sequencing reads. By correlating the predicted profiles with results from phenotypic antibiotic susceptibility testing we show that the prediction can replace in vitro susceptibility testing for five of seven antibiotics. Because of the relatively low costs, possibility for real-time data analyses, and portability, the Oxford Nanopore Technologies MinION sequencing platform—especially in light of an upcoming less error-prone technology for the platform—appears to be well suited for pathogen genomic analyses such as the one described here. Together with CholerAegon, it can leverage pathogen genomics to improve disease surveillance and to control further spread of antimicrobial resistance.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2021
Brandt, Christian; Krautwurst, Sebastian; Spott, Riccardo; Lohde, Mara; Jundzill, Mateusz; Marquet, Mike; Hölzer, Martin
poreCov - An Easy to Use, Fast, and Robust Workflow for SARS-CoV-2 Genome Reconstruction via Nanopore Sequencing Journal Article
In: Front Genet, vol. 12, pp. 711437, 2021.
@article{Brandt2021,
title = {poreCov - An Easy to Use, Fast, and Robust Workflow for SARS-CoV-2 Genome Reconstruction via Nanopore Sequencing},
author = {Christian Brandt and Sebastian Krautwurst and Riccardo Spott and Mara Lohde and Mateusz Jundzill and Mike Marquet and Martin Hölzer},
url = {https://github.com/replikation/poreCov},
doi = {10.3389/fgene.2021.711437},
year = {2021},
date = {2021-07-28},
urldate = {2021-07-28},
journal = {Front Genet},
volume = {12},
pages = {711437},
abstract = {In response to the SARS-CoV-2 pandemic, a highly increased sequencing effort has been established worldwide to track and trace ongoing viral evolution. Technologies, such as nanopore sequencing via the ARTIC protocol are used to reliably generate genomes from raw sequencing data as a crucial base for molecular surveillance. However, for many labs that perform SARS-CoV-2 sequencing, bioinformatics is still a major bottleneck, especially if hundreds of samples need to be processed in a recurring fashion. Pipelines developed for short-read data cannot be applied to nanopore data. Therefore, specific long-read tools and parameter settings need to be orchestrated to enable accurate genotyping and robust reference-based genome reconstruction of SARS-CoV-2 genomes from nanopore data. Here we present poreCov, a highly parallel workflow written in Nextflow, using containers to wrap all the tools necessary for a routine SARS-CoV-2 sequencing lab into one program. The ease of installation, combined with concise summary reports that clearly highlight all relevant information, enables rapid and reliable analysis of hundreds of SARS-CoV-2 raw sequence data sets or genomes. poreCov is freely available on GitHub under the GNUv3 license: github.com/replikation/poreCov.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Brandt, Christian; Krautwurst, Sebastian; Spott, Riccardo; Lohde, Mara; Jundzill, Mateusz; Marquet, Mike; Hölzer, Martin
poreCov - an easy to use, fast, and robust workflow for SARS-CoV-2 genome reconstruction via nanopore sequencing Journal Article
In: bioRxiv, 2021, (Now published in Frontiers in Genetics: https://dx.doi.org/10.3389/fgene.2021.711437).
@article{Brandt:21,
title = {poreCov - an easy to use, fast, and robust workflow for SARS-CoV-2 genome reconstruction via nanopore sequencing},
author = {Christian Brandt and Sebastian Krautwurst and Riccardo Spott and Mara Lohde and Mateusz Jundzill and Mike Marquet and Martin Hölzer},
url = {https://github.com/replikation/poreCov},
doi = {10.1101/2021.05.07.443089},
year = {2021},
date = {2021-05-07},
urldate = {2021-05-07},
journal = {bioRxiv},
publisher = {Cold Spring Harbor Laboratory},
abstract = {In response to the SARS-CoV-2 pandemic, a highly increased sequencing effort has been established worldwide to track and trace ongoing viral evolution. Technologies such as nanopore sequencing via the ARTIC protocol are used to reliably generate genomes from raw sequencing data as a crucial base for molecular surveillance. However, for many labs that perform SARS-CoV-2 sequencing, bioinformatics is still a major bottleneck, especially if hundreds of samples need to be processed in a recurring fashion. Pipelines developed for short-read data cannot be applied to nanopore data. Therefore, specific long-read tools and parameter settings need to be orchestrated to enable accurate genotyping and robust reference-based genome reconstruction of SARS-CoV-2 genomes from nanopore data. Here we present poreCov, a highly parallel workflow written in Nextflow, using containers to wrap all the tools necessary for a routine SARS-CoV-2 sequencing lab into one program. The ease of installation, combined with concise summary reports that clearly highlight all relevant information, enables rapid and reliable analysis of hundreds of SARS-CoV-2 raw sequence data sets or genomes. poreCov is freely available on GitHub under the GNUv3 license: github.com/replikation/poreCov.
},
note = {Now published in Frontiers in Genetics: https://dx.doi.org/10.3389/fgene.2021.711437},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Krautwurst, Sebastian; Dijkman, Ronald; Thiel, Volker; Krumbholz, Andi; Marz, Manja
Direct RNA Sequencing for Complete Viral Genomes Incollection
In: Frishman, Dmitrij; Marz, Manja (Ed.): Virus Bioinformatics, CRC Press, 2021.
@incollection{Krautwurst:21,
title = {Direct RNA Sequencing for Complete Viral Genomes},
author = {Sebastian Krautwurst and Ronald Dijkman and Volker Thiel and Andi Krumbholz and Manja Marz},
editor = {Dmitrij Frishman and Manja Marz},
url = {https://www.taylorfrancis.com/chapters/edit/10.1201/9781003097679-3/direct-rna-sequencing-complete-viral-genomes-sebastian-krautwurst-ronald-dijkman-volker-thiel-andi-krumbholz-manja-marz},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {Virus Bioinformatics},
publisher = {CRC Press},
abstract = {Determination of nucleotide sequences present in biological samples (termed “sequencing”) has become a key method in almost all fields of bioscience, including virology. Since the advent of high-throughput sequencing (“second-generation sequencing”), it is possible to sequence millions of DNA fragments (“reads”) in parallel at very high accuracy, enabling the inference of single nucleotide polymorphisms (SNPs) between virus strains.
In this chapter, we provide details on how the long-read sequencing technologies (“third-generation sequencing”) which were developed in recent years have expanded the toolkit for researchers beyond the possibilities of short-read sequencing, with a focus on virus sequencing. With increased read lengths, it is possible to sequence full viral transcripts and genomes in single contiguous reads, enabling detailed studies of transcript isoforms, haplotypes, and viral quasispecies. In comparison, long-read technologies have generally higher raw read error rates, but an accurate assembly of transcripts and genomes is facilitated or made unnecessary due to the long contiguous sequences. One of the technologies, namely nanopore sequencing, also uniquely allows for direct RNA sequencing without the need for the creation or amplification of complementary DNA. This enables accurate capture of RNA content in a sample “as is,” e.g., in cells infected by RNA viruses. The protocol also leaves RNA modifications intact, which can be inferred during sequencing. Nanopore sequencing can be implemented at low costs and with constant genome coverage using cDNA amplicon sequencing methods, e.g., for highly parallel screening during virus outbreaks.},
keywords = {},
pubstate = {published},
tppubtype = {incollection}
}
In this chapter, we provide details on how the long-read sequencing technologies (“third-generation sequencing”) which were developed in recent years have expanded the toolkit for researchers beyond the possibilities of short-read sequencing, with a focus on virus sequencing. With increased read lengths, it is possible to sequence full viral transcripts and genomes in single contiguous reads, enabling detailed studies of transcript isoforms, haplotypes, and viral quasispecies. In comparison, long-read technologies have generally higher raw read error rates, but an accurate assembly of transcripts and genomes is facilitated or made unnecessary due to the long contiguous sequences. One of the technologies, namely nanopore sequencing, also uniquely allows for direct RNA sequencing without the need for the creation or amplification of complementary DNA. This enables accurate capture of RNA content in a sample “as is,” e.g., in cells infected by RNA viruses. The protocol also leaves RNA modifications intact, which can be inferred during sequencing. Nanopore sequencing can be implemented at low costs and with constant genome coverage using cDNA amplicon sequencing methods, e.g., for highly parallel screening during virus outbreaks.
2019
Mostajo, Nelly F.; Lataretu, Marie; Krautwurst, Sebastian; Mock, Florian; Desirò, Daniel; Lamkiewicz, Kevin; Collatz, Maximilian; Schoen, Andreas; Weber, Friedemann; Marz, Manja; Hölzer, Martin
A comprehensive annotation and differential expression analysis of short and long non-coding RNAs in 16 bat genomes Journal Article
In: NAR Genomics Bioinf, vol. 2, no. 1, pp. lqz006, 2019.
@article{Mostajo:20,
title = {A comprehensive annotation and differential expression analysis of short and long non-coding RNAs in 16 bat genomes},
author = {Nelly F. Mostajo and Marie Lataretu and Sebastian Krautwurst and Florian Mock and Daniel Desirò and Kevin Lamkiewicz and Maximilian Collatz and Andreas Schoen and Friedemann Weber and Manja Marz and Martin Hölzer},
url = {https://www.rna.uni-jena.de/supplements/bats/index.html},
doi = {10.1093/nargab/lqz006},
year = {2019},
date = {2019-09-30},
urldate = {2019-09-30},
journal = {NAR Genomics Bioinf},
volume = {2},
number = {1},
pages = {lqz006},
abstract = {Although bats are increasingly becoming the focus of scientific studies due to their unique properties, these exceptional animals are still among the least studied mammals. Assembly quality and completeness of bat genomes vary a lot and especially non-coding RNA (ncRNA) annotations are incomplete or simply missing. Accordingly, standard bioinformatics pipelines for gene expression analysis often ignore ncRNAs such as microRNAs or long antisense RNAs. The main cause of this problem is the use of incomplete genome annotations. We present a complete screening for ncRNAs within 16 bat genomes. NcRNAs affect a remarkable variety of vital biological functions, including gene expression regulation, RNA processing, RNA interference and, as recently described, regulatory processes in viral infections. Within all investigated bat assemblies, we annotated 667 ncRNA families including 162 snoRNAs and 193 miRNAs as well as rRNAs, tRNAs, several snRNAs and lncRNAs, and other structural ncRNA elements. We validated our ncRNA candidates by six RNA-Seq data sets and show significant expression patterns that have never been described before in a bat species on such a large scale. Our annotations will be usable as a resource (rna.uni-jena.de/supplements/bats) for deeper studying of bat evolution, ncRNAs repertoire, gene expression and regulation, ecology and important host–virus interactions.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Viehweger, Adrian; Krautwurst, Sebastian; Lamkiewicz, Kevin; Madhugiri, Ramakanth; Ziebuhr, John; Hölzer, Martin; Marz, Manja
In: Genome Res, vol. 29, pp. 1545-1554, 2019.
@article{Viehweger:19a,
title = {Direct RNA nanopore sequencing of full-length coronavirus genomes provides novel insights into structural variants and enables modification analysis.},
author = {Adrian Viehweger and Sebastian Krautwurst and Kevin Lamkiewicz and Ramakanth Madhugiri and John Ziebuhr and Martin Hölzer and Manja Marz},
doi = {10.1101/gr.247064.118},
year = {2019},
date = {2019-08-22},
urldate = {2019-08-22},
journal = {Genome Res},
volume = {29},
pages = {1545-1554},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Sequence analyses of RNA virus genomes remain challenging owing to the exceptional genetic plasticity of these viruses. Because of high mutation and recombination rates, genome replication by viral RNA-dependent RNA polymerases leads to populations of closely related viruses, so-called “quasispecies.” Standard (short-read) sequencing technologies are ill-suited to reconstruct large numbers of full-length haplotypes of (1) RNA virus genomes and (2) subgenome-length (sg) RNAs composed of noncontiguous genome regions. Here, we used a full-length, direct RNA sequencing (DRS) approach based on nanopores to characterize viral RNAs produced in cells infected with a human coronavirus. By using DRS, we were able to map the longest (∼26-kb) contiguous read to the viral reference genome. By combining Illumina and Oxford Nanopore sequencing, we reconstructed a highly accurate consensus sequence of the human coronavirus (HCoV)-229E genome (27.3 kb). Furthermore, by using long reads that did not require an assembly step, we were able to identify, in infected cells, diverse and novel HCoV-229E sg RNAs that remain to be characterized. Also, the DRS approach, which circumvents reverse transcription and amplification of RNA, allowed us to detect methylation sites in viral RNAs. Our work paves the way for haplotype-based analyses of viral quasispecies by showing the feasibility of intra-sample haplotype separation. Even though several technical challenges remain to be addressed to exploit the potential of the nanopore technology fully, our work illustrates that DRS may significantly advance genomic studies of complex virus populations, including predictions on long-range interactions in individual full-length viral RNA haplotypes.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Viehweger, Adrian; Krautwurst, Sebastian; Koenig, Brigitte; Marz, Manja
An encoding of genome content for machine learning Journal Article
In: bioRxiv, pp. 524280, 2019.
@article{Viehweger:19,
title = {An encoding of genome content for machine learning},
author = {Adrian Viehweger and Sebastian Krautwurst and Brigitte Koenig and Manja Marz},
url = {https://github.com/phiweger/nanotext},
doi = {10.1101/524280},
year = {2019},
date = {2019-01-18},
urldate = {2019-01-18},
journal = {bioRxiv},
pages = {524280},
publisher = {Cold Spring Harbor Laboratory},
abstract = {An ever-growing number of metagenomes can be used for biomining and the study of microbial functions. The use of learning algorithms in this context has been hindered, because they often need input in the form of low-dimensional, dense vectors of numbers. We propose such a representation for genomes called nanotext that scales to very large data sets.
The underlying model is learned from a corpus of nearly 150 thousand genomes spanning 750 million protein domains. We treat the protein domains in a genome like words in a document, assuming that protein domains in a similar context have similar “meaning”. This meaning can be distributed by a neural net over a vector of numbers.
The resulting vectors efficiently encode function, preserve known phylogeny, capture subtle functional relationships and are robust against genome incompleteness. The “functional” distance between two vectors complements nucleotide-based distance, so that genomes can be identified as similar even though their nucleotide identity is low. nanotext can thus encode (meta)genomes for direct use in downstream machine learning tasks. We show this by predicting plausible culture media for metagenome assembled genomes (MAGs) from the Tara Oceans Expedition using their genome content only. nanotext is freely released under a BSD licence (https://github.com/phiweger/nanotext).},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
The underlying model is learned from a corpus of nearly 150 thousand genomes spanning 750 million protein domains. We treat the protein domains in a genome like words in a document, assuming that protein domains in a similar context have similar “meaning”. This meaning can be distributed by a neural net over a vector of numbers.
The resulting vectors efficiently encode function, preserve known phylogeny, capture subtle functional relationships and are robust against genome incompleteness. The “functional” distance between two vectors complements nucleotide-based distance, so that genomes can be identified as similar even though their nucleotide identity is low. nanotext can thus encode (meta)genomes for direct use in downstream machine learning tasks. We show this by predicting plausible culture media for metagenome assembled genomes (MAGs) from the Tara Oceans Expedition using their genome content only. nanotext is freely released under a BSD licence (https://github.com/phiweger/nanotext).