Models and Information-Theoretic Bounds for Nanopore Sequencing. Mao, W., Diggavi, S. N., & Kannan, S. IEEE Transactions on Information Theory, 64(4):3216-3236, April, 2018.
Arxiv doi abstract bibtex 6 downloads Nanopore sequencing is an emerging new technology for sequencing Deoxyribonucleic acid (DNA), which can read long fragments of DNA ( 50000 bases), in contrast to most current short-read sequencing technologies which can only read hundreds of bases. While nanopore sequencers can acquire long reads, the high error rates (20%-30%) pose a technical challenge. In a nanopore sequencer, a DNA is migrated through a nanopore, and current variations are measured. The DNA sequence is inferred from this observed current pattern using an algorithm called a base-caller. In this paper, we propose a mathematical model for the “channel” from the input DNA sequence to the observed current, and calculate bounds on the information extraction capacity of the nanopore sequencer. This model incorporates impairments, such as (non-linear) intersymbol interference, deletions, and random response. These information bounds have two-fold application: 1) The decoding rate with a uniform input distribution can be used to calculate the average size of the plausible list of DNA sequences given an observed current trace. This bound can be used to benchmark existing base-calling algorithms, as well as serving a performance objective to design better nanopores. 2) When the nanopore sequencer is used as a reader in a DNA storage system, the storage capacity is quantified by our bounds.
@article{8301564,
abstract = {Nanopore sequencing is an emerging new technology for sequencing Deoxyribonucleic acid (DNA), which can read long fragments of DNA (~50000 bases), in contrast to most current short-read sequencing technologies which can only read hundreds of bases. While nanopore sequencers can acquire long reads, the high error rates (20\%-30\%) pose a technical challenge. In a nanopore sequencer, a DNA is migrated through a nanopore, and current variations are measured. The DNA sequence is inferred from this observed current pattern using an algorithm called a base-caller. In this paper, we propose a mathematical model for the “channel” from the input DNA sequence to the observed current, and calculate bounds on the information extraction capacity of the nanopore sequencer. This model incorporates impairments, such as (non-linear) intersymbol interference, deletions, and random response. These information bounds have two-fold application: 1) The decoding rate with a uniform input distribution can be used to calculate the average size of the plausible list of DNA sequences given an observed current trace. This bound can be used to benchmark existing base-calling algorithms, as well as serving a performance objective to design better nanopores. 2) When the nanopore sequencer is used as a reader in a DNA storage system, the storage capacity is quantified by our bounds.},
author = {W. {Mao} and S. N. {Diggavi} and S. {Kannan}},
doi = {10.1109/TIT.2018.2809001},
issn = {1557-9654},
journal = {IEEE Transactions on Information Theory},
keywords = {biology computing;DNA;genomics;intersymbol interference;molecular biophysics;nanobiotechnology;nanopore sequencing;sequencing Deoxyribonucleic acid;sequencing technologies;nanopore sequencer;input DNA sequences;base-calling algorithms;DNA storage system;storage capacity;DNA;Sequential analysis;Decoding;Nanobioscience;Current measurement;Reliability;Mathematical model;Deoxyribonucleic acid (DNA) sequencing;bioinformatics;base calling;channel with synchronization errors;deletion channel;finite state channels},
month = {April},
number = {4},
pages = {3216-3236},
tags = {journal,IT,BioInf,NDS},
title = {Models and Information-Theoretic Bounds for Nanopore Sequencing},
type = {2},
url_arxiv = {https://arxiv.org/abs/1705.11154},
volume = {64},
year = {2018}
}
Downloads: 6
{"_id":"XjaYQ7DZZoY8iN3g3","bibbaseid":"mao-diggavi-kannan-modelsandinformationtheoreticboundsfornanoporesequencing-2018","author_short":["Mao, W.","Diggavi, S. N.","Kannan, S."],"bibdata":{"bibtype":"article","type":"2","abstract":"Nanopore sequencing is an emerging new technology for sequencing Deoxyribonucleic acid (DNA), which can read long fragments of DNA ( 50000 bases), in contrast to most current short-read sequencing technologies which can only read hundreds of bases. While nanopore sequencers can acquire long reads, the high error rates (20%-30%) pose a technical challenge. In a nanopore sequencer, a DNA is migrated through a nanopore, and current variations are measured. The DNA sequence is inferred from this observed current pattern using an algorithm called a base-caller. In this paper, we propose a mathematical model for the “channel” from the input DNA sequence to the observed current, and calculate bounds on the information extraction capacity of the nanopore sequencer. This model incorporates impairments, such as (non-linear) intersymbol interference, deletions, and random response. These information bounds have two-fold application: 1) The decoding rate with a uniform input distribution can be used to calculate the average size of the plausible list of DNA sequences given an observed current trace. This bound can be used to benchmark existing base-calling algorithms, as well as serving a performance objective to design better nanopores. 2) When the nanopore sequencer is used as a reader in a DNA storage system, the storage capacity is quantified by our bounds.","author":[{"firstnames":["W."],"propositions":[],"lastnames":["Mao"],"suffixes":[]},{"firstnames":["S.","N."],"propositions":[],"lastnames":["Diggavi"],"suffixes":[]},{"firstnames":["S."],"propositions":[],"lastnames":["Kannan"],"suffixes":[]}],"doi":"10.1109/TIT.2018.2809001","issn":"1557-9654","journal":"IEEE Transactions on Information Theory","keywords":"biology computing;DNA;genomics;intersymbol interference;molecular biophysics;nanobiotechnology;nanopore sequencing;sequencing Deoxyribonucleic acid;sequencing technologies;nanopore sequencer;input DNA sequences;base-calling algorithms;DNA storage system;storage capacity;DNA;Sequential analysis;Decoding;Nanobioscience;Current measurement;Reliability;Mathematical model;Deoxyribonucleic acid (DNA) sequencing;bioinformatics;base calling;channel with synchronization errors;deletion channel;finite state channels","month":"April","number":"4","pages":"3216-3236","tags":"journal,IT,BioInf,NDS","title":"Models and Information-Theoretic Bounds for Nanopore Sequencing","url_arxiv":"https://arxiv.org/abs/1705.11154","volume":"64","year":"2018","bibtex":"@article{8301564,\n abstract = {Nanopore sequencing is an emerging new technology for sequencing Deoxyribonucleic acid (DNA), which can read long fragments of DNA (~50000 bases), in contrast to most current short-read sequencing technologies which can only read hundreds of bases. While nanopore sequencers can acquire long reads, the high error rates (20\\%-30\\%) pose a technical challenge. In a nanopore sequencer, a DNA is migrated through a nanopore, and current variations are measured. The DNA sequence is inferred from this observed current pattern using an algorithm called a base-caller. In this paper, we propose a mathematical model for the “channel” from the input DNA sequence to the observed current, and calculate bounds on the information extraction capacity of the nanopore sequencer. This model incorporates impairments, such as (non-linear) intersymbol interference, deletions, and random response. These information bounds have two-fold application: 1) The decoding rate with a uniform input distribution can be used to calculate the average size of the plausible list of DNA sequences given an observed current trace. This bound can be used to benchmark existing base-calling algorithms, as well as serving a performance objective to design better nanopores. 2) When the nanopore sequencer is used as a reader in a DNA storage system, the storage capacity is quantified by our bounds.},\n author = {W. {Mao} and S. N. {Diggavi} and S. {Kannan}},\n doi = {10.1109/TIT.2018.2809001},\n issn = {1557-9654},\n journal = {IEEE Transactions on Information Theory},\n keywords = {biology computing;DNA;genomics;intersymbol interference;molecular biophysics;nanobiotechnology;nanopore sequencing;sequencing Deoxyribonucleic acid;sequencing technologies;nanopore sequencer;input DNA sequences;base-calling algorithms;DNA storage system;storage capacity;DNA;Sequential analysis;Decoding;Nanobioscience;Current measurement;Reliability;Mathematical model;Deoxyribonucleic acid (DNA) sequencing;bioinformatics;base calling;channel with synchronization errors;deletion channel;finite state channels},\n month = {April},\n number = {4},\n pages = {3216-3236},\n tags = {journal,IT,BioInf,NDS},\n title = {Models and Information-Theoretic Bounds for Nanopore Sequencing},\n type = {2},\n url_arxiv = {https://arxiv.org/abs/1705.11154},\n volume = {64},\n year = {2018}\n}\n\n","author_short":["Mao, W.","Diggavi, S. N.","Kannan, S."],"key":"8301564","id":"8301564","bibbaseid":"mao-diggavi-kannan-modelsandinformationtheoreticboundsfornanoporesequencing-2018","role":"author","urls":{" arxiv":"https://arxiv.org/abs/1705.11154"},"keyword":["biology computing;DNA;genomics;intersymbol interference;molecular biophysics;nanobiotechnology;nanopore sequencing;sequencing Deoxyribonucleic acid;sequencing technologies;nanopore sequencer;input DNA sequences;base-calling algorithms;DNA storage system;storage capacity;DNA;Sequential analysis;Decoding;Nanobioscience;Current measurement;Reliability;Mathematical model;Deoxyribonucleic acid (DNA) sequencing;bioinformatics;base calling;channel with synchronization errors;deletion channel;finite state channels"],"metadata":{"authorlinks":{}},"downloads":6,"html":""},"bibtype":"article","biburl":"https://bibbase.org/network/files/e2kjGxYgtBo8SWSbC","dataSources":["hicKnsKYNEFXC4CgH","jxCYzXXYRqw2fiEXQ","wCByFFrQMyRwfzrJ6","yuqM5ah4HMsTyDrMa","YaM87hGQiepg5qijZ","n9wmfkt5w8CPqCepg","soj2cS6PgG8NPmWGr","FaDBDiyFAJY5pL28h","ycfdiwWPzC2rE6H77"],"keywords":["biology computing;dna;genomics;intersymbol interference;molecular biophysics;nanobiotechnology;nanopore sequencing;sequencing deoxyribonucleic acid;sequencing technologies;nanopore sequencer;input dna sequences;base-calling algorithms;dna storage system;storage capacity;dna;sequential analysis;decoding;nanobioscience;current measurement;reliability;mathematical model;deoxyribonucleic acid (dna) sequencing;bioinformatics;base calling;channel with synchronization errors;deletion channel;finite state channels"],"search_terms":["models","information","theoretic","bounds","nanopore","sequencing","mao","diggavi","kannan"],"title":"Models and Information-Theoretic Bounds for Nanopore Sequencing","year":2018,"downloads":6}