@article{lin_volkel_cao_hook_polak_clark_san miguel_timp_tuck_velev_et al._2024, title={A primordial DNA store and compute engine}, volume={8}, ISSN={1748-3387 1748-3395}, url={http://dx.doi.org/10.1038/s41565-024-01771-6}, DOI={10.1038/s41565-024-01771-6}, journal={Nature Nanotechnology}, publisher={Springer Science and Business Media LLC}, author={Lin, Kevin N. and Volkel, Kevin and Cao, Cyrus and Hook, Paul W. and Polak, Rachel E. and Clark, Andrew S. and San Miguel, Adriana and Timp, Winston and Tuck, James M. and Velev, Orlin D. and et al.}, year={2024}, month={Aug} } @article{volkel_lin_hook_timp_keung_tuck_2023, title={FrameD: framework for DNA-based data storage design, verification, and validation}, volume={39}, ISSN={["1367-4811"]}, url={https://doi.org/10.1093/bioinformatics/btad572}, DOI={10.1093/bioinformatics/btad572}, abstractNote={Abstract Motivation DNA-based data storage is a quickly growing field that hopes to harness the massive theoretical information density of DNA molecules to produce a competitive next-generation storage medium suitable for archival data. In recent years, many DNA-based storage system designs have been proposed. Given that no common infrastructure exists for simulating these storage systems, comparing many different designs along with many different error models is increasingly difficult. To address this challenge, we introduce FrameD, a simulation infrastructure for DNA storage systems that leverages the underlying modularity of DNA storage system designs to provide a framework to express different designs while being able to reuse common components. Results We demonstrate the utility of FrameD and the need for a common simulation platform using a case study. Our case study compares designs that utilize strand copies differently, some that align strand copies using multiple sequence alignment algorithms and others that do not. We found that the choice to include multiple sequence alignment in the pipeline is dependent on the error rate and the type of errors being injected and is not always beneficial. In addition to supporting a wide range of designs, FrameD provides the user with transparent parallelism to deal with a large number of reads from sequencing and the need for many fault injection iterations. We believe that FrameD fills a void in the tools publicly available to the DNA storage community by providing a modular and extensible framework with support for massive parallelism. As a result, it will help accelerate the design process of future DNA-based storage systems. Availability and implementation The source code for FrameD along with the data generated during the demonstration of FrameD is available in a public Github repository at https://github.com/dna-storage/framed, (https://dx.doi.org/10.5281/zenodo.7757762). }, number={10}, journal={BIOINFORMATICS}, author={Volkel, Kevin D. and Lin, Kevin N. and Hook, Paul W. and Timp, Winston and Keung, Albert J. and Tuck, James M.}, editor={Kelso, JanetEditor}, year={2023}, month={Oct} } @article{volkel_tomek_keung_tuck_2022, title={DINOS: Data INspired Oligo Synthesis for DNA Data Storage}, volume={18}, ISSN={["1550-4840"]}, url={http://dx.doi.org/10.1145/3510853}, DOI={10.1145/3510853}, abstractNote={As interest in DNA-based information storage grows, the costs of synthesis have been identified as a key bottleneck. A potential direction is to tune synthesis for data. Data strands tend to be composed of a small set of recurring code word sequences, and they contain longer sequences of repeated data. To exploit these properties, we propose a new framework called DINOS. DINOS consists of three key parts: (i) The first is a hierarchical strand assembly algorithm, inspired by gene assembly techniques that can assemble arbitrary data strands from a small set of primitive blocks. (ii) The assembly algorithm relies on our novel formulation for how to construct primitive blocks, spanning a variety of useful configurations from a set of code words and overhangs. Each primitive block is a code word flanked by a pair of overhangs that are created by a cyclic pairing process that keeps the number of primitive blocks small. Using these primitive blocks, any data strand of arbitrary length can be assembled, theoretically. We show a minimal system for a binary code with as few as six primitive blocks, and we generalize our processes to support an arbitrary set of overhangs and code words. (iii) We exploit our hierarchical assembly approach to identify redundant sequences and coalesce the reactions that create them to make assembly more efficient. We evaluate DINOS and describe its key characteristics. For example, the number of reactions needed to make a strand can be reduced by increasing the number of overhangs or the number of code words, but increasing the number of overhangs offers a small advantage over increasing code words while requiring substantially fewer primitive blocks. However, density is improved more by increasing the number of code words. We also find that a simple redundancy coalescing technique is able to reduce reactions by 90.6% and 41.2% on average for decompressed and compressed data, respectively, even when the smallest data fragments being assembled are 16 bits. With a simple padding heuristic that finds even more redundancy, we can further decrease reactions for the same operating point up to 91.1% and 59% for decompressed and compressed data, respectively, on average. Our approach offers greater density by up to 80% over a prior general purpose gene assembly technique. Finally, in an analysis of synthesis costs in which we make 1 GB volume using de novo synthesis versus making only the primitive blocks with de novo synthesis and otherwise assembling using DINOS, we estimate DINOS as 10 5 × cheaper than de novo synthesis. }, number={3}, journal={ACM JOURNAL ON EMERGING TECHNOLOGIES IN COMPUTING SYSTEMS}, publisher={Association for Computing Machinery (ACM)}, author={Volkel, Kevin and Tomek, Kyle J. and Keung, Albert J. and Tuck, James M.}, year={2022}, month={Jul} } @article{tomek_volkel_indermaur_tuck_keung_2021, title={Promiscuous molecules for smarter file operations in DNA-based data storage}, volume={12}, ISSN={["2041-1723"]}, url={https://doi.org/10.1038/s41467-021-23669-w}, DOI={10.1038/s41467-021-23669-w}, abstractNote={AbstractDNA holds significant promise as a data storage medium due to its density, longevity, and resource and energy conservation. These advantages arise from the inherent biomolecular structure of DNA which differentiates it from conventional storage media. The unique molecular architecture of DNA storage also prompts important discussions on how data should be organized, accessed, and manipulated and what practical functionalities may be possible. Here we leverage thermodynamic tuning of biomolecular interactions to implement useful data access and organizational features. Specific sets of environmental conditions including distinct DNA concentrations and temperatures were screened for their ability to switchably access either all DNA strands encoding full image files from a GB-sized background database or subsets of those strands encoding low resolution, File Preview, versions. We demonstrate File Preview with four JPEG images and provide an argument for the substantial and practical economic benefit of this generalizable strategy to organize data.}, number={1}, journal={NATURE COMMUNICATIONS}, author={Tomek, Kyle J. and Volkel, Kevin and Indermaur, Elaine W. and Tuck, James M. and Keung, Albert J.}, year={2021}, month={Jun} } @article{lin_volkel_tuck_keung_2020, title={Dynamic and scalable DNA-based information storage}, volume={11}, ISSN={["2041-1723"]}, url={https://doi.org/10.1038/s41467-020-16797-2}, DOI={10.1038/s41467-020-16797-2}, abstractNote={AbstractThe physical architectures of information storage systems often dictate how information is encoded, databases are organized, and files are accessed. Here we show that a simple architecture comprised of a T7 promoter and a single-stranded overhang domain (ss-dsDNA), can unlock dynamic DNA-based information storage with powerful capabilities and advantages. The overhang provides a physical address for accessing specific DNA strands as well as implementing a range of in-storage file operations. It increases theoretical storage densities and capacities by expanding the encodable sequence space and simplifies the computational burden in designing sets of orthogonal file addresses. Meanwhile, the T7 promoter enables repeatable information access by transcribing information from DNA without destroying it. Furthermore, saturation mutagenesis around the T7 promoter and systematic analyses of environmental conditions reveal design criteria that can be used to optimize information access. This simple but powerful ss-dsDNA architecture lays the foundation for information storage with versatile capabilities.}, number={1}, journal={NATURE COMMUNICATIONS}, publisher={Springer Science and Business Media LLC}, author={Lin, Kevin N. and Volkel, Kevin and Tuck, James M. and Keung, Albert J.}, year={2020}, month={Jun} }