@article{ferriter_mueller_bahmani_pan_2020, title={VCFC: Structural and Semantic Compression and Indexing of Genetic Variant Data}, ISSN={["2156-1133"]}, DOI={10.1109/BIBM49941.2020.9313221}, abstractNote={Personalized genomic datasets are growing in size at an accelerating pace presenting a dilemma between the need for fast retrieval requiring “near data” and cost of storage, which decreases for “distant media” with larger capacity but longer access time. Instead of database technology, the bioinformatics community has developed an industry standard for compressing and indexing of genetic variant files that store the difference between a person’s genome to a human reference genome. These standardizations rely on generic data compression schemes.This work contributes novel domain-specific compression and indexing algorithms that retain the structure and semantics of genetic variation data while supporting common query patterns. A line-based run-length partial compression technique for variant genotype data using a novel indexing strategy is developed and shown to perform well on large sample sets compared to the industry standard. The evaluation over genomic datasets indicates compression at a comparable size for our data representation while resulting in speedup of ˇ2X in indexed queries compared to the industry standard. This underlines that our representation could replace existing standards resulting in reduced computational cost at equivalent storage size.}, journal={2020 IEEE INTERNATIONAL CONFERENCE ON BIOINFORMATICS AND BIOMEDICINE}, author={Ferriter, Kyle and Mueller, Frank and Bahmani, Amir and Pan, Cuiping}, year={2020}, pages={200–203} }