@article{hu_lokhandwala_te_tseng_2019, title={Dynamic Multi-Resolution Data Storage}, DOI={10.1145/3352460.3358282}, abstractNote={Approximate computing that works on less precise data leads to significant performance gains and energy-cost reductions for compute kernels. However, without leveraging the full-stack design of computer systems, modern computer architectures undermine the potential of approximate computing. In this paper, we present Varifocal Storage, a dynamic multi-resolution storage system that tackles challenges in performance, quality, flexibility and cost for computer systems supporting diverse application demands. Varifocal Storage dynamically adjusts the dataset resolution within a storage device, thereby mitigating the performance bottleneck of exchanging/preparing data for approximate compute kernels. Varifocal Storage introduces Autofocus and iFilter mechanisms to provide quality control inside the storage device and make programs more adaptive to diverse datasets. Varifocal Storage also offers flexible, efficient support for approximate and exact computing without exceeding the costs of conventional storage systems by (1) saving the raw dataset in the storage device, and (2) targeting operators that complement the power of existing SSD controllers to dynamically generate lower-resolution datasets. We evaluate the performance of Varifocal Storage by running applications on a heterogeneous computer with our prototype SSD. The results show that Varifocal Storage can speed up data resolution adjustments by 2.02× or 1.74× without programmer input. Compared to conventional approximate-computing architectures, Varifocal Storage speeds up the overall execution time by 1.52×.}, journal={MICRO'52: THE 52ND ANNUAL IEEE/ACM INTERNATIONAL SYMPOSIUM ON MICROARCHITECTURE}, author={Hu, Yu-Ching and Lokhandwala, Murtuza Taher and Te, I and Tseng, Hung-Wei}, year={2019}, pages={196–210} } @article{matam_koo_zha_tseng_annavaram_2019, title={GraphSSD: Graph Semantics Aware SSD}, DOI={10.1145/3307650.3322275}, abstractNote={Graph analytics play a key role in a number of applications such as social networks, drug discovery, and recommendation systems. Given the large size of graphs that may exceed the capacity of the main memory, application performance is bounded by storage access time. Out-of-core graph processing frameworks try to tackle this storage access bottleneck through techniques such as graph sharding, and sub-graph partitioning. Even with these techniques, the need to access data across different graph shards or sub-graphs causes storage systems to become a significant performance hurdle. In this paper, we propose a graph semantic aware solid state drive (SSD) framework, called GraphSSD, which is a full system solution for storing, accessing, and performing graph analytics on SSDs. Rather than treating storage as a collection of blocks, GraphSSD considers graph structure while deciding on graph layout, access, and update mechanisms. GraphSSD replaces the conventional logical to physical page mapping mechanism in an SSD with a novel vertex- to-page mapping scheme and exploits the detailed knowledge of the flash properties to minimize page accesses. GraphSSD also supports efficient graph updates (vertex and edge modifications) by minimizing unnecessary page movement overheads. GraphSSD provides a simple programming interface that enables application developers to access graphs as native data in their applications, thereby simplifying the code development. It also augments the NVMe (non-volatile memory express) interface with a minimal set of changes to map the graph access APIs to appropriate storage access mechanisms. Our evaluation results show that the GraphSSD framework improves the performance by up to 1.85 x for the basic graph data fetch functions and on average 1.40x, 1.42x, 1.60x, 1.56x, and 1.29x for the widely used breadth-first search, connected components, random-walk, maximal independent set, and page rank applications, respectively.}, journal={PROCEEDINGS OF THE 2019 46TH INTERNATIONAL SYMPOSIUM ON COMPUTER ARCHITECTURE (ISCA '19)}, author={Matam, Kiran Kumar and Koo, Gunjae and Zha, Haipeng and Tseng, Hung-Wei and Annavaram, Murali}, year={2019}, pages={116–128} } @article{te_lokhandwala_hu_tseng_2018, title={Pensieve: a Machine Learning Assisted SSD Layer for Extending the Lifetime}, ISSN={["1063-6404"]}, DOI={10.1109/ICCD.2018.00016}, abstractNote={As the capacity per unit cost dropping, flash-based SSDs become popular in various computing scenarios. However, the restricted program-erase cycles still severely limit cost-effectiveness of flash-based storage solutions. This paper proposes Pensieve, a machine-learning assisted SSD firmware layer that transparently helps reduce the demand for programs and erases. Pensieve efficiently classifies writing data into different compression categories without hints from software systems. Data with the same category may use a shared dictionary to compress the content, allowing Pensieve to further avoid duplications. As Pensieve does not require any modification in the software stack, Pensieve is compatible with existing applications, file systems and operating systems. With modern SSD architectures, implementing a Pensieve-compliant SSD also requires no additional hardware, providing a drop-in upgrade for existing storage systems. The experimental result on our prototype Pensieve SSD shows that Pensieve can reduce the amount of program operations by 19%, while delivering competitive performance.}, journal={2018 IEEE 36TH INTERNATIONAL CONFERENCE ON COMPUTER DESIGN (ICCD)}, author={Te, I and Lokhandwala, Murtuza and Hu, Yu-Ching and Tseng, Hung-Wei}, year={2018}, pages={35–42} } @article{jin_tseng_papakonstantinou_swanson_2017, title={KAML: A Flexible, High-Performance Key-Value SSD}, ISSN={["1530-0897"]}, DOI={10.1109/hpca.2017.15}, abstractNote={Modern solid state drives (SSDs) unnecessarily confine host programs to the conventional block I/O interface, leading to suboptimal performance and resource under-utilization. Recent attempts to replace or extend this interface with a key-value-oriented interface and/or built-in support for transactions offer some improvements, but the details of their implementations make them a poor match for many applications. This paper presents the key-addressable, multi-log SSD (KAML), an SSD with a key-value interface that uses a novel multi-log architecture and stores data as variable-sized records rather than fixed-sized sectors. Exposing a key-value interface allows applications to remove a layer of indirection between application-level keys (e.g., database record IDs or file inode numbers) and data stored in the SSD. KAML also provides native transaction support tuned to support fine-grained locking, achieving improved performance compared to previous designs that require page-level locking. Finally, KAML includes a caching layer analogous to a conventional page cache that leverages host DRAM to improve performance and provides additional transactional features. We have implemented a prototype of KAML on a commercial SSD prototyping platform, and our results show that compared with existing key-value stores, KAML improves the performance of online transaction processing (OLTP) workloads by 1.1X – 4.0X, and NoSQL key-value store applications by 1.1X – 3.0X.}, journal={2017 23RD IEEE INTERNATIONAL SYMPOSIUM ON HIGH PERFORMANCE COMPUTER ARCHITECTURE (HPCA)}, author={Jin, Yanqin and Tseng, Hung-Wei and Papakonstantinou, Yannis and Swanson, Steven}, year={2017}, pages={373–384} } @inproceedings{liu_tseng_gahagan_li_jin_swanson_2016, title={Hippogriff: efficiently moving data in heterogeneous computing systems}, DOI={10.1109/iccd.2016.7753307}, abstractNote={Data movement between the compute and the storage (e.g., GPU and SSD) has been a long-neglected problem in heterogeneous systems, while the inefficiency in existing systems does cause significant loss in both performance and energy efficiency. This paper presents Hippogriff to provide a high-level programming model to simplify data movement between the compute and the storage, and to dynamically schedule data transfers based on system load. By eliminating unnecessary data movement, Hippogriff can speedup single program workloads by 1.17×, and save 17% energy. For multi-program workloads, Hippogriff shows 1.25× speedup. Hippogriff also improves the performance of a GPU-based MapReduce framework by 27%.}, booktitle={Proceedings of the 34th ieee international conference on computer design (iccd)}, author={Liu, Y. and Tseng, H. W. and Gahagan, M. and Li, J. and Jin, Y. Q. and Swanson, S.}, year={2016}, pages={376–379} } @article{li_tseng_lin_papakonstantinou_swanson_2016, title={HippogriffDB: Balancing I/O and GPU Bandwidth in Big Data Analytics}, volume={9}, ISSN={["2150-8097"]}, DOI={10.14778/3007328.3007331}, abstractNote={As data sets grow and conventional processor performance scaling slows, data analytics move towards heterogeneous architectures that incorporate hardware accelerators (notably GPUs) to continue scaling performance. However, existing GPU-based databases fail to deal with big data applications efficiently: their execution model suffers from scalability limitations on GPUs whose memory capacity is limited; existing systems fail to consider the discrepancy between fast GPUs and slow storage, which can counteract the benefit of GPU accelerators.}, number={14}, journal={PROCEEDINGS OF THE VLDB ENDOWMENT}, author={Li, Jing and Tseng, Hung-Wei and Lin, Chunbin and Papakonstantinou, Yannis and Swanson, Steven}, year={2016}, month={Oct}, pages={1647–1658} } @inproceedings{liu_tseng_swanson_2016, title={Spmario: scale up mapreduce with i/o-oriented scheduling for the gpu}, DOI={10.1109/iccd.2016.7753309}, abstractNote={The popularity of GPUs in general purpose computation has prompted efforts to scale up MapReduce systems with GPUs, but lack of efficient I/O handling results in underutilization of shared system resources in existing systems. This paper presents SPMario, a scale-up GPU MapReduce framework to speed up job execution and boost utilization of system resources with the new I/O Oriented Scheduling. The evaluation on a set of representative benchmarks against a highly-optimized baseline system shows that for the single job cases, SPMario can speedup job execution by up to 2.28×, and boost GPU utilization by 2.12× and 2.51× for I/O utilization. When scheduling two jobs together, I/O Oriented Scheduling outperforms round-robin scheduling by up to 13.54% in total execution time, and by up to 12.27% and 14.92% in GPU and I/O utilization, respectively.}, booktitle={Proceedings of the 34th ieee international conference on computer design (iccd)}, author={Liu, Y. and Tseng, H. W. and Swanson, S.}, year={2016}, pages={384–387} }