@article{hsu_nair_menzies_freeh_2018, title={Micky: A Cheaper Alternative for Selecting Cloud Instances}, DOI={10.1109/CLOUD.2018.00058}, abstractNote={Most cloud computing optimizers explore and improve one workload at a time. When optimizing many workloads, the single-optimizer approach can be prohibitively expensive. Accordingly, we examine "collective optimizer" that concurrently explore and improve a set of workloads significantly reducing the measurement costs. Our large-scale empirical study shows that there is often a single cloud configuration which is surprisingly near-optimal for most workloads. Consequently, we create a collective-optimizer, MICKY, that reformulates the task of finding the near-optimal cloud configuration as a multi-armed bandit problem. MICKY efficiently balances exploration (of new cloud configurations) and exploitation (of known good cloud configuration). Our experiments show that MICKY can achieve on average 8.6 times reduction in measurement cost as compared to the state-of-the-art method while finding near-optimal solutions. Hence we propose MICKY as the basis of a practical collective optimization method for finding good cloud configurations (based on various constraints such as budget and tolerance to near-optimal configurations)}, journal={PROCEEDINGS 2018 IEEE 11TH INTERNATIONAL CONFERENCE ON CLOUD COMPUTING (CLOUD)}, author={Hsu, Chin-Jung and Nair, Vivek and Menzies, Tim and Freeh, Vincent}, year={2018}, pages={409–416} } @inproceedings{hsu_freeh_villanustre_2017, title={Trilogy: data placement to improve performance and robustness of cloud computing}, DOI={10.1109/bigdata.2017.8258202}, abstractNote={Infrastructure as a Service, one of the most disruptive aspects of cloud computing, enables configuring a cluster for each application for each workload. When the workload changes, a cluster will be either underutilized (wasting resources) or unable to meet demand (incurring opportunity costs). Consequently, efficient cluster resizing requires proper data replication and placement. Our work reveals that coarse-grain, workload-aware replication addresses over-utilization but cannot resolve under-utilization. With fine-grain partitioning of the dataset, data replication can reduce both under- and over-utilization. In our empirical studies, compared to a näive uniform data replication a coarse-grain workload-aware replication increases throughput by 81% on a highly-skewed workload. A fine-grain scheme further reaches 166% increase. Furthermore, a surprisingly small increase in granularity is sufficient to obtain most benefits. Evaluations also show that maximizing the number of unique partitions per node increases robustness to tolerate workload deviation while minimizing this number reduces storage footprint.}, booktitle={2017 IEEE International Conference on Big Data (Big Data)}, author={Hsu, C. J. and Freeh, V. W. and Villanustre, F.}, year={2017}, pages={2442–2451} } @inproceedings{hsu_panta_ra_freeh_2016, title={Inside-out: Reliable performance prediction for distributed storage systems in the cloud}, DOI={10.1109/srds.2016.025}, abstractNote={Many storage systems are undergoing a significant shift from dedicated appliance-based model to software-defined storage (SDS) because the latter is flexible, scalable and cost-effective for modern workloads. However, it is challenging to provide a reliable guarantee of end-to-end performance in SDS due to complex software stack, time-varying workload and performance interference among tenants. Therefore, modeling and monitoring the performance of storage systems is critical for ensuring reliable QoS guarantees. Existing approaches such as performance benchmarking and analytical modeling are inadequate because they are not efficient in exploring large configuration space, and cannot support elastic operations and diverse storage services in SDS. This paper presents Inside-Out, an automatic model building tool that creates accurate performance models for distributed storage services. Inside-Out is a black-box approach. It builds high-level performance models by applying machine learning techniques to low-level system performance metrics collected from individual components of the distributed SDS system. Inside-Out uses a two-level learning method that combines two machine learning models to automatically filter irrelevant features, boost prediction accuracy and yield consistent prediction. Our in-depth evaluation shows that Inside-Out is a robust solution that enables SDS to predict end-to-end performance even in challenging conditions, e.g., changes in workload, storage configuration, available cloud resources, size of the distributed storage service, and amount of interference due to multi-tenants. Our experiments show that Inside-Out can predict end-to-end performance with 91.1% accuracy on average. Its prediction accuracy is consistent across diverse storage environments.}, booktitle={Proceedings of 2016 ieee 35th symposium on reliable distributed systems (srds)}, author={Hsu, C. J. and Panta, R. K. and Ra, M. R. and Freeh, V. W.}, year={2016}, pages={127–136} } @article{kc_freeh_2015, title={Dynamically controlling node-level parallelism in Hadoop}, DOI={10.1109/cloud.2015.49}, abstractNote={Hadoop is a widely used large scale data processing framework. Applications run in Hadoop as containers, the concurrency of which affects completion time of an application as well as system resource usage. When there are too many concurrent containers, resource bottlenecks occur and when there too few, system resources are underutilized. The default and best practice settings underutilize resources which results in longer application completion times. In this work, we develop an approach to dynamically change the parallelism for concurrent containers to suit an application. Our approach ensures efficient utilization of resources and avoids bottlenecks for all types of MapReduce applications. Our approach improves performance of MapReduce applications by as much as 28% and 60% respectively when compared to the best practice and default settings.}, journal={2015 IEEE 8TH INTERNATIONAL CONFERENCE ON CLOUD COMPUTING}, author={Kc, Kamal and Freeh, Vincent W.}, year={2015}, pages={309–316} } @article{kc_hsu_freeh_2015, title={Evaluation of MapReduce in a large cluster}, DOI={10.1109/cloud.2015.68}, abstractNote={MapReduce is a widely used framework that runs large scale data processing applications. However, there are very few systematic studies of MapReduce on large clusters and thus there is a lack of reference for expected behavior or issues while running applications in a large cluster. This paper describes our findings of running applications on Pivotal's Analytics Workbench, which consists of a 540-node Hadoop cluster. Our experience sheds light on how applications behave in a large-scale cluster. This paper discusses our experiences in three areas. The first describes scaling behavior of applications as the dataset size increases. The second discusses the appropriate settings for parallelism and overlap of map and reduce tasks. The third area discusses general observations. These areas have not been reported or studied previously. Our findings show that IO-intensive applications do not scale as data size increases and MapReduce applications require different amounts of parallelism and overlap to minimize completion time. Additionally, our observations also highlight the need for appropriate memory allocation for a MapReduce component and the importance of decreasing log file size.}, journal={2015 IEEE 8TH INTERNATIONAL CONFERENCE ON CLOUD COMPUTING}, author={Kc, Kamal and Hsu, Chin-Jung and Freeh, Vincent W.}, year={2015}, pages={461–468} } @inbook{kc_freeh_2014, title={Tuning Hadoop Map Slot Value Using CPU Metric}, ISBN={9783319130200 9783319130217}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-13021-7_11}, DOI={10.1007/978-3-319-13021-7_11}, abstractNote={Hadoop is a widely used open source mapreduce framework. Its performance is critical because it increases the usefulness of products and services for a large number of companies who have adopted Hadoop for their business purposes. One of the configuration parameters that influences the resource allocation and thus the performance of a Hadoop application is map slot value (MSV). MSV determines the number of map tasks that run concurrently on a node. For a given architecture, a Hadoop application has an MSV for which its performance is best. Furthermore, there is not a single map slot value that is best for all applications. A Hadoop application’s performance suffers when MSV is not the best. Therefore, knowing the best MSV is important for an application. In this work, we find a low-overhead method to predict the best MSV using a new Hadoop counter that measures per-map task CPU utilization. Our experiments on a variety of Hadoop applications show that using a single MSV for all applications results in performance degradation up to 132 % when compared to using the best MSV for each application.}, booktitle={Big Data Benchmarks, Performance Optimization, and Emerging Hardware}, publisher={Springer International Publishing}, author={Kc, Kamal and Freeh, Vincent W.}, year={2014}, pages={141–153} } @article{lim_freeh_lowenthal_2011, title={Adaptive, transparent CPU scaling algorithms leveraging inter-node MPI communication regions}, volume={37}, ISSN={["1872-7336"]}, DOI={10.1016/j.parco.2011.07.001}, abstractNote={Although users of high-performance computing are most interested in raw performance, both energy and power consumption have become critical concerns. Because the CPU is often the major power consumer, some microprocessors allow frequency and voltage scaling, which enables a system to efficiently reduce CPU performance and power. When the CPU is not on the critical path, such dynamic frequency and voltage scaling can produce significant energy savings with little performance penalty. This paper presents an MPI runtime system that dynamically reduces CPU frequency and voltage during communication phases in MPI programs. It dynamically identifies such phases and, without a priori knowledge, selects the CPU frequency in order to minimize energy-delay product. All analysis and subsequent frequency and voltage scaling is within MPI and so is entirely transparent to the application. This means that the large number of existing MPI programs, as well as new ones being developed, can use our system without modification. Results show that the median reduction in energy-delay product for twelve benchmarks is 8%, the median energy reduction is 11%, and the median increase in execution time increase is only 2%.}, number={10-11}, journal={PARALLEL COMPUTING}, author={Lim, Min Yeol and Freeh, Vincent W. and Lowenthal, David K.}, year={2011}, pages={667–683} } @inbook{tran_etheridge_bletsch_jiang_freeh_ning_2011, title={On the Expressiveness of Return-into-libc Attacks}, ISBN={9783642236433 9783642236440}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-23644-0_7}, DOI={10.1007/978-3-642-23644-0_7}, abstractNote={Return-into-libc (RILC) is one of the most common forms of code-reuse attacks. In this attack, an intruder uses a buffer overflow or other exploit to redirect control flow through existing (libc) functions within the legitimate program. While dangerous, it is generally considered limited in its expressive power since it only allows the attacker to execute straight-line code. In other words, RILC attacks are believed to be incapable of arbitrary computation—they are not Turing complete. Consequently, to address this limitation, researchers have developed other code-reuse techniques, such as return-oriented programming (ROP). In this paper, we make the counterargument and demonstrate that the original RILC technique is indeed Turing complete. Specifically, we present a generalized RILC attack called Turing complete RILC (TC-RILC) that allows for arbitrary computations. We demonstrate that TC-RILC satisfies formal requirements of Turing-completeness. In addition, because it depends on the well-defined semantics of libc functions, we also show that a TC-RILC attack can be portable between different versions (or even different families) of operating systems and naturally has negative implications for some existing anti-ROP defenses. The development of TC-RILC on both Linux and Windows platforms demonstrates the expressiveness and practicality of the generalized RILC attack.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer Berlin Heidelberg}, author={Tran, Minh and Etheridge, Mark and Bletsch, Tyler and Jiang, Xuxian and Freeh, Vincent and Ning, Peng}, year={2011}, pages={121–141} } @inbook{zhou_zhang_jiang_freeh_2011, title={Taming Information-Stealing Smartphone Applications (on Android)}, ISBN={9783642215988 9783642215995}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-21599-5_7}, DOI={10.1007/978-3-642-21599-5_7}, abstractNote={Smartphones have been becoming ubiquitous and mobile users are increasingly relying on them to store and handle personal information. However, recent studies also reveal the disturbing fact that users’ personal information is put at risk by (rogue) smartphone applications. Existing solutions exhibit limitations in their capabilities in taming these privacy-violating smartphone applications. In this paper, we argue for the need of a new privacy mode in smartphones. The privacy mode can empower users to flexibly control in a fine-grained manner what kinds of personal information will be accessible to an application. Also, the granted access can be dynamically adjusted at runtime in a fine-grained manner to better suit a user’s needs in various scenarios (e.g., in a different time or location). We have developed a system called TISSA that implements such a privacy mode on Android. The evaluation with more than a dozen of information-leaking Android applications demonstrates its effectiveness and practicality. Furthermore, our evaluation shows that TISSA introduces negligible performance overhead.}, booktitle={Trust and Trustworthy Computing}, publisher={Springer Berlin Heidelberg}, author={Zhou, Yajin and Zhang, Xinwen and Jiang, Xuxian and Freeh, Vincent W.}, year={2011}, pages={93–107} } @article{freeh_kappiah_lowenthal_bletsch_2008, title={Just-in-time dynamic voltage scaling: Exploiting inter-node slack to save energy in MPI programs}, volume={68}, ISSN={["1096-0848"]}, DOI={10.1016/j.jpdc.2008.04.007}, abstractNote={Although users of high-performance computing are most interested in raw performance, both energy and power consumption have become critical concerns. As a result, improving energy efficiency of nodes on HPC machines has become important, and the prevalence of power-scalable clusters, where the frequency and voltage can be dynamically modified, has increased. On power-scalable clusters, one opportunity for saving energy with little or no loss of performance exists when the computational load is not perfectly balanced. This situation occurs frequently, as keeping the load balanced between nodes is one of the long-standing fundamental problems in parallel and distributed computing. Indeed, despite the large body of research aimed at balancing load both statically and dynamically, this problem is quite difficult to solve. This paper presents a system called Jitter that reduces the frequency and voltage on nodes that are assigned less computation and, therefore, have idle or slack time. This saves energy on these nodes, and the goal of Jitter is to attempt to ensure that they arrive “just in time” so that they avoid increasing overall execution time. Specifically, we dynamically determine which nodes have enough slack time such that they can execute at a reduced frequency with little performance cost—which will greatly reduce the consumed energy on that node. In particular, Jitter saves 12.8% energy with 0.4% time increase–which is essentially the same as a hand-tuned solution–on the Aztec benchmark.}, number={9}, journal={JOURNAL OF PARALLEL AND DISTRIBUTED COMPUTING}, author={Freeh, Vincent W. and Kappiah, Nandini and Lowenthal, David K. and Bletsch, Tyler K.}, year={2008}, month={Sep}, pages={1175–1185} } @article{freeh_lowenthal_pan_kappiah_springer_rountree_femal_2007, title={Analyzing the energy-time trade-off in high-performance computing applications}, volume={18}, ISSN={["1558-2183"]}, DOI={10.1109/TPDS.2007.1026}, abstractNote={Although users of high-performance computing are most interested in raw performance both energy and power consumption has become critical concerns. One approach to lowering energy and power is to use high-performance cluster nodes that have several power-performance states so that the energy-time trade-off can be dynamically adjusted. This paper analyzes the energy-time trade-off of a wide range of applications-serial and parallel-on a power-scalable cluster. We use a cluster of frequency and voltage-scalable AMD-64 nodes, each equipped with a power meter. We study the effects of memory and communication bottlenecks via direct measurement of time and energy. We also investigate metrics that can, at runtime, predict when each type of bottleneck occurs. Our results show that, for programs that have a memory or communication bottleneck, a power-scalable cluster can save significant energy with only a small time penalty. Furthermore, we find that, for some programs, it is possible to both consume less energy and execute in less time by increasing the number of nodes while reducing the frequency-voltage setting of each node}, number={6}, journal={IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS}, author={Freeh, Vincent W. and Lowenthal, David K. and Pan, Feng and Kappiah, Nandini and Springer, Rob and Rountree, Barry L. and Femal, Mark E.}, year={2007}, month={Jun}, pages={835–848} } @inbook{femal_freeh_2005, title={Safe overprovisioning: Using power limits to increase aggregate throughput}, volume={3471}, DOI={10.1007/11574859_11}, abstractNote={Management of power in data centers is driven by the need to not exceed circuit capacity. The methods employed in the oversight of these power circuits are typically static and ad-hoc. New power-scalable system components allow for dynamically controlling power consumption with an accompanying effect on performance. Because the incremental performance gain from operating in a higher performance state is less than the increase in power, it is possible to overprovision the hardware infrastructure to increase throughput and yet still remain below an aggregate power limit. In overprovisioning, if each component operates at maximum power the limit would be exceeded with disastrous results. However, safe overprovisioning regulates power consumption locally to meet the global power budget. Host-based and network-centric models are proposed to monitor and coordinate the distribution of power with the fundamental goal of increasing throughput. This research work presents the advantages of overprovisioning and describes a general framework and an initial prototype. Initial results with a synthetic benchmark indicate throughput increases of nearly 6% from a staticly assigned, power managed environment and over 30% from an unmanaged environment.}, booktitle={Power-aware computer systems: 4th International Workshop, PACS 2004, Portland, OR, USA, December 5, 2004 (Lecture notes in computer science; 3471)}, publisher={Berlin: Springer}, author={Femal, M. E. and Freeh, V. W.}, editor={B. Falsafi, T.N. VijaykumarEditor}, year={2005}, pages={150–164} }