@article{anyanwu_ravindra_kim_2014, title={Algebraic optimization of RDF graph pattern queries on MapReduce}, DOI={10.1201/b17112-7}, journal={Large Scale and Big Data: Processing and Management}, author={Anyanwu, K. and Ravindra, P. and Kim, H.}, year={2014}, pages={183–227} } @article{ravindra_anyanwu_2014, title={Nesting Strategies for Enabling Nimble MapReduce Dataflows for Large RDF Data}, volume={10}, ISSN={["1552-6291"]}, DOI={10.4018/ijswis.2014010101}, abstractNote={Graph and semi-structured data are usually modeled in relational processing frameworks as “thin” relations (node, edge, node) and processing such data involves a lot of join operations. Intermediate results of joins with multi-valued attributes or relationships, contain redundant subtuples due to repetition of single-valued attributes. The amount of redundant content is high for real-world multi-valued relationships in social network (millions of Twitter followers of popular celebrities) or biological (multiple references to related proteins) datasets. In MapReduce-based platforms such as Apache Hive and Pig, redundancy in intermediate results contributes avoidable costs to the overall I/O, sorting, and network transfer overhead of join-intensive workloads due to longer workflows. Consequently, providing techniques for dealing with such redundancy will enable more nimble execution of such workflows. This paper argues for the use of a nested data model for representing intermediate data concisely using nesting-aware dataflow operators that allow for lazy and partial unnesting strategies. This approach reduces the overall I/O and network footprint of a workflow by concisely representing intermediate results during most of a workflow's execution, until complete unnesting is absolutely necessary. The proposed strategies are integrated into Apache Pig and experimental evaluation over real-world and synthetic benchmark datasets confirms their superiority over relational-style MapReduce systems such as Apache Pig and Hive.}, number={1}, journal={INTERNATIONAL JOURNAL ON SEMANTIC WEB AND INFORMATION SYSTEMS}, author={Ravindra, Padmashree and Anyanwu, Kemafor}, year={2014}, pages={1–26} } @inproceedings{ravindra_2014, title={Towards optimization of RDF analytical queries on MapReduce}, booktitle={2014 IEEE 30th International Conference on Data Engineering Workshops (ICDEW)}, author={Ravindra, P.}, year={2014}, pages={335–339} } @article{anyanwu_kim_ravindra_2013, title={Algebraic Optimization for Processing Graph Pattern Queries in the Cloud}, volume={17}, ISSN={["1941-0131"]}, DOI={10.1109/mic.2012.22}, abstractNote={MapReduce platforms such as Hadoop are now the de facto standard for large-scale data processing, but they have significant limitations for join-intensive workloads typical in Semantic Web processing. This article overviews an algebraic optimization approach based on a Nested TripleGroup Data Model and Algebra (NTGA) that minimizes overall processing costs by reducing the number of MapReduce cycles. It also presents an approach for integrating NTGA-based processing of graph pattern queries into Apache Pig and compares it to execution plans using relational-style algebra operators.}, number={2}, journal={IEEE INTERNET COMPUTING}, author={Anyanwu, Kemafor and Kim, HyeongSik and Ravindra, Padmashree}, year={2013}, pages={52–61} } @inproceedings{sridhar_ravindra_anyanwu_2009, title={RAPID: Enabling scalable ad-hoc analytics on the semantic web}, volume={5823}, url={http://dx.doi.org/10.1007/978-3-642-04930-9_45}, DOI={10.1007/978-3-642-04930-9_45}, abstractNote={As the amount of available RDF data continues to increase steadily, there is growing interest in developing efficient methods for analyzing such data. While recent efforts have focused on developing efficient methods for traditional data processing, analytical processing which typically involves more complex queries has received much less attention. The use of cost effective parallelization techniques such as Google’s Map-Reduce offer significant promise for achieving Web scale analytics. However, currently available implementations are designed for simple data processing on structured data. In this paper, we present a language, RAPID, for scalable ad-hoc analytical processing of RDF data on Map-Reduce frameworks. It builds on Yahoo’s Pig Latin by introducing primitives based on a specialized join operator, the MD-join, for expressing analytical tasks in a manner that is more amenable to parallel processing, as well as primitives for coping with semi-structured nature of RDF data. Experimental evaluation results demonstrate significant performance improvements for analytical processing of RDF data over existing Map-Reduce based techniques.}, booktitle={Semantic web - iswc 2009, proceedings}, author={Sridhar, R. and Ravindra, P. and Anyanwu, K.}, year={2009}, pages={715–730} }