@inproceedings{kim_ravindra_anyanwu_2017, title={A semantics-aware storage framework for scalable processing of knowledge graphs on hadoop}, DOI={10.1109/bigdata.2017.8257927}, abstractNote={Knowledge graphs are graph-based data models which employ named nodes and edges to capture differentiation among entities and relationships in richly diverse data collections such as in the biomedical domain. The flexibility of knowledge graphs allows for heterogeneous collections to be linked and integrated in precise ways. However, resulting data models often have irregular structures which are not easy to manage using platforms for structured, schema-first data models like the relational model. To facilitate exchange, inter-operability and reuse of data, standards such as Resource Description Framework (RDF) have been increasingly adopted for representation. Domains such as the biomedical now have large collections of publicly available RDF graphs as well as benchmark workloads. To achieve scalability in data processing, some efforts are being made to build on distributed processing platforms such as Hadoop and Spark. However, while some distributed graph platforms have emerged for certain classes of mining workloads for non-semantic graphs (without typed edges and nodes), knowledge graph processing, which often involves ontological inferencing, continues to be plagued by scalability and efficiency challenges. In this paper, we present the design of a Hadoop-based storage architecture for knowledge graphs that overcomes some of the challenges of big RDF data processing. The rationale of the design strategy is to go beyond the traditional approach of exploiting structural properties of graphs while storing to include exploitation of semantic properties of knowledge graphs. Our system SemStorm is a Hadoop-based indexed, polymorphic, signatured file organization that supports efficient storage of data collections with significant data heterogeneity. Naive storage models for such data place more demands for meta-data management than traditional systems can support. The polymorphic file organization is further coupled with a nested, column-oriented file format to enable discriminatory data access based on queries. A major hallmark of SemStorm is the enabling of semantic-awareness in storage framework. The idea is to exploit the knowledge represented in ontologies that accompany data for optimizing data storage models such as identifying and managing data (sometimes implicit) redundancies. Another major advantage of SemStorm is that it derives optimized storage models for data autonomically, i.e., without user input. Extensive experiments conducted on real-world and synthetic benchmark datasets show that SemStorm is up to 10X faster than existing approaches.}, booktitle={2017 IEEE International Conference on Big Data (Big Data)}, author={Kim, H. and Ravindra, P. and Anyanwu, K.}, year={2017}, pages={193–202} } @inproceedings{ravindra_kim_anyanwu_2015, title={Rewriting complex SPARQL analytical queries for efficient cloud-based processing}, DOI={10.1109/bigdata.2015.7363738}, abstractNote={Many emerging Semantic Web applications combine and aggregate data across domains for analysis. Such analytical queries compute aggregates over multiple groupings of data, resulting in query plans with complex grouping-aggregation constraints. In the context of an RDF analytical query, each such grouping maps to a graph pattern subquery with multiple join operations, and related groups often result in overlapping graph patterns within the same query. In this paper, we propose a holistic approach to optimize RDF analytical queries by refactoring queries to achieve shared execution of common subexpressions that enables parallel evaluation of groupings as well as aggregations. Such a rewriting enables shorter execution workflows, particularly beneficial for scale-out processing on distributed Cloud systems with multiple I/O phases. Experiments on real-world and synthetic benchmarks confirm that such a rewriting can achieve more efficient execution plans when compared to relational-style SPARQL query plans executed on popular Cloud systems.}, booktitle={Proceedings 2015 IEEE International Conference on Big Data}, author={Ravindra, P. and Kim, H. and Anyanwu, K.}, year={2015}, pages={32–37} } @article{anyanwu_ravindra_kim_2014, title={Algebraic optimization of RDF graph pattern queries on MapReduce}, DOI={10.1201/b17112-7}, journal={Large Scale and Big Data: Processing and Management}, author={Anyanwu, K. and Ravindra, P. and Kim, H.}, year={2014}, pages={183–227} } @article{anyanwu_kim_ravindra_2013, title={Algebraic Optimization for Processing Graph Pattern Queries in the Cloud}, volume={17}, ISSN={["1941-0131"]}, DOI={10.1109/mic.2012.22}, abstractNote={MapReduce platforms such as Hadoop are now the de facto standard for large-scale data processing, but they have significant limitations for join-intensive workloads typical in Semantic Web processing. This article overviews an algebraic optimization approach based on a Nested TripleGroup Data Model and Algebra (NTGA) that minimizes overall processing costs by reducing the number of MapReduce cycles. It also presents an approach for integrating NTGA-based processing of graph pattern queries into Apache Pig and compares it to execution plans using relational-style algebra operators.}, number={2}, journal={IEEE INTERNET COMPUTING}, author={Anyanwu, Kemafor and Kim, HyeongSik and Ravindra, Padmashree}, year={2013}, pages={52–61} } @inproceedings{kim_anyanwu_2013, title={Optimizing queries over semantically integrated datasets on mapreduce platforms}, DOI={10.1109/bigdata.2013.6691788}, abstractNote={Life science databases generally consist of multiple heterogeneous datasets that have been integrated using complex ontologies. Querying such databases typically involves complex graph patterns, and evaluating such patterns poses challenges when MapReduce-based platforms are used to scale up processing, translating to long execution workflows with large amount of disk and network I/O costs. In this poster, we focus on optimizing UNION queries (e.g., unions of conjunctives for inference) and present an algebraic interpretation of the query rewritings which are more amenable to efficient processing on MapReduce.}, booktitle={2013 IEEE International Conference on Big Data}, author={Kim, H. and Anyanwu, K.}, year={2013} } @inproceedings{fu_kim_anyanwu_2013, title={Scaling concurrency of personalized semantic search over large RDF data}, DOI={10.1109/bigdata.2013.6691622}, abstractNote={Recent keyword search techniques on Semantic Web are moving away from shallow, information retrieval-style approaches that merely find “keyword matches” towards more interpretive approaches that attempt to induce structure from keyword queries. The process of query interpretation is usually guided by structures in data, and schema and is often supported by a graph exploration procedure. However, graph exploration-based interpretive techniques are impractical for multi-tenant scenarios for large databases because separate expensive graph exploration states need to be maintained for different user queries. This leads to significant memory overhead in situations of large numbers of concurrent requests. This limitation could negatively impact the possibility of achieving the ultimate goal of personalizing search. In this paper, we propose a lightweight interpretation approach that employs indexing to improve throughput and concurrency with much less memory overhead. It is also more amenable to distributed or partitioned execution. The approach is implemented in a system called “SKI” and an experimental evaluation of SKI's performance on the DBPedia and Billion Triple Challenge datasets shows orders-of-magnitude performance improvement over existing techniques.}, booktitle={2013 IEEE International Conference on Big Data}, author={Fu, H. Z. and Kim, H. and Anyanwu, K.}, year={2013} }