@article{hasan_ogan_starly_2021, title={Hybrid Blockchain Architecture for Cloud Manufacturing-as-a-service (CMaaS) Platforms with Improved Data Storage and Transaction Efficiency}, volume={53}, ISSN={["2351-9789"]}, DOI={10.1016/j.promfg.2021.06.060}, abstractNote={Blockchain based decentralized Cloud Manufacturing-as-a-Service (CMaaS) platforms enable customers to gain access to a large capacity of manufacturing nodes over cryptographically secure networks. In recent times, the Ethereum network has emerged as a popular blockchain framework for providing provenance and traceability of proprietary manufacturing data in decentralized CMaaS. However, the Ethereum ecosystem was only designed to store and transmit low volume financial transaction data and little has been done to make it an efficient repository of large manufacturing data streams in CMaaS systems. In this paper, the authors build on their previous work and report the design, implementation, and validation of middleware software architectures that allow Ethereum based distributed CMaaS platforms to harness the benefits of the secure asset models of the Ethereum ecosystem and the immutable big data storage capabilities of the decentralized BigchainDB database platform. A novel hybrid blockchain architecture enabled by efficient communication protocols and blockchain oracles is proposed. This architecture allows the transfer and immutable storage of large manufacturing data streams onto global BigchainDB nodes allowing data rich manufacturing transactions to bypass the transaction fees of the Ethereum ecosystem. Additionally, a machine learning based time series inference model is proposed which enables the forecast of Ethereum gas price into the future. This allows the CMaaS platform smart contracts to judiciously assign gas price limits and hence save on transactions ensuing from transfer or creation of assets. The outcomes of this research show that the designed hybrid architecture can lead to the reduction of significant number of computational steps and hence transaction fees on Ethereum by offloading large volume data onto BigchainDB nodes. A Random Forest regressor based time series inference model has been shown to exhibit superior performance in the prediction of Ethereum gas price, that allows the CMaaS to avoid executing transactions in periods of high gas prices within the Ethereum ecosystem.}, journal={49TH SME NORTH AMERICAN MANUFACTURING RESEARCH CONFERENCE (NAMRC 49, 2021)}, author={Hasan, Mahmud and Ogan, Kemafor and Starly, Binil}, year={2021}, pages={594–605} } @inbook{evaluating generalized path queries by integrating algebraic path problem solving with graph pattern matching_2019, url={http://dx.doi.org/10.1007/978-3-030-33220-4_8}, DOI={10.1007/978-3-030-33220-4_8}, abstractNote={Path querying on Semantic Networks is gaining increased focus because of its broad applicability. Some graph databases offer support for variants of path queries e.g. shortest path. However, many applications have the need for the set version of various path problem i.e. finding paths between multiple source and multiple destination nodes (subject to different kinds of constraints). Further, the sets of source and destination nodes may be described declaratively as patterns, rather than given explicitly. Such queries lead to the requirement of integrating graph pattern matching with path problem solving. There are currently existing limitations in support of such queries (either inability to express some classes, incomplete results, inability to complete query evaluation unless graph patterns are extremely selective, etc). In this paper, we propose a framework for evaluating generalized path queries - gpqs that integrate an algebraic technique for solving path problems with SPARQL graph pattern matching. The integrated algebraic querying technique enables more scalable and efficient processing of gpqs, including the possibility of support for a broader range of path constraints. We present the approach and implementation strategy and compare performance and query expressiveness with a popular graph engine.}, booktitle={Lecture Notes in Computer Science}, year={2019} } @inproceedings{semantic query transformations for increased parallelization in distributed knowledge graph query processing_2019, url={http://dx.doi.org/10.1145/3295500.3356212}, DOI={10.1145/3295500.3356212}, abstractNote={Ontologies have become an increasingly popular semantic layer for integrating multiple heterogeneous datasets. However, significant challenges remain with supporting efficient and scalable processing of queries with data linked with ontologies (ontological queries). Ontological query processing queries requires explicitly defined query patterns be expanded to capture implicit ones, based on available ontology inference axioms. However, in practice such as in the biomedical domain, the complexity of the ontological axioms results in significantly large query expansions which present day query processing infrastructure cannot support. In particular, it remains unclear how to effectively parallelize such queries. In this paper, we propose data and query transformations that enable inter-operator parallelism of ontological queries on Hadoop platforms. Our transformation techniques exploit ontological axioms, second order data types and operator rewritings to eliminate expensive query substructures for increased parallelizability. Comprehensive experiments conducted on benchmark datasets show up to 25X performance improvement over existing approaches.}, booktitle={Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}, year={2019}, month={Nov} } @inproceedings{scalable exploratory search on knowledge graphs using apache spark_2018, url={http://dx.doi.org/10.1109/wetice.2018.00036}, DOI={10.1109/wetice.2018.00036}, abstractNote={Faceted search is a popular exploratory search paradigm on Big Knowledge Graphs. Translating exploration steps into database queries for processing leads to several joins when dealing with knowledge graphs as opposed to filter conditions when dealing with structured data. Further, existing engines handle each exploration step as independent queries in spite of data dependencies that often exist between steps. In this work, we propose an incremental query execution model RAPIDFacet, that exploits the iterative nature of faceted search and reuses intermediate results. The approach is built on top of Apache Spark which naturally supports iterative models and the Nested Triplegroup Data Model and Algebra (NTGA) which uses a coarse grained data model to avoid joins. Evaluations showed up to 150x faster execution than existing approaches.}, booktitle={2018 IEEE 27th International Conference on Enabling Technologies: Infrastructure for Collaborative Enterprises (WETICE)}, year={2018}, month={Jun} } @inproceedings{kim_ravindra_anyanwu_2017, title={A semantics-aware storage framework for scalable processing of knowledge graphs on hadoop}, DOI={10.1109/bigdata.2017.8257927}, abstractNote={Knowledge graphs are graph-based data models which employ named nodes and edges to capture differentiation among entities and relationships in richly diverse data collections such as in the biomedical domain. The flexibility of knowledge graphs allows for heterogeneous collections to be linked and integrated in precise ways. However, resulting data models often have irregular structures which are not easy to manage using platforms for structured, schema-first data models like the relational model. To facilitate exchange, inter-operability and reuse of data, standards such as Resource Description Framework (RDF) have been increasingly adopted for representation. Domains such as the biomedical now have large collections of publicly available RDF graphs as well as benchmark workloads. To achieve scalability in data processing, some efforts are being made to build on distributed processing platforms such as Hadoop and Spark. However, while some distributed graph platforms have emerged for certain classes of mining workloads for non-semantic graphs (without typed edges and nodes), knowledge graph processing, which often involves ontological inferencing, continues to be plagued by scalability and efficiency challenges. In this paper, we present the design of a Hadoop-based storage architecture for knowledge graphs that overcomes some of the challenges of big RDF data processing. The rationale of the design strategy is to go beyond the traditional approach of exploiting structural properties of graphs while storing to include exploitation of semantic properties of knowledge graphs. Our system SemStorm is a Hadoop-based indexed, polymorphic, signatured file organization that supports efficient storage of data collections with significant data heterogeneity. Naive storage models for such data place more demands for meta-data management than traditional systems can support. The polymorphic file organization is further coupled with a nested, column-oriented file format to enable discriminatory data access based on queries. A major hallmark of SemStorm is the enabling of semantic-awareness in storage framework. The idea is to exploit the knowledge represented in ontologies that accompany data for optimizing data storage models such as identifying and managing data (sometimes implicit) redundancies. Another major advantage of SemStorm is that it derives optimized storage models for data autonomically, i.e., without user input. Extensive experiments conducted on real-world and synthetic benchmark datasets show that SemStorm is up to 10X faster than existing approaches.}, booktitle={2017 IEEE International Conference on Big Data (Big Data)}, author={Kim, H. and Ravindra, P. and Anyanwu, K.}, year={2017}, pages={193–202} } @inproceedings{type-based semantic optimization for scalable rdf graph pattern matching_2017, url={http://dx.doi.org/10.1145/3038912.3052655}, DOI={10.1145/3038912.3052655}, abstractNote={Scalable query processing relies on early and aggressive determination and pruning of query-irrelevant data. Besides the traditional space-pruning techniques such as indexing, type-based optimizations that exploit integrity constraints defined on the types can be used to rewrite queries into more efficient ones. However, such optimizations are only applicable in strongly-typed data and query models which make it a challenge for semi-structured models such as RDF. Consequently, developing techniques for enabling typebased query optimizations will contribute new insight to improving the scalability of RDF processing systems. In this paper, we address the challenge of type-based query optimization for RDF graph pattern queries. The approach comprises of (i) a novel type system for RDF data induced from data and ontologies and (ii) a query optimization and evaluation framework for evaluating graph pattern queries using type-based optimizations. An implementation of this approach integrated into Apache Pig is presented and evaluated. Comprehensive experiments conducted on real-world and synthetic benchmark datasets show that our approach is up to 500X faster than existing approaches}, booktitle={Proceedings of the 26th International Conference on World Wide Web}, year={2017}, month={Apr} } @inproceedings{ravindra_kim_anyanwu_2015, title={Rewriting complex SPARQL analytical queries for efficient cloud-based processing}, DOI={10.1109/bigdata.2015.7363738}, abstractNote={Many emerging Semantic Web applications combine and aggregate data across domains for analysis. Such analytical queries compute aggregates over multiple groupings of data, resulting in query plans with complex grouping-aggregation constraints. In the context of an RDF analytical query, each such grouping maps to a graph pattern subquery with multiple join operations, and related groups often result in overlapping graph patterns within the same query. In this paper, we propose a holistic approach to optimize RDF analytical queries by refactoring queries to achieve shared execution of common subexpressions that enables parallel evaluation of groupings as well as aggregations. Such a rewriting enables shorter execution workflows, particularly beneficial for scale-out processing on distributed Cloud systems with multiple I/O phases. Experiments on real-world and synthetic benchmarks confirm that such a rewriting can achieve more efficient execution plans when compared to relational-style SPARQL query plans executed on popular Cloud systems.}, booktitle={Proceedings 2015 IEEE International Conference on Big Data}, author={Ravindra, P. and Kim, H. and Anyanwu, K.}, year={2015}, pages={32–37} } @article{anyanwu_ravindra_kim_2014, title={Algebraic optimization of RDF graph pattern queries on MapReduce}, DOI={10.1201/b17112-7}, journal={Large Scale and Big Data: Processing and Management}, author={Anyanwu, K. and Ravindra, P. and Kim, H.}, year={2014}, pages={183–227} } @inbook{sessoms_anyanwu_2014, place={Berlin Heidelberg}, series={Lecture Notes in Computer Science}, title={Enabling a Package Query Paradigm on the Semantic Web: Model and Algorithms}, ISBN={9783642544255 9783662459423}, ISSN={0302-9743 1611-3349}, DOI={10.1007/978-3-642-54426-2_1}, abstractNote={The traditional search model of finding links on the Web is unsatisfactory for the increasingly complex tasks that seek to leverage the diverse, increasingly structured and semantically annotated data on the Web. A good example is when users seek to find collections or packages of resources that meet some constraints e.g., a collection of learning resources that cover some topics and have a good average rating or a collection of tourist attractions in a city such that total cost and total travel time for visiting all attractions meet the given constraints. For such queries, the goal is the return a set of constraint-qualifying collections or packages. However, using the traditional “set of links” query paradigm, such queries can only be satisfied by issuing multiple queries, reviewing answer lists and manually assembling packages to suit a user’s desired constraints.}, booktitle={Transactions on Large-Scale Data- and Knowledge-Centered Systems XIII}, publisher={Springer}, author={Sessoms, Matthew and Anyanwu, Kemafor}, editor={Hameurlain, A. and Küng, J. and Wagner, R.Editors}, year={2014}, pages={1–32}, collection={Lecture Notes in Computer Science} } @article{ravindra_anyanwu_2014, title={Nesting Strategies for Enabling Nimble MapReduce Dataflows for Large RDF Data}, volume={10}, ISSN={["1552-6291"]}, DOI={10.4018/ijswis.2014010101}, abstractNote={Graph and semi-structured data are usually modeled in relational processing frameworks as “thin” relations (node, edge, node) and processing such data involves a lot of join operations. Intermediate results of joins with multi-valued attributes or relationships, contain redundant subtuples due to repetition of single-valued attributes. The amount of redundant content is high for real-world multi-valued relationships in social network (millions of Twitter followers of popular celebrities) or biological (multiple references to related proteins) datasets. In MapReduce-based platforms such as Apache Hive and Pig, redundancy in intermediate results contributes avoidable costs to the overall I/O, sorting, and network transfer overhead of join-intensive workloads due to longer workflows. Consequently, providing techniques for dealing with such redundancy will enable more nimble execution of such workflows. This paper argues for the use of a nested data model for representing intermediate data concisely using nesting-aware dataflow operators that allow for lazy and partial unnesting strategies. This approach reduces the overall I/O and network footprint of a workflow by concisely representing intermediate results during most of a workflow's execution, until complete unnesting is absolutely necessary. The proposed strategies are integrated into Apache Pig and experimental evaluation over real-world and synthetic benchmark datasets confirms their superiority over relational-style MapReduce systems such as Apache Pig and Hive.}, number={1}, journal={INTERNATIONAL JOURNAL ON SEMANTIC WEB AND INFORMATION SYSTEMS}, author={Ravindra, Padmashree and Anyanwu, Kemafor}, year={2014}, pages={1–26} } @inbook{maali_ravindra_anyanwu_decker_2014, title={SYRql: A Dataflow Language for Large Scale Processing of RDF Data}, ISBN={9783319119632 9783319119649}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-319-11964-9_10}, DOI={10.1007/978-3-319-11964-9_10}, abstractNote={The recent big data movement resulted in a surge of activity on layering declarative languages on top of distributed computation platforms. In the Semantic Web realm, this surge of analytics languages was not reflected despite the significant growth in the available RDF data. Consequently, when analysing large RDF datasets, users are left with two main options: using SPARQL or using an existing non-RDF-specific big data language, both with its own limitations. The pure declarative nature of SPARQL and the high cost of evaluation can be limiting in some scenarios. On the other hand, existing big data languages are designed mainly for tabular data and, therefore, applying them to RDF data results in verbose, unreadable, and sometimes inefficient scripts. In this paper, we introduce SYRql, a dataflow language designed to process RDF data at a large scale. SYRql blends concepts from both SPARQL and existing big data languages. We formally define a closed algebra that underlies SYRql and discuss its properties and some unique optimisation opportunities this algebra provides. Furthermore, we describe an implementation that translates SYRql scripts into a series of MapReduce jobs and compare the performance to other big data processing languages.}, booktitle={The Semantic Web – ISWC 2014}, publisher={Springer International Publishing}, author={Maali, Fadi and Ravindra, Padmashree and Anyanwu, Kemafor and Decker, Stefan}, year={2014}, pages={147–163} } @inproceedings{anyanwu_2013, title={A vision for SPARQL multi-query optimization on MapReduce}, DOI={10.1109/icdew.2013.6547420}, abstractNote={MapReduce has emerged as a key component of large scale data analysis in the cloud. However, it presents challenges for SPARQL query processing because of the absence of traditional join optimization machinery like statistics, indexes and techniques for translation of join-intensive workloads to efficient MapReduce workflows. Further, MapReduce is primarily a batch processing paradigm. Therefore, it is plausible that many workloads will include a batch of queries or new queries could be generated from given queries e.g. due to query rewriting of inferencing queries. Consequently, the issue of multi-query optimization deserves some focus and this paper lays out a vision for rule-based multi-query optimization based on a recently proposed data model and algebra, Nested TripleGroup Data Model and Algebra, for efficient SPARQL query processing on MapReduce.}, booktitle={2013 ieee 29th international conference on data engineering workshops (icdew)}, author={Anyanwu, K.}, year={2013}, pages={25–26} } @article{anyanwu_kim_ravindra_2013, title={Algebraic Optimization for Processing Graph Pattern Queries in the Cloud}, volume={17}, ISSN={["1941-0131"]}, DOI={10.1109/mic.2012.22}, abstractNote={MapReduce platforms such as Hadoop are now the de facto standard for large-scale data processing, but they have significant limitations for join-intensive workloads typical in Semantic Web processing. This article overviews an algebraic optimization approach based on a Nested TripleGroup Data Model and Algebra (NTGA) that minimizes overall processing costs by reducing the number of MapReduce cycles. It also presents an approach for integrating NTGA-based processing of graph pattern queries into Apache Pig and compares it to execution plans using relational-style algebra operators.}, number={2}, journal={IEEE INTERNET COMPUTING}, author={Anyanwu, Kemafor and Kim, HyeongSik and Ravindra, Padmashree}, year={2013}, pages={52–61} } @inproceedings{kim_anyanwu_2013, title={Optimizing queries over semantically integrated datasets on mapreduce platforms}, DOI={10.1109/bigdata.2013.6691788}, abstractNote={Life science databases generally consist of multiple heterogeneous datasets that have been integrated using complex ontologies. Querying such databases typically involves complex graph patterns, and evaluating such patterns poses challenges when MapReduce-based platforms are used to scale up processing, translating to long execution workflows with large amount of disk and network I/O costs. In this poster, we focus on optimizing UNION queries (e.g., unions of conjunctives for inference) and present an algebraic interpretation of the query rewritings which are more amenable to efficient processing on MapReduce.}, booktitle={2013 IEEE International Conference on Big Data}, author={Kim, H. and Anyanwu, K.}, year={2013} } @inproceedings{fu_kim_anyanwu_2013, title={Scaling concurrency of personalized semantic search over large RDF data}, DOI={10.1109/bigdata.2013.6691622}, abstractNote={Recent keyword search techniques on Semantic Web are moving away from shallow, information retrieval-style approaches that merely find “keyword matches” towards more interpretive approaches that attempt to induce structure from keyword queries. The process of query interpretation is usually guided by structures in data, and schema and is often supported by a graph exploration procedure. However, graph exploration-based interpretive techniques are impractical for multi-tenant scenarios for large databases because separate expensive graph exploration states need to be maintained for different user queries. This leads to significant memory overhead in situations of large numbers of concurrent requests. This limitation could negatively impact the possibility of achieving the ultimate goal of personalizing search. In this paper, we propose a lightweight interpretation approach that employs indexing to improve throughput and concurrency with much less memory overhead. It is also more amenable to distributed or partitioned execution. The approach is implemented in a system called “SKI” and an experimental evaluation of SKI's performance on the DBPedia and Billion Triple Challenge datasets shows orders-of-magnitude performance improvement over existing techniques.}, booktitle={2013 IEEE International Conference on Big Data}, author={Fu, H. Z. and Kim, H. and Anyanwu, K.}, year={2013} } @inbook{sessoms_anyanwu_2013, title={SkyPackage: From Finding Items to Finding a Skyline of Packages on the Semantic Web}, ISBN={9783642379956 9783642379963}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-37996-3_4}, DOI={10.1007/978-3-642-37996-3_4}, abstractNote={Enabling complex querying paradigms over the wealth of available Semantic Web data will significantly impact the relevance and adoption of Semantic Web technologies in a broad range of domains. While the current predominant paradigm is to retrieve a list of items, in many cases the actual intent is satisfied by reviewing the lists and assembling compatible items into lists or packages of resources such that each package collectively satisfies the need, such as assembling different collections of places to visit during a vacation. Users may place constraints on individual items, and the compatibility of items within a package is based on global constraints placed on packages, like total distance or time to travel between locations in a package. Finding such packages using the traditional item-querying model requires users to review lists of possible multiple queries and assemble and compare packages manually. In this paper, we propose three algorithms for supporting such a package query model as a first class paradigm. Since package constraints may involve multiple criteria, several competing packages are possible. Therefore, we propose the idea of computing a skyline of package results as an extension to a popular query model for multi-criteria decision-making called skyline queries, which to date has only focused on computing item skylines. We formalize the semantics of the logical query operator, Sky-Package, and propose three algorithms for the physical operator implementation. A comparative evaluation of the algorithms over real world and synthetic-benchmark RDF datasets is provided.}, booktitle={Semantic Technology}, publisher={Springer Berlin Heidelberg}, author={Sessoms, Matthew and Anyanwu, Kemafor}, year={2013}, pages={49–64} } @inbook{hong_anyanwu_2012, title={HIP: Information Passing for Optimizing Join-Intensive Data Processing Workloads on Hadoop}, ISBN={9783642325960 9783642325977}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-32597-7_33}, DOI={10.1007/978-3-642-32597-7_33}, abstractNote={Hadoop-based data processing platforms translate join intensive queries into multiple “jobs” (MapReduce cycles). Such multi-job workflows lead to a significant amount of data movement through the disk, network and memory fabric of a Hadoop cluster which could negatively impact performance and scalability. Consequently, techniques that minimize sizes of intermediate results will be useful in this context. In this paper, we present an information passing technique (HIP) that can minimize the size of intermediate data on Hadoop-based data processing platforms.}, booktitle={Lecture Notes in Computer Science}, publisher={Springer Berlin Heidelberg}, author={Hong, Seokyong and Anyanwu, Kemafor}, year={2012}, pages={384–391} } @inproceedings{scan-sharing for optimizing rdf graph pattern matching on mapreduce_2012, url={http://dx.doi.org/10.1109/cloud.2012.14}, DOI={10.1109/cloud.2012.14}, abstractNote={Recently, the number and size of RDF data collections has increased rapidly making the issue of scalable processing techniques crucial. The MapReduce model has become a de facto standard for large scale data processing using a cluster of machines in the cloud. Generally, RDF query processing creates join-intensive workloads, resulting in lengthy MapReduce workflows with expensive I/O, data transfer, and sorting costs. However, the MapReduce computation model provides limited static optimization techniques used in relational databases (e.g., indexing and cost-based optimization). Consequently, dynamic optimization techniques for such join-intensive tasks on MapReduce need to be investigated. In some previous efforts, we propose a Nested Triple Group data model and Algebra (NTGA) for efficient graph pattern query processing in the cloud. Here, we extend this work with a scan-sharing technique that is used to optimize the processing of graph patterns with repeated properties. Specifically, our scan-sharing technique eliminates the need for repeated scanning of input relations when properties are used repeatedly in graph patterns. A formal foundation underlying this scan sharing technique is discussed as well as an implementation strategy that has been integrated in the Apache Pig framework is presented. We also present a comprehensive evaluation demonstrating performance benefits of our NTGA plus scan-sharing approach.}, booktitle={2012 IEEE Fifth International Conference on Cloud Computing}, year={2012}, month={Jun} } @inproceedings{to nest or not to nest, when and how much_2012, url={http://dx.doi.org/10.1145/2237867.2237872}, DOI={10.1145/2237867.2237872}, abstractNote={Many queries on RDF datasets involve triple patterns whose properties are multi-valued. When processing such queries using flat data models and their associated algebras, intermediate results could contain a lot of redundancy. In the context of processing using MapReduce based platforms such as Hadoop, such redundancy could account for a non-trivial proportion of overall disk I/O, sorting and network data transfer costs. Further, when MapReduce workflows consist of multiple cycles as is typical when processing RDF graph pattern queries, these costs could compound over multiple cycles. However, it may be possible to avoid such overhead if nested data models and algebras are used. In this short paper, we present some on-going research into the use of a nested TripleGroup data model and Algebra (NTGA) for MapReduce based RDF graph processing. The NTGA operators fully subscribe to the NTG data model. This is in contrast to systems such as Pig where the data model supports some nesting but the algebra is primarily tuple based (requiring the flattening of nested objects before other operators can be applied). This full subscription to the nested data model by NTGA also enables support for different unnesting strategies including delayed and partial unnesting. We present a preliminary evaluation of these strategies for efficient management of multi-valued properties while processing graph pattern queries in Apache Pig.}, booktitle={Proceedings of the 4th International Workshop on Semantic Web Information Management - SWIM '12}, year={2012} } @inbook{ravindra_kim_anyanwu_2011, title={An Intermediate Algebra for Optimizing RDF Graph Pattern Matching on MapReduce}, ISBN={9783642210631 9783642210648}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-21064-8_4}, DOI={10.1007/978-3-642-21064-8_4}, abstractNote={Existing MapReduce systems support relational style join operators which translate multi-join query plans into several Map-Reduce cycles. This leads to high I/O and communication costs due to the multiple data transfer steps between map and reduce phases. SPARQL graph pattern matching is dominated by join operations, and is unlikely to be efficiently processed using existing techniques. This cost is prohibitive for RDF graph pattern matching queries which typically involve several join operations. In this paper, we propose an approach for optimizing graph pattern matching by reinterpreting certain join tree structures as grouping operations. This enables a greater degree of parallelism in join processing resulting in more “bushy” like query execution plans with fewer Map-Reduce cycles. This approach requires that the intermediate results are managed as sets of groups of triples or TripleGroups. We therefore propose a data model and algebra - Nested TripleGroup Algebra for capturing and manipulating TripleGroups. The relationship with the traditional relational style algebra used in Apache Pig is discussed. A comparative performance evaluation of the traditional Pig approach and RAPID+ (Pig extended with NTGA) for graph pattern matching queries on the BSBM benchmark dataset is presented. Results show up to 60% performance improvement of our approach over traditional Pig for some tasks.}, booktitle={The Semanic Web: Research and Applications}, publisher={Springer Berlin Heidelberg}, author={Ravindra, Padmashree and Kim, HyeongSik and Anyanwu, Kemafor}, year={2011}, pages={46–61} } @inproceedings{cosi_2011, url={http://dx.doi.org/10.1145/1963192.1963291}, DOI={10.1145/1963192.1963291}, abstractNote={The demo will present CoSi, a system that enables context-sensitive interpretation of keyword queries on RDF databases. The techniques for representing, managing and exploiting query history are central to achieving this objective. The demonstration will show the effectiveness of our approach for capturing a user's querying context from their query history. Further, it will show how context is utilized to influence the interpretation of a new query. The demonstration is based on DBPedia, the RDF representation of Wikipedia.}, booktitle={Proceedings of the 20th international conference companion on World wide web - WWW '11}, year={2011} } @inbook{fu_anyanwu_2011, title={Effectively Interpreting Keyword Queries on RDF Databases with a Rear View}, ISBN={9783642250729 9783642250736}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-25073-6_13}, DOI={10.1007/978-3-642-25073-6_13}, abstractNote={Effective techniques for keyword search over RDF databases incorporate an explicit interpretation phase that maps keywords in a keyword query to structured query constructs. Because of the ambiguity of keyword queries, it is often not possible to generate a unique interpretation for a keyword query. Consequently, heuristics geared toward generating the top-K likeliest user-intended interpretations have been proposed. However, heuristics currently proposed fail to capture any user-dependent characteristics, but rather depend on database-dependent properties such as occurrence frequency of subgraph pattern connecting keywords. This leads to the problem of generating top-K interpretations that are not aligned with user intentions. In this paper, we propose a context-aware approach for keyword query interpretation that personalizes the interpretation process based on a user’s query context. Our approach addresses the novel problem of using a sequence of structured queries corresponding to interpretations of keyword queries in the query history as contextual information for biasing the interpretation of a new query. Experimental results presented over DBPedia dataset show that our approach outperforms the state-of-the-art technique on both efficiency and effectiveness, particularly for ambiguous queries.}, booktitle={The Semantic Web – ISWC 2011}, publisher={Springer Berlin Heidelberg}, author={Fu, Haizhou and Anyanwu, Kemafor}, year={2011}, pages={193–208} } @inbook{chen_gao_anyanwu_2011, title={Efficiently Evaluating Skyline Queries on RDF Databases}, ISBN={9783642210631 9783642210648}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-21064-8_9}, DOI={10.1007/978-3-642-21064-8_9}, abstractNote={Skyline queries are a class of preference queries that compute the pareto-optimal tuples from a set of tuples and are valuable for multi-criteria decision making scenarios. While this problem has received significant attention in the context of single relational table, skyline queries over joins of multiple tables that are typical of storage models for RDF data has received much less attention. A naïve approach such as a join-first-skyline-later strategy splits the join and skyline computation phases which limit opportunities for optimization. Other existing techniques for multi-relational skyline queries assume storage and indexing techniques that are not typically used with RDF which would require a preprocessing step for data transformation. In this paper, we present an approach for optimizing skyline queries over RDF data stored using a vertically partitioned schema model. It is based on the concept of a "Header Point" which maintains a concise summary of the already visited regions of the data space. This summary allows some fraction of non-skyline tuples to be pruned from advancing to the skyline processing phase, thus reducing the overall cost of expensive dominance checks required in the skyline phase. We further present more aggressive pruning rules that result in the computation of near-complete skylines in significantly less time than the complete algorithm. A comprehensive performance evaluation of different algorithms is presented using datasets with different types of data distributions generated by a benchmark data generator.}, booktitle={The Semanic Web: Research and Applications}, publisher={Springer Berlin Heidelberg}, author={Chen, Ling and Gao, Sidan and Anyanwu, Kemafor}, year={2011}, pages={123–138} } @article{kim_ravindra_anyanwu_2011, title={From SPARQL to MapReduce: The Journey Using a Nested TripleGroup Algebra}, volume={4}, url={http://www.vldb.org/pvldb/vol4/p1426-kim.pdf}, number={12}, journal={Proc. VLDB Endow.}, author={Kim, HyeongSik and Ravindra, Padmashree and Anyanwu, Kemafor}, year={2011}, pages={1426–1429} } @inproceedings{an agglomerative query model for discovery in linked data_2010, url={http://dx.doi.org/10.1145/1859127.1859131}, DOI={10.1145/1859127.1859131}, abstractNote={Data on the Web is increasingly being used for discovery and exploratory tasks. Unlike traditional fact-finding tasks that require only the typical single-query and response paradigm, these tasks involve a multistage search process in which bits of information are accumulated over a series of related queries. The ability and effectiveness of users to connect the dots between these pieces of information are crucial to enable discovery. In this paper, we introduce the notion of agglomerative querying for supporting "search processes" and present its motivation, challenges and formalization. We focus on a specific class of agglomerative querying called association agglomerative querying which is very natural for linked data models such as RDF. We present a preliminary implementation approach for processing such queries and discuss its relationship with SPARQL query processing. Finally, we present empirical results for proving the effectiveness of our approach on the DBLP dataset and future directions.}, booktitle={Procceedings of the 13th International Workshop on the Web and Databases - WebDB '10}, year={2010} } @inproceedings{fu_gao_anyanwu_2010, title={Disambiguating Keyword Queries on RDF Databases Using "Deep" Segmentation}, DOI={10.1109/ICSC.2010.90}, abstractNote={Keyword search on (semi)structured databases is an increasingly popular research topic. But existing techniques do not deal well with the problems presented by the queries that are ambiguous. Recent approaches for RDF databases try to improve the quality of results by introducing an explicit top-k “interpretation” phase in which queries are translated into an ordered list of “most likely intended” structured (SPARQL) queries before query execution. However, even these recent techniques only address keyword query ambiguity in a limited fashion by identifying fine-grained semantic units or segments of a query. This enables some reduction in the space of interpretations, pruning away incorrect interpretations, but the reduction in interpretation space is not as aggressive as it could be. In this paper, we propose a “deep segmentation” technique for keyword queries issued against an RDF database. This approach achieves a more aggressive pruning of irrelevant interpretations from the space of interpretations considered and therefore produces better quality query interpretations even in the presence of significant query ambiguity. We present results for a comprehensive human-based evaluation that is based on a metric that we introduce called degree of ambiguity (DOTA) that has not been considered by previous efforts. The experimental results show that our approach outperforms existing techniques in terms of quality even when queries are very ambiguous.}, booktitle={2010 IEEE Fourth International Conference on Semantic Computing}, author={Fu, H. and Gao, S. and Anyanwu, K.}, year={2010}, pages={236–243} } @inbook{li_panetto_berio_anyanwu_2010, place={Berlin Heidelberg}, series={Lecture Notes in Computer Science}, title={EI2N’10 & SeDeS’10 - PC Co-chairs Message}, ISBN={9783642169601 9783642169618}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-16961-8_35}, DOI={10.1007/978-3-642-16961-8_35}, abstractNote={After the successful fourth edition in 2009, the fifth edition of the Enterprise Integration, Interoperability and Networking workshop (EI2N’2010) has been organized as part of the OTM’2010 Federated Conferences and is supported by the IFAC Technical Committee 5.3 ”Enterprise Integration and Networking”, the IFIP TC 8 WG 8.1 ”Design and Evaluation of Information Systems”, the SIG INTEROP Grande-Région on ”Enterprise Systems Interoperability” and the French CNRS National Research Group GDR MACS.}, booktitle={On the Move to Meaningful Internet Systems: OTM 2010 Workshops}, publisher={Springer}, author={Li, Qing and Panetto, Hervé and Berio, Giuseppe and Anyanwu, Kemafor}, editor={Meersman, R. and Dillon, T. and Herrero, P.Editors}, year={2010}, pages={180–181}, collection={Lecture Notes in Computer Science} } @inbook{li_panetto_berio_anyanwu_2010, place={Berlin Heidelberg}, series={Lecture Notes in Computer Science}, title={EI2N’10 & SeDeS’10 - PC Co-chairs Message}, ISBN={9783642169601 9783642169618}, ISSN={0302-9743 1611-3349}, url={http://dx.doi.org/10.1007/978-3-642-16961-8_79}, DOI={10.1007/978-3-642-16961-8_79}, abstractNote={After the successful fourth edition in 2009, the fifth edition of the Enterprise Integration, Interoperability and Networking workshop (EI2N’2010) has been organised as part of the OTM’2010 Federated Conferences and is supported by the IFAC Technical Committee 5.3 ”Enterprise Integration and Networking”, the IFIP TC 8 WG 8.1 ”Design and Evaluation of Information Systems”, the SIG INTEROP Grande-Région on ”Enterprise Systems Interoperability” and the French CNRS National Research Group GDR MACS. Collaboration is necessary for enterprises to prosper in the current extreme dynamic and heterogeneous business environment. Enterprise integration, interoperability and networking are the major disciplines that have studied how to do companies to collaborate and communicate in the most effective way. These disciplines are well-established and are supported by international conferences, initiatives, groups, task forces and governmental projects all over the world where different domains of knowledge have been considered from different points of views and a variety of objectives (e.g., technological or managerial). Enterprise Integration involves breaking down organizational barriers to improve synergy within the enterprise so that business goals are achieved in a more productive and efficient way. The past decade of enterprise integration research and industrial implementation has seen the emergence of important new areas, such as research into interoperability and networking, which involve breaking down organizational barriers to improve synergy within the enterprise and among enterprises. The ambition to achieve dynamic, efficient and effective cooperation of enterprises within networks of companies, or in an entire industry sector, requires the improvement of existing, or the development of new, theories and technologies. Enterprise Modelling, Architecture, and semantic techniques are the pillars supporting the achievement of Enterprise Integration and Interoperability. Internet of Things and Cloud Computing now present new opportunities to realize inter enterprise and intra enterprise integration. For these reasons, the workshop’s objective is to foster discussions among representatives of these neighbouring disciplines and to discover new research paths within the enterprise integration community. After peer reviews, 6 papers have been accepted out of 12 submissions to this workshop. Prof. Michael Sobolewski (Polish-Japanese Institute of IT, Poland) has been invited as EI2N plenary keynote on ”Exerted Enterprise Computing: from Protocol-oriented Networking to Exertion-oriented Networking”. In addition to the presentations of the accepted papers, groups have been organised into what E2IN traditionally calls ”workshop cafés”, to discuss and debate the presented topics. This year discussion enabled putting forward new research related to ”interoperability issues in collaborative information systems”. These groups reported the results of the respective discussions during a plenary session that was jointly organised with the CoopIS’2010 conference, in order to share the vision for future research on this top domain. The papers published in this volume of proceedings present samples of current research in the enterprise modelling, systems interoperability, services management, cloud integration and, more globally, systems engineering and enterprise architecture domains. Some new architecting principles that has gained currency in the recent past is semantic technique, service oriented architecture and cloud computing with their principles, reference models and technology, and if applied correctly can be an important contributor to the future of interoperable, networked and collaborative enterprises. The success of this complex field also depends on the maturity and coherency of the management of the involved enterprises, a topic covered by the second workshop café. As a special track of EI2N’2010, SeDeS’2010 is the first international workshop on Semantics & Decision Support. The call for papers saw 12 submissions, among which the Programme Committee has selected 4 papers to be presented at EI2N’2010. The selected papers cover the topics of ontology-based decision making applications in the fields of eGovernment, eLearning, business rule management and Human Resource Management.}, booktitle={On the Move to Meaningful Internet Systems: OTM 2010 Workshops}, publisher={Springer}, author={Li, Qing and Panetto, Hervé and Berio, Giuseppe and Anyanwu, Kemafor}, editor={Meersman, R. and Dillon, T. and Herrero, P.Editors}, year={2010}, pages={563–564}, collection={Lecture Notes in Computer Science} } @inproceedings{sridhar_ravindra_anyanwu_2009, title={RAPID: Enabling scalable ad-hoc analytics on the semantic web}, volume={5823}, url={http://dx.doi.org/10.1007/978-3-642-04930-9_45}, DOI={10.1007/978-3-642-04930-9_45}, abstractNote={As the amount of available RDF data continues to increase steadily, there is growing interest in developing efficient methods for analyzing such data. While recent efforts have focused on developing efficient methods for traditional data processing, analytical processing which typically involves more complex queries has received much less attention. The use of cost effective parallelization techniques such as Google’s Map-Reduce offer significant promise for achieving Web scale analytics. However, currently available implementations are designed for simple data processing on structured data. In this paper, we present a language, RAPID, for scalable ad-hoc analytical processing of RDF data on Map-Reduce frameworks. It builds on Yahoo’s Pig Latin by introducing primitives based on a specialized join operator, the MD-join, for expressing analytical tasks in a manner that is more amenable to parallel processing, as well as primitives for coping with semi-structured nature of RDF data. Experimental evaluation results demonstrate significant performance improvements for analytical processing of RDF data over existing Map-Reduce based techniques.}, booktitle={Semantic web - iswc 2009, proceedings}, author={Sridhar, R. and Ravindra, P. and Anyanwu, K.}, year={2009}, pages={715–730} } @inbook{maduko_anyanwu_sheth_schliekelman_2008, place={Berlin Heidelberg}, series={Lecture Notes in Computer Science}, title={Graph Summaries for Subgraph Frequency Estimation}, ISBN={9783540682332 9783540682349}, url={http://dx.doi.org/10.1007/978-3-540-68234-9_38}, DOI={10.1007/978-3-540-68234-9_38}, abstractNote={A fundamental problem related to graph structured databases is searching for substructures. One issue with respect to optimizing such searches is the ability to estimate the frequency of substructures within a query graph. In this work, we present and evaluate two techniques for estimating the frequency of subgraphs from a summary of the data graph. In the first technique, we assume that edge occurrences on edge sequences are position independent and summarize only the most informative dependencies. In the second technique, we prune small subgraphs using a valuation scheme that blends information about their importance and estimation power. In both techniques, we assume conditional independence to estimate the frequencies of larger subgraphs. We validate the effectiveness of our techniques through experiments on real and synthetic datasets.}, booktitle={The Semantic Web: Research and Applications. ESWC 2008.}, publisher={Springer}, author={Maduko, Angela and Anyanwu, Kemafor and Sheth, Amit and Schliekelman, Paul}, editor={Bechhofer, S. and Hauswirth, M. and Hoffmann, J. and Koubarakis, M.Editors}, year={2008}, month={May}, pages={508–523}, collection={Lecture Notes in Computer Science} } @inproceedings{anyanwu_murukannaiah_maduko_2008, title={Structure Discovery Queries in Disk-Based Semantic Web Databases}, DOI={10.1109/SKG.2008.108}, abstractNote={Link analysis tasks are fundamental to analytical applications in scientific research, business, national security, etc. Such tasks involve finding associations or interactions between entities e.g. people, chemical or genes. In graph theoretic terms, this amounts to finding arbitrary sub-graph structures that link a given set of entities. On the other hand, the traditional graph pattern matching query paradigm focuses on finding sub-graphs that match the structure given in a query. Consequently, an important problem is developing methods for evaluating such queries, particularly, when data resides on disk. In such cases, query processing techniques must avoid loading the whole database graph into memory and must utilize indexing and query processing techniques that mitigate the inherently I/O bound nature of navigating disk based graphs. In this paper, we present a computational framework for efficiently evaluating a class of structure discovery queries. It is based on an algebraic approach to solving path problems that leads to a natural disk storage model for graph data using traditional B+ tree index structures. We present some very promising preliminary evaluation results which show a very significant improvement in query performance over other approaches.}, booktitle={2008 Fourth International Conference on Semantics, Knowledge and Grid}, author={Anyanwu, K. and Murukannaiah, P. K. and Maduko, A.}, year={2008}, pages={336–342} } @inproceedings{estimating the cardinality of rdf graph patterns_2007, url={http://dx.doi.org/10.1145/1242572.1242782}, DOI={10.1145/1242572.1242782}, abstractNote={Most RDF query languages allow for graph structure search through a conjunction of triples which is typically processed using join operations. A key factor in optimizing joins is determining the join order which depends on the expected cardinality of intermediate results. This work proposes a pattern-based summarization framework for estimating the cardinality of RDF graph patterns. We present experiments on real world and synthetic datasets which confirm the feasibility of our approach.}, booktitle={Proceedings of the 16th international conference on World Wide Web - WWW '07}, year={2007} } @inproceedings{sparq2l_2007, url={http://dx.doi.org/10.1145/1242572.1242680}, DOI={10.1145/1242572.1242680}, abstractNote={Many applications in analytical domains often have the need to "connect the dots" i.e., query about the structure of data. In bioinformatics for example, it is typical to want to query about interactions between proteins. The aim of such queries is to "extract" relationships between entities i.e. paths from a data graph. Often, such queries will specify certain constraints that qualifying results must satisfy e.g. paths involving a set of mandatory nodes. Unfortunately, most present day Semantic Web query languages including the current draft of the anticipated recommendation SPARQL, lack the ability to express queries about arbitrary path structures in data. In addition, many systems that support some limited form of path queries rely on main memory graph algorithms limiting their applicability to very large scale graphs. In this paper, we present an approach for supporting Path Extraction queries. Our proposal comprises (i) a query language SPARQ2L which extends SPARQL with path variables and path variable constraint expressions, and (ii) a novel query evaluation framework based on efficient algebraic techniques for solving path problems which allows for path queries to be efficiently evaluated on disk resident RDF graphs. The effectiveness of our proposal is demonstrated by a performance evaluation of our approach on both real world based and synthetic dataset.}, booktitle={Proceedings of the 16th international conference on World Wide Web - WWW '07}, year={2007} } @inproceedings{semrank_2005, url={http://dx.doi.org/10.1145/1060745.1060766}, DOI={10.1145/1060745.1060766}, abstractNote={While the idea that querying mechanisms for complex relationships (otherwise known as Semantic Associations) should be integral to Semantic Web search technologies has recently gained some ground, the issue of how search results will be ranked remains largely unaddressed. Since it is expected that the number of relationships between entities in a knowledge base will be much larger than the number of entities themselves, the likelihood that Semantic Association searches would result in an overwhelming number of results for users is increased, therefore elevating the need for appropriate ranking schemes. Furthermore, it is unlikely that ranking schemes for ranking entities (documents, resources, etc.) may be applied to complex structures such as Semantic Associations.In this paper, we present an approach that ranks results based on how predictable a result might be for users. It is based on a relevance model SemRank, which is a rich blend of semantic and information-theoretic techniques with heuristics that supports the novel idea of modulative searches, where users may vary their search modes to effect changes in the ordering of results depending on their need. We also present the infrastructure used in the SSARK system to support the computation of SemRank values for resulting Semantic Associations and their ordering.}, booktitle={Proceedings of the 14th international conference on World Wide Web - WWW '05}, year={2005} } @article{semantic association identification and knowledge discovery for national security applications_2005, url={http://dx.doi.org/10.4018/jdm.2005010103}, DOI={10.4018/jdm.2005010103}, abstractNote={Public and private organizations have access to a vast amount of internal, deep Web and open Web information. Transforming this heterogeneous and distributed information into actionable and insightful information is the key to the emerging new classes of business intelligence and national security applications. Although the role of semantics in search and integration has been often talked about, in this paper we discuss semantic approaches to support analytics on vast amounts of heterogeneous data. In particular, we bring together novel academic research and commercialized Semantic Web technology. The academic research related to semantic association identification is built upon commercial Semantic Web technology for semantic metadata extraction. A prototypical demonstration of this research and technology is presented in the context of an aviation security application of significance to national security.}, journal={Journal of Database Management}, year={2005}, month={Jan} } @inproceedings{ρ-queries:: enabling querying for semantic associations on the semantic web_2003, url={http://dx.doi.org/10.1145/775152.775249}, DOI={10.1145/775152.775249}, abstractNote={This paper presents the notion of Semantic Associations as complex relationships between resource entities. These relationships capture both a connectivity of entities as well as similarity of entities based on a specific notion of similarity called r-isomorphism. It formalizes these notions for the RDF data model, by introducing a notion of a Property Sequence as a type. In the context of a graph model such as that for RDF, Semantic Associations amount to specific certain graph signatures. Specifically, they refer to sequences (i.e. directed paths) here called Property Sequences, between entities, networks of Property Sequences (i.e. undirected paths), or subgraphs of r-isomorphic Property Sequences.The ability to query about the existence of such relationships is fundamental to tasks in analytical domains such as national security and business intelligence, where tasks often focus on finding complex yet meaningful and obscured relationships between entities. However, support for such queries is lacking in contemporary query systems, including those for RDF.}, booktitle={Proceedings of the twelfth international conference on World Wide Web - WWW '03}, year={2003} } @article{the ρ operator_2002, url={http://dx.doi.org/10.1145/637411.637418}, DOI={10.1145/637411.637418}, abstractNote={In this paper, we introduce an approach that supports querying for Semantic Associations on the Semantic Web. Semantic Associations capture complex relationships between entities involving sequences of predicates, and sets of predicate sequences that interact in complex ways. Detecting such associations is at the heart of many research and analytical activities that are crucial to applications in national security and business intelligence. This in combination with the improving ability to identify entities in documents as part of automatic semantic annotation, gives a very powerful capability for semantic analysis of large amounts of heterogeneous content.The approach for supporting Semantic Associations discussed in this paper has four main facets. First, it generalizes these associations into three main classes based on their structural properties, allowing us to reason about them in a domain-independent manner. The second is the provision of an operator ρ for expressing queries about such associations. Third, it uses a graph data model for knowledge representation, allowing the semantic associations search techniques to be built upon the graph algorithms for paths, while integrating knowledge from the schema into the search process. The fourth facet is the use of a notion of context, which allows for restricting the search space and for context-driven ranking of results. Just as a Web search engine looks for relevant documents in the current Web, ρ can be seen as discovering and ranking complex relationships in the Semantic Web.In this paper, we demonstrate the need for supporting such complex semantic relationships. We also give a formal basis to the notion of Semantic Associations and give a brief discussion on our overall approach for discovering and ranking them.}, journal={ACM SIGMOD Record}, year={2002}, month={Dec} }