@article{abedin_chubet_gibney_thankachan_2023, title={Contextual Pattern Matching in Less Space}, ISSN={["1068-0314"]}, DOI={10.1109/DCC55655.2023.00024}, abstractNote={We revisit the Contextual Pattern Matching Problem, defined as follows: preprocess a text T[1, n], so that given a query consisting of a string P and a length P, the occurrences of all distinct strings XPY where |X|=|Y|=P can be reported. This problem was introduced by Navarro, who presented an O($\overline{r}\log(n/\overline{r}))$ space data structure, where $\overline{r}$ is the maximum of the number of runs in the BWT of the text $\mathrm{T}[1,n]$ and its reverse. His solution reports all c contextual occurrences in $O(|P|+c\log n)$ time. However, the only known bounds on $\overline{r}$ are $\overline{r}=O(r\log^{2}n)$ where r is the number of runs in the BWT of T, making it desirable to avoid using structures with space dependent on $\overline{r}$. We demonstrate that this is possible without a significant sacrifice in query time by providing an $O(r\log(n/r))$ space solution that answers queries in $O(|P|+c\log P\cdot\log(n/r))$ time.}, journal={2023 DATA COMPRESSION CONFERENCE, DCC}, author={Abedin, Paniz and Chubet, Oliver and Gibney, Daniel and Thankachan, Sharma V.}, year={2023}, pages={160–167} } @article{gibney_macnichol_thankachan_2023, title={Non-overlapping Indexing in BWT-Runs Bounded Space}, volume={14240}, ISBN={["978-3-031-43979-7"]}, ISSN={["1611-3349"]}, DOI={10.1007/978-3-031-43980-3_21}, abstractNote={We revisit the non-overlapping indexing problem for an efficient repetition-aware solution. The problem is to index a text T[1..n], such that whenever a pattern P[1..p] comes as a query, we can report the largest set of non-overlapping occurrences of P in T. A previous index by Cohen and Porat [ISAAC 2009] takes linear space and optimal $$O(p+\mathsf {occ_{no}})$$ query time, where $$\mathsf {occ_{no}}$$ denotes the output size. We present an index of size O(r), where r denotes the number of runs in the Burrows Wheeler Transform (BWT) of T. The parameter r is significantly smaller than n for highly repetitive texts. The query time of our index is $$O(p\log \log _w \sigma +\textsf{sort}(\mathsf {occ_{no}}))$$ , where $$\sigma $$ denotes the alphabet size, w denotes the machine word size in bits and $$\textsf{sort}(x)$$ denotes the time for sorting x integers within the range [1, n].}, journal={STRING PROCESSING AND INFORMATION RETRIEVAL, SPIRE 2023}, author={Gibney, Daniel and Macnichol, Paul and Thankachan, Sharma V.}, year={2023}, pages={260–270} } @article{shah_sheng_thankachan_vitter_2023, title={Ranked Document Retrieval in External Memory}, volume={19}, ISSN={["1549-6333"]}, DOI={10.1145/3559763}, abstractNote={ The ranked (or top- k ) document retrieval problem is defined as follows: preprocess a collection {T 1 ,T 2 ,… ,T d } of d strings (called documents) of total length n into a data structure, such that for any given query (P,k) , where P is a string (called pattern) of length p ≥ 1 and k ∈ [1,d] is an integer, the identifiers of those k documents that are most relevant to P can be reported, ideally in the sorted order of their relevance. The seminal work by Hon et al. [FOCS 2009 and Journal of the ACM 2014] presented an O(n) -space (in words) data structure with O(p+k log k) query time. The query time was later improved to O(p+k) [SODA 2012] and further to O(p/ log σn+k ) [SIAM Journal on Computing 2017] by Navarro and Nekrich, where σ is the alphabet size. We revisit this problem in the external memory model and present three data structures. The first one takes O(n) -space and answer queries in O(p/B + log B n + k/B+ log * (n/B) ) I/Os, where B is the block size. The second one takes O(n log * (n/B) ) space and answer queries in optimal O(p/B + log B n + k/B) I/Os. In both cases, the answers are reported in the unsorted order of relevance. To handle sorted top- k document retrieval, we present an O(n log (d/B)) space data structure with optimal query cost. }, number={1}, journal={ACM TRANSACTIONS ON ALGORITHMS}, author={Shah, Rahul and Sheng, Cheng and Thankachan, Sharma and Vitter, Jeffrey}, year={2023}, month={Jan} } @article{gibney_thankachan_aluru_2022, title={On the Hardness of Sequence Alignment on De Bruijn Graphs}, ISSN={["1557-8666"]}, DOI={10.1089/cmb.2022.0411}, abstractNote={The problem of aligning a sequence to a walk in a labeled graph is of fundamental importance to Computational Biology. For an arbitrary graph G=(V,E) and a pattern P of length m, a lower bound based on the Strong Exponential Time Hypothesis implies that an algorithm for finding a walk in G exactly matching P significantly faster than O(|E|m) time is unlikely. However, for many special graphs, such as de Bruijn graphs, the problem can be solved in linear time. For approximate matching, the picture is more complex. When edits (substitutions, insertions, and deletions) are only allowed to the pattern, or when the graph is acyclic, the problem is solvable in O(|E|m) time. When edits are allowed to arbitrary cyclic graphs, the problem becomes NP-complete, even on binary alphabets. Moreover, NP-completeness continues to hold even when edits are restricted to only substitutions. Despite the popularity of the de Bruijn graphs in Computational Biology, the complexity of approximate pattern matching on the de Bruijn graphs remained unknown. We investigate this problem and show that the properties that make the de Bruijn graphs amenable to efficient exact pattern matching do not extend to approximate matching, even when restricted to the substitutions only case with alphabet size four. Specifically, we prove that determining the existence of a matching walk in a de Bruijn graph is NP-complete when substitutions are allowed to the graph. We also demonstrate that an algorithm significantly faster than O(|E|m) is unlikely for the de Bruijn graphs in the case where substitutions are only allowed to the pattern. This stands in contrast to pattern-to-text matching where exact matching is solvable in linear time, such as on the de Bruijn graphs, but approximate matching under substitutions is solvable in subquadratic Õ(nm) time, where n is the text's length.}, journal={JOURNAL OF COMPUTATIONAL BIOLOGY}, author={Gibney, Daniel and Thankachan, Sharma V. and Aluru, Srinivas}, year={2022}, month={Nov} }