@article{bennett_martin_lahiri_2022, title={Fitting sparse Markov models through a collapsed Gibbs sampler}, ISSN={["1613-9658"]}, DOI={10.1007/s00180-022-01310-8}, journal={COMPUTATIONAL STATISTICS}, author={Bennett, Iris and Martin, Donald E. K. and Lahiri, Soumendra Nath}, year={2022}, month={Dec} } @article{martin_2020, title={Distributions of pattern statistics in sparse Markov models}, volume={72}, ISSN={0020-3157 1572-9052}, url={http://dx.doi.org/10.1007/S10463-019-00714-6}, DOI={10.1007/s10463-019-00714-6}, number={4}, journal={Annals of the Institute of Statistical Mathematics}, publisher={Springer Science and Business Media LLC}, author={Martin, Donald E. K.}, year={2020}, month={Aug}, pages={895–913} } @article{martin_2019, title={Computation of exact probabilities associated with overlapping pattern occurrences}, volume={11}, ISSN={["1939-0068"]}, DOI={10.1002/wics.1477}, abstractNote={Searching for patterns in data is important because it can lead to the discovery of sequence segments that play a functional role. The complexity of pattern statistics that are used in data analysis and the need of the sampling distribution of those statistics for inference renders efficient computation methods as paramount. This article gives an overview of the main methods used to compute distributions of statistics of overlapping pattern occurrences, specifically, generating functions, correlation functions, the Goulden‐Jackson cluster method, recursive equations, and Markov chain embedding. The underlying data sequence will be assumed to be higher‐order Markovian, which includes sparse Markov models and variable length Markov chains as special cases. Also considered will be recent developments for extending the computational capabilities of the Markov chain‐based method through an algorithm for minimizing the size of the chain's state space, as well as improved data modeling capabilities through sparse Markov models. An application to compute a distribution used as a test statistic in sequence alignment will serve to illustrate the usefulness of the methodology.}, number={5}, journal={WILEY INTERDISCIPLINARY REVIEWS-COMPUTATIONAL STATISTICS}, author={Martin, Donald E. K.}, year={2019}, month={Sep} } @article{martin_2019, title={Minimal auxiliary Markov chains through sequential elimination of states}, volume={48}, ISSN={["1532-4141"]}, DOI={10.1080/03610918.2017.1406505}, abstractNote={ABSTRACT When using an auxiliary Markov chain to compute the distribution of a pattern statistic, the computational complexity is directly related to the number of Markov chain states. Theory related to minimal deterministic finite automata have been applied to large state spaces to reduce the number of Markov chain states so that only a minimal set remains. In this paper, a characterization of equivalent states is given so that extraneous states are deleted during the process of forming the state space, improving computational efficiency. The theory extends the applicability of Markov chain based methods for computing the distribution of pattern statistics.}, number={4}, journal={COMMUNICATIONS IN STATISTICS-SIMULATION AND COMPUTATION}, author={Martin, Donald E. K.}, year={2019}, month={Apr}, pages={1040–1054} } @article{martin_noe_2017, title={Faster exact distributions of pattern statistics through sequential elimination of states}, volume={69}, ISSN={["1572-9052"]}, DOI={10.1007/s10463-015-0540-y}, number={1}, journal={ANNALS OF THE INSTITUTE OF STATISTICAL MATHEMATICS}, author={Martin, Donald E. K. and Noe, Laurent}, year={2017}, month={Feb}, pages={231–248} } @article{coleman_martin_reich_2015, title={Multiple window discrete scan statistic for higher-order Markovian sequences}, volume={42}, ISSN={["1360-0532"]}, DOI={10.1080/02664763.2015.1005061}, abstractNote={Accurate and efficient methods to detect unusual clusters of abnormal activity are needed in many fields such as medicine and business. Often the size of clusters is unknown; hence, multiple (variable) window scan statistics are used to identify clusters using a set of different potential cluster sizes. We give an efficient method to compute the exact distribution of multiple window discrete scan statistics for higher-order, multi-state Markovian sequences. We define a Markov chain to efficiently keep track of probabilities needed to compute p-values for the statistic. The state space of the Markov chain is set up by a criterion developed to identify strings that are associated with observing the specified values of the statistic. Using our algorithm, we identify cases where the available approximations do not perform well. We demonstrate our methods by detecting unusual clusters of made free throw shots by National Basketball Association players during the 2009–2010 regular season.}, number={8}, journal={JOURNAL OF APPLIED STATISTICS}, author={Coleman, Deidra A. and Martin, Donald E. K. and Reich, Brian J.}, year={2015}, month={Aug}, pages={1690–1705} } @article{martin_2015, title={p-values for the Discrete Scan Statistic through Slack Variables}, volume={44}, ISSN={["1532-4141"]}, DOI={10.1080/03610918.2013.777457}, abstractNote={The discrete scan statistic is used in many areas of applied probability and statistics to study local clumping of patterns. Testing based on the statistic requires tail probabilities. Whereas the distribution has been studied extensively, most of the results are approximations, due to the difficulties associated with the computation. Results for exact p-values for the statistic have been given for a binary sequence that is independent or first-order Markovian. We give an algorithm to obtain probabilities for the statistic over multi-state trials that are Markovian of a general order of dependence, and explore the algorithm's usefulness.}, number={9}, journal={COMMUNICATIONS IN STATISTICS-SIMULATION AND COMPUTATION}, author={Martin, D. E. K.}, year={2015}, pages={2223–2239} } @article{noe_martin_2014, title={A Coverage Criterion for Spaced Seeds and Its Applications to Support Vector Machine String Kernels and k-Mer Distances}, volume={21}, ISSN={["1557-8666"]}, DOI={10.1089/cmb.2014.0173}, abstractNote={Spaced seeds have been recently shown to not only detect more alignments, but also to give a more accurate measure of phylogenetic distances, and to provide a lower misclassification rate when used with Support Vector Machines (SVMs). We confirm by independent experiments these two results, and propose in this article to use a coverage criterion to measure the seed efficiency in both cases in order to design better seed patterns. We show first how this coverage criterion can be directly measured by a full automaton-based approach. We then illustrate how this criterion performs when compared with two other criteria frequently used, namely the single-hit and multiple-hit criteria, through correlation coefficients with the correct classification/the true distance. At the end, for alignment-free distances, we propose an extension by adopting the coverage criterion, show how it performs, and indicate how it can be efficiently computed.}, number={12}, journal={JOURNAL OF COMPUTATIONAL BIOLOGY}, author={Noe, Laurent and Martin, Donald E. K.}, year={2014}, month={Dec}, pages={947–963} } @article{martin_aston_2013, title={Distribution of Statistics of Hidden State Sequences Through the Sum-Product Algorithm}, volume={15}, ISSN={["1573-7713"]}, DOI={10.1007/s11009-012-9289-4}, number={4}, journal={METHODOLOGY AND COMPUTING IN APPLIED PROBABILITY}, author={Martin, Donald E. K. and Aston, John A. D.}, year={2013}, month={Dec}, pages={897–918} } @article{aston_peng_martin_2012, title={Implied distributions in multiple change point problems}, volume={22}, ISSN={["1573-1375"]}, DOI={10.1007/s11222-011-9268-6}, abstractNote={A method for efficiently calculating exact marginal, conditional and joint distributions for change points defined by general finite state Hidden Markov Models is proposed. The distributions are not subject to any approximation or sampling error once parameters of the model have been estimated. It is shown that, in contrast to sampling methods, very little computation is needed. The method provides probabilities associated with change points within an interval, as well as at specific points.}, number={4}, journal={STATISTICS AND COMPUTING}, author={Aston, J. A. D. and Peng, J. Y. and Martin, D. E. K.}, year={2012}, month={Jul}, pages={981–993} } @article{martin_coleman_2011, title={Distribution of Clump Statistics for a Collection of Words}, volume={48}, ISSN={0021-9002 1475-6072}, url={http://dx.doi.org/10.1017/S0021900200008615}, DOI={10.1017/S0021900200008615}, abstractNote={We give an efficient method based on minimal deterministic finite automata for computing the exact distribution of the number of occurrences and coverage of clumps (maximal sets of overlapping words) of a collection of words. In addition, we compute probabilities for the number of h -clumps, word groupings where gaps of a maximal length h between occurrences of words are allowed. The method facilitates the computation of p -values for testing procedures. A word is allowed to contain other words of the collection, making the computation more general, but also more difficult. The underlying sequence is assumed to be Markovian of an arbitrary order.}, number={04}, journal={Journal of Applied Probability}, publisher={Cambridge University Press (CUP)}, author={Martin, Donald E. K. and Coleman, Deidra A.}, year={2011}, month={Dec}, pages={1049–1059} } @article{martin_coleman_2011, title={Distribution of clump statistics for a collection of words}, volume={48}, DOI={10.1239/jap/1324046018}, abstractNote={We give an efficient method based on minimal deterministic finite automata for computing the exact distribution of the number of occurrences and coverage of clumps (maximal sets of overlapping words) of a collection of words. In addition, we compute probabilities for the number of h-clumps, word groupings where gaps of a maximal length h between occurrences of words are allowed. The method facilitates the computation of p-values for testing procedures. A word is allowed to contain other words of the collection, making the computation more general, but also more difficult. The underlying sequence is assumed to be Markovian of an arbitrary order.}, number={4}, journal={Journal of Applied Probability}, author={Martin, D. E. K. and Coleman, D. A.}, year={2011}, pages={1049–1059} } @article{martin_2008, title={Application of auxiliary Markov chains to start-up demonstration tests}, volume={184}, ISSN={0377-2217}, url={http://dx.doi.org/10.1016/j.ejor.2006.12.009}, DOI={10.1016/j.ejor.2006.12.009}, abstractNote={We use auxiliary Markov chains to derive probabilistic results for five types of start-up demonstration tests, with start-ups that are Markovian of a general order. Four of the tests are based on consecutive (or total) successful start-ups and consecutive (or total) failures; the fifth has two rejection criteria. For each test type, we obtain the probability of the test ending with acceptance of the unit, the probability distribution and moments of the number of start-ups in the test, the probability of acceptance (or rejection) of the equipment in a specified number of trials, and the conditional distribution of the number of start-ups in the test given that the unit is accepted or rejected. Numerical examples are given. Though the results are for these specific types of start-up demonstration tests, the method of derivation may be used for tests with other stopping criteria, and in other situations as well.}, number={2}, journal={European Journal of Operational Research}, publisher={Elsevier BV}, author={Martin, Donald E.K.}, year={2008}, month={Jan}, pages={574–583} } @article{martina_aston_2008, title={Waiting time distribution of generalized later patterns}, volume={52}, ISSN={["0167-9473"]}, DOI={10.1016/j.csda.2008.04.019}, abstractNote={In this paper the concept of later waiting time distributions for patterns in multi-state trials is generalized to cover a collection of compound patterns that must all be counted pattern-specific numbers of times, and a practical method is given to compute the generalized distribution. The solution given applies to overlapping counting and two types of non-overlapping counting, and the underlying sequences are assumed to be Markovian of a general order. Patterns are allowed to be weighted so that an occurrence is counted multiple times, and patterns may be completely included in longer patterns. Probabilities are computed through an auxiliary Markov chain. As the state space associated with the auxiliary chain can be quite large if its setup is handled in a naïve fashion, an algorithm is given for generating a “minimal” state space that leaves out states that can never be reached. For the case of overlapping counting, a formula that relates probabilities for intersections of events to probabilities for unions of subsets of the events is also used, so that the distribution is also computed in terms of probabilities for competing patterns. A detailed example is given to illustrate the methodology.}, number={11}, journal={COMPUTATIONAL STATISTICS & DATA ANALYSIS}, author={Martina, Donald E. K. and Aston, John A. D.}, year={2008}, month={Jul}, pages={4879–4890} } @article{aston_martin_2007, title={DISTRIBUTIONS ASSOCIATED WITH GENERAL RUNS AND PATTERNS IN HIDDEN MARKOV MODELS}, volume={1}, ISSN={["1932-6157"]}, DOI={10.1214/07-AOAS125}, abstractNote={This paper gives a method for computing distributions associated with patterns in the state sequence of a hidden Markov model, conditional on observing all or part of the observation sequence. Probabilities are computed for very general classes of patterns (competing patterns and generalized later patterns), and thus, the theory includes as special cases results for a large class of problems that have wide application. The unobserved state sequence is assumed to be Markovian with a general order of dependence. An auxiliary Markov chain is associated with the state sequence and is used to simplify the computations. Two examples are given to illustrate the use of the methodology. Whereas the first application is more to illustrate the basic steps in applying the theory, the second is a more detailed application to DNA sequences, and shows that the methods can be adapted to include restrictions related to biological knowledge.}, number={2}, journal={ANNALS OF APPLIED STATISTICS}, author={Aston, John A. D. and Martin, Donald E. K.}, year={2007}, month={Dec}, pages={585–611} } @article{martin_2006, title={A recursive algorithm for computing the distribution of the number of successes in higher-order Markovian trials}, volume={50}, ISSN={0167-9473}, url={http://dx.doi.org/10.1016/j.csda.2004.09.005}, DOI={10.1016/j.csda.2004.09.005}, abstractNote={This paper presents a recursive method of computing the distribution of the number of successes in a sequence of binary trials that are Markovian of a general order. Waiting-time distributions are also obtained. Recurrence relations among probabilities of partitioned events are used to compute the desired probabilities. An advantage of computing the probabilities in the manner given is that the algorithm may be easily programmed on a computer, requiring no computer algebra system, as in computation based on conditional probability generating functions. The difference between calculated probabilities for various model orders emphasizes the importance of selecting a proper model order for a Markovian data set.}, number={3}, journal={Computational Statistics & Data Analysis}, publisher={Elsevier BV}, author={Martin, Donald E.K.}, year={2006}, month={Feb}, pages={604–610} } @article{martin_2006, title={Hot-hand effects in sports and a recursive method of computing probabilities for streaks}, volume={33}, ISSN={0305-0548}, url={http://dx.doi.org/10.1016/j.cor.2004.09.023}, DOI={10.1016/j.cor.2004.09.023}, abstractNote={We give a recursive method of computing probabilities associated with the waiting time to the first occurrence of a run of arbitrary length in Markovian trials of a general order. Using data from the Oakland Athletics’ 2002 season as an example, we show that the assumed model order can make a large difference in computed probabilities. The algorithm is then applied to the computation of probabilities associated with strikes and nonstrikes in bowling. After showing that there is significant deviation from a model of Bernoulli trials for data from the 2003–2004 Professional Bowlers Association tour, we suggest a criterion based on the longest success run for choosing the order of Markovian dependence that gives the best fit to the streakiness characteristics of an individual bowler's data.}, number={7}, journal={Computers & Operations Research}, publisher={Elsevier BV}, author={Martin, Donald E.K.}, year={2006}, month={Jul}, pages={1983–2001} }