KolmogorovSmirnovTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.statistics.inference;
import java.util.Arrays;
import java.util.Objects;
import java.util.function.DoubleSupplier;
import java.util.function.DoubleUnaryOperator;
import java.util.function.IntToDoubleFunction;
import org.apache.commons.numbers.combinatorics.BinomialCoefficientDouble;
import org.apache.commons.numbers.combinatorics.Factorial;
import org.apache.commons.numbers.core.ArithmeticUtils;
import org.apache.commons.numbers.core.Sum;
import org.apache.commons.rng.UniformRandomProvider;
/**
* Implements the Kolmogorov-Smirnov (K-S) test for equality of continuous distributions.
*
* <p>The one-sample test uses a D statistic based on the maximum deviation of the empirical
* distribution of sample data points from the distribution expected under the null hypothesis.
*
* <p>The two-sample test uses a D statistic based on the maximum deviation of the two empirical
* distributions of sample data points. The two-sample tests evaluate the null hypothesis that
* the two samples {@code x} and {@code y} come from the same underlying distribution.
*
* <p>References:
* <ol>
* <li>
* Marsaglia, G., Tsang, W. W., & Wang, J. (2003).
* <a href="https://doi.org/10.18637/jss.v008.i18">Evaluating Kolmogorov's Distribution.</a>
* Journal of Statistical Software, 8(18), 1–4.
* <li>Simard, R., & L’Ecuyer, P. (2011).
* <a href="https://doi.org/10.18637/jss.v039.i11">Computing the Two-Sided Kolmogorov-Smirnov Distribution.</a>
* Journal of Statistical Software, 39(11), 1–18.
* <li>Sekhon, J. S. (2011).
* <a href="https://doi.org/10.18637/jss.v042.i07">
* Multivariate and Propensity Score Matching Software with Automated Balance Optimization:
* The Matching package for R.</a>
* Journal of Statistical Software, 42(7), 1–52.
* <li>Viehmann, T (2021).
* <a href="https://doi.org/10.48550/arXiv.2102.08037">
* Numerically more stable computation of the p-values for the two-sample Kolmogorov-Smirnov test.</a>
* arXiv:2102.08037
* <li>Hodges, J. L. (1958).
* <a href="https://doi.org/10.1007/BF02589501">
* The significance probability of the smirnov two-sample test.</a>
* Arkiv for Matematik, 3(5), 469-486.
* </ol>
*
* <p>Note that [1] contains an error in computing h, refer to <a
* href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
*
* @see <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
* Kolmogorov-Smirnov (K-S) test (Wikipedia)</a>
* @since 1.1
*/
public final class KolmogorovSmirnovTest {
/** Name for sample 1. */
private static final String SAMPLE_1_NAME = "Sample 1";
/** Name for sample 2. */
private static final String SAMPLE_2_NAME = "Sample 2";
/** When the largest sample size exceeds this value, 2-sample test AUTO p-value
* uses an asymptotic distribution to compute the p-value. */
private static final int LARGE_SAMPLE = 10000;
/** Maximum finite factorial. */
private static final int MAX_FACTORIAL = 170;
/** Maximum length of an array. This is used to determine if two arrays can be concatenated
* to create a sampler from the joint distribution. The limit is copied from the limit
* of java.util.ArrayList. */
private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;
/** The maximum least common multiple (lcm) to attempt the exact p-value computation.
* The integral d value is in [0, n*m] in steps of the greatest common denominator (gcd),
* thus lcm = n*m/gcd is the number of possible different p-values.
* Some methods have a lower limit due to computation limits. This should be larger
* than LARGE_SAMPLE^2 so all AUTO p-values attempt an exact computation, i.e.
* at least 10000^2 ~ 2^26.56. */
private static final long MAX_LCM_TWO_SAMPLE_EXACT_P = 1L << 31;
/** Placeholder to use for the two-sample sign array when the value can be ignored. */
private static final int[] IGNORED_SIGN = new int[1];
/** Placeholder to use for the two-sample ties D array when the value can be ignored. */
private static final long[] IGNORED_D = new long[2];
/** Default instance. */
private static final KolmogorovSmirnovTest DEFAULT = new KolmogorovSmirnovTest(
AlternativeHypothesis.TWO_SIDED, PValueMethod.AUTO, false, null, 1000);
/** Alternative hypothesis. */
private final AlternativeHypothesis alternative;
/** Method to compute the p-value. */
private final PValueMethod pValueMethod;
/** Use a strict inequality for the two-sample exact p-value. */
private final boolean strictInequality;
/** Source of randomness. */
private final UniformRandomProvider rng;
/** Number of iterations . */
private final int iterations;
/**
* Result for the one-sample Kolmogorov-Smirnov test.
*
* <p>This class is immutable.
*
* @since 1.1
*/
public static class OneResult extends BaseSignificanceResult {
/** Sign of the statistic. */
private final int sign;
/**
* Create an instance.
*
* @param statistic Test statistic.
* @param sign Sign of the statistic.
* @param p Result p-value.
*/
OneResult(double statistic, int sign, double p) {
super(statistic, p);
this.sign = sign;
}
/**
* Gets the sign of the statistic. This is 1 for \(D^+\) and -1 for \(D^-\).
* For a two sided-test this is zero if the magnitude of \(D^+\) and \(D^-\) was equal;
* otherwise the sign indicates the larger of \(D^+\) or \(D^-\).
*
* @return the sign
*/
public int getSign() {
return sign;
}
}
/**
* Result for the two-sample Kolmogorov-Smirnov test.
*
* <p>This class is immutable.
*
* @since 1.1
*/
public static final class TwoResult extends OneResult {
/** Flag to indicate there were significant ties.
* Note that in extreme cases there may be significant ties despite {@code upperD == D}
* due to rounding when converting the integral statistic to a double. For this
* reason the presence of ties is stored as a flag. */
private final boolean significantTies;
/** Upper bound of the D statistic from all possible paths through regions with ties. */
private final double upperD;
/** The p-value of the upper D value. */
private final double upperP;
/**
* Create an instance.
*
* @param statistic Test statistic.
* @param sign Sign of the statistic.
* @param p Result p-value.
* @param significantTies Flag to indicate there were significant ties.
* @param upperD Upper bound of the D statistic from all possible paths through
* regions with ties.
* @param upperP The p-value of the upper D value.
*/
TwoResult(double statistic, int sign, double p, boolean significantTies, double upperD, double upperP) {
super(statistic, sign, p);
this.significantTies = significantTies;
this.upperD = upperD;
this.upperP = upperP;
}
/**
* {@inheritDoc}
*
* <p><strong>Ties</strong>
*
* <p>The presence of ties in the data creates a distribution for the D values generated
* by all possible orderings of the tied regions. This statistic is computed using the
* path with the <em>minimum</em> effect on the D statistic.
*
* <p>For a one-sided statistic \(D^+\) or \(D^-\), this is the lower bound of the D statistic.
*
* <p>For a two-sided statistic D, this may be <em>below</em> the lower bound of the
* distribution of all possible D values. This case occurs when the number of ties
* is very high and is identified by a {@link #getPValue() p-value} of 1.
*
* <p>If the two-sided statistic is zero this only occurs in the presence of ties:
* either the two arrays are identical, are 'identical' data of a single value
* (sample sizes may be different), or have a sequence of ties of 'identical' data
* with a net zero effect on the D statistic, e.g.
* <pre>
* [1,2,3] vs [1,2,3]
* [0,0,0,0] vs [0,0,0]
* [0,0,0,0,1,1,1,1] vs [0,0,0,1,1,1]
* </pre>
*/
@Override
public double getStatistic() {
// Note: This method is here for documentation
return super.getStatistic();
}
/**
* Returns {@code true} if there were ties between samples that occurred
* in a region which could change the D statistic if the ties were resolved to
* a defined order.
*
* <p>Ties between the data can be interpreted as if the values were different
* but within machine epsilon. In this case the order within the tie region is not known.
* If the most extreme ordering of any tied regions (e.g. all tied values of {@code x}
* before all tied values of {@code y}) could create a larger D statistic this
* method will return {@code true}.
*
* <p>If there were no ties, or all possible orderings of tied regions create the same
* D statistic, this method returns {@code false}.
*
* <p>Note it is possible that this method returns {@code true} when {@code D == upperD}
* due to rounding when converting the computed D statistic to a double. This will
* only occur for large sample sizes {@code n} and {@code m} where the product
* {@code n*m >= 2^53}.
*
* @return true if the D statistic could be changed by resolution of ties
* @see #getUpperD()
*/
public boolean hasSignificantTies() {
return significantTies;
}
/**
* Return the upper bound of the D statistic from all possible paths through regions with ties.
*
* <p>This will return a value equal to or greater than {@link #getStatistic()}.
*
* @return the upper bound of D
* @see #hasSignificantTies()
*/
public double getUpperD() {
return upperD;
}
/**
* Return the p-value of the upper bound of the D statistic.
*
* <p>If computed, this will return a value equal to or less than
* {@link #getPValue() getPValue}. It may be orders of magnitude smaller.
*
* <p>Note: This p-value corresponds to the most extreme p-value from all possible
* orderings of tied regions. It is <strong>not</strong> recommended to use this to
* reject the null hypothesis. The upper bound of D and the corresponding p-value
* provide information that must be interpreted in the context of the sample data and
* used to inform a decision on the suitability of the test to the data.
*
* <p>This value is set to {@link Double#NaN NaN} if the {@link #getPValue() p-value} was
* {@linkplain PValueMethod#ESTIMATE estimated}. The estimated p-value will have been created
* using a distribution of possible D values given the underlying joint distribution of
* the sample data. Comparison of the p-value to the upper p-value is not applicable.
*
* @return the p-value of the upper bound of D (or NaN)
* @see #getUpperD()
*/
public double getUpperPValue() {
return upperP;
}
}
/**
* @param alternative Alternative hypothesis.
* @param method P-value method.
* @param strict true to use a strict inequality.
* @param rng Source of randomness.
* @param iterations Number of iterations.
*/
private KolmogorovSmirnovTest(AlternativeHypothesis alternative, PValueMethod method, boolean strict,
UniformRandomProvider rng, int iterations) {
this.alternative = alternative;
this.pValueMethod = method;
this.strictInequality = strict;
this.rng = rng;
this.iterations = iterations;
}
/**
* Return an instance using the default options.
*
* <ul>
* <li>{@link AlternativeHypothesis#TWO_SIDED}
* <li>{@link PValueMethod#AUTO}
* <li>{@link Inequality#NON_STRICT}
* <li>{@linkplain #with(UniformRandomProvider) RNG = none}
* <li>{@linkplain #withIterations(int) Iterations = 1000}
* </ul>
*
* @return default instance
*/
public static KolmogorovSmirnovTest withDefaults() {
return DEFAULT;
}
/**
* Return an instance with the configured alternative hypothesis.
*
* @param v Value.
* @return an instance
*/
public KolmogorovSmirnovTest with(AlternativeHypothesis v) {
return new KolmogorovSmirnovTest(Objects.requireNonNull(v), pValueMethod, strictInequality, rng, iterations);
}
/**
* Return an instance with the configured p-value method.
*
* <p>For the one-sample two-sided test Kolmogorov's asymptotic approximation can be used;
* otherwise the p-value uses the distribution of the D statistic.
*
* <p>For the two-sample test the exact p-value can be computed for small sample sizes;
* otherwise the p-value resorts to the asymptotic approximation. Alternatively a p-value
* can be estimated from the combined distribution of the samples. This requires a source
* of randomness.
*
* @param v Value.
* @return an instance
* @see #with(UniformRandomProvider)
*/
public KolmogorovSmirnovTest with(PValueMethod v) {
return new KolmogorovSmirnovTest(alternative, Objects.requireNonNull(v), strictInequality, rng, iterations);
}
/**
* Return an instance with the configured inequality.
*
* <p>Computes the p-value for the two-sample test as \(P(D_{n,m} > d)\) if strict;
* otherwise \(P(D_{n,m} \ge d)\), where \(D_{n,m}\) is the 2-sample
* Kolmogorov-Smirnov statistic, either the two-sided \(D_{n,m}\) or one-sided
* \(D_{n,m}^+\) or \(D_{n,m}^-\).
*
* @param v Value.
* @return an instance
*/
public KolmogorovSmirnovTest with(Inequality v) {
return new KolmogorovSmirnovTest(alternative, pValueMethod,
Objects.requireNonNull(v) == Inequality.STRICT, rng, iterations);
}
/**
* Return an instance with the configured source of randomness.
*
* <p>Applies to the two-sample test when the p-value method is set to
* {@link PValueMethod#ESTIMATE ESTIMATE}. The randomness
* is used for sampling of the combined distribution.
*
* <p>There is no default source of randomness. If the randomness is not
* set then estimation is not possible and an {@link IllegalStateException} will be
* raised in the two-sample test.
*
* @param v Value.
* @return an instance
* @see #with(PValueMethod)
*/
public KolmogorovSmirnovTest with(UniformRandomProvider v) {
return new KolmogorovSmirnovTest(alternative, pValueMethod, strictInequality,
Objects.requireNonNull(v), iterations);
}
/**
* Return an instance with the configured number of iterations.
*
* <p>Applies to the two-sample test when the p-value method is set to
* {@link PValueMethod#ESTIMATE ESTIMATE}. This is the number of sampling iterations
* used to estimate the p-value. The p-value is a fraction using the {@code iterations}
* as the denominator. The number of significant digits in the p-value is
* upper bounded by log<sub>10</sub>(iterations); small p-values have fewer significant
* digits. A large number of iterations is recommended when using a small critical
* value to reject the null hypothesis.
*
* @param v Value.
* @return an instance
* @throws IllegalArgumentException if the number of iterations is not strictly positive
*/
public KolmogorovSmirnovTest withIterations(int v) {
return new KolmogorovSmirnovTest(alternative, pValueMethod, strictInequality, rng,
Arguments.checkStrictlyPositive(v));
}
/**
* Computes the one-sample Kolmogorov-Smirnov test statistic.
*
* <ul>
* <li>two-sided: \(D_n=\sup_x |F_n(x)-F(x)|\)
* <li>greater: \(D_n^+=\sup_x (F_n(x)-F(x))\)
* <li>less: \(D_n^-=\sup_x (F(x)-F_n(x))\)
* </ul>
*
* <p>where \(F\) is the distribution cumulative density function ({@code cdf}),
* \(n\) is the length of {@code x} and \(F_n\) is the empirical distribution that puts
* mass \(1/n\) at each of the values in {@code x}.
*
* <p>The cumulative distribution function should map a real value {@code x} to a probability
* in [0, 1]. To use a reference distribution the CDF can be passed using a method reference:
* <pre>
* UniformContinuousDistribution dist = UniformContinuousDistribution.of(0, 1);
* UniformRandomProvider rng = RandomSource.KISS.create(123);
* double[] x = dist.sampler(rng).samples(100);
* double d = KolmogorovSmirnovTest.withDefaults().statistic(x, dist::cumulativeProbability);
* </pre>
*
* @param cdf Reference cumulative distribution function.
* @param x Sample being evaluated.
* @return Kolmogorov-Smirnov statistic
* @throws IllegalArgumentException if {@code data} does not have length at least 2; or contains NaN values.
* @see #test(double[], DoubleUnaryOperator)
*/
public double statistic(double[] x, DoubleUnaryOperator cdf) {
return computeStatistic(x, cdf, IGNORED_SIGN);
}
/**
* Computes the two-sample Kolmogorov-Smirnov test statistic.
*
* <ul>
* <li>two-sided: \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
* <li>greater: \(D_{n,m}^+=\sup_x (F_n(x)-F_m(x))\)
* <li>less: \(D_{n,m}^-=\sup_x (F_m(x)-F_n(x))\)
* </ul>
*
* <p>where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
* empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
* is the empirical distribution that puts mass \(1/m\) at each of the values in {@code y}.
*
* @param x First sample.
* @param y Second sample.
* @return Kolmogorov-Smirnov statistic
* @throws IllegalArgumentException if either {@code x} or {@code y} does not
* have length at least 2; or contain NaN values.
* @see #test(double[], double[])
*/
public double statistic(double[] x, double[] y) {
final int n = checkArrayLength(x);
final int m = checkArrayLength(y);
// Clone to avoid destructive modification of input
final long dnm = computeIntegralKolmogorovSmirnovStatistic(x.clone(), y.clone(),
IGNORED_SIGN, IGNORED_D);
// Re-use the method to compute D in [0, 1] for consistency
return computeD(dnm, n, m, ArithmeticUtils.gcd(n, m));
}
/**
* Performs a one-sample Kolmogorov-Smirnov test evaluating the null hypothesis
* that {@code x} conforms to the distribution cumulative density function ({@code cdf}).
*
* <p>The test is defined by the {@link AlternativeHypothesis}:
* <ul>
* <li>Two-sided evaluates the null hypothesis that the two distributions are
* identical, \(F_n(i) = F(i)\) for all \( i \); the alternative is that the are not
* identical. The statistic is \( max(D_n^+, D_n^-) \) and the sign of \( D \) is provided.
* <li>Greater evaluates the null hypothesis that the \(F_n(i) <= F(i)\) for all \( i \);
* the alternative is \(F_n(i) > F(i)\) for at least one \( i \). The statistic is \( D_n^+ \).
* <li>Less evaluates the null hypothesis that the \(F_n(i) >= F(i)\) for all \( i \);
* the alternative is \(F_n(i) < F(i)\) for at least one \( i \). The statistic is \( D_n^- \).
* </ul>
*
* <p>The p-value method defaults to exact. The one-sided p-value uses Smirnov's stable formula:
*
* <p>\[ P(D_n^+ \ge x) = x \sum_{j=0}^{\lfloor n(1-x) \rfloor} \binom{n}{j} \left(\frac{j}{n} + x\right)^{j-1} \left(1-x-\frac{j}{n} \right)^{n-j} \]
*
* <p>The two-sided p-value is computed using methods described in
* Simard & L’Ecuyer (2011). The two-sided test supports an asymptotic p-value
* using Kolmogorov's formula:
*
* <p>\[ \lim_{n\to\infty} P(\sqrt{n}D_n > z) = 2 \sum_{i=1}^\infty (-1)^{i-1} e^{-2 i^2 z^2} \]
*
* @param x Sample being being evaluated.
* @param cdf Reference cumulative distribution function.
* @return test result
* @throws IllegalArgumentException if {@code data} does not have length at least 2; or contains NaN values.
* @see #statistic(double[], DoubleUnaryOperator)
*/
public OneResult test(double[] x, DoubleUnaryOperator cdf) {
final int[] sign = {0};
final double d = computeStatistic(x, cdf, sign);
final double p;
if (alternative == AlternativeHypothesis.TWO_SIDED) {
PValueMethod method = pValueMethod;
if (method == PValueMethod.AUTO) {
// No switch to the asymptotic for large n
method = PValueMethod.EXACT;
}
if (method == PValueMethod.ASYMPTOTIC) {
// Kolmogorov's asymptotic formula using z = sqrt(n) * d
p = KolmogorovSmirnovDistribution.ksSum(Math.sqrt(x.length) * d);
} else {
// exact
p = KolmogorovSmirnovDistribution.Two.sf(d, x.length);
}
} else {
// one-sided: always use exact
p = KolmogorovSmirnovDistribution.One.sf(d, x.length);
}
return new OneResult(d, sign[0], p);
}
/**
* Performs a two-sample Kolmogorov-Smirnov test on samples {@code x} and {@code y}.
* Test the empirical distributions \(F_n\) and \(F_m\) where \(n\) is the length
* of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the empirical distribution
* that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) is the empirical
* distribution that puts mass \(1/m\) of the {@code y} values.
*
* <p>The test is defined by the {@link AlternativeHypothesis}:
* <ul>
* <li>Two-sided evaluates the null hypothesis that the two distributions are
* identical, \(F_n(i) = F_m(i)\) for all \( i \); the alternative is that they are not
* identical. The statistic is \( max(D_n^+, D_n^-) \) and the sign of \( D \) is provided.
* <li>Greater evaluates the null hypothesis that the \(F_n(i) <= F_m(i)\) for all \( i \);
* the alternative is \(F_n(i) > F_m(i)\) for at least one \( i \). The statistic is \( D_n^+ \).
* <li>Less evaluates the null hypothesis that the \(F_n(i) >= F_m(i)\) for all \( i \);
* the alternative is \(F_n(i) < F_m(i)\) for at least one \( i \). The statistic is \( D_n^- \).
* </ul>
*
* <p>If the {@linkplain PValueMethod p-value method} is auto, then an exact p computation
* is attempted if both sample sizes are less than 10000 using the methods presented in
* Viehmann (2021) and Hodges (1958); otherwise an asymptotic p-value is returned.
* The two-sided p-value is \(\overline{F}(d, \sqrt{mn / (m + n)})\) where \(\overline{F}\)
* is the complementary cumulative density function of the two-sided one-sample
* Kolmogorov-Smirnov distribution. The one-sided p-value uses an approximation from
* Hodges (1958) Eq 5.3.
*
* <p>\(D_{n,m}\) has a discrete distribution. This makes the p-value associated with the
* null hypothesis \(H_0 : D_{n,m} \gt d \) differ from \(H_0 : D_{n,m} \ge d \)
* by the mass of the observed value \(d\). These can be distinguished using an
* {@link Inequality} parameter. This is ignored for large samples.
*
* <p>If the data contains ties there is no defined ordering in the tied region to use
* for the difference between the two empirical distributions. Each ordering of the
* tied region <em>may</em> create a different D statistic. All possible orderings
* generate a distribution for the D value. In this case the tied region is traversed
* entirely and the effect on the D value evaluated at the end of the tied region.
* This is the path with the least change on the D statistic. The path with the
* greatest change on the D statistic is also computed as the upper bound on D.
* If these two values are different then the tied region is known to generate a
* distribution for the D statistic and the p-value is an over estimate for the cases
* with a larger D statistic. The presence of any significant tied regions is returned
* in the result.
*
* <p>If the p-value method is {@link PValueMethod#ESTIMATE ESTIMATE} then the p-value is
* estimated by repeat sampling of the joint distribution of {@code x} and {@code y}.
* The p-value is the frequency that a sample creates a D statistic
* greater than or equal to (or greater than for strict inequality) the observed value.
* In this case a source of randomness must be configured or an {@link IllegalStateException}
* will be raised. The p-value for the upper bound on D will not be estimated and is set to
* {@link Double#NaN NaN}. This estimation procedure is not affected by ties in the data
* and is increasingly robust for larger datasets. The method is modeled after
* <a href="https://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a>
* in the R {@code Matching} package (Sekhon (2011)).
*
* @param x First sample.
* @param y Second sample.
* @return test result
* @throws IllegalArgumentException if either {@code x} or {@code y} does not
* have length at least 2; or contain NaN values.
* @throws IllegalStateException if the p-value method is {@link PValueMethod#ESTIMATE ESTIMATE}
* and there is no source of randomness.
* @see #statistic(double[], double[])
*/
public TwoResult test(double[] x, double[] y) {
final int n = checkArrayLength(x);
final int m = checkArrayLength(y);
PValueMethod method = pValueMethod;
final int[] sign = {0};
final long[] tiesD = {0, 0};
final double[] sx = x.clone();
final double[] sy = y.clone();
final long dnm = computeIntegralKolmogorovSmirnovStatistic(sx, sy, sign, tiesD);
// Compute p-value. Note that the p-value is not invalidated by ties; it is the
// D statistic that could be invalidated by resolution of the ties. So compute
// the exact p even if ties are present.
if (method == PValueMethod.AUTO) {
// Use exact for small samples
method = Math.max(n, m) < LARGE_SAMPLE ?
PValueMethod.EXACT :
PValueMethod.ASYMPTOTIC;
}
final int gcd = ArithmeticUtils.gcd(n, m);
final double d = computeD(dnm, n, m, gcd);
final boolean significantTies = tiesD[1] > dnm;
final double d2 = significantTies ? computeD(tiesD[1], n, m, gcd) : d;
final double p;
double p2;
// Allow bootstrap estimation of the p-value
if (method == PValueMethod.ESTIMATE) {
p = estimateP(sx, sy, dnm);
p2 = Double.NaN;
} else {
final boolean exact = method == PValueMethod.EXACT;
p = p2 = twoSampleP(dnm, n, m, gcd, d, exact);
if (significantTies) {
// Compute the upper bound on D.
// The p-value is also computed. The alternative is to save the options
// in the result with (upper dnm, n, m) and compute it on-demand.
// Note detection of whether the exact P computation is possible is based on
// n and m, thus this will use the same computation.
p2 = twoSampleP(tiesD[1], n, m, gcd, d2, exact);
}
}
return new TwoResult(d, sign[0], p, significantTies, d2, p2);
}
/**
* Estimates the <i>p-value</i> of a two-sample Kolmogorov-Smirnov test evaluating the
* null hypothesis that {@code x} and {@code y} are samples drawn from the same
* probability distribution.
*
* <p>This method will destructively modify the input arrays (via a sort).
*
* <p>This method estimates the p-value by repeatedly sampling sets of size
* {@code x.length} and {@code y.length} from the empirical distribution
* of the combined sample. The memory requirement is an array copy of each of
* the input arguments.
*
* <p>When using strict inequality, this is equivalent to the algorithm implemented in
* the R function {@code ks.boot} as described in Sekhon (2011) Journal of Statistical
* Software, 42(7), 1–52 [3].
*
* @param x First sample.
* @param y Second sample.
* @param dnm Integral D statistic.
* @return p-value
* @throws IllegalStateException if the source of randomness is null.
*/
private double estimateP(double[] x, double[] y, long dnm) {
if (rng == null) {
throw new IllegalStateException("No source of randomness");
}
// Test if the random statistic is greater (strict), or greater or equal to d
final long d = strictInequality ? dnm : dnm - 1;
final long plus;
final long minus;
if (alternative == AlternativeHypothesis.GREATER_THAN) {
plus = d;
minus = Long.MIN_VALUE;
} else if (alternative == AlternativeHypothesis.LESS_THAN) {
plus = Long.MAX_VALUE;
minus = -d;
} else {
// two-sided
plus = d;
minus = -d;
}
// Test dnm=0. This occurs for example when x == y.
if (0 < minus || 0 > plus) {
// Edge case where all possible d will be outside the inclusive bounds
return 1;
}
// Sample randomly with replacement from the combined distribution.
final DoubleSupplier gen = createSampler(x, y, rng);
int count = 0;
for (int i = iterations; i > 0; i--) {
for (int j = 0; j < x.length; j++) {
x[j] = gen.getAsDouble();
}
for (int j = 0; j < y.length; j++) {
y[j] = gen.getAsDouble();
}
if (testIntegralKolmogorovSmirnovStatistic(x, y, plus, minus)) {
count++;
}
}
return count / (double) iterations;
}
/**
* Computes the magnitude of the one-sample Kolmogorov-Smirnov test statistic.
* The sign of the statistic is optionally returned in {@code sign}. For the two-sided case
* the sign is 0 if the magnitude of D+ and D- was equal; otherwise it indicates which D
* had the larger magnitude.
*
* @param x Sample being evaluated.
* @param cdf Reference cumulative distribution function.
* @param sign Sign of the statistic (non-zero length).
* @return Kolmogorov-Smirnov statistic
* @throws IllegalArgumentException if {@code data} does not have length at least 2;
* or contains NaN values.
*/
private double computeStatistic(double[] x, DoubleUnaryOperator cdf, int[] sign) {
final int n = checkArrayLength(x);
final double nd = n;
final double[] sx = sort(x.clone(), "Sample");
// Note: ties in the data do not matter as we compare the empirical CDF
// immediately before the value (i/n) and at the value (i+1)/n. For ties
// of length m this would be (i-m+1)/n and (i+1)/n and the result is the same.
double d = 0;
if (alternative == AlternativeHypothesis.GREATER_THAN) {
// Compute D+
for (int i = 0; i < n; i++) {
final double yi = cdf.applyAsDouble(sx[i]);
final double dp = (i + 1) / nd - yi;
d = dp > d ? dp : d;
}
sign[0] = 1;
} else if (alternative == AlternativeHypothesis.LESS_THAN) {
// Compute D-
for (int i = 0; i < n; i++) {
final double yi = cdf.applyAsDouble(sx[i]);
final double dn = yi - i / nd;
d = dn > d ? dn : d;
}
sign[0] = -1;
} else {
// Two sided.
// Compute both (as unsigned) and return the sign indicating the largest result.
double plus = 0;
double minus = 0;
for (int i = 0; i < n; i++) {
final double yi = cdf.applyAsDouble(sx[i]);
final double dn = yi - i / nd;
final double dp = (i + 1) / nd - yi;
minus = dn > minus ? dn : minus;
plus = dp > plus ? dp : plus;
}
sign[0] = Double.compare(plus, minus);
d = Math.max(plus, minus);
}
return d;
}
/**
* Computes the two-sample Kolmogorov-Smirnov test statistic. The statistic is integral
* and has a value in {@code [0, n*m]}. Not all values are possible; the smallest
* increment is the greatest common divisor of {@code n} and {@code m}.
*
* <p>This method will destructively modify the input arrays (via a sort).
*
* <p>The sign of the statistic is returned in {@code sign}. For the two-sided case
* the sign is 0 if the magnitude of D+ and D- was equal; otherwise it indicates which D
* had the larger magnitude. If the two-sided statistic is zero the two arrays are
* identical, or are 'identical' data of a single value (sample sizes may be different),
* or have a sequence of ties of 'identical' data with a net zero effect on the D statistic
* e.g.
* <pre>
* [1,2,3] vs [1,2,3]
* [0,0,0,0] vs [0,0,0]
* [0,0,0,0,1,1,1,1] vs [0,0,0,1,1,1]
* </pre>
*
* <p>This method detects ties in the input data. Not all ties will invalidate the returned
* statistic. Ties between the data can be interpreted as if the values were different
* but within machine epsilon. In this case the path through the tie region is not known.
* All paths through the tie region end at the same point. This method will compute the
* most extreme path through each tie region and the D statistic for these paths. If the
* ties D statistic is a larger magnitude than the returned D statistic then at least
* one tie region lies at a point on the full path that could result in a different
* statistic in the absence of ties. This signals the P-value computed using the returned
* D statistic is one of many possible p-values given the data and how ties are resolved.
* Note: The tiesD value may be smaller than the returned D statistic as it is not set
* to the maximum of D or tiesD. The only result of interest is when {@code tiesD > D}
* due to a tie region that can change the output D. On output {@code tiesD[0] != 0}
* if there were ties between samples and {@code tiesD[1] = D} of the most extreme path
* through the ties.
*
* @param x First sample (destructively modified by sort).
* @param y Second sample (destructively modified by sort).
* @param sign Sign of the statistic (non-zero length).
* @param tiesD Integral statistic for the most extreme path through any ties (length at least 2).
* @return integral Kolmogorov-Smirnov statistic
* @throws IllegalArgumentException if either {@code x} or {@code y} contain NaN values.
*/
private long computeIntegralKolmogorovSmirnovStatistic(double[] x, double[] y, int[] sign, long[] tiesD) {
// Sort the sample arrays
sort(x, SAMPLE_1_NAME);
sort(y, SAMPLE_2_NAME);
final int n = x.length;
final int m = y.length;
// CDFs range from 0 to 1 using increments of 1/n and 1/m for x and y respectively.
// Scale by n*m to use increments of m and n for x and y.
// Find the max difference between cdf_x and cdf_y.
int i = 0;
int j = 0;
long d = 0;
long plus = 0;
long minus = 0;
// Ties: store the D+,D- for most extreme path though tie region(s)
long tplus = 0;
long tminus = 0;
do {
// No NaNs so compare using < and >
if (x[i] < y[j]) {
final double z = x[i];
do {
i++;
d += m;
} while (i < n && x[i] == z);
plus = d > plus ? d : plus;
} else if (x[i] > y[j]) {
final double z = y[j];
do {
j++;
d -= n;
} while (j < m && y[j] == z);
minus = d < minus ? d : minus;
} else {
// Traverse to the end of the tied section for d.
// Also compute the most extreme path through the tied region.
final double z = x[i];
final long dd = d;
int k = i;
do {
i++;
} while (i < n && x[i] == z);
k = i - k;
d += k * (long) m;
// Extreme D+ path
tplus = d > tplus ? d : tplus;
k = j;
do {
j++;
} while (j < m && y[j] == z);
k = j - k;
d -= k * (long) n;
// Extreme D- path must start at the original d
tminus = Math.min(tminus, dd - k * (long) n);
// End of tied section
if (d > plus) {
plus = d;
} else if (d < minus) {
minus = d;
}
}
} while (i < n && j < m);
// The presence of any ties is flagged by a non-zero value for D+ or D-.
// Note we cannot use the selected tiesD value as in the one-sided case it may be zero
// and the non-selected D value will be non-zero.
tiesD[0] = tplus | tminus;
// For simplicity the correct tiesD is not returned (correct value is commented).
// The only case that matters is tiesD > D which is evaluated by the caller.
// Note however that the distance of tiesD < D is a measure of how little the
// tied region matters.
if (alternative == AlternativeHypothesis.GREATER_THAN) {
sign[0] = 1;
// correct = max(tplus, plus)
tiesD[1] = tplus;
return plus;
} else if (alternative == AlternativeHypothesis.LESS_THAN) {
sign[0] = -1;
// correct = -min(tminus, minus)
tiesD[1] = -tminus;
return -minus;
} else {
// Two sided.
sign[0] = Double.compare(plus, -minus);
d = Math.max(plus, -minus);
// correct = max(d, max(tplus, -tminus))
tiesD[1] = Math.max(tplus, -tminus);
return d;
}
}
/**
* Test if the two-sample integral Kolmogorov-Smirnov statistic is strictly greater
* than the specified values for D+ and D-. Note that D- should have a negative sign
* to impose an inclusive lower bound.
*
* <p>This method will destructively modify the input arrays (via a sort).
*
* <p>For a two-sided alternative hypothesis {@code plus} and {@code minus} should have the
* same magnitude with opposite signs.
*
* <p>For a one-sided alternative hypothesis the value of {@code plus} or {@code minus}
* should have the expected value of the statistic, and the opposite D should have the maximum
* or minimum long value. The {@code minus} should be negatively signed:
*
* <ul>
* <li>greater: {@code plus} = D, {@code minus} = {@link Long#MIN_VALUE}
* <li>greater: {@code minus} = -D, {@code plus} = {@link Long#MAX_VALUE}
* </ul>
*
* <p>Note: This method has not been specialized for the one-sided case. Specialization
* would eliminate a conditional branch for {@code d > Long.MAX_VALUE} or
* {@code d < Long.MIN_VALUE}. Since these branches are never possible in the one-sided case
* this should be efficiently chosen by branch prediction in a processor pipeline.
*
* @param x First sample (destructively modified by sort; must not contain NaN).
* @param y Second sample (destructively modified by sort; must not contain NaN).
* @param plus Limit on D+ (inclusive upper bound).
* @param minus Limit on D- (inclusive lower bound).
* @return true if the D value exceeds the provided limits
*/
private static boolean testIntegralKolmogorovSmirnovStatistic(double[] x, double[] y, long plus, long minus) {
// Sort the sample arrays
Arrays.sort(x);
Arrays.sort(y);
final int n = x.length;
final int m = y.length;
// CDFs range from 0 to 1 using increments of 1/n and 1/m for x and y respectively.
// Scale by n*m to use increments of m and n for x and y.
// Find the any difference that exceeds the specified bounds.
int i = 0;
int j = 0;
long d = 0;
do {
// No NaNs so compare using < and >
if (x[i] < y[j]) {
final double z = x[i];
do {
i++;
d += m;
} while (i < n && x[i] == z);
if (d > plus) {
return true;
}
} else if (x[i] > y[j]) {
final double z = y[j];
do {
j++;
d -= n;
} while (j < m && y[j] == z);
if (d < minus) {
return true;
}
} else {
// Traverse to the end of the tied section for d.
final double z = x[i];
do {
i++;
d += m;
} while (i < n && x[i] == z);
do {
j++;
d -= n;
} while (j < m && y[j] == z);
// End of tied section
if (d > plus || d < minus) {
return true;
}
}
} while (i < n && j < m);
// Note: Here d requires returning to zero. This is applicable to the one-sided
// cases since d may have always been above zero (favours D+) or always below zero
// (favours D-). This is ignored as the method is not called when dnm=0 is
// outside the inclusive bounds.
return false;
}
/**
* Creates a sampler to sample randomly from the combined distribution of the two samples.
*
* @param x First sample.
* @param y Second sample.
* @param rng Source of randomness.
* @return the sampler
*/
private static DoubleSupplier createSampler(double[] x, double[] y,
UniformRandomProvider rng) {
return createSampler(x, y, rng, MAX_ARRAY_SIZE);
}
/**
* Creates a sampler to sample randomly from the combined distribution of the two
* samples. This will copy the input data for the sampler.
*
* @param x First sample.
* @param y Second sample.
* @param rng Source of randomness.
* @param maxArraySize Maximum size of a single array.
* @return the sampler
*/
static DoubleSupplier createSampler(double[] x, double[] y,
UniformRandomProvider rng,
int maxArraySize) {
final int n = x.length;
final int m = y.length;
final int len = n + m;
// Overflow safe: len > maxArraySize
if (len - maxArraySize > 0) {
// Support sampling with maximum length arrays
// (where a concatenated array is not possible)
// by choosing one or the other.
// - generate i in [-n, m)
// - return i < 0 ? x[n + i] : y[i]
// The sign condition is a 50-50 branch.
// Perform branchless by extracting the sign bit to pick the array.
// Copy the source data.
final double[] xx = x.clone();
final double[] yy = y.clone();
final IntToDoubleFunction nextX = i -> xx[n + i];
final IntToDoubleFunction nextY = i -> yy[i];
// Arrange function which accepts the negative index at position [1]
final IntToDoubleFunction[] next = {nextY, nextX};
return () -> {
final int i = rng.nextInt(-n, m);
return next[i >>> 31].applyAsDouble(i);
};
}
// Concatenate arrays
final double[] z = new double[len];
System.arraycopy(x, 0, z, 0, n);
System.arraycopy(y, 0, z, n, m);
return () -> z[rng.nextInt(len)];
}
/**
* Compute the D statistic from the integral D value.
*
* @param dnm Integral D-statistic value (in [0, n*m]).
* @param n First sample size.
* @param m Second sample size.
* @param gcd Greatest common divisor of n and m.
* @return D-statistic value (in [0, 1]).
*/
private static double computeD(long dnm, int n, int m, int gcd) {
// Note: Integer division using the gcd is intentional
final long a = dnm / gcd;
final int b = m / gcd;
return a / ((double) n * b);
}
/**
* Computes \(P(D_{n,m} > d)\) for the 2-sample Kolmogorov-Smirnov statistic.
*
* @param dnm Integral D-statistic value (in [0, n*m]).
* @param n First sample size.
* @param m Second sample size.
* @param gcd Greatest common divisor of n and m.
* @param d D-statistic value (in [0, 1]).
* @param exact whether to compute the exact probability; otherwise approximate.
* @return probability
* @see #twoSampleExactP(long, int, int, int, boolean, boolean)
* @see #twoSampleApproximateP(double, int, int, boolean)
*/
private double twoSampleP(long dnm, int n, int m, int gcd, double d, boolean exact) {
// Exact computation returns -1 if it cannot compute.
double p = -1;
if (exact) {
p = twoSampleExactP(dnm, n, m, gcd, strictInequality, alternative == AlternativeHypothesis.TWO_SIDED);
}
if (p < 0) {
p = twoSampleApproximateP(d, n, m, alternative == AlternativeHypothesis.TWO_SIDED);
}
return p;
}
/**
* Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
* d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic, either the two-sided
* \(D_{n,m}\) or one-sided \(D_{n,m}^+\}. See
* {@link #statistic(double[], double[])} for the definition of \(D_{n,m}\).
*
* <p>The returned probability is exact. If the value cannot be computed this returns -1.
*
* <p>Note: This requires the greatest common divisor of n and m. The integral D statistic
* in the range [0, n*m] is separated by increments of the gcd. The method will only
* compute p-values for valid values of D by calculating for D/gcd.
* Strict inquality is performed using the next valid value for D.
*
* @param dnm Integral D-statistic value (in [0, n*m]).
* @param n First sample size.
* @param m Second sample size.
* @param gcd Greatest common divisor of n and m.
* @param strict whether or not the probability to compute is expressed as a strict inequality.
* @param twoSided whether D refers to D or D+.
* @return probability that a randomly selected m-n partition of m + n generates D
* greater than (resp. greater than or equal to) {@code d} (or -1)
*/
static double twoSampleExactP(long dnm, int n, int m, int gcd, boolean strict, boolean twoSided) {
// Create the statistic in [0, lcm]
// For strict inequality D > d the result is the same if we compute for D >= (d+1)
final long d = dnm / gcd + (strict ? 1 : 0);
// P-value methods compute for d <= lcm (least common multiple)
final long lcm = (long) n * (m / gcd);
if (d > lcm) {
return 0;
}
// Note: Some methods require m >= n, others n >= m
final int a = Math.min(n, m);
final int b = Math.max(n, m);
if (twoSided) {
// Any two-sided statistic dnm cannot be less than min(n, m) in the absence of ties.
if (d * gcd <= a) {
return 1;
}
// Here d in [2, lcm]
if (n == m) {
return twoSampleTwoSidedPOutsideSquare(d, n);
}
return twoSampleTwoSidedPStabilizedInner(d, b, a, gcd);
}
// Any one-sided statistic cannot be less than 0
if (d <= 0) {
return 1;
}
// Here d in [1, lcm]
if (n == m) {
return twoSampleOneSidedPOutsideSquare(d, n);
}
return twoSampleOneSidedPOutside(d, a, b, gcd);
}
/**
* Computes \(P(D_{n,m} \ge d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic.
*
* <p>The returned probability is exact, implemented using the stabilized inner method
* presented in Viehmann (2021).
*
* <p>This is optimized for {@code m <= n}. If {@code m > n} and index-out-of-bounds
* exception can occur.
*
* @param d Integral D-statistic value (in [2, lcm])
* @param n Larger sample size.
* @param m Smaller sample size.
* @param gcd Greatest common divisor of n and m.
* @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
* greater than or equal to {@code d}
*/
private static double twoSampleTwoSidedPStabilizedInner(long d, int n, int m, int gcd) {
// Check the computation is possible.
// Note that the possible paths is binom(m+n, n).
// However the computation is stable above this limit.
// Possible d values (requiring a unique p-value) = max(dnm) / gcd = lcm(n, m).
// Possible terms to compute <= n * size(cij)
// Simple limit based on the number of possible different p-values
if ((long) n * (m / gcd) > MAX_LCM_TWO_SAMPLE_EXACT_P) {
return -1;
}
// This could be updated to use d in [1, lcm].
// Currently it uses d in [gcd, n*m].
// Largest intermediate value is (dnm + im + n) which is within 2^63
// if n and m are 2^31-1, i = n, dnm = n*m: (2^31-1)^2 + (2^31-1)^2 + 2^31-1 < 2^63
final long dnm = d * gcd;
// Viehmann (2021): Updated for i in [0, n], j in [0, m]
// C_i,j = 1 if |i/n - j/m| >= d
// = 0 if |i/n - j/m| < d and (i=0 or j=0)
// = C_i-1,j * i/(i+j) + C_i,j-1 * j/(i+j) otherwise
// P2 = C_n,m
// Note: The python listing in Viehmann used d in [0, 1]. This uses dnm in [0, nm]
// so updates the scaling to compute the ranges. Also note that the listing uses
// dist > d or dist < -d where this uses |dist| >= d to compute P(D >= d) (non-strict inequality).
// The provided listing is explicit in the values for each j in the range.
// It can be optimized given the known start and end j for each iteration as only
// j where |i/n - j/m| < d must be processed:
// startJ where: im - jn < dnm : jn > im - dnm
// j = floor((im - dnm) / n) + 1 in [0, m]
// endJ where: jn - im >= dnm
// j = ceil((dnm + im) / n) in [0, m+1]
// First iteration with i = 0
// j = ceil(dnm / n)
int endJ = Math.min(m + 1, (int) ((dnm + n - 1) / n));
// Only require 1 array to store C_i-1,j as the startJ only ever increases
// and we update lower indices using higher ones.
// The maximum value *written* is j=m or less using j/m <= 2*d : j = ceil(2*d*m)
// Viehmann uses: size = int(2*m*d + 2) == ceil(2*d*m) + 1
// The maximum value *read* is j/m <= 2*d. This may be above m. This occurs when
// j - lastStartJ > m and C_i-1,j = 1. This can be avoided if (startJ - lastStartJ) <= 1
// which occurs if m <= n, i.e. the window only slides 0 or 1 in j for each increment i
// and we can maintain Cij as 1 larger than ceil(2*d*m) + 1.
final double[] cij = new double[Math.min(m + 1, 2 * endJ + 2)];
// Each iteration fills C_i,j with values and the remaining values are
// kept as 1 for |i/n - j/m| >= d
//assert (endJ - 1) * (long) n < dnm : "jn >= dnm for j < endJ";
for (int j = endJ; j < cij.length; j++) {
//assert j * (long) n >= dnm : "jn < dnm for j >= endJ";
cij[j] = 1;
}
int startJ = 0;
int length = endJ;
double val = -1;
long im = 0;
for (int i = 1; i <= n; i++) {
im += m;
final int lastStartJ = startJ;
// Compute C_i,j for startJ <= j < endJ
// startJ = floor((im - dnm) / n) + 1 in [0, m]
// endJ = ceil((dnm + im) / n) in [0, m+1]
startJ = im < dnm ? 0 : Math.min(m, (int) ((im - dnm) / n) + 1);
endJ = Math.min(m + 1, (int) ((dnm + im + n - 1) / n));
if (startJ >= endJ) {
// No possible paths inside the boundary
return 1;
}
//assert startJ - lastStartJ <= 1 : "startJ - lastStartJ > 1";
// Initialize previous value C_i,j-1
val = startJ == 0 ? 0 : 1;
//assert startJ == 0 || Math.abs(im - (startJ - 1) * (long) n) >= dnm : "|im - jn| < dnm for j < startJ";
//assert endJ > m || Math.abs(im - endJ * (long) n) >= dnm : "|im - jn| < dnm for j >= endJ";
for (int j = startJ; j < endJ; j++) {
//assert j == 0 || Math.abs(im - j * (long) n) < dnm : "|im - jn| >= dnm for startJ <= j < endJ";
// C_i,j = C_i-1,j * i/(i+j) + C_i,j-1 * j/(i+j)
// Note: if (j - lastStartJ) >= cij.length this creates an IOOB exception.
// In this case cij[j - lastStartJ] == 1. Only happens when m >= n.
// Fixed using c_i-1,j = (j - lastStartJ >= cij.length ? 1 : cij[j - lastStartJ]
val = (cij[j - lastStartJ] * i + val * j) / ((double) i + j);
cij[j - startJ] = val;
}
// Must keep the remaining values in C_i,j as 1 to allow
// cij[j - lastStartJ] * i == i when (j - lastStartJ) > lastLength
final int lastLength = length;
length = endJ - startJ;
for (int j = lastLength - length - 1; j >= 0; j--) {
cij[length + j] = 1;
}
}
// Return the most recently written value: C_n,m
return val;
}
/**
* Computes \(P(D_{n,m}^+ \ge d)\), where \(D_{n,m}^+\) is the 2-sample one-sided
* Kolmogorov-Smirnov statistic.
*
* <p>The returned probability is exact, implemented using the outer method
* presented in Hodges (1958).
*
* <p>This method will fail-fast and return -1 if the computation of the
* numbers of paths overflows.
*
* @param d Integral D-statistic value (in [0, lcm])
* @param n First sample size.
* @param m Second sample size.
* @param gcd Greatest common divisor of n and m.
* @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
* greater than or equal to {@code d}
*/
private static double twoSampleOneSidedPOutside(long d, int n, int m, int gcd) {
// Hodges, Fig.2
// Lower boundary: (nx - my)/nm >= d : (nx - my) >= dnm
// B(x, y) is the number of ways from (0, 0) to (x, y) without previously
// reaching the boundary.
// B(x, y) = binom(x+y, y) - [number of ways which previously reached the boundary]
// Total paths:
// sum_y { B(x, y) binom(m+n-x-y, n-y) }
// Normalized by binom(m+n, n). Check this is possible.
final long lm = m;
if (n + lm > Integer.MAX_VALUE) {
return -1;
}
final double binom = binom(m + n, n);
if (binom == Double.POSITIVE_INFINITY) {
return -1;
}
// This could be updated to use d in [1, lcm].
// Currently it uses d in [gcd, n*m].
final long dnm = d * gcd;
// Visit all x in [0, m] where (nx - my) >= d for each increasing y in [0, n].
// x = ceil( (d + my) / n ) = (d + my + n - 1) / n
// y = ceil( (nx - d) / m ) = (nx - d + m - 1) / m
// Note: n m integer, d in [0, nm], the intermediate cannot overflow a long.
// x | y=0 = (d + n - 1) / n
final int x0 = (int) ((dnm + n - 1) / n);
if (x0 >= m) {
return 1 / binom;
}
// The y above is the y *on* the boundary. Set the limit as the next y above:
// y | x=m = 1 + floor( (nx - d) / m ) = 1 + (nm - d) / m
final int maxy = (int) ((n * lm - dnm + m) / m);
// Compute x and B(x, y) for visited B(x,y)
final int[] xy = new int[maxy];
final double[] bxy = new double[maxy];
xy[0] = x0;
bxy[0] = 1;
for (int y = 1; y < maxy; y++) {
final int x = (int) ((dnm + lm * y + n - 1) / n);
// B(x, y) = binom(x+y, y) - [number of ways which previously reached the boundary]
// Add the terms to subtract as a negative sum.
final Sum b = Sum.create();
for (int yy = 0; yy < y; yy++) {
// Here: previousX = x - xy[yy] : previousY = y - yy
// bxy[yy] is the paths to (previousX, previousY)
// binom represent the paths from (previousX, previousY) to (x, y)
b.addProduct(bxy[yy], -binom(x - xy[yy] + y - yy, y - yy));
}
b.add(binom(x + y, y));
xy[y] = x;
bxy[y] = b.getAsDouble();
}
// sum_y { B(x, y) binom(m+n-x-y, n-y) }
final Sum sum = Sum.create();
for (int y = 0; y < maxy; y++) {
sum.addProduct(bxy[y], binom(m + n - xy[y] - y, n - y));
}
// No individual term should have overflowed since binom is finite.
// Any sum above 1 is floating-point error.
return KolmogorovSmirnovDistribution.clipProbability(sum.getAsDouble() / binom);
}
/**
* Computes \(P(D_{n,n}^+ \ge d)\), where \(D_{n,n}^+\) is the 2-sample one-sided
* Kolmogorov-Smirnov statistic.
*
* <p>The returned probability is exact, implemented using the outer method
* presented in Hodges (1958).
*
* @param d Integral D-statistic value (in [1, lcm])
* @param n Sample size.
* @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
* greater than or equal to {@code d}
*/
private static double twoSampleOneSidedPOutsideSquare(long d, int n) {
// Hodges (1958) Eq. 2.3:
// p = binom(2n, n-a) / binom(2n, n)
// a in [1, n] == d * n == dnm / n
final int a = (int) d;
// Rearrange:
// p = ( 2n! / ((n-a)! (n+a)!) ) / ( 2n! / (n! n!) )
// = n! n! / ( (n-a)! (n+a)! )
// Perform using pre-computed factorials if possible.
if (n + a <= MAX_FACTORIAL) {
final double x = Factorial.doubleValue(n);
final double y = Factorial.doubleValue(n - a);
final double z = Factorial.doubleValue(n + a);
return (x / y) * (x / z);
}
// p = n! / (n-a)! * n! / (n+a)!
// n * (n-1) * ... * (n-a+1)
// = -----------------------------
// (n+a) * (n+a-1) * ... * (n+1)
double p = 1;
for (int i = 0; i < a && p != 0; i++) {
p *= (n - i) / (1.0 + n + i);
}
return p;
}
/**
* Computes \(P(D_{n,n}^+ \ge d)\), where \(D_{n,n}^+\) is the 2-sample two-sided
* Kolmogorov-Smirnov statistic.
*
* <p>The returned probability is exact, implemented using the outer method
* presented in Hodges (1958).
*
* @param d Integral D-statistic value (in [1, n])
* @param n Sample size.
* @return probability that a randomly selected m-n partition of n + n generates \(D_{n,n}\)
* greater than or equal to {@code d}
*/
private static double twoSampleTwoSidedPOutsideSquare(long d, int n) {
// Hodges (1958) Eq. 2.4:
// p = 2 [ binom(2n, n-a) - binom(2n, n-2a) + binom(2n, n-3a) - ... ] / binom(2n, n)
// a in [1, n] == d * n == dnm / n
// As per twoSampleOneSidedPOutsideSquare, divide by binom(2n, n) and each term
// can be expressed as a product:
// ( n - i n - i n - i )
// p = 2 * ( prod_i=0^a --------- - prod_i=0^2a --------- + prod_i=0^3a --------- + ... )
// ( 1 + n + i 1 + n + i 1 + n + i )
// for ja in [1, ..., n/a]
// Avoid repeat computation of terms by extracting common products:
// p = 2 * ( p0a * (1 - p1a * (1 - p2a * (1 - ... ))) )
// where each term pja is prod_i={ja}^{ja+a} for all j in [1, n / a]
// The first term is the one-sided p.
final double p0a = twoSampleOneSidedPOutsideSquare(d, n);
if (p0a == 0) {
// Underflow - nothing more to do
return 0;
}
// Compute the inner-terms from small to big.
// j = n / (d/n) ~ n*n / d
// j is a measure of how extreme the d value is (small j is extreme d).
// When j is above 0 a path may traverse from the lower boundary to the upper boundary.
final int a = (int) d;
double p = 0;
for (int j = n / a; j > 0; j--) {
double pja = 1;
final int jaa = j * a + a;
// Since p0a did not underflow we avoid the check for pj != 0
for (int i = j * a; i < jaa; i++) {
pja *= (n - i) / (1.0 + n + i);
}
p = pja * (1 - p);
}
p = p0a * (1 - p);
return Math.min(1, 2 * p);
}
/**
* Compute the binomial coefficient binom(n, k).
*
* @param n N.
* @param k K.
* @return binom(n, k)
*/
private static double binom(int n, int k) {
return BinomialCoefficientDouble.value(n, k);
}
/**
* Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\)
* is the 2-sample Kolmogorov-Smirnov statistic. See
* {@link #statistic(double[], double[])} for the definition of \(D_{n,m}\).
*
* <p>Specifically, what is returned is \(1 - CDF(d, \sqrt{mn / (m + n)})\) where CDF
* is the cumulative density function of the two-sided one-sample Kolmogorov-Smirnov
* distribution.
*
* @param d D-statistic value.
* @param n First sample size.
* @param m Second sample size.
* @param twoSided True to compute the two-sided p-value; else one-sided.
* @return approximate probability that a randomly selected m-n partition of m + n generates
* \(D_{n,m}\) greater than {@code d}
*/
static double twoSampleApproximateP(double d, int n, int m, boolean twoSided) {
final double nn = Math.min(n, m);
final double mm = Math.max(n, m);
if (twoSided) {
// Smirnov's asymptotic formula:
// P(sqrt(N) D_n > x)
// N = m*n/(m+n)
return KolmogorovSmirnovDistribution.Two.sf(d, (int) Math.round(mm * nn / (mm + nn)));
}
// one-sided
// Use Hodges Eq 5.3. Requires m >= n
// Correct for m=n, m an integral multiple of n, and 'on the average' for m nearly equal to n
final double z = d * Math.sqrt(nn * mm / (nn + mm));
return Math.exp(-2 * z * z - 2 * z * (mm + 2 * nn) / Math.sqrt(mm * nn * (mm + nn)) / 3);
}
/**
* Verifies that {@code array} has length at least 2.
*
* @param array Array to test.
* @return the length
* @throws IllegalArgumentException if array is too short
*/
private static int checkArrayLength(double[] array) {
final int n = array.length;
if (n <= 1) {
throw new InferenceException(InferenceException.TWO_VALUES_REQUIRED, n);
}
return n;
}
/**
* Sort the input array. Throws an exception if NaN values are
* present. It is assumed the array is non-zero length.
*
* @param x Input array.
* @param name Name of the array.
* @return a reference to the input (sorted) array
* @throws IllegalArgumentException if {@code x} contains NaN values.
*/
private static double[] sort(double[] x, String name) {
Arrays.sort(x);
// NaN will be at the end
if (Double.isNaN(x[x.length - 1])) {
throw new InferenceException(name + " contains NaN");
}
return x;
}
}