You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by er...@apache.org on 2018/11/23 11:43:04 UTC

[3/8] commons-rng git commit: RNG-62: Added a CombinationSampler

RNG-62: Added a CombinationSampler

Complements the PermutationSampler as the sample order is C(n, k) rather
than P(n, k). Can be used for faster subset sampling if the subset order
is not important.

Project: http://git-wip-us.apache.org/repos/asf/commons-rng/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-rng/commit/17ddb9f3
Tree: http://git-wip-us.apache.org/repos/asf/commons-rng/tree/17ddb9f3
Diff: http://git-wip-us.apache.org/repos/asf/commons-rng/diff/17ddb9f3

Branch: refs/heads/master
Commit: 17ddb9f3376a1cc2e798246fedcf73320902354c
Parents: 37638f1
Author: Alex Herbert <a....@sussex.ac.uk>
Authored: Wed Nov 21 15:15:28 2018 +0000
Committer: Gilles <er...@apache.org>
Committed: Thu Nov 22 15:17:05 2018 +0100

----------------------------------------------------------------------
 .../rng/sampling/CombinationSampler.java        | 167 ++++++++++++++++
 .../rng/sampling/CombinationSamplerTest.java    | 190 +++++++++++++++++++
 2 files changed, 357 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-rng/blob/17ddb9f3/commons-rng-sampling/src/main/java/org/apache/commons/rng/sampling/CombinationSampler.java
----------------------------------------------------------------------
diff --git a/commons-rng-sampling/src/main/java/org/apache/commons/rng/sampling/CombinationSampler.java b/commons-rng-sampling/src/main/java/org/apache/commons/rng/sampling/CombinationSampler.java
new file mode 100644
index 0000000..04b4376
--- /dev/null
+++ b/commons-rng-sampling/src/main/java/org/apache/commons/rng/sampling/CombinationSampler.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.rng.sampling;
+
+import org.apache.commons.rng.UniformRandomProvider;
+
+/**
+ * Class for representing combinations of a sequence of integers.
+ *
+ * <p>A combination is a selection of items from a collection, such that (unlike
+ * permutations) the order of selection <strong>does not matter</strong>. This
+ * sampler can be used to generate a combination in an unspecified order and is
+ * faster than the corresponding {@link PermutationSampler}.
+ *
+ * <p>Note that the sample order is unspecified. For example a sample
+ * combination of 2 from 4 may return {@code [0,1]} or {@code [1,0]} as the two are
+ * equivalent, and the order of a given combination may change in subsequent samples.
+ *
+ * <p>The sampler can be used to generate indices to select subsets where the
+ * order of the subset is not important.
+ *
+ * @see <a href="https://en.wikipedia.org/wiki/Combination">Combination
+ *      definition</a>
+ * @see PermutationSampler
+ */
+public class CombinationSampler {
+    /** Domain of the combination. */
+    private final int[] domain;
+    /** Size of the combination. */
+    private final int size;
+    /** The number of steps of a full shuffle to perform. */
+    private final int steps;
+    /**
+     * The position to copy the domain from after a partial shuffle.
+     *
+     * <p>The copy is either in the range [0 : size] or [domain.length - size :
+     * domain.length].
+     */
+    private final int from;
+    /** RNG. */
+    private final UniformRandomProvider rng;
+
+    /**
+     * Creates a generator of combinations.
+     *
+     * <p>The {@link #sample()} method will generate an integer array of
+     * length {@code k} whose entries are selected randomly, without
+     * repetition, from the integers 0, 1, ..., {@code n}-1 (inclusive).
+     * The returned array represents a combination of {@code n} taken
+     * {@code k}.
+     *
+     * <p>In contrast to a permutation, the returned array is <strong>not
+     * guaranteed</strong> to be in a random order. The {@link #sample()}
+     * method returns the array in an unspecified order.
+     *
+     * <p>If {@code n <= 0} or {@code k <= 0} or {@code k > n} then no combination
+     * is required and an exception is raised.
+     *
+     * @param rng Generator of uniformly distributed random numbers.
+     * @param n   Domain of the combination.
+     * @param k   Size of the combination.
+     * @throws IllegalArgumentException if {@code n <= 0} or {@code k <= 0} or
+     *                                  {@code k > n}.
+     */
+    public CombinationSampler(UniformRandomProvider rng, int n, int k) {
+        if (n <= 0) {
+            throw new IllegalArgumentException("n <= 0 : n=" + n);
+        }
+        if (k <= 0) {
+            throw new IllegalArgumentException("k <= 0 : k=" + k);
+        }
+        if (k > n) {
+            throw new IllegalArgumentException("k > n : k=" + k + ", n=" + n);
+        }
+
+        domain = PermutationSampler.natural(n);
+        size = k;
+        // The sample can be optimised by only performing the first k or (n - k) steps
+        // from a full Fisher-Yates shuffle from the end of the domain to the start.
+        // The upper positions will then contain a random sample from the domain. The
+        // lower half is then by definition also a random sample (just not in a random order).
+        // The sample is then picked using the upper or lower half depending which
+        // makes the number of steps smaller.
+        if (k <= n / 2) {
+            // Upper half
+            steps = k;
+            from = n - k;
+        } else {
+            // Lower half
+            steps = n - k;
+            from = 0;
+        }
+        this.rng = rng;
+    }
+
+    /**
+     * Return a combination of {@code k} whose entries are selected randomly,
+     * without repetition, from the integers 0, 1, ..., {@code n}-1 (inclusive).
+     *
+     * <p>The order of the returned array is not guaranteed to be in a random order
+     * as the order of a combination <strong>does not matter</strong>.
+     *
+     * @return a random combination.
+     */
+    public int[] sample() {
+        // Shuffle from the end but limit to a number of steps.
+        // The subset C(n, k) is then either those positions that have
+        // been sampled, or those that haven't been, depending
+        // on the number of steps.
+        // Note: if n==k the number of steps is zero and the result
+        // is just a clone of the domain.
+        for (int i = domain.length - 1,
+                j = 0; i > 0 && j < steps; i--, j++) {
+            // Swap index i with any position down to 0 (including itself)
+            swap(domain, i, rng.nextInt(i + 1));
+        }
+        final int[] result = new int[size];
+        System.arraycopy(domain, from, result, 0, size);
+        return result;
+    }
+
+    /**
+     * Swaps the two specified elements in the specified array.
+     *
+     * @param array the array
+     * @param i     the first index
+     * @param j     the second index
+     */
+    private static void swap(int[] array, int i, int j) {
+        final int tmp = array[i];
+        array[i] = array[j];
+        array[j] = tmp;
+    }
+
+    /**
+     * Gets the domain size (n) of the combination C(n, k).
+     *
+     * @return the number of elements (n) in the domain
+     */
+    public int getN() {
+        return domain.length;
+    }
+
+    /**
+     * Gets the sample size (k) of the combination C(n, k).
+     *
+     * @return the number of elements (k) in the combination
+     */
+    public int getK() {
+        return size;
+    }
+}

http://git-wip-us.apache.org/repos/asf/commons-rng/blob/17ddb9f3/commons-rng-sampling/src/test/java/org/apache/commons/rng/sampling/CombinationSamplerTest.java
----------------------------------------------------------------------
diff --git a/commons-rng-sampling/src/test/java/org/apache/commons/rng/sampling/CombinationSamplerTest.java b/commons-rng-sampling/src/test/java/org/apache/commons/rng/sampling/CombinationSamplerTest.java
new file mode 100644
index 0000000..2d43204
--- /dev/null
+++ b/commons-rng-sampling/src/test/java/org/apache/commons/rng/sampling/CombinationSamplerTest.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing sampleissions and
+ * limitations under the License.
+ */
+package org.apache.commons.rng.sampling;
+
+import java.util.Arrays;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.commons.math3.stat.inference.ChiSquareTest;
+import org.apache.commons.math3.util.CombinatoricsUtils;
+import org.apache.commons.rng.UniformRandomProvider;
+import org.apache.commons.rng.simple.RandomSource;
+
+/**
+ * Tests for {@link CombinationSampler}.
+ */
+public class CombinationSamplerTest {
+    private final UniformRandomProvider rng = RandomSource.create(RandomSource.XOR_SHIFT_1024_S);
+
+    @Test
+    public void testSampleIsInDomain() {
+        final int n = 6;
+        for (int k = 1; k <= n; k++) {
+            final CombinationSampler sampler = new CombinationSampler(rng, n, k);
+            Assert.assertEquals("Incorrect n", n, sampler.getN());
+            Assert.assertEquals("Incorrect k", k, sampler.getK());
+            final int[] random = sampler.sample();
+            for (int s : random) {
+                assertIsInDomain(n, s);
+            }
+        }
+    }
+
+    @Test
+    public void testUniformWithKlessThanHalfN() {
+        final int n = 8;
+        final int k = 2;
+        assertUniformSamples(n, k);
+    }
+
+    @Test
+    public void testUniformWithKmoreThanHalfN() {
+        final int n = 8;
+        final int k = 6;
+        assertUniformSamples(n, k);
+    }
+
+    @Test
+    public void testSampleWhenNequalsKIsNotShuffled() {
+        // Check n == k boundary case.
+        // This is allowed but the sample is not shuffled.
+        for (int n = 1; n < 3; n++) {
+            final int k = n;
+            final CombinationSampler sampler = new CombinationSampler(rng, n, k);
+            Assert.assertEquals("Incorrect n", n, sampler.getN());
+            Assert.assertEquals("Incorrect k", k, sampler.getK());
+            final int[] sample = sampler.sample();
+            Assert.assertEquals("Incorrect sample length", n, sample.length);
+            for (int i = 0; i < n; i++) {
+                Assert.assertEquals("Sample was shuffled", i, sample[i]);
+            }
+        }
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testKgreaterThanNThrows() {
+        // Must fail for k > n.
+        final int n = 2;
+        final int k = 3;
+        new CombinationSampler(rng, n, k);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testNequalsZeroThrows() {
+        // Must fail for n = 0.
+        final int n = 0;
+        final int k = 3;
+        new CombinationSampler(rng, n, k);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testKequalsZeroThrows() {
+        // Must fail for k = 0.
+        final int n = 2;
+        final int k = 0;
+        new CombinationSampler(rng, n, k);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testNisNegativeThrows() {
+        // Must fail for n <= 0.
+        final int n = -1;
+        final int k = 3;
+        new CombinationSampler(rng, n, k);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testKisNegativeThrows() {
+        // Must fail for k <= 0.
+        final int n = 0;
+        final int k = -1;
+        new CombinationSampler(rng, n, k);
+    }
+
+    //// Support methods.
+
+    /**
+     * Asserts the sample value is in the range 0 to n-1.
+     *
+     * @param n     the n
+     * @param value the sample value
+     */
+    private static final void assertIsInDomain(int n, int value) {
+        if (value < 0 || value >= n) {
+            Assert.fail("sample " + value + " not in the domain " + n);
+        }
+    }
+
+    private void assertUniformSamples(int n, int k) {
+        // The C(n, k) should generate a sample of unspecified order.
+        // To test this each combination is allocated a unique code
+        // based on setting k of the first n-bits in an integer.
+        // Codes are positive for all combinations of bits that use k-bits,
+        // otherwise they are negative.
+        final int totalBitCombinations = 1 << n;
+        int[] codeLookup = new int[totalBitCombinations];
+        Arrays.fill(codeLookup, -1); // initialise as negative
+        int codes = 0;
+        for (int i = 0; i < totalBitCombinations; i++) {
+            if (Integer.bitCount(i) == k) {
+                // This is a valid sample so allocate a code
+                codeLookup[i] = codes++;
+            }
+        }
+
+        // The number of combinations C(n, k) is the binomial coefficient
+        Assert.assertEquals("Incorrect number of combination codes",
+                CombinatoricsUtils.binomialCoefficient(n, k), codes);
+
+        final long[] observed = new long[codes];
+        final int numSamples = 6000;
+
+        final CombinationSampler sampler = new CombinationSampler(rng, n, k);
+        for (int i = 0; i < numSamples; i++) {
+            observed[findCode(codeLookup, sampler.sample())]++;
+        }
+
+        // Chi squared test of uniformity
+        final double numExpected = numSamples / (double) codes;
+        final double[] expected = new double[codes];
+        Arrays.fill(expected, numExpected);
+        final ChiSquareTest chiSquareTest = new ChiSquareTest();
+        // Pass if we cannot reject null hypothesis that distributions are the same.
+        Assert.assertFalse(chiSquareTest.chiSquareTest(expected, observed, 0.001));
+    }
+
+    private static int findCode(int[] codeLookup, int[] sample) {
+        // Each sample index is used to set a bit in an integer.
+        // The resulting bits should be a valid code.
+        int bits = 0;
+        for (int s : sample) {
+            // This shift will be from 0 to n-1 since it is from the
+            // domain of size n.
+            bits |= (1 << s);
+        }
+        if (bits >= codeLookup.length) {
+            Assert.fail("Bad bit combination: " + Arrays.toString(sample));
+        }
+        final int code = codeLookup[bits];
+        if (code < 0) {
+            Assert.fail("Bad bit code: " + Arrays.toString(sample));
+        }
+        return code;
+    }
+}