You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by er...@apache.org on 2018/11/23 11:43:04 UTC
[3/8] commons-rng git commit: RNG-62: Added a CombinationSampler
RNG-62: Added a CombinationSampler
Complements the PermutationSampler as the sample order is C(n, k) rather
than P(n, k). Can be used for faster subset sampling if the subset order
is not important.
Project: http://git-wip-us.apache.org/repos/asf/commons-rng/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-rng/commit/17ddb9f3
Tree: http://git-wip-us.apache.org/repos/asf/commons-rng/tree/17ddb9f3
Diff: http://git-wip-us.apache.org/repos/asf/commons-rng/diff/17ddb9f3
Branch: refs/heads/master
Commit: 17ddb9f3376a1cc2e798246fedcf73320902354c
Parents: 37638f1
Author: Alex Herbert <a....@sussex.ac.uk>
Authored: Wed Nov 21 15:15:28 2018 +0000
Committer: Gilles <er...@apache.org>
Committed: Thu Nov 22 15:17:05 2018 +0100
----------------------------------------------------------------------
.../rng/sampling/CombinationSampler.java | 167 ++++++++++++++++
.../rng/sampling/CombinationSamplerTest.java | 190 +++++++++++++++++++
2 files changed, 357 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-rng/blob/17ddb9f3/commons-rng-sampling/src/main/java/org/apache/commons/rng/sampling/CombinationSampler.java
----------------------------------------------------------------------
diff --git a/commons-rng-sampling/src/main/java/org/apache/commons/rng/sampling/CombinationSampler.java b/commons-rng-sampling/src/main/java/org/apache/commons/rng/sampling/CombinationSampler.java
new file mode 100644
index 0000000..04b4376
--- /dev/null
+++ b/commons-rng-sampling/src/main/java/org/apache/commons/rng/sampling/CombinationSampler.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.rng.sampling;
+
+import org.apache.commons.rng.UniformRandomProvider;
+
+/**
+ * Class for representing combinations of a sequence of integers.
+ *
+ * <p>A combination is a selection of items from a collection, such that (unlike
+ * permutations) the order of selection <strong>does not matter</strong>. This
+ * sampler can be used to generate a combination in an unspecified order and is
+ * faster than the corresponding {@link PermutationSampler}.
+ *
+ * <p>Note that the sample order is unspecified. For example a sample
+ * combination of 2 from 4 may return {@code [0,1]} or {@code [1,0]} as the two are
+ * equivalent, and the order of a given combination may change in subsequent samples.
+ *
+ * <p>The sampler can be used to generate indices to select subsets where the
+ * order of the subset is not important.
+ *
+ * @see <a href="https://en.wikipedia.org/wiki/Combination">Combination
+ * definition</a>
+ * @see PermutationSampler
+ */
+public class CombinationSampler {
+ /** Domain of the combination. */
+ private final int[] domain;
+ /** Size of the combination. */
+ private final int size;
+ /** The number of steps of a full shuffle to perform. */
+ private final int steps;
+ /**
+ * The position to copy the domain from after a partial shuffle.
+ *
+ * <p>The copy is either in the range [0 : size] or [domain.length - size :
+ * domain.length].
+ */
+ private final int from;
+ /** RNG. */
+ private final UniformRandomProvider rng;
+
+ /**
+ * Creates a generator of combinations.
+ *
+ * <p>The {@link #sample()} method will generate an integer array of
+ * length {@code k} whose entries are selected randomly, without
+ * repetition, from the integers 0, 1, ..., {@code n}-1 (inclusive).
+ * The returned array represents a combination of {@code n} taken
+ * {@code k}.
+ *
+ * <p>In contrast to a permutation, the returned array is <strong>not
+ * guaranteed</strong> to be in a random order. The {@link #sample()}
+ * method returns the array in an unspecified order.
+ *
+ * <p>If {@code n <= 0} or {@code k <= 0} or {@code k > n} then no combination
+ * is required and an exception is raised.
+ *
+ * @param rng Generator of uniformly distributed random numbers.
+ * @param n Domain of the combination.
+ * @param k Size of the combination.
+ * @throws IllegalArgumentException if {@code n <= 0} or {@code k <= 0} or
+ * {@code k > n}.
+ */
+ public CombinationSampler(UniformRandomProvider rng, int n, int k) {
+ if (n <= 0) {
+ throw new IllegalArgumentException("n <= 0 : n=" + n);
+ }
+ if (k <= 0) {
+ throw new IllegalArgumentException("k <= 0 : k=" + k);
+ }
+ if (k > n) {
+ throw new IllegalArgumentException("k > n : k=" + k + ", n=" + n);
+ }
+
+ domain = PermutationSampler.natural(n);
+ size = k;
+ // The sample can be optimised by only performing the first k or (n - k) steps
+ // from a full Fisher-Yates shuffle from the end of the domain to the start.
+ // The upper positions will then contain a random sample from the domain. The
+ // lower half is then by definition also a random sample (just not in a random order).
+ // The sample is then picked using the upper or lower half depending which
+ // makes the number of steps smaller.
+ if (k <= n / 2) {
+ // Upper half
+ steps = k;
+ from = n - k;
+ } else {
+ // Lower half
+ steps = n - k;
+ from = 0;
+ }
+ this.rng = rng;
+ }
+
+ /**
+ * Return a combination of {@code k} whose entries are selected randomly,
+ * without repetition, from the integers 0, 1, ..., {@code n}-1 (inclusive).
+ *
+ * <p>The order of the returned array is not guaranteed to be in a random order
+ * as the order of a combination <strong>does not matter</strong>.
+ *
+ * @return a random combination.
+ */
+ public int[] sample() {
+ // Shuffle from the end but limit to a number of steps.
+ // The subset C(n, k) is then either those positions that have
+ // been sampled, or those that haven't been, depending
+ // on the number of steps.
+ // Note: if n==k the number of steps is zero and the result
+ // is just a clone of the domain.
+ for (int i = domain.length - 1,
+ j = 0; i > 0 && j < steps; i--, j++) {
+ // Swap index i with any position down to 0 (including itself)
+ swap(domain, i, rng.nextInt(i + 1));
+ }
+ final int[] result = new int[size];
+ System.arraycopy(domain, from, result, 0, size);
+ return result;
+ }
+
+ /**
+ * Swaps the two specified elements in the specified array.
+ *
+ * @param array the array
+ * @param i the first index
+ * @param j the second index
+ */
+ private static void swap(int[] array, int i, int j) {
+ final int tmp = array[i];
+ array[i] = array[j];
+ array[j] = tmp;
+ }
+
+ /**
+ * Gets the domain size (n) of the combination C(n, k).
+ *
+ * @return the number of elements (n) in the domain
+ */
+ public int getN() {
+ return domain.length;
+ }
+
+ /**
+ * Gets the sample size (k) of the combination C(n, k).
+ *
+ * @return the number of elements (k) in the combination
+ */
+ public int getK() {
+ return size;
+ }
+}
http://git-wip-us.apache.org/repos/asf/commons-rng/blob/17ddb9f3/commons-rng-sampling/src/test/java/org/apache/commons/rng/sampling/CombinationSamplerTest.java
----------------------------------------------------------------------
diff --git a/commons-rng-sampling/src/test/java/org/apache/commons/rng/sampling/CombinationSamplerTest.java b/commons-rng-sampling/src/test/java/org/apache/commons/rng/sampling/CombinationSamplerTest.java
new file mode 100644
index 0000000..2d43204
--- /dev/null
+++ b/commons-rng-sampling/src/test/java/org/apache/commons/rng/sampling/CombinationSamplerTest.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing sampleissions and
+ * limitations under the License.
+ */
+package org.apache.commons.rng.sampling;
+
+import java.util.Arrays;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.commons.math3.stat.inference.ChiSquareTest;
+import org.apache.commons.math3.util.CombinatoricsUtils;
+import org.apache.commons.rng.UniformRandomProvider;
+import org.apache.commons.rng.simple.RandomSource;
+
+/**
+ * Tests for {@link CombinationSampler}.
+ */
+public class CombinationSamplerTest {
+ private final UniformRandomProvider rng = RandomSource.create(RandomSource.XOR_SHIFT_1024_S);
+
+ @Test
+ public void testSampleIsInDomain() {
+ final int n = 6;
+ for (int k = 1; k <= n; k++) {
+ final CombinationSampler sampler = new CombinationSampler(rng, n, k);
+ Assert.assertEquals("Incorrect n", n, sampler.getN());
+ Assert.assertEquals("Incorrect k", k, sampler.getK());
+ final int[] random = sampler.sample();
+ for (int s : random) {
+ assertIsInDomain(n, s);
+ }
+ }
+ }
+
+ @Test
+ public void testUniformWithKlessThanHalfN() {
+ final int n = 8;
+ final int k = 2;
+ assertUniformSamples(n, k);
+ }
+
+ @Test
+ public void testUniformWithKmoreThanHalfN() {
+ final int n = 8;
+ final int k = 6;
+ assertUniformSamples(n, k);
+ }
+
+ @Test
+ public void testSampleWhenNequalsKIsNotShuffled() {
+ // Check n == k boundary case.
+ // This is allowed but the sample is not shuffled.
+ for (int n = 1; n < 3; n++) {
+ final int k = n;
+ final CombinationSampler sampler = new CombinationSampler(rng, n, k);
+ Assert.assertEquals("Incorrect n", n, sampler.getN());
+ Assert.assertEquals("Incorrect k", k, sampler.getK());
+ final int[] sample = sampler.sample();
+ Assert.assertEquals("Incorrect sample length", n, sample.length);
+ for (int i = 0; i < n; i++) {
+ Assert.assertEquals("Sample was shuffled", i, sample[i]);
+ }
+ }
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testKgreaterThanNThrows() {
+ // Must fail for k > n.
+ final int n = 2;
+ final int k = 3;
+ new CombinationSampler(rng, n, k);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testNequalsZeroThrows() {
+ // Must fail for n = 0.
+ final int n = 0;
+ final int k = 3;
+ new CombinationSampler(rng, n, k);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testKequalsZeroThrows() {
+ // Must fail for k = 0.
+ final int n = 2;
+ final int k = 0;
+ new CombinationSampler(rng, n, k);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testNisNegativeThrows() {
+ // Must fail for n <= 0.
+ final int n = -1;
+ final int k = 3;
+ new CombinationSampler(rng, n, k);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testKisNegativeThrows() {
+ // Must fail for k <= 0.
+ final int n = 0;
+ final int k = -1;
+ new CombinationSampler(rng, n, k);
+ }
+
+ //// Support methods.
+
+ /**
+ * Asserts the sample value is in the range 0 to n-1.
+ *
+ * @param n the n
+ * @param value the sample value
+ */
+ private static final void assertIsInDomain(int n, int value) {
+ if (value < 0 || value >= n) {
+ Assert.fail("sample " + value + " not in the domain " + n);
+ }
+ }
+
+ private void assertUniformSamples(int n, int k) {
+ // The C(n, k) should generate a sample of unspecified order.
+ // To test this each combination is allocated a unique code
+ // based on setting k of the first n-bits in an integer.
+ // Codes are positive for all combinations of bits that use k-bits,
+ // otherwise they are negative.
+ final int totalBitCombinations = 1 << n;
+ int[] codeLookup = new int[totalBitCombinations];
+ Arrays.fill(codeLookup, -1); // initialise as negative
+ int codes = 0;
+ for (int i = 0; i < totalBitCombinations; i++) {
+ if (Integer.bitCount(i) == k) {
+ // This is a valid sample so allocate a code
+ codeLookup[i] = codes++;
+ }
+ }
+
+ // The number of combinations C(n, k) is the binomial coefficient
+ Assert.assertEquals("Incorrect number of combination codes",
+ CombinatoricsUtils.binomialCoefficient(n, k), codes);
+
+ final long[] observed = new long[codes];
+ final int numSamples = 6000;
+
+ final CombinationSampler sampler = new CombinationSampler(rng, n, k);
+ for (int i = 0; i < numSamples; i++) {
+ observed[findCode(codeLookup, sampler.sample())]++;
+ }
+
+ // Chi squared test of uniformity
+ final double numExpected = numSamples / (double) codes;
+ final double[] expected = new double[codes];
+ Arrays.fill(expected, numExpected);
+ final ChiSquareTest chiSquareTest = new ChiSquareTest();
+ // Pass if we cannot reject null hypothesis that distributions are the same.
+ Assert.assertFalse(chiSquareTest.chiSquareTest(expected, observed, 0.001));
+ }
+
+ private static int findCode(int[] codeLookup, int[] sample) {
+ // Each sample index is used to set a bit in an integer.
+ // The resulting bits should be a valid code.
+ int bits = 0;
+ for (int s : sample) {
+ // This shift will be from 0 to n-1 since it is from the
+ // domain of size n.
+ bits |= (1 << s);
+ }
+ if (bits >= codeLookup.length) {
+ Assert.fail("Bad bit combination: " + Arrays.toString(sample));
+ }
+ final int code = codeLookup[bits];
+ if (code < 0) {
+ Assert.fail("Bad bit code: " + Arrays.toString(sample));
+ }
+ return code;
+ }
+}