You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@crunch.apache.org by ki...@apache.org on 2012/10/17 22:13:49 UTC

git commit: CRUNCH-98. Sampling Scala PCollection.

Updated Branches:
  refs/heads/master 2576896c9 -> 14132b093


CRUNCH-98. Sampling Scala PCollection.


Project: http://git-wip-us.apache.org/repos/asf/incubator-crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-crunch/commit/14132b09
Tree: http://git-wip-us.apache.org/repos/asf/incubator-crunch/tree/14132b09
Diff: http://git-wip-us.apache.org/repos/asf/incubator-crunch/diff/14132b09

Branch: refs/heads/master
Commit: 14132b093cd5201b0f323d7fb9c7a9ab4a58a679
Parents: 2576896
Author: Kiyan Ahmadizadeh <ki...@wibidata.com>
Authored: Tue Oct 16 15:29:41 2012 -0700
Committer: Kiyan Ahmadizadeh <ki...@wibidata.com>
Committed: Tue Oct 16 16:10:47 2012 -0700

----------------------------------------------------------------------
 .../apache/crunch/scrunch/PCollectionTest.scala    |   28 +++++++++++++++
 .../org/apache/crunch/scrunch/PCollection.scala    |    8 ++++
 2 files changed, 36 insertions(+), 0 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/14132b09/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
index 4c25298..94ac917 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
@@ -69,4 +69,32 @@ class PCollectionTest extends CrunchTestSupport with JUnitSuite {
     assertEquals("Wrong last line in Shakespeare.", lastLineInShakespeare,
         lines(linesInShakespeare - 1))
   }
+
+  /**
+   * Tests sampling elements from a PCollection using some acceptance probability.
+   */
+  @Test def testSampling {
+    // Get the collection and sample ten percent.
+    val shakespeare = shakespeareCollection
+    val sampledCollection = shakespeare.sample(0.10)
+    val length = sampledCollection.length().value()
+    // The number of lines in the sampled collection should be about ten percent of the lines in
+    // the original collection. We use a tolerance of +- 50.
+    val lower = linesInShakespeare * 0.10 - 50
+    val upper = linesInShakespeare * 0.10 + 50
+    assertTrue("Sampled collection contains too few elements.", lower <= length)
+    assertTrue("Sampled collection contains too many elements.", length <= upper)
+  }
+
+  /**
+   * Tests sampling elements from a PCollection using some acceptance probability and a seed.
+   */
+  @Test def testSamplingWithSeed {
+    // Get the collection and sample ten percent.
+    val shakespeare = shakespeareCollection
+    // With a seed of 1L, 380 elements should be sampled.
+    val sampledCollection = shakespeare.sample(0.10, 1L)
+    val length = sampledCollection.length().value()
+    assertEquals("Incorrect number of elements sampled with seed 1L.", 380L, length)
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/14132b09/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala b/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
index 89959ea..ac2242f 100644
--- a/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
+++ b/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
@@ -76,6 +76,14 @@ class PCollection[S](val native: JCollection[S]) extends PCollectionLike[S, PCol
 
   def min()(implicit converter: Converter[S, S]) = PObject(Aggregate.min(native))(converter)
 
+  def sample(acceptanceProbability: Double) = {
+    wrap(native.sample(acceptanceProbability))
+  }
+
+  def sample(acceptanceProbability: Double, seed: Long) = {
+    wrap(native.sample(acceptanceProbability, seed))
+  }
+
   def pType = native.getPType()
 }