You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@crunch.apache.org by ki...@apache.org on 2012/10/17 22:13:49 UTC
git commit: CRUNCH-98. Sampling Scala PCollection.
Updated Branches:
refs/heads/master 2576896c9 -> 14132b093
CRUNCH-98. Sampling Scala PCollection.
Project: http://git-wip-us.apache.org/repos/asf/incubator-crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-crunch/commit/14132b09
Tree: http://git-wip-us.apache.org/repos/asf/incubator-crunch/tree/14132b09
Diff: http://git-wip-us.apache.org/repos/asf/incubator-crunch/diff/14132b09
Branch: refs/heads/master
Commit: 14132b093cd5201b0f323d7fb9c7a9ab4a58a679
Parents: 2576896
Author: Kiyan Ahmadizadeh <ki...@wibidata.com>
Authored: Tue Oct 16 15:29:41 2012 -0700
Committer: Kiyan Ahmadizadeh <ki...@wibidata.com>
Committed: Tue Oct 16 16:10:47 2012 -0700
----------------------------------------------------------------------
.../apache/crunch/scrunch/PCollectionTest.scala | 28 +++++++++++++++
.../org/apache/crunch/scrunch/PCollection.scala | 8 ++++
2 files changed, 36 insertions(+), 0 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/14132b09/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
index 4c25298..94ac917 100644
--- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
+++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
@@ -69,4 +69,32 @@ class PCollectionTest extends CrunchTestSupport with JUnitSuite {
assertEquals("Wrong last line in Shakespeare.", lastLineInShakespeare,
lines(linesInShakespeare - 1))
}
+
+ /**
+ * Tests sampling elements from a PCollection using some acceptance probability.
+ */
+ @Test def testSampling {
+ // Get the collection and sample ten percent.
+ val shakespeare = shakespeareCollection
+ val sampledCollection = shakespeare.sample(0.10)
+ val length = sampledCollection.length().value()
+ // The number of lines in the sampled collection should be about ten percent of the lines in
+ // the original collection. We use a tolerance of +- 50.
+ val lower = linesInShakespeare * 0.10 - 50
+ val upper = linesInShakespeare * 0.10 + 50
+ assertTrue("Sampled collection contains too few elements.", lower <= length)
+ assertTrue("Sampled collection contains too many elements.", length <= upper)
+ }
+
+ /**
+ * Tests sampling elements from a PCollection using some acceptance probability and a seed.
+ */
+ @Test def testSamplingWithSeed {
+ // Get the collection and sample ten percent.
+ val shakespeare = shakespeareCollection
+ // With a seed of 1L, 380 elements should be sampled.
+ val sampledCollection = shakespeare.sample(0.10, 1L)
+ val length = sampledCollection.length().value()
+ assertEquals("Incorrect number of elements sampled with seed 1L.", 380L, length)
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/14132b09/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala b/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
index 89959ea..ac2242f 100644
--- a/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
+++ b/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
@@ -76,6 +76,14 @@ class PCollection[S](val native: JCollection[S]) extends PCollectionLike[S, PCol
def min()(implicit converter: Converter[S, S]) = PObject(Aggregate.min(native))(converter)
+ def sample(acceptanceProbability: Double) = {
+ wrap(native.sample(acceptanceProbability))
+ }
+
+ def sample(acceptanceProbability: Double, seed: Long) = {
+ wrap(native.sample(acceptanceProbability, seed))
+ }
+
def pType = native.getPType()
}