You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@beam.apache.org by GitBox <gi...@apache.org> on 2021/06/14 07:40:29 UTC

[GitHub] [beam] robertwb commented on a change in pull request #15002: [BEAM-9547] Dataframe weighted sample.

robertwb commented on a change in pull request #15002:
URL: https://github.com/apache/beam/pull/15002#discussion_r650242075



##########
File path: sdks/python/apache_beam/dataframe/frames_test.py
##########
@@ -1607,6 +1607,30 @@ def test_sample_with_missing_weights(self):
     self.assertEqual(series_result.name, "GDP")
     self.assertEqual(set(series_result.index), set(["Nauru", "Iceland"]))
 
+  def test_sample_with_weights_distribution(self):
+    num_other_elements = 100
+    num_runs = 20
+
+    def sample_many_times(s, weights):
+      all = None
+      for _ in range(num_runs):
+        sampled = s.sample(weights=weights)
+        if all is None:
+          all = sampled
+        else:
+          all = all.append(sampled)
+      return all.sum()
+
+    result = self._run_test(
+        sample_many_times,
+        # The first element is 1, the rest are all 0.  This means that when
+        # we sum all the sampled elements (above), the result should be the
+        # number of times the first element was sampled.
+        pd.Series([1] + [0] * num_other_elements),
+        # Pick the first element about 20% of the time.
+        pd.Series([0.2] + [.8 / num_other_elements] * num_other_elements))
+
+    self.assertTrue(0 < result < num_runs / 2, result)

Review comment:
       Updated the test, PTAL.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org