You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by mh...@apache.org on 2018/07/09 22:19:23 UTC

[1/2] datafu git commit: DATAFU-127 New macro - samply by keys

Repository: datafu
Updated Branches:
  refs/heads/master 8dae166c0 -> 17f034a07


DATAFU-127 New macro - samply by keys

Signed-off-by: Matthew Hayes <mh...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/datafu/commit/a0d1366d
Tree: http://git-wip-us.apache.org/repos/asf/datafu/tree/a0d1366d
Diff: http://git-wip-us.apache.org/repos/asf/datafu/diff/a0d1366d

Branch: refs/heads/master
Commit: a0d1366d5c3a0beecf07f551d7eaf682b7adfdf1
Parents: 8dae166
Author: Eyal Allweil <ea...@paypal.com>
Authored: Mon Jul 9 15:17:51 2018 -0700
Committer: Matthew Hayes <mh...@apache.org>
Committed: Mon Jul 9 15:17:51 2018 -0700

----------------------------------------------------------------------
 .../main/resources/datafu/sample_by_keys.pig    | 44 ++++++++++++++++++++
 .../datafu/test/pig/sampling/SamplingTests.java | 40 +++++++++++++++++-
 2 files changed, 83 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/datafu/blob/a0d1366d/datafu-pig/src/main/resources/datafu/sample_by_keys.pig
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/resources/datafu/sample_by_keys.pig b/datafu-pig/src/main/resources/datafu/sample_by_keys.pig
new file mode 100644
index 0000000..c22ffc7
--- /dev/null
+++ b/datafu-pig/src/main/resources/datafu/sample_by_keys.pig
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Macro for sampling a table by a list of keys.
+ *
+ * Params:
+ *   - table_name               - table name to sample
+ *   - sample_set               - a set of keys
+ *   - join_key_table           - join column name in the table
+ *   - join_key_sample          - join column name in the sample
+ */
+DEFINE sample_by_keys(table, sample_set, join_key_table, join_key_sample) RETURNS out {
+    t = FOREACH $table GENERATE
+        $join_key_table AS join_key,
+        TOTUPLE(*)      AS original;
+
+    s = FOREACH $sample_set GENERATE $join_key_sample;
+    sd = DISTINCT s;
+
+    joined = JOIN   t       BY join_key,
+                    sd      BY $join_key_sample USING 'replicated';
+
+    flat = FOREACH joined GENERATE FLATTEN(original);
+
+    -- as the previous is a map only job, this row makes sure we use reducers so there won't be many output files
+    $out = ORDER flat BY $join_key_table PARALLEL 1;
+};

http://git-wip-us.apache.org/repos/asf/datafu/blob/a0d1366d/datafu-pig/src/test/java/datafu/test/pig/sampling/SamplingTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/sampling/SamplingTests.java b/datafu-pig/src/test/java/datafu/test/pig/sampling/SamplingTests.java
index 418a694..6fdb580 100644
--- a/datafu-pig/src/test/java/datafu/test/pig/sampling/SamplingTests.java
+++ b/datafu-pig/src/test/java/datafu/test/pig/sampling/SamplingTests.java
@@ -30,7 +30,6 @@ import java.util.Set;
 import junit.framework.Assert;
 
 import org.adrianwalker.multilinestring.Multiline;
-import org.apache.pig.backend.executionengine.ExecException;
 import org.apache.pig.data.BagFactory;
 import org.apache.pig.data.DataBag;
 import org.apache.pig.data.Tuple;
@@ -520,4 +519,43 @@ public class SamplingTests extends PigTests
       found.add(i);
     }
   }
+
+  private void prepareDataForSampleByKeysTest() throws IOException {
+      writeLinesToFile("input",
+        "1\ta\t20140201",
+        "4\td\t20140201",
+        "2\tb\t20110201",
+        "6\tf\t20140201",
+        "4\td2\t20140301",
+        "3\tc\t20160201" );
+      writeLinesToFile("input2", "1", "2", "3", "4");
+  }
+
+  /**
+  import 'datafu/sample_by_keys.pig';
+
+  big_table = LOAD 'input' AS (key1: int, val: chararray, dt: chararray);
+  keys = LOAD 'input2' AS (key2: int);
+
+  data = sample_by_keys(big_table, keys, 'key1', 'key2');
+
+  STORE data INTO 'output';
+   */
+  @Multiline
+  private String sampleByKeysTest;
+
+  @Test
+  public void sampleByKeysTest() throws Exception
+  {
+    prepareDataForSampleByKeysTest();
+
+    PigTest test = createPigTestFromString(sampleByKeysTest);
+
+    assertOutput(test, "data",
+    "(1,a,20140201)",
+    "(2,b,20110201)",
+    "(3,c,20160201)",
+    "(4,d2,20140301)",
+    "(4,d,20140201)");
+  }
 }


[2/2] datafu git commit: Enable test.single by disabling tests for buildSrc

Posted by mh...@apache.org.
Enable test.single by disabling tests for buildSrc


Project: http://git-wip-us.apache.org/repos/asf/datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/datafu/commit/17f034a0
Tree: http://git-wip-us.apache.org/repos/asf/datafu/tree/17f034a0
Diff: http://git-wip-us.apache.org/repos/asf/datafu/diff/17f034a0

Branch: refs/heads/master
Commit: 17f034a076835e956a4efab7fbc3c7bf4102033f
Parents: a0d1366
Author: Matthew Hayes <mh...@apache.org>
Authored: Mon Jul 9 15:19:17 2018 -0700
Committer: Matthew Hayes <mh...@apache.org>
Committed: Mon Jul 9 15:19:17 2018 -0700

----------------------------------------------------------------------
 buildSrc/build.gradle | 1 +
 1 file changed, 1 insertion(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/datafu/blob/17f034a0/buildSrc/build.gradle
----------------------------------------------------------------------
diff --git a/buildSrc/build.gradle b/buildSrc/build.gradle
index cd09b54..02c21af 100644
--- a/buildSrc/build.gradle
+++ b/buildSrc/build.gradle
@@ -25,3 +25,4 @@ dependencies {
   compile 'com.github.rholder:gradle-autojar:1.0.1'
 }
 
+test.enabled=false