You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by mh...@apache.org on 2018/07/09 22:19:23 UTC
[1/2] datafu git commit: DATAFU-127 New macro - samply by keys
Repository: datafu
Updated Branches:
refs/heads/master 8dae166c0 -> 17f034a07
DATAFU-127 New macro - samply by keys
Signed-off-by: Matthew Hayes <mh...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/datafu/commit/a0d1366d
Tree: http://git-wip-us.apache.org/repos/asf/datafu/tree/a0d1366d
Diff: http://git-wip-us.apache.org/repos/asf/datafu/diff/a0d1366d
Branch: refs/heads/master
Commit: a0d1366d5c3a0beecf07f551d7eaf682b7adfdf1
Parents: 8dae166
Author: Eyal Allweil <ea...@paypal.com>
Authored: Mon Jul 9 15:17:51 2018 -0700
Committer: Matthew Hayes <mh...@apache.org>
Committed: Mon Jul 9 15:17:51 2018 -0700
----------------------------------------------------------------------
.../main/resources/datafu/sample_by_keys.pig | 44 ++++++++++++++++++++
.../datafu/test/pig/sampling/SamplingTests.java | 40 +++++++++++++++++-
2 files changed, 83 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/datafu/blob/a0d1366d/datafu-pig/src/main/resources/datafu/sample_by_keys.pig
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/resources/datafu/sample_by_keys.pig b/datafu-pig/src/main/resources/datafu/sample_by_keys.pig
new file mode 100644
index 0000000..c22ffc7
--- /dev/null
+++ b/datafu-pig/src/main/resources/datafu/sample_by_keys.pig
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Macro for sampling a table by a list of keys.
+ *
+ * Params:
+ * - table_name - table name to sample
+ * - sample_set - a set of keys
+ * - join_key_table - join column name in the table
+ * - join_key_sample - join column name in the sample
+ */
+DEFINE sample_by_keys(table, sample_set, join_key_table, join_key_sample) RETURNS out {
+ t = FOREACH $table GENERATE
+ $join_key_table AS join_key,
+ TOTUPLE(*) AS original;
+
+ s = FOREACH $sample_set GENERATE $join_key_sample;
+ sd = DISTINCT s;
+
+ joined = JOIN t BY join_key,
+ sd BY $join_key_sample USING 'replicated';
+
+ flat = FOREACH joined GENERATE FLATTEN(original);
+
+ -- as the previous is a map only job, this row makes sure we use reducers so there won't be many output files
+ $out = ORDER flat BY $join_key_table PARALLEL 1;
+};
http://git-wip-us.apache.org/repos/asf/datafu/blob/a0d1366d/datafu-pig/src/test/java/datafu/test/pig/sampling/SamplingTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/sampling/SamplingTests.java b/datafu-pig/src/test/java/datafu/test/pig/sampling/SamplingTests.java
index 418a694..6fdb580 100644
--- a/datafu-pig/src/test/java/datafu/test/pig/sampling/SamplingTests.java
+++ b/datafu-pig/src/test/java/datafu/test/pig/sampling/SamplingTests.java
@@ -30,7 +30,6 @@ import java.util.Set;
import junit.framework.Assert;
import org.adrianwalker.multilinestring.Multiline;
-import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
@@ -520,4 +519,43 @@ public class SamplingTests extends PigTests
found.add(i);
}
}
+
+ private void prepareDataForSampleByKeysTest() throws IOException {
+ writeLinesToFile("input",
+ "1\ta\t20140201",
+ "4\td\t20140201",
+ "2\tb\t20110201",
+ "6\tf\t20140201",
+ "4\td2\t20140301",
+ "3\tc\t20160201" );
+ writeLinesToFile("input2", "1", "2", "3", "4");
+ }
+
+ /**
+ import 'datafu/sample_by_keys.pig';
+
+ big_table = LOAD 'input' AS (key1: int, val: chararray, dt: chararray);
+ keys = LOAD 'input2' AS (key2: int);
+
+ data = sample_by_keys(big_table, keys, 'key1', 'key2');
+
+ STORE data INTO 'output';
+ */
+ @Multiline
+ private String sampleByKeysTest;
+
+ @Test
+ public void sampleByKeysTest() throws Exception
+ {
+ prepareDataForSampleByKeysTest();
+
+ PigTest test = createPigTestFromString(sampleByKeysTest);
+
+ assertOutput(test, "data",
+ "(1,a,20140201)",
+ "(2,b,20110201)",
+ "(3,c,20160201)",
+ "(4,d2,20140301)",
+ "(4,d,20140201)");
+ }
}
[2/2] datafu git commit: Enable test.single by disabling tests for
buildSrc
Posted by mh...@apache.org.
Enable test.single by disabling tests for buildSrc
Project: http://git-wip-us.apache.org/repos/asf/datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/datafu/commit/17f034a0
Tree: http://git-wip-us.apache.org/repos/asf/datafu/tree/17f034a0
Diff: http://git-wip-us.apache.org/repos/asf/datafu/diff/17f034a0
Branch: refs/heads/master
Commit: 17f034a076835e956a4efab7fbc3c7bf4102033f
Parents: a0d1366
Author: Matthew Hayes <mh...@apache.org>
Authored: Mon Jul 9 15:19:17 2018 -0700
Committer: Matthew Hayes <mh...@apache.org>
Committed: Mon Jul 9 15:19:17 2018 -0700
----------------------------------------------------------------------
buildSrc/build.gradle | 1 +
1 file changed, 1 insertion(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/datafu/blob/17f034a0/buildSrc/build.gradle
----------------------------------------------------------------------
diff --git a/buildSrc/build.gradle b/buildSrc/build.gradle
index cd09b54..02c21af 100644
--- a/buildSrc/build.gradle
+++ b/buildSrc/build.gradle
@@ -25,3 +25,4 @@ dependencies {
compile 'com.github.rholder:gradle-autojar:1.0.1'
}
+test.enabled=false