You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by mh...@apache.org on 2019/01/07 15:03:10 UTC
[1/2] datafu git commit: DATAFU-130 Add three-way left outer join
macro
Repository: datafu
Updated Branches:
refs/heads/master f7ec4c7ad -> 0d0842719
DATAFU-130 Add three-way left outer join macro
Signed-off-by: matthew.hayes <mh...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/datafu/commit/e81ea866
Tree: http://git-wip-us.apache.org/repos/asf/datafu/tree/e81ea866
Diff: http://git-wip-us.apache.org/repos/asf/datafu/diff/e81ea866
Branch: refs/heads/master
Commit: e81ea8666467fb89ceba37465e5905b9674633f9
Parents: 857cf16
Author: Eyal Allweil <ey...@apache.org>
Authored: Wed Nov 28 11:50:22 2018 +0200
Committer: matthew.hayes <mh...@apache.org>
Committed: Thu Jan 3 09:29:14 2019 -0800
----------------------------------------------------------------------
.../src/main/resources/datafu/count_macros.pig | 11 +++++
.../src/main/resources/datafu/diff_macros.pig | 8 ++++
.../main/resources/datafu/left_outer_join.pig | 39 ++++++++++++++++++
.../main/resources/datafu/sample_by_keys.pig | 13 +++---
.../src/test/java/datafu/test/pig/PigTests.java | 2 +-
.../java/datafu/test/pig/macros/MacroTests.java | 42 ++++++++++++++++++--
6 files changed, 103 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/datafu/blob/e81ea866/datafu-pig/src/main/resources/datafu/count_macros.pig
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/resources/datafu/count_macros.pig b/datafu-pig/src/main/resources/datafu/count_macros.pig
index 9bebce4..4b5280b 100644
--- a/datafu-pig/src/main/resources/datafu/count_macros.pig
+++ b/datafu-pig/src/main/resources/datafu/count_macros.pig
@@ -17,11 +17,22 @@
* under the License.
*/
+/**
+ * Counts all the rows in a relation
+ *
+ * relation - the relation to count
+ */
DEFINE count_all_non_distinct(alias) returns res {
grp_all = GROUP $alias ALL;
$res = FOREACH grp_all GENERATE COUNT($alias);
};
+/**
+ * Counts all the distinct keys in a relation
+ *
+ * relation - the relation to count
+ * key - the field to check distinctness
+ */
DEFINE count_distinct_keys(alias, key) returns res {
just_key = FOREACH $alias GENERATE $key;
dist_data = DISTINCT just_key;
http://git-wip-us.apache.org/repos/asf/datafu/blob/e81ea866/datafu-pig/src/main/resources/datafu/diff_macros.pig
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/resources/datafu/diff_macros.pig b/datafu-pig/src/main/resources/datafu/diff_macros.pig
index cacb1b6..9523fe7 100644
--- a/datafu-pig/src/main/resources/datafu/diff_macros.pig
+++ b/datafu-pig/src/main/resources/datafu/diff_macros.pig
@@ -17,6 +17,14 @@
* under the License.
*/
+/**
+ * Produces a human-readable description of the rows and fields changed between two relations.
+ *
+ * diff_macro_old - the old, baseline relation
+ * diff_macro_new - the new relation to be checked
+ * diff_macro_pk - the key on which to join/compare individual rows
+ * diff_macro_ignored_field - an optional field which can be ignored in the comparison, like a timestamp
+ */
DEFINE diff_macro(diff_macro_old, diff_macro_new, diff_macro_pk, diff_macro_ignored_field) returns diffs {
DEFINE TupleDiff datafu.pig.util.TupleDiff;
http://git-wip-us.apache.org/repos/asf/datafu/blob/e81ea866/datafu-pig/src/main/resources/datafu/left_outer_join.pig
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/resources/datafu/left_outer_join.pig b/datafu-pig/src/main/resources/datafu/left_outer_join.pig
new file mode 100644
index 0000000..aeea267
--- /dev/null
+++ b/datafu-pig/src/main/resources/datafu/left_outer_join.pig
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Used to do a left outer join of three relations
+ *
+ * relation1 - the first relation to join
+ * key1 - the field from the first relation on which to group
+ * relation2 - the second relation to join
+ * key2 - the field from the second relation on which to group
+ * relation3 - the third relation to join
+ * key3 - the field from the third relation on which to group
+ *
+ */
+DEFINE left_outer_join(relation1, key1, relation2, key2, relation3, key3) returns joined {
+ DEFINE EmptyBagToNullFields datafu.pig.bags.EmptyBagToNullFields();
+
+ cogrouped = COGROUP $relation1 BY $key1, $relation2 BY $key2, $relation3 BY $key3;
+ $joined = FOREACH cogrouped GENERATE
+ FLATTEN($relation1),
+ FLATTEN(EmptyBagToNullFields($relation2)),
+ FLATTEN(EmptyBagToNullFields($relation3));
+};
http://git-wip-us.apache.org/repos/asf/datafu/blob/e81ea866/datafu-pig/src/main/resources/datafu/sample_by_keys.pig
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/resources/datafu/sample_by_keys.pig b/datafu-pig/src/main/resources/datafu/sample_by_keys.pig
index c22ffc7..def1b03 100644
--- a/datafu-pig/src/main/resources/datafu/sample_by_keys.pig
+++ b/datafu-pig/src/main/resources/datafu/sample_by_keys.pig
@@ -17,14 +17,13 @@
* under the License.
*/
-/*
- * Macro for sampling a table by a list of keys.
+/**
+ * Samples a table by a list of keys.
*
- * Params:
- * - table_name - table name to sample
- * - sample_set - a set of keys
- * - join_key_table - join column name in the table
- * - join_key_sample - join column name in the sample
+ * table_name - table name to sample
+ * sample_set - a set of keys
+ * join_key_table - join column name in the table
+ * join_key_sample - join column name in the sample
*/
DEFINE sample_by_keys(table, sample_set, join_key_table, join_key_sample) RETURNS out {
t = FOREACH $table GENERATE
http://git-wip-us.apache.org/repos/asf/datafu/blob/e81ea866/datafu-pig/src/test/java/datafu/test/pig/PigTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/PigTests.java b/datafu-pig/src/test/java/datafu/test/pig/PigTests.java
index b869492..d83ff4f 100644
--- a/datafu-pig/src/test/java/datafu/test/pig/PigTests.java
+++ b/datafu-pig/src/test/java/datafu/test/pig/PigTests.java
@@ -235,7 +235,7 @@ public abstract class PigTests
protected void assertOutput(PigTest test, String alias, String... expected) throws IOException, ParseException
{
List<Tuple> tuples = getLinesForAlias(test, alias);
- assertEquals(expected.length, tuples.size(), "Mismatch in number of tuples");
+ assertEquals(tuples.size(), expected.length, "Mismatch in number of tuples");
int i=0;
for (String e : expected)
{
http://git-wip-us.apache.org/repos/asf/datafu/blob/e81ea866/datafu-pig/src/test/java/datafu/test/pig/macros/MacroTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/macros/MacroTests.java b/datafu-pig/src/test/java/datafu/test/pig/macros/MacroTests.java
index 17d0af5..7a3e7b9 100644
--- a/datafu-pig/src/test/java/datafu/test/pig/macros/MacroTests.java
+++ b/datafu-pig/src/test/java/datafu/test/pig/macros/MacroTests.java
@@ -38,8 +38,7 @@ public class MacroTests extends PigTests
STORE cnt INTO 'output';
*/
- @Multiline
- private String countDistinctTest;
+ @Multiline private static String countDistinctTest;
@Test
public void countDistinctTest() throws Exception
@@ -74,8 +73,7 @@ public class MacroTests extends PigTests
STORE cnt INTO 'output';
*/
- @Multiline
- private String countTest;
+ @Multiline private static String countTest;
@Test
public void countTest() throws Exception
@@ -99,4 +97,40 @@ public class MacroTests extends PigTests
assertOutput(test, "cnt", "(31)");
}
+ /**
+
+ import 'datafu/left_outer_join.pig';
+
+ data1 = LOAD 'first' AS (id:chararray, num1:int);
+ data2 = LOAD 'second' AS (id2:chararray, num2:int);
+ data3 = LOAD 'third' AS (id:chararray, num3:int);
+
+ joined = left_outer_join(data1, id, data2, id2, data3, id);
+ STORE joined INTO 'output';
+
+ */
+ @Multiline private static String leftOuterJoinTest;
+
+ @Test
+ public void leftOuterJoinTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(leftOuterJoinTest);
+
+ writeLinesToFile("first","A1\t1","A2\t2","A3\t3","A4\t4","A5\t5","A6\t6");
+
+ writeLinesToFile("second","A1\t11","B2\t12","A3\t13","A4\t14","B5\t15","B6\t16");
+
+ writeLinesToFile("third","A1\t111","A2\t112","A3\t113","B4\t114","A5\t115", "C6\t116");
+
+ test.runScript();
+
+ assertOutput(test, "joined",
+ "(A1,1,A1,11,A1,111)",
+ "(A2,2,,,A2,112)",
+ "(A3,3,A3,13,A3,113)",
+ "(A4,4,A4,14,,)",
+ "(A5,5,,,A5,115)",
+ "(A6,6,,,,)"
+ );
+ }
}
[2/2] datafu git commit: Merge branch 'DATAFU-130'
Posted by mh...@apache.org.
Merge branch 'DATAFU-130'
Project: http://git-wip-us.apache.org/repos/asf/datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/datafu/commit/0d084271
Tree: http://git-wip-us.apache.org/repos/asf/datafu/tree/0d084271
Diff: http://git-wip-us.apache.org/repos/asf/datafu/diff/0d084271
Branch: refs/heads/master
Commit: 0d0842719c1ecc8b9569d076f7fc1656cc3f36a8
Parents: f7ec4c7 e81ea86
Author: matthew.hayes <mh...@apache.org>
Authored: Mon Jan 7 07:01:04 2019 -0800
Committer: matthew.hayes <mh...@apache.org>
Committed: Mon Jan 7 07:01:04 2019 -0800
----------------------------------------------------------------------
.../src/main/resources/datafu/count_macros.pig | 11 +++++
.../src/main/resources/datafu/diff_macros.pig | 8 ++++
.../main/resources/datafu/left_outer_join.pig | 39 ++++++++++++++++++
.../main/resources/datafu/sample_by_keys.pig | 13 +++---
.../src/test/java/datafu/test/pig/PigTests.java | 2 +-
.../java/datafu/test/pig/macros/MacroTests.java | 42 ++++++++++++++++++--
6 files changed, 103 insertions(+), 12 deletions(-)
----------------------------------------------------------------------